diff --git a/Makefile b/Makefile index 855b42a8c..5f00462dd 100644 --- a/Makefile +++ b/Makefile @@ -94,6 +94,7 @@ o/$(MODE): \ rwc:/dev/shm \ rx:build/bootstrap \ rx:o/third_party/gcc \ + r:build/portcosmo.h \ /proc/stat \ rw:/dev/null \ w:o/stack.log \ diff --git a/build/definitions.mk b/build/definitions.mk index fdc598244..1fef28bff 100644 --- a/build/definitions.mk +++ b/build/definitions.mk @@ -88,11 +88,15 @@ ARCH = x86_64 HOSTS ?= freebsd openbsd netbsd rhel7 rhel5 xnu win10 endif +PORTCOSMO_CCFLAGS = -fportcosmo -include build/portcosmo.h + ifneq ("$(wildcard o/third_party/gcc/bin/x86_64-pc-linux-gnu-*)","") PREFIX = o/third_party/gcc/bin/x86_64-pc-linux-gnu- +DEFAULT_CPPFLAGS += $(PORTCOSMO_CCFLAGS) else IGNORE := $(shell build/bootstrap/unbundle.com) PREFIX = o/third_party/gcc/bin/x86_64-linux-musl- +DEFAULT_CPPFLAGS += $(PORTCOSMO_CCFLAGS) endif ifeq ($(ARCH), aarch64) PREFIX = o/third_party/gcc/bin/aarch64-linux-musl- @@ -163,7 +167,7 @@ TRADITIONAL = \ -Wno-return-type \ -Wno-pointer-sign -DEFAULT_CCFLAGS = \ +DEFAULT_CCFLAGS += \ -Wall \ -Werror \ -fdebug-prefix-map='$(PWD)'= \ @@ -206,7 +210,7 @@ MATHEMATICAL = \ -O3 \ -fwrapv -DEFAULT_CPPFLAGS = \ +DEFAULT_CPPFLAGS += \ -DCOSMO \ -DMODE='"$(MODE)"' \ -DIMAGE_BASE_VIRTUAL=$(IMAGE_BASE_VIRTUAL) \ diff --git a/build/portcosmo.h b/build/portcosmo.h new file mode 100644 index 000000000..49245010a --- /dev/null +++ b/build/portcosmo.h @@ -0,0 +1,361 @@ +#ifndef ACTUALLY_MODS +#define ACTUALLY_MODS +#if !(__ASSEMBLER__ + __LINKER__ + 0) +static const int __tmpcosmo_AF_ALG = -15823936; +static const int __tmpcosmo_AF_APPLETALK = -15823820; +static const int __tmpcosmo_AF_ASH = -15823924; +static const int __tmpcosmo_AF_ATMPVC = -15824070; +static const int __tmpcosmo_AF_ATMSVC = -15824056; +static const int __tmpcosmo_AF_AX25 = -15824014; +static const int __tmpcosmo_AF_BLUETOOTH = -15823992; +static const int __tmpcosmo_AF_BRIDGE = -15823812; +static const int __tmpcosmo_AF_CAIF = -15823850; +static const int __tmpcosmo_AF_CAN = -15823868; +static const int __tmpcosmo_AF_ECONET = -15823852; +static const int __tmpcosmo_AF_FILE = -15824118; +static const int __tmpcosmo_AF_IB = -15823966; +static const int __tmpcosmo_AF_IEEE802154 = -15823906; +static const int __tmpcosmo_AF_IPX = -15824002; +static const int __tmpcosmo_AF_IRDA = -15823860; +static const int __tmpcosmo_AF_ISDN = -15823978; +static const int __tmpcosmo_AF_IUCV = -15824106; +static const int __tmpcosmo_AF_KCM = -15824024; +static const int __tmpcosmo_AF_KEY = -15823948; +static const int __tmpcosmo_AF_LINK = -15823878; +static const int __tmpcosmo_AF_LLC = -15823824; +static const int __tmpcosmo_AF_LOCAL = -15823928; +static const int __tmpcosmo_AF_MAX = -15824082; +static const int __tmpcosmo_AF_MPLS = -15824026; +static const int __tmpcosmo_AF_NETBEUI = -15824124; +static const int __tmpcosmo_AF_NETLINK = -15824004; +static const int __tmpcosmo_AF_NETROM = -15823886; +static const int __tmpcosmo_AF_NFC = -15824142; +static const int __tmpcosmo_AF_PACKET = -15824028; +static const int __tmpcosmo_AF_PHONET = -15823830; +static const int __tmpcosmo_AF_PPPOX = -15823876; +static const int __tmpcosmo_AF_ROSE = -15824016; +static const int __tmpcosmo_AF_ROUTE = -15824100; +static const int __tmpcosmo_AF_RXRPC = -15823926; +static const int __tmpcosmo_AF_SECURITY = -15824136; +static const int __tmpcosmo_AF_SNA = -15823950; +static const int __tmpcosmo_AF_TIPC = -15824034; +static const int __tmpcosmo_AF_VSOCK = -15824146; +static const int __tmpcosmo_AF_WANPIPE = -15823960; +static const int __tmpcosmo_AF_X25 = -15823864; +static const int __tmpcosmo_E2BIG = -15823698; +static const int __tmpcosmo_EACCES = -15823580; +static const int __tmpcosmo_EADDRINUSE = -15823756; +static const int __tmpcosmo_EADDRNOTAVAIL = -15823592; +static const int __tmpcosmo_EADV = -15823574; +static const int __tmpcosmo_EAFNOSUPPORT = -15823748; +static const int __tmpcosmo_EAGAIN = -15823506; +static const int __tmpcosmo_EALREADY = -15823530; +static const int __tmpcosmo_EAUTH = -15823702; +static const int __tmpcosmo_EBADARCH = -15823738; +static const int __tmpcosmo_EBADE = -15823740; +static const int __tmpcosmo_EBADEXEC = -15823684; +static const int __tmpcosmo_EBADF = -15823744; +static const int __tmpcosmo_EBADFD = -15823554; +static const int __tmpcosmo_EBADMACHO = -15823618; +static const int __tmpcosmo_EBADMSG = -15823650; +static const int __tmpcosmo_EBADR = -15823570; +static const int __tmpcosmo_EBADRPC = -15823626; +static const int __tmpcosmo_EBADRQC = -15823688; +static const int __tmpcosmo_EBADSLT = -15823788; +static const int __tmpcosmo_EBUSY = -15823550; +static const int __tmpcosmo_ECANCELED = -15823676; +static const int __tmpcosmo_ECHILD = -15823662; +static const int __tmpcosmo_ECHRNG = -15823722; +static const int __tmpcosmo_ECOMM = -15823634; +static const int __tmpcosmo_ECONNABORTED = -15823616; +static const int __tmpcosmo_ECONNREFUSED = -15823556; +static const int __tmpcosmo_ECONNRESET = -15823548; +static const int __tmpcosmo_EDEADLK = -15823718; +static const int __tmpcosmo_EDESTADDRREQ = -15823658; +static const int __tmpcosmo_EDEVERR = -15823518; +static const int __tmpcosmo_EDOM = -15823798; +static const int __tmpcosmo_EDOTDOT = -15823726; +static const int __tmpcosmo_EDQUOT = -15823620; +static const int __tmpcosmo_EEXIST = -15823594; +static const int __tmpcosmo_EFAULT = -15823686; +static const int __tmpcosmo_EFBIG = -15823768; +static const int __tmpcosmo_EFTYPE = -15823568; +static const int __tmpcosmo_EHOSTDOWN = -15823596; +static const int __tmpcosmo_EHOSTUNREACH = -15823742; +static const int __tmpcosmo_EHWPOISON = -15823680; +static const int __tmpcosmo_EIDRM = -15823644; +static const int __tmpcosmo_EILSEQ = -15823540; +static const int __tmpcosmo_EINPROGRESS = -15823720; +static const int __tmpcosmo_EINTR = -15823710; +static const int __tmpcosmo_EINVAL = -15823624; +static const int __tmpcosmo_EIO = -15823544; +static const int __tmpcosmo_EISCONN = -15823704; +static const int __tmpcosmo_EISDIR = -15823758; +static const int __tmpcosmo_EISNAM = -15823682; +static const int __tmpcosmo_EKEYEXPIRED = -15823520; +static const int __tmpcosmo_EKEYREJECTED = -15823712; +static const int __tmpcosmo_EKEYREVOKED = -15823780; +static const int __tmpcosmo_EL2HLT = -15823510; +static const int __tmpcosmo_EL2NSYNC = -15823670; +static const int __tmpcosmo_EL3HLT = -15823792; +static const int __tmpcosmo_EL3RST = -15823654; +static const int __tmpcosmo_ELIBACC = -15823708; +static const int __tmpcosmo_ELIBBAD = -15823564; +static const int __tmpcosmo_ELIBEXEC = -15823696; +static const int __tmpcosmo_ELIBMAX = -15823724; +static const int __tmpcosmo_ELIBSCN = -15823786; +static const int __tmpcosmo_ELNRNG = -15823732; +static const int __tmpcosmo_ELOOP = -15823672; +static const int __tmpcosmo_EMEDIUMTYPE = -15823508; +static const int __tmpcosmo_EMFILE = -15823762; +static const int __tmpcosmo_EMLINK = -15823694; +static const int __tmpcosmo_EMSGSIZE = -15823536; +static const int __tmpcosmo_EMULTIHOP = -15823750; +static const int __tmpcosmo_ENAMETOOLONG = -15823600; +static const int __tmpcosmo_ENAVAIL = -15823656; +static const int __tmpcosmo_ENEEDAUTH = -15823766; +static const int __tmpcosmo_ENETDOWN = -15823730; +static const int __tmpcosmo_ENETRESET = -15823604; +static const int __tmpcosmo_ENETUNREACH = -15823524; +static const int __tmpcosmo_ENFILE = -15823700; +static const int __tmpcosmo_ENOANO = -15823734; +static const int __tmpcosmo_ENOATTR = -15823606; +static const int __tmpcosmo_ENOBUFS = -15823628; +static const int __tmpcosmo_ENOCSI = -15823760; +static const int __tmpcosmo_ENODATA = -15823516; +static const int __tmpcosmo_ENODEV = -15823774; +static const int __tmpcosmo_ENOENT = -15823590; +static const int __tmpcosmo_ENOEXEC = -15823512; +static const int __tmpcosmo_ENOKEY = -15823764; +static const int __tmpcosmo_ENOLCK = -15823782; +static const int __tmpcosmo_ENOLINK = -15823538; +static const int __tmpcosmo_ENOMEDIUM = -15823598; +static const int __tmpcosmo_ENOMEM = -15823514; +static const int __tmpcosmo_ENOMSG = -15823796; +static const int __tmpcosmo_ENONET = -15823642; +static const int __tmpcosmo_ENOPKG = -15823664; +static const int __tmpcosmo_ENOPOLICY = -15823716; +static const int __tmpcosmo_ENOPROTOOPT = -15823608; +static const int __tmpcosmo_ENOSPC = -15823646; +static const int __tmpcosmo_ENOSR = -15823558; +static const int __tmpcosmo_ENOSTR = -15823706; +static const int __tmpcosmo_ENOSYS = -15823636; +static const int __tmpcosmo_ENOTBLK = -15823640; +static const int __tmpcosmo_ENOTCONN = -15823778; +static const int __tmpcosmo_ENOTDIR = -15823648; +static const int __tmpcosmo_ENOTEMPTY = -15823552; +static const int __tmpcosmo_ENOTNAM = -15823532; +static const int __tmpcosmo_ENOTRECOVERABLE = -15823746; +static const int __tmpcosmo_ENOTSOCK = -15823582; +static const int __tmpcosmo_ENOTSUP = -15823602; +static const int __tmpcosmo_ENOTTY = -15823528; +static const int __tmpcosmo_ENOTUNIQ = -15823790; +static const int __tmpcosmo_ENXIO = -15823622; +static const int __tmpcosmo_EOPNOTSUPP = -15823588; +static const int __tmpcosmo_EOVERFLOW = -15823736; +static const int __tmpcosmo_EOWNERDEAD = -15823562; +static const int __tmpcosmo_EPERM = -15823754; +static const int __tmpcosmo_EPFNOSUPPORT = -15823690; +static const int __tmpcosmo_EPIPE = -15823534; +static const int __tmpcosmo_EPROCLIM = -15823610; +static const int __tmpcosmo_EPROCUNAVAIL = -15823546; +static const int __tmpcosmo_EPROGMISMATCH = -15823572; +static const int __tmpcosmo_EPROGUNAVAIL = -15823526; +static const int __tmpcosmo_EPROTO = -15823678; +static const int __tmpcosmo_EPROTONOSUPPORT = -15823576; +static const int __tmpcosmo_EPROTOTYPE = -15823614; +static const int __tmpcosmo_EPWROFF = -15823692; +static const int __tmpcosmo_ERANGE = -15823772; +static const int __tmpcosmo_EREMCHG = -15823666; +static const int __tmpcosmo_EREMOTE = -15823560; +static const int __tmpcosmo_EREMOTEIO = -15823794; +static const int __tmpcosmo_ERESTART = -15823728; +static const int __tmpcosmo_ERFKILL = -15823612; +static const int __tmpcosmo_EROFS = -15823566; +static const int __tmpcosmo_ERPCMISMATCH = -15823542; +static const int __tmpcosmo_ESHLIBVERS = -15823584; +static const int __tmpcosmo_ESHUTDOWN = -15823660; +static const int __tmpcosmo_ESOCKTNOSUPPORT = -15823776; +static const int __tmpcosmo_ESPIPE = -15823652; +static const int __tmpcosmo_ESRCH = -15823674; +static const int __tmpcosmo_ESRMNT = -15823714; +static const int __tmpcosmo_ESTALE = -15823632; +static const int __tmpcosmo_ESTRPIPE = -15823770; +static const int __tmpcosmo_ETIME = -15823630; +static const int __tmpcosmo_ETIMEDOUT = -15823522; +static const int __tmpcosmo_ETOOMANYREFS = -15823586; +static const int __tmpcosmo_ETXTBSY = -15823638; +static const int __tmpcosmo_EUCLEAN = -15823578; +static const int __tmpcosmo_EUNATCH = -15823504; +static const int __tmpcosmo_EUSERS = -15823668; +static const int __tmpcosmo_EXDEV = -15823752; +static const int __tmpcosmo_EXFULL = -15823784; +static const int __tmpcosmo_F_DUPFD_CLOEXEC = -15823938; +static const int __tmpcosmo_F_GETLEASE = -15823862; +static const int __tmpcosmo_F_GETLK = -15823916; +static const int __tmpcosmo_F_GETLK64 = -15823846; +static const int __tmpcosmo_F_GETOWN = -15824116; +static const int __tmpcosmo_F_GETPATH = -15824128; +static const int __tmpcosmo_F_GETPIPE_SZ = -15824006; +static const int __tmpcosmo_F_GETSIG = -15824112; +static const int __tmpcosmo_F_MAXFD = -15823896; +static const int __tmpcosmo_F_NOCACHE = -15824048; +static const int __tmpcosmo_F_NOTIFY = -15823898; +static const int __tmpcosmo_F_RDLCK = -15823826; +static const int __tmpcosmo_F_SETLEASE = -15823884; +static const int __tmpcosmo_F_SETLK = -15824088; +static const int __tmpcosmo_F_SETLK64 = -15824154; +static const int __tmpcosmo_F_SETLKW = -15824096; +static const int __tmpcosmo_F_SETLKW64 = -15824104; +static const int __tmpcosmo_F_SETOWN = -15823874; +static const int __tmpcosmo_F_SETPIPE_SZ = -15823958; +static const int __tmpcosmo_F_SETSIG = -15823832; +static const int __tmpcosmo_F_UNLCK = -15824148; +static const int __tmpcosmo_F_WRLCK = -15824058; +static const int __tmpcosmo_IFF_ALLMULTI = -15824140; +static const int __tmpcosmo_IFF_AUTOMEDIA = -15823962; +static const int __tmpcosmo_IFF_DYNAMIC = -15823848; +static const int __tmpcosmo_IFF_MASTER = -15823900; +static const int __tmpcosmo_IFF_MULTICAST = -15824000; +static const int __tmpcosmo_IFF_NOARP = -15823802; +static const int __tmpcosmo_IFF_NOTRAILERS = -15824130; +static const int __tmpcosmo_IFF_POINTOPOINT = -15824138; +static const int __tmpcosmo_IFF_PORTSEL = -15824150; +static const int __tmpcosmo_IFF_PROMISC = -15824010; +static const int __tmpcosmo_IFF_RUNNING = -15824080; +static const int __tmpcosmo_IFF_SLAVE = -15824022; +static const int __tmpcosmo_LOCAL_PEERCRED = -15823986; +static const int __tmpcosmo_SIGBUS = -15824132; +static const int __tmpcosmo_SIGCHLD = -15824036; +static const int __tmpcosmo_SIGCONT = -15823836; +static const int __tmpcosmo_SIGEMT = -15823972; +static const int __tmpcosmo_SIGINFO = -15824086; +static const int __tmpcosmo_SIGIO = -15823912; +static const int __tmpcosmo_SIGPOLL = -15823854; +static const int __tmpcosmo_SIGPWR = -15824114; +static const int __tmpcosmo_SIGRTMAX = -15824040; +static const int __tmpcosmo_SIGRTMIN = -15824134; +static const int __tmpcosmo_SIGSTKFLT = -15823934; +static const int __tmpcosmo_SIGSTOP = -15824158; +static const int __tmpcosmo_SIGSYS = -15823922; +static const int __tmpcosmo_SIGTHR = -15823902; +static const int __tmpcosmo_SIGTSTP = -15823988; +static const int __tmpcosmo_SIGUNUSED = -15823970; +static const int __tmpcosmo_SIGURG = -15823952; +static const int __tmpcosmo_SIGUSR1 = -15824018; +static const int __tmpcosmo_SIGUSR2 = -15823998; +static const int __tmpcosmo_SIG_BLOCK = -15823800; +static const int __tmpcosmo_SIG_SETMASK = -15824090; +static const int __tmpcosmo_SIG_UNBLOCK = -15824078; +static const int __tmpcosmo_SOL_AAL = -15823976; +static const int __tmpcosmo_SOL_ALG = -15823956; +static const int __tmpcosmo_SOL_ATM = -15823914; +static const int __tmpcosmo_SOL_BLUETOOTH = -15824062; +static const int __tmpcosmo_SOL_CAIF = -15823904; +static const int __tmpcosmo_SOL_DCCP = -15823814; +static const int __tmpcosmo_SOL_DECNET = -15823842; +static const int __tmpcosmo_SOL_ICMPV6 = -15823908; +static const int __tmpcosmo_SOL_IPV6 = -15823808; +static const int __tmpcosmo_SOL_IRDA = -15823880; +static const int __tmpcosmo_SOL_IUCV = -15824156; +static const int __tmpcosmo_SOL_KCM = -15824092; +static const int __tmpcosmo_SOL_LLC = -15823930; +static const int __tmpcosmo_SOL_NETBEUI = -15823894; +static const int __tmpcosmo_SOL_NETLINK = -15824012; +static const int __tmpcosmo_SOL_NFC = -15823942; +static const int __tmpcosmo_SOL_PACKET = -15823806; +static const int __tmpcosmo_SOL_PNPIPE = -15823968; +static const int __tmpcosmo_SOL_PPPOL2TP = -15823816; +static const int __tmpcosmo_SOL_RAW = -15824044; +static const int __tmpcosmo_SOL_RDS = -15824020; +static const int __tmpcosmo_SOL_RXRPC = -15823984; +static const int __tmpcosmo_SOL_SOCKET = -15824050; +static const int __tmpcosmo_SOL_TIPC = -15823940; +static const int __tmpcosmo_SOL_X25 = -15823856; +static const int __tmpcosmo_SO_ACCEPTCONN = -15823872; +static const int __tmpcosmo_SO_ATTACH_BPF = -15824072; +static const int __tmpcosmo_SO_ATTACH_FILTER = -15824094; +static const int __tmpcosmo_SO_ATTACH_REUSEPORT_CBPF = -15823964; +static const int __tmpcosmo_SO_ATTACH_REUSEPORT_EBPF = -15824060; +static const int __tmpcosmo_SO_BINDTODEVICE = -15823990; +static const int __tmpcosmo_SO_BPF_EXTENSIONS = -15824030; +static const int __tmpcosmo_SO_BROADCAST = -15823882; +static const int __tmpcosmo_SO_BSDCOMPAT = -15824038; +static const int __tmpcosmo_SO_BUSY_POLL = -15823944; +static const int __tmpcosmo_SO_CNX_ADVICE = -15823828; +static const int __tmpcosmo_SO_DETACH_BPF = -15824068; +static const int __tmpcosmo_SO_DETACH_FILTER = -15824032; +static const int __tmpcosmo_SO_DOMAIN = -15823980; +static const int __tmpcosmo_SO_DONTROUTE = -15823918; +static const int __tmpcosmo_SO_ERROR = -15823892; +static const int __tmpcosmo_SO_EXCLUSIVEADDRUSE = -15823858; +static const int __tmpcosmo_SO_GET_FILTER = -15823834; +static const int __tmpcosmo_SO_INCOMING_CPU = -15824074; +static const int __tmpcosmo_SO_KEEPALIVE = -15823890; +static const int __tmpcosmo_SO_LINGER = -15824084; +static const int __tmpcosmo_SO_LOCK_FILTER = -15823804; +static const int __tmpcosmo_SO_MARK = -15824008; +static const int __tmpcosmo_SO_MAX_PACING_RATE = -15824120; +static const int __tmpcosmo_SO_NOFCS = -15823818; +static const int __tmpcosmo_SO_NO_CHECK = -15824152; +static const int __tmpcosmo_SO_OOBINLINE = -15823838; +static const int __tmpcosmo_SO_PASSCRED = -15823888; +static const int __tmpcosmo_SO_PASSSEC = -15823866; +static const int __tmpcosmo_SO_PEEK_OFF = -15823870; +static const int __tmpcosmo_SO_PEERCRED = -15823954; +static const int __tmpcosmo_SO_PEERNAME = -15824042; +static const int __tmpcosmo_SO_PEERSEC = -15823844; +static const int __tmpcosmo_SO_PRIORITY = -15824122; +static const int __tmpcosmo_SO_PROTOCOL = -15823982; +static const int __tmpcosmo_SO_RCVBUF = -15823974; +static const int __tmpcosmo_SO_RCVBUFFORCE = -15823994; +static const int __tmpcosmo_SO_RCVLOWAT = -15824076; +static const int __tmpcosmo_SO_RCVTIMEO = -15824046; +static const int __tmpcosmo_SO_REUSEADDR = -15823810; +static const int __tmpcosmo_SO_REUSEPORT = -15823822; +static const int __tmpcosmo_SO_RXQ_OVFL = -15824066; +static const int __tmpcosmo_SO_SECURITY_AUTHENTICATION = -15824098; +static const int __tmpcosmo_SO_SECURITY_ENCRYPTION_NETWORK = -15824126; +static const int __tmpcosmo_SO_SELECT_ERR_QUEUE = -15824052; +static const int __tmpcosmo_SO_SETFIB = -15823920; +static const int __tmpcosmo_SO_SNDBUF = -15824102; +static const int __tmpcosmo_SO_SNDBUFFORCE = -15823840; +static const int __tmpcosmo_SO_SNDLOWAT = -15823946; +static const int __tmpcosmo_SO_SNDTIMEO = -15824064; +static const int __tmpcosmo_SO_TIMESTAMP = -15823932; +static const int __tmpcosmo_SO_TIMESTAMPING = -15824054; +static const int __tmpcosmo_SO_TIMESTAMPNS = -15823910; +static const int __tmpcosmo_SO_TYPE = -15824144; +static const int __tmpcosmo_SO_USELOOPBACK = -15824110; +static const int __tmpcosmo_SO_WIFI_STATUS = -15824108; +static const unsigned int __tmpcosmo_B1000000 = 15823512; +static const unsigned int __tmpcosmo_B110 = 15823518; +static const unsigned int __tmpcosmo_B115200 = 15823540; +static const unsigned int __tmpcosmo_B1152000 = 15823538; +static const unsigned int __tmpcosmo_B1200 = 15823548; +static const unsigned int __tmpcosmo_B134 = 15823510; +static const unsigned int __tmpcosmo_B150 = 15823542; +static const unsigned int __tmpcosmo_B1500000 = 15823508; +static const unsigned int __tmpcosmo_B1800 = 15823522; +static const unsigned int __tmpcosmo_B19200 = 15823546; +static const unsigned int __tmpcosmo_B200 = 15823528; +static const unsigned int __tmpcosmo_B2000000 = 15823524; +static const unsigned int __tmpcosmo_B230400 = 15823516; +static const unsigned int __tmpcosmo_B2400 = 15823526; +static const unsigned int __tmpcosmo_B2500000 = 15823558; +static const unsigned int __tmpcosmo_B300 = 15823534; +static const unsigned int __tmpcosmo_B3000000 = 15823530; +static const unsigned int __tmpcosmo_B3500000 = 15823544; +static const unsigned int __tmpcosmo_B38400 = 15823514; +static const unsigned int __tmpcosmo_B4000000 = 15823520; +static const unsigned int __tmpcosmo_B4800 = 15823556; +static const unsigned int __tmpcosmo_B50 = 15823532; +static const unsigned int __tmpcosmo_B500000 = 15823550; +static const unsigned int __tmpcosmo_B57600 = 15823552; +static const unsigned int __tmpcosmo_B576000 = 15823506; +static const unsigned int __tmpcosmo_B600 = 15823554; +static const unsigned int __tmpcosmo_B75 = 15823536; +static const unsigned int __tmpcosmo_B9600 = 15823504; +static const unsigned short __tmpcosmo_AF_INET6 = 58236; +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* ACTUALLY_MODS */ diff --git a/libc/calls/struct/timespec.h b/libc/calls/struct/timespec.h index 4d8404f74..8aa213a42 100644 --- a/libc/calls/struct/timespec.h +++ b/libc/calls/struct/timespec.h @@ -1,11 +1,27 @@ #ifndef COSMOPOLITAN_LIBC_CALLS_STRUCT_TIMESPEC_H_ #define COSMOPOLITAN_LIBC_CALLS_STRUCT_TIMESPEC_H_ + +#ifdef COSMO +#define timespec_get __timespec_get +#define timespec_getres __timespec_getres +#define timespec_cmp __timespec_cmp +#define timespec_tomicros __timespec_tomicros +#define timespec_tomillis __timespec_tomillis +#define timespec_tonanos __timespec_tonanos +#define timespec_add __timespec_add +#define timespec_fromnanos __timespec_fromnanos +#define timespec_frommicros __timespec_frommicros +#define timespec_frommillis __timespec_frommillis +#define timespec_real __timespec_real +#define timespec_mono __timespec_mono +#define timespec_sleep __timespec_sleep +#define timespec_sleep_until __timespec_sleep_until +#define timespec_sub __timespec_sub +#endif /* COSMO */ + #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ -#define timespec_zero ((struct timespec){0}) -#define timespec_max ((struct timespec){0x7fffffffffffffff, 999999999}) - struct timespec { int64_t tv_sec; int64_t tv_nsec; /* nanoseconds */ @@ -18,9 +34,14 @@ int futimens(int, const struct timespec[2]); int nanosleep(const struct timespec *, struct timespec *); int sys_futex(int *, int, int, const struct timespec *, int *); int utimensat(int, const char *, const struct timespec[2], int); + +#ifdef COSMO +/* cosmopolitan libc's non-posix timespec library + removed by default due to emacs codebase clash */ +#define timespec_zero ((struct timespec){0}) +#define timespec_max ((struct timespec){0x7fffffffffffffff, 999999999}) int timespec_get(struct timespec *, int); int timespec_getres(struct timespec *, int); - int timespec_cmp(struct timespec, struct timespec) pureconst; int64_t timespec_tomicros(struct timespec) pureconst; int64_t timespec_tomillis(struct timespec) pureconst; @@ -34,6 +55,7 @@ struct timespec timespec_mono(void); struct timespec timespec_sleep(struct timespec); int timespec_sleep_until(struct timespec); struct timespec timespec_sub(struct timespec, struct timespec) pureconst; +#endif /* COSMO */ COSMOPOLITAN_C_END_ #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ diff --git a/libc/calls/struct/timeval.h b/libc/calls/struct/timeval.h index 01e6f4638..5c31ddeae 100644 --- a/libc/calls/struct/timeval.h +++ b/libc/calls/struct/timeval.h @@ -2,6 +2,16 @@ #define COSMOPOLITAN_LIBC_CALLS_STRUCT_TIMEVAL_H_ #include "libc/calls/struct/timespec.h" #include "libc/time/struct/timezone.h" + +#ifdef COSMO +#define timeval_cmp __timeval_cmp +#define timeval_frommicros __timeval_frommicros +#define timeval_frommillis __timeval_frommillis +#define timeval_add __timeval_add +#define timeval_sub __timeval_sub +#define timeval_totimespec __timeval_totimespec +#endif /* COSMO */ + #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ @@ -16,6 +26,9 @@ int gettimeofday(struct timeval *, struct timezone *); int lutimes(const char *, const struct timeval[2]); int utimes(const char *, const struct timeval[2]); +#ifdef COSMO +/* cosmopolitan libc's non-posix timevals library + removed by default due to emacs codebase clash */ int timeval_cmp(struct timeval, struct timeval) pureconst; struct timeval timeval_frommicros(int64_t) pureconst; struct timeval timeval_frommillis(int64_t) pureconst; @@ -23,6 +36,7 @@ struct timeval timeval_add(struct timeval, struct timeval) pureconst; struct timeval timeval_sub(struct timeval, struct timeval) pureconst; struct timeval timespec_totimeval(struct timespec) pureconst; struct timespec timeval_totimespec(struct timeval) pureconst; +#endif /* COSMO */ COSMOPOLITAN_C_END_ #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ diff --git a/libc/integral/c.inc b/libc/integral/c.inc index 28619eb49..9c89e06d3 100644 --- a/libc/integral/c.inc +++ b/libc/integral/c.inc @@ -725,6 +725,9 @@ void abort(void) wontreturn; #endif /* GCC8+ */ #if __GNUC__ + 0 >= 9 #pragma GCC diagnostic ignored /* "always true" breaks dce */ "-Waddress" +#if __GNUC__ >= 11 +#pragma GCC diagnostic ignored /* orwellian */ "-Wold-style-definition" +#endif /* GCC11+ */ #endif /* GCC9+ */ #endif /* !C++ */ #endif /* GCC && !LLVM */ diff --git a/third_party/gcc/README.cosmo b/third_party/gcc/README.cosmo index 0b2e73410..dbe836485 100644 --- a/third_party/gcc/README.cosmo +++ b/third_party/gcc/README.cosmo @@ -1,232 +1,28 @@ -This is a modern statically-linked GNU C2X toolchain. +DESCRIPTION -You have the freedom to obtain the original sources to these binaries, -and build ones just like them, by visiting: + Cosmopolitan GCC + Prebuilt x86_64-linux binaries + An APE-friendly C/C++ compiler - https://www.gnu.org/ - https://github.com/richfelker/musl-cross-make +LICENSE -The musl-cross-make tool also produces libraries and header files. We've -only vendored the statically-linked executable files, since Cosmopolitan -won't depend on GPL-licensed headers / runtime libraries. + GPLv3 and other licenses (see LICENSE.txt) -We haven't made any modifications to the original software. The versions -we chose are documented in $PKG/LICENSE.txt. Here's our Musl -build config for maximum transparency: +ORIGIN -commit 38e52db8358c043ae82b346a2e6e66bc86a53bc1 -Author: Rich Felker -Date: Wed Dec 18 14:29:07 2019 -0500 + @ahgamut's musl-cross-make fork + https://github.com/ahgamut/musl-cross-make/ + d0f33e2162cf5e5b30cdf3b3accc0d0f7756830c - switch linux kernel headers to 4.19.88 by default - - using slim headers-only version. this change is needed to support all - future versions of musl on 32-bit archs, since prior to 4.16 the - kernel headers had incompatibility with userspace time_t not matching - the kernel's old (32-bit) time_t. support for older headers will be - dropped entirely soon. +MODIFICATIONS -TARGET = x86_64-linux-musl -OUTPUT = /opt/cross9 -GCC_VER = 9.2.0 -export LANG=en_US.UTF-8 -export LC_CTYPE=en_US.UTF-8 -COMMON_CONFIG += CC="/opt/cross9/bin/x86_64-linux-musl-cc -static --static -g -Os -ftree-vectorize -fvect-cost-model=unlimited -mstringop-strategy=vector_loop -save-temps -fno-ident" -COMMON_CONFIG += CXX="/opt/cross9/bin/x86_64-linux-musl-c++ -static --static -g -Os -ftree-vectorize -fvect-cost-model=unlimited -mstringop-strategy=vector_loop -save-temps -fno-ident" -COMMON_CONFIG += LD="/opt/cross9/bin/x86_64-linux-musl-ld --build-id=none" -COMMON_CONFIG += NM="/opt/cross9/bin/x86_64-linux-musl-nm" -COMMON_CONFIG += LDFLAGS="-Wl,--build-id=none" -COMMON_CONFIG += OBJCOPY="/opt/cross9/bin/x86_64-linux-musl-objcopy" -COMMON_CONFIG += --disable-nls --disable-lto -GCC_CONFIG += --enable-languages=c,c++ -GCC_CONFIG += --disable-multilib -GCC_CONFIG += --with-gnu-as -GCC_CONFIG += --with-gnu-ld -GCC_CONFIG += --disable-multilib -GCC_CONFIG += --enable-sjlj-exceptions -GCC_CONFIG += --disable-threads -GCC_CONFIG += --disable-tls -COMMON_CONFIG += --with-debug-prefix-map=$(CURDIR)= + ahgamut's musl-cross-make fork includes a 2kLOC patch that modifies + GCC so it'll compile C code like `switch(errno){case EINVAL: etc.}` -#!/bin/sh -set -e -export LC_ALL=C -export GUNZ="/bin/gzip --rsyncable -9 -c" -BASE=/opt/cross9 -PKG=third_party/gcc -VERS=9.2.0 +SEE ALSO -if [ ! -d $BASE ]; then - echo error: run make install >&2 - exit 1 -fi + third_party/gcc/portcosmo.patch -if [ -d $BASE/$PKG ]; then - rm -rf $BASE/$PKG -fi +NOTES -mkdir -p $BASE/$PKG/bin -mkdir -p $BASE/$PKG/libexec/gcc/x86_64-linux-musl/$VERS -mkdir -p $BASE/$PKG/x86_64-linux-musl/bin - -cp $BASE/bin/x86_64-linux-musl-gcov-dump $BASE/$PKG/bin/x86_64-linux-musl-gcov-dump -cp $BASE/bin/x86_64-linux-musl-cc $BASE/$PKG/bin/x86_64-linux-musl-gcc -cp $BASE/bin/x86_64-linux-musl-addr2line $BASE/$PKG/bin/x86_64-linux-musl-addr2line -cp $BASE/bin/x86_64-linux-musl-ar $BASE/$PKG/bin/x86_64-linux-musl-ar -cp $BASE/libexec/gcc/x86_64-linux-musl/9.2.0/cc1plus $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1plus -cp $BASE/bin/x86_64-linux-musl-c++ $BASE/$PKG/bin/x86_64-linux-musl-g++ -cp $BASE/libexec/gcc/x86_64-linux-musl/9.2.0/collect2 $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/collect2 -cp $BASE/bin/x86_64-linux-musl-gcc-nm $BASE/$PKG/bin/x86_64-linux-musl-gcc-nm -cp $BASE/bin/x86_64-linux-musl-c++filt $BASE/$PKG/bin/x86_64-linux-musl-c++filt -cp $BASE/bin/x86_64-linux-musl-elfedit $BASE/$PKG/bin/x86_64-linux-musl-elfedit -cp $BASE/bin/x86_64-linux-musl-ld $BASE/$PKG/x86_64-linux-musl/bin/ld.bfd -cp $BASE/bin/x86_64-linux-musl-size $BASE/$PKG/bin/x86_64-linux-musl-size -cp $BASE/bin/x86_64-linux-musl-strings $BASE/$PKG/bin/x86_64-linux-musl-strings -cp $BASE/bin/x86_64-linux-musl-objcopy $BASE/$PKG/bin/x86_64-linux-musl-objcopy -cp $BASE/bin/x86_64-linux-musl-nm $BASE/$PKG/bin/x86_64-linux-musl-nm -cp $BASE/libexec/gcc/x86_64-linux-musl/9.2.0/cc1 $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1 -cp $BASE/bin/x86_64-linux-musl-readelf $BASE/$PKG/bin/x86_64-linux-musl-readelf -cp $BASE/bin/x86_64-linux-musl-objdump $BASE/$PKG/bin/x86_64-linux-musl-objdump -cp $BASE/bin/x86_64-linux-musl-gcc-ar $BASE/$PKG/bin/x86_64-linux-musl-gcc-ar -cp $BASE/bin/x86_64-linux-musl-gcov $BASE/$PKG/bin/x86_64-linux-musl-gcov -cp $BASE/bin/x86_64-linux-musl-ranlib $BASE/$PKG/bin/x86_64-linux-musl-ranlib -cp $BASE/bin/x86_64-linux-musl-as $BASE/$PKG/bin/x86_64-linux-musl-as -cp $BASE/bin/x86_64-linux-musl-gcc-ranlib $BASE/$PKG/bin/x86_64-linux-musl-gcc-ranlib -cp $BASE/bin/x86_64-linux-musl-cpp $BASE/$PKG/bin/x86_64-linux-musl-cpp -cp $BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-strip -cp $BASE/bin/x86_64-linux-musl-gprof $BASE/$PKG/bin/x86_64-linux-musl-gprof -cp $BASE/bin/x86_64-linux-musl-gcov-tool $BASE/$PKG/bin/x86_64-linux-musl-gcov-tool - -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gcov-dump -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gcc -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-addr2line -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-ar -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1plus -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-g++ -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/collect2 -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gcc-nm -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-c++filt -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-elfedit -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/x86_64-linux-musl/bin/ld.bfd -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-size -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-strings -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-objcopy -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-nm -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1 -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-readelf -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-objdump -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gcc-ar -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gcov -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-ranlib -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-as -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gcc-ranlib -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-cpp -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-strip -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gprof -$BASE/bin/x86_64-linux-musl-strip $BASE/$PKG/bin/x86_64-linux-musl-gcov-tool - -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gcov-dump >$BASE/$PKG/bin/x86_64-linux-musl-gcov-dump.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gcc >$BASE/$PKG/bin/x86_64-linux-musl-gcc.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-addr2line >$BASE/$PKG/bin/x86_64-linux-musl-addr2line.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-ar >$BASE/$PKG/bin/x86_64-linux-musl-ar.gz -$GUNZ $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1plus >$BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1plus.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-g++ >$BASE/$PKG/bin/x86_64-linux-musl-g++.gz -$GUNZ $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/collect2 >$BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/collect2.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gcc-nm >$BASE/$PKG/bin/x86_64-linux-musl-gcc-nm.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-c++filt >$BASE/$PKG/bin/x86_64-linux-musl-c++filt.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-elfedit >$BASE/$PKG/bin/x86_64-linux-musl-elfedit.gz -$GUNZ $BASE/$PKG/x86_64-linux-musl/bin/ld.bfd >$BASE/$PKG/x86_64-linux-musl/bin/ld.bfd.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-size >$BASE/$PKG/bin/x86_64-linux-musl-size.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-strings >$BASE/$PKG/bin/x86_64-linux-musl-strings.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-objcopy >$BASE/$PKG/bin/x86_64-linux-musl-objcopy.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-nm >$BASE/$PKG/bin/x86_64-linux-musl-nm.gz -$GUNZ $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1 >$BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-readelf >$BASE/$PKG/bin/x86_64-linux-musl-readelf.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-objdump >$BASE/$PKG/bin/x86_64-linux-musl-objdump.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gcc-ar >$BASE/$PKG/bin/x86_64-linux-musl-gcc-ar.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gcov >$BASE/$PKG/bin/x86_64-linux-musl-gcov.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-ranlib >$BASE/$PKG/bin/x86_64-linux-musl-ranlib.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-as >$BASE/$PKG/bin/x86_64-linux-musl-as.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gcc-ranlib >$BASE/$PKG/bin/x86_64-linux-musl-gcc-ranlib.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-cpp >$BASE/$PKG/bin/x86_64-linux-musl-cpp.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-strip >$BASE/$PKG/bin/x86_64-linux-musl-strip.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gprof >$BASE/$PKG/bin/x86_64-linux-musl-gprof.gz -$GUNZ $BASE/$PKG/bin/x86_64-linux-musl-gcov-tool >$BASE/$PKG/bin/x86_64-linux-musl-gcov-tool.gz - -rm $BASE/$PKG/bin/x86_64-linux-musl-gcov-dump -rm $BASE/$PKG/bin/x86_64-linux-musl-gcc -rm $BASE/$PKG/bin/x86_64-linux-musl-addr2line -rm $BASE/$PKG/bin/x86_64-linux-musl-ar -rm $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1plus -rm $BASE/$PKG/bin/x86_64-linux-musl-g++ -rm $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/collect2 -rm $BASE/$PKG/bin/x86_64-linux-musl-gcc-nm -rm $BASE/$PKG/bin/x86_64-linux-musl-c++filt -rm $BASE/$PKG/bin/x86_64-linux-musl-elfedit -rm $BASE/$PKG/x86_64-linux-musl/bin/ld.bfd -rm $BASE/$PKG/bin/x86_64-linux-musl-size -rm $BASE/$PKG/bin/x86_64-linux-musl-strings -rm $BASE/$PKG/bin/x86_64-linux-musl-objcopy -rm $BASE/$PKG/bin/x86_64-linux-musl-nm -rm $BASE/$PKG/libexec/gcc/x86_64-linux-musl/9.2.0/cc1 -rm $BASE/$PKG/bin/x86_64-linux-musl-readelf -rm $BASE/$PKG/bin/x86_64-linux-musl-objdump -rm $BASE/$PKG/bin/x86_64-linux-musl-gcc-ar -rm $BASE/$PKG/bin/x86_64-linux-musl-gcov -rm $BASE/$PKG/bin/x86_64-linux-musl-ranlib -rm $BASE/$PKG/bin/x86_64-linux-musl-as -rm $BASE/$PKG/bin/x86_64-linux-musl-gcc-ranlib -rm $BASE/$PKG/bin/x86_64-linux-musl-cpp -rm $BASE/$PKG/bin/x86_64-linux-musl-strip -rm $BASE/$PKG/bin/x86_64-linux-musl-gprof -rm $BASE/$PKG/bin/x86_64-linux-musl-gcov-tool - -ln -s x86_64-linux-musl-gcc $BASE/$PKG/bin/x86_64-linux-musl-cc -ln -s x86_64-linux-musl-gcc $BASE/$PKG/bin/x86_64-linux-musl-gcc-9.2.0 -ln -s ../../bin/x86_64-linux-musl-ar $BASE/$PKG/x86_64-linux-musl/bin/ar -ln -s x86_64-linux-musl-g++ $BASE/$PKG/bin/x86_64-linux-musl-c++ -ln -s ld.bfd $BASE/$PKG/x86_64-linux-musl/bin/ld -ln -s ../x86_64-linux-musl/bin/ld.bfd $BASE/$PKG/bin/x86_64-linux-musl-ld.bfd -ln -s ../x86_64-linux-musl/bin/ld.bfd $BASE/$PKG/bin/x86_64-linux-musl-ld -ln -s ../../bin/x86_64-linux-musl-objcopy $BASE/$PKG/x86_64-linux-musl/bin/objcopy -ln -s ../../bin/x86_64-linux-musl-nm $BASE/$PKG/x86_64-linux-musl/bin/nm -ln -s ../../bin/x86_64-linux-musl-readelf $BASE/$PKG/x86_64-linux-musl/bin/readelf -ln -s ../../bin/x86_64-linux-musl-objdump $BASE/$PKG/x86_64-linux-musl/bin/objdump -ln -s ../../bin/x86_64-linux-musl-ranlib $BASE/$PKG/x86_64-linux-musl/bin/ranlib -ln -s ../../bin/x86_64-linux-musl-as $BASE/$PKG/x86_64-linux-musl/bin/as -ln -s ../../bin/x86_64-linux-musl-strip $BASE/$PKG/x86_64-linux-musl/bin/strip - -{ - cat <<'EOF' -This is a modern statically-linked GNU C2X toolchain. - -You have the freedom to obtain the original sources to these binaries, -and build ones just like them, by visiting: - - https://www.gnu.org/ - https://github.com/richfelker/musl-cross-make - -The musl-cross-make tool also produces libraries and header files. We've -only vendored the statically-linked executable files, since Cosmopolitan -won't depend on GPL-licensed headers / runtime libraries. - -We haven't made any modifications to the original software. The versions -we chose are documented in $PKG/LICENSE.txt. Here's our Musl -build config for maximum transparency: - -EOF - git show --quiet - echo - cat config.mak - echo - cat bundle.sh -} >$BASE/$PKG/README.cosmo - -{ - for f in $(find . -iname \*copying\* -or -iname \*license\* | sort); do - printf '\n' - printf '%s\n' "$f" - printf '========================================================================\n' - cat "$f" - done -} >$BASE/$PKG/LICENSE.txt + My name is Justine Tunney and I approve of these binaries. diff --git a/third_party/gcc/bin/x86_64-linux-musl-addr2line.gz b/third_party/gcc/bin/x86_64-linux-musl-addr2line.gz index 6022f8df8..2b004c7e6 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-addr2line.gz and b/third_party/gcc/bin/x86_64-linux-musl-addr2line.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-ar.gz b/third_party/gcc/bin/x86_64-linux-musl-ar.gz index 40448bd0b..ed5976591 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-ar.gz and b/third_party/gcc/bin/x86_64-linux-musl-ar.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-as.gz b/third_party/gcc/bin/x86_64-linux-musl-as.gz index 65d3eb1a1..9250c8232 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-as.gz and b/third_party/gcc/bin/x86_64-linux-musl-as.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-c++filt.gz b/third_party/gcc/bin/x86_64-linux-musl-c++filt.gz index 2cd6fdaed..a532e2835 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-c++filt.gz and b/third_party/gcc/bin/x86_64-linux-musl-c++filt.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-cpp.gz b/third_party/gcc/bin/x86_64-linux-musl-cpp.gz index 06ad513f5..3729c7808 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-cpp.gz and b/third_party/gcc/bin/x86_64-linux-musl-cpp.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-elfedit.gz b/third_party/gcc/bin/x86_64-linux-musl-elfedit.gz index ebbbca3ef..cd59cfd05 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-elfedit.gz and b/third_party/gcc/bin/x86_64-linux-musl-elfedit.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-g++.gz b/third_party/gcc/bin/x86_64-linux-musl-g++.gz index c680cf2db..60011f1d7 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-g++.gz and b/third_party/gcc/bin/x86_64-linux-musl-g++.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gcc-ar.gz b/third_party/gcc/bin/x86_64-linux-musl-gcc-ar.gz index c79d4f0bd..500f06473 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gcc-ar.gz and b/third_party/gcc/bin/x86_64-linux-musl-gcc-ar.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gcc-nm.gz b/third_party/gcc/bin/x86_64-linux-musl-gcc-nm.gz index 5c36e2aee..708fcda5a 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gcc-nm.gz and b/third_party/gcc/bin/x86_64-linux-musl-gcc-nm.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gcc-ranlib.gz b/third_party/gcc/bin/x86_64-linux-musl-gcc-ranlib.gz index fd0e4bcd8..221031f0b 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gcc-ranlib.gz and b/third_party/gcc/bin/x86_64-linux-musl-gcc-ranlib.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gcc.gz b/third_party/gcc/bin/x86_64-linux-musl-gcc.gz index 6fd25cdce..c8c5e0e22 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gcc.gz and b/third_party/gcc/bin/x86_64-linux-musl-gcc.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gcov-dump.gz b/third_party/gcc/bin/x86_64-linux-musl-gcov-dump.gz index 630a86768..2113e00a5 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gcov-dump.gz and b/third_party/gcc/bin/x86_64-linux-musl-gcov-dump.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gcov-tool.gz b/third_party/gcc/bin/x86_64-linux-musl-gcov-tool.gz index 6411347ae..290e9cb77 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gcov-tool.gz and b/third_party/gcc/bin/x86_64-linux-musl-gcov-tool.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gcov.gz b/third_party/gcc/bin/x86_64-linux-musl-gcov.gz index 70dcebdb5..0cfa9a289 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gcov.gz and b/third_party/gcc/bin/x86_64-linux-musl-gcov.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-gprof.gz b/third_party/gcc/bin/x86_64-linux-musl-gprof.gz index 99363407f..b14edf58c 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-gprof.gz and b/third_party/gcc/bin/x86_64-linux-musl-gprof.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-nm.gz b/third_party/gcc/bin/x86_64-linux-musl-nm.gz index ade64c9b1..75d42614c 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-nm.gz and b/third_party/gcc/bin/x86_64-linux-musl-nm.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-objcopy.gz b/third_party/gcc/bin/x86_64-linux-musl-objcopy.gz index 9dfed872b..345bc7b53 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-objcopy.gz and b/third_party/gcc/bin/x86_64-linux-musl-objcopy.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-objdump.gz b/third_party/gcc/bin/x86_64-linux-musl-objdump.gz index 13415520a..2c76e9c37 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-objdump.gz and b/third_party/gcc/bin/x86_64-linux-musl-objdump.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-ranlib.gz b/third_party/gcc/bin/x86_64-linux-musl-ranlib.gz index 39e9e7273..4fd1fb81f 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-ranlib.gz and b/third_party/gcc/bin/x86_64-linux-musl-ranlib.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-readelf.gz b/third_party/gcc/bin/x86_64-linux-musl-readelf.gz index 4b7db889d..da326da56 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-readelf.gz and b/third_party/gcc/bin/x86_64-linux-musl-readelf.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-size.gz b/third_party/gcc/bin/x86_64-linux-musl-size.gz index 55c3386ac..d7accb1a6 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-size.gz and b/third_party/gcc/bin/x86_64-linux-musl-size.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-strings.gz b/third_party/gcc/bin/x86_64-linux-musl-strings.gz index 1a77aa767..d6a602a6b 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-strings.gz and b/third_party/gcc/bin/x86_64-linux-musl-strings.gz differ diff --git a/third_party/gcc/bin/x86_64-linux-musl-strip.gz b/third_party/gcc/bin/x86_64-linux-musl-strip.gz index 54b42ca0e..c0ffca2c0 100644 Binary files a/third_party/gcc/bin/x86_64-linux-musl-strip.gz and b/third_party/gcc/bin/x86_64-linux-musl-strip.gz differ diff --git a/third_party/gcc/config.mak b/third_party/gcc/config.mak new file mode 100644 index 000000000..2fa28bfbb --- /dev/null +++ b/third_party/gcc/config.mak @@ -0,0 +1,97 @@ +# +# config.mak.dist - sample musl-cross-make configuration +# +# Copy to config.mak and edit as desired. +# + +# There is no default TARGET; you must select one here or on the make +# command line. Some examples: + +# TARGET = i486-linux-musl +TARGET = aarch64-linux-musl +# TARGET = arm-linux-musleabi +# TARGET = arm-linux-musleabihf +# TARGET = sh2eb-linux-muslfdpic +# TARGET = powerpc64le-linux-musl +# TARGET = aarch64-linux-musl + +# By default, cross compilers are installed to ./output under the top-level +# musl-cross-make directory and can later be moved wherever you want them. +# To install directly to a specific location, set it here. Multiple targets +# can safely be installed in the same location. Some examples: + +OUTPUT = /opt/cross11portcosmo +# OUTPUT = /usr/local + +# By default, latest supported release versions of musl and the toolchain +# components are used. You can override those here, but the version selected +# must be supported (under hashes/ and patches/) to work. For musl, you +# can use "git-refname" (e.g. git-master) instead of a release. Setting a +# blank version for gmp, mpc, mpfr and isl will suppress download and +# in-tree build of these libraries and instead depend on pre-installed +# libraries when available (isl is optional and not set by default). +# Setting a blank version for linux will suppress installation of kernel +# headers, which are not needed unless compiling programs that use them. + +# BINUTILS_VER = 2.25.1 +GCC_VER = 11.2.0 +# MUSL_VER = git-master +# GMP_VER = +# MPC_VER = +# MPFR_VER = +# ISL_VER = +# LINUX_VER = + +# By default source archives are downloaded with wget. curl is also an option. + +# DL_CMD = wget -c -O +# DL_CMD = curl -C - -L -o + +# Check sha-1 hashes of downloaded source archives. On gnu systems this is +# usually done with sha1sum. + +# SHA1_CMD = sha1sum -c +# SHA1_CMD = sha1 -c +# SHA1_CMD = shasum -a 1 -c + +# Something like the following can be used to produce a static-linked +# toolchain that's deployable to any system with matching arch, using +# an existing musl-targeted cross compiler. This only works if the +# system you build on can natively (or via binfmt_misc and qemu) run +# binaries produced by the existing toolchain (in this example, i486). + +# MUSL_CONFIG += --enable-debug +# MUSL_CONFIG += CFLAGS="-Os -fno-omit-frame-pointer -fno-optimize-sibling-calls -mno-omit-leaf-frame-pointer" +MUSL_CONFIG += CFLAGS="-Os" + +COMMON_CONFIG += CC="/opt/cross/bin/x86_64-linux-musl-gcc -static --static" +COMMON_CONFIG += CXX="/opt/cross/bin/x86_64-linux-musl-g++ -static --static" +# COMMON_CONFIG += CC="gcc -static --static" +# COMMON_CONFIG += CXX="g++ -static --static" + +# Recommended options for smaller build for deploying binaries: + +COMMON_CONFIG += CFLAGS="-Os -g0" +COMMON_CONFIG += CXXFLAGS="-Os -g0" +COMMON_CONFIG += LDFLAGS="-s" + +# Options you can add for faster/simpler build at the expense of features: + +COMMON_CONFIG += --disable-nls +GCC_CONFIG += --disable-libquadmath --disable-decimal-float +GCC_CONFIG += --disable-libitm +GCC_CONFIG += --disable-fixed-point +GCC_CONFIG += --disable-lto + +# By default C and C++ are the only languages enabled, and these are +# the only ones tested and known to be supported. You can uncomment the +# following and add other languages if you want to try getting them to +# work too. + +GCC_CONFIG += --enable-languages=c,c++ #--enable-plugin + +# You can keep the local build path out of your toolchain binaries and +# target libraries with the following, but then gdb needs to be told +# where to look for source files. + +# COMMON_CONFIG += --with-debug-prefix-map=$(CURDIR)= diff --git a/third_party/gcc/lib/gcc/x86_64-linux-musl/11.2.0/specs b/third_party/gcc/lib/gcc/x86_64-linux-musl/11.2.0/specs new file mode 100644 index 000000000..e69de29bb diff --git a/third_party/gcc/lib/gcc/x86_64-linux-musl/9.2.0/specs b/third_party/gcc/lib/gcc/x86_64-linux-musl/9.2.0/specs deleted file mode 100644 index ae7116a07..000000000 --- a/third_party/gcc/lib/gcc/x86_64-linux-musl/9.2.0/specs +++ /dev/null @@ -1,141 +0,0 @@ -*asm: -%{m16|m32:--32} %{m16|m32:;:--64} %{msse2avx:%{!mavx:-msse2avx}} - -*asm_debug: -%{%:debug-level-gt(0):%{gstabs*:--gstabs}%{!gstabs*:%{g*:--gdwarf2}}} %{fdebug-prefix-map=*:--debug-prefix-map %*} - -*asm_final: -%{gsplit-dwarf: - objcopy --extract-dwo %{c:%{o*:%*}%{!o*:%b%O}}%{!c:%U%O} %{c:%{o*:%:replace-extension(%{o*:%*} .dwo)}%{!o*:%b.dwo}}%{!c:%b.dwo} - objcopy --strip-dwo %{c:%{o*:%*}%{!o*:%b%O}}%{!c:%U%O} } - -*asm_options: -%{-target-help:%:print-asm-header()} %{v} %{w:-W} %{I*} %{gz|gz=zlib:--compress-debug-sections=zlib} %{gz=none:--compress-debug-sections=none} %{gz=zlib-gnu:--compress-debug-sections=zlib-gnu} %a %Y %{c:%W{o*}%{!o*:-o %w%b%O}}%{!c:-o %d%w%u%O} - -*invoke_as: -%{!fwpa*: %{fcompare-debug=*|fdump-final-insns=*:%:compare-debug-dump-opt()} %{!S:-o %|.s | - as %(asm_options) %m.s %A } } - -*cpp: -%{posix:-D_POSIX_SOURCE} %{pthread:-D_REENTRANT} - -*cpp_options: -%(cpp_unique_options) %1 %{m*} %{std*&ansi&trigraphs} %{W*&pedantic*} %{w} %{f*} %{g*:%{%:debug-level-gt(0):%{g*} %{!fno-working-directory:-fworking-directory}}} %{O*} %{undef} %{save-temps*:-fpch-preprocess} - -*cpp_debug_options: -%{d*} - -*cpp_unique_options: -%{!Q:-quiet} %{nostdinc*} %{C} %{CC} %{v} %@{I*&F*} %{P} %I %{MD:-MD %{!o:%b.d}%{o*:%.d%*}} %{MMD:-MMD %{!o:%b.d}%{o*:%.d%*}} %{M} %{MM} %{MF*} %{MG} %{MP} %{MQ*} %{MT*} %{!E:%{!M:%{!MM:%{!MT:%{!MQ:%{MD|MMD:%{o*:-MQ %*}}}}}}} %{remap} %{g3|ggdb3|gstabs3|gxcoff3|gvms3:-dD} %{!iplugindir*:%{fplugin*:%:find-plugindir()}} %{H} %C %{D*&U*&A*} %{i*} %Z %i %{E|M|MM:%W{o*}} - -*trad_capable_cpp: -cc1 -E %{traditional|traditional-cpp:-traditional-cpp} - -*cc1: -%{!mandroid|tno-android-cc:%(cc1_cpu) %{profile:-p};:%(cc1_cpu) %{profile:-p} %{!fno-pic:%{!fno-PIC:%{!fpic:%{!fPIC: -fPIC}}}}} - -*cc1_options: -%{pg:%{fomit-frame-pointer:%e-pg and -fomit-frame-pointer are incompatible}} %{!iplugindir*:%{fplugin*:%:find-plugindir()}} %1 %{!Q:-quiet} %{!dumpbase:-dumpbase %B} %{d*} %{m*} %{aux-info*} %{fcompare-debug-second:%:compare-debug-auxbase-opt(%b)} %{!fcompare-debug-second:%{c|S:%{o*:-auxbase-strip %*}%{!o*:-auxbase %b}}}%{!c:%{!S:-auxbase %b}} %{g*} %{O*} %{W*&pedantic*} %{w} %{std*&ansi&trigraphs} %{v:-version} %{pg:-p} %{p} %{f*} %{undef} %{Qn:-fno-ident} %{Qy:} %{-help:--help} %{-target-help:--target-help} %{-version:--version} %{-help=*:--help=%*} %{!fsyntax-only:%{S:%W{o*}%{!o*:-o %b.s}}} %{fsyntax-only:-o %j} %{-param*} %{coverage:-fprofile-arcs -ftest-coverage} %{fprofile-arcs|fprofile-generate*|coverage: %{!fprofile-update=single: %{pthread:-fprofile-update=prefer-atomic}}} - -*cc1plus: - - -*link_gcc_c_sequence: -%{static|static-pie:--start-group} %G %{!nolibc:%L} %{static|static-pie:--end-group}%{!static:%{!static-pie:%G}} - -*link_ssp: -%{fstack-protector|fstack-protector-all|fstack-protector-strong|fstack-protector-explicit:-lssp_nonshared} - -*endfile: ---push-state --pop-state - -*link: -%{!mandroid|tno-android-ld:%{m16|m32:;:-m elf_x86_64} %{m16|m32:-m elf_i386} %{shared:-shared} %{!shared: %{!static: %{!static-pie: %{rdynamic:-export-dynamic} }} %{static:-static} %{static-pie:-static -pie --no-dynamic-linker -z text}};:%{m16|m32:;:-m elf_x86_64} %{m16|m32:-m elf_i386} %{mx32:-m elf32_x86_64} %{shared:-shared} %{!shared: %{!static: %{!static-pie: %{rdynamic:-export-dynamic} %{m16|m32:-dynamic-linker } %{m16|m32:;:-dynamic-linker} }} %{static:-static} %{static-pie:-static -pie --no-dynamic-linker -z text}} %{shared: -Bsymbolic}} - -*lib: ---push-state --pop-state - -*link_gomp: - - -*libgcc: ---push-state --pop-state - -*startfile: ---push-state --pop-state - -*cross_compile: -1 - -*version: -9.2.0 - -*multilib: -. ; - -*multilib_defaults: -m64 - -*multilib_extra: - - -*multilib_matches: - - -*multilib_exclusions: - - -*multilib_options: - - -*multilib_reuse: - - -*linker: -collect2 - -*linker_plugin_file: - - -*lto_wrapper: - - -*lto_gcc: - - -*post_link: - - -*link_libgcc: -%D - -*md_exec_prefix: - - -*md_startfile_prefix: - - -*md_startfile_prefix_1: - - -*startfile_prefix_spec: - - -*sysroot_spec: ---sysroot=%R - -*sysroot_suffix_spec: - - -*sysroot_hdrs_suffix_spec: - - -*self_spec: - - -*cc1_cpu: -%{march=native:%>march=native %:local_cpu_detect(arch) %{!mtune=*:%>mtune=native %:local_cpu_detect(tune)}} %{mtune=native:%>mtune=native %:local_cpu_detect(tune)} - -*link_command: -%{!fsyntax-only:%{!c:%{!M:%{!MM:%{!E:%{!S: %(linker) %{fuse-linker-plugin: %e-fuse-linker-plugin is not supported in this configuration}%{flto|flto=*:% +Date: Sun Jun 4 22:42:41 2023 -0500 + + enable patching errno constants with -fportcosmo + + https://github.com/ahgamut/gcc/tree/portcosmo-11.2 + + diff the above fork with gcc-11.2.0 to get the diff in this commit. the + patched gcc allows using Cosmopolitan magic numbers in switch cases and + struct initializations in the following way: + + - build gcc using this patch + + - if ENOSYS is in your switch case, define a value in a separate header + as follows: + + static const int __tmpcosmo_ENOSYS = -23486391; // any value + + - pass the -fportcosmo flag when compiling your file, and include the + header containing the above temporary values + + - the patched gcc will do the necessary AST transformation so that the + actual ENOSYS value is used. + +diff --git a/patches/gcc-11.2.0/0006-portcosmo.diff b/patches/gcc-11.2.0/0006-portcosmo.diff +new file mode 100644 +index 0000000..1700611 +--- /dev/null ++++ b/patches/gcc-11.2.0/0006-portcosmo.diff +@@ -0,0 +1,1838 @@ ++diff --git a/gcc/Makefile.in b/gcc/Makefile.in ++index 8a5fb3fd9..3a7498db8 100644 ++--- a/gcc/Makefile.in +++++ b/gcc/Makefile.in ++@@ -1231,6 +1231,10 @@ GCC_OBJS = gcc.o gcc-main.o ggc-none.o ++ ++ c-family-warn = $(STRICT_WARN) ++ +++PORTCOSMO_OBJS = c-family/portcosmo.o c-family/subcontext.o\ +++ c-family/initstruct.o c-family/ifswitch.o \ +++ c-family/unpatch_int.o c-family/unpatch_ast.o +++ ++ # Language-specific object files shared by all C-family front ends. ++ C_COMMON_OBJS = c-family/c-common.o c-family/c-cppbuiltin.o c-family/c-dump.o \ ++ c-family/c-format.o c-family/c-gimplify.o c-family/c-indentation.o \ ++@@ -1238,7 +1242,8 @@ C_COMMON_OBJS = c-family/c-common.o c-family/c-cppbuiltin.o c-family/c-dump.o \ ++ c-family/c-ppoutput.o c-family/c-pragma.o c-family/c-pretty-print.o \ ++ c-family/c-semantics.o c-family/c-ada-spec.o \ ++ c-family/c-ubsan.o c-family/known-headers.o \ ++- c-family/c-attribs.o c-family/c-warn.o c-family/c-spellcheck.o +++ c-family/c-attribs.o c-family/c-warn.o c-family/c-spellcheck.o \ +++ $(PORTCOSMO_OBJS) ++ ++ # Analyzer object files ++ ANALYZER_OBJS = \ ++diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c ++index d227686a0..a1d4c81ec 100644 ++--- a/gcc/c-family/c-common.c +++++ b/gcc/c-family/c-common.c ++@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3. If not see ++ #include "c-spellcheck.h" ++ #include "selftest.h" ++ #include "debug.h" +++#include "c-family/portcosmo.h" ++ ++ cpp_reader *parse_in; /* Declared in c-pragma.h. */ ++ ++@@ -2112,8 +2113,16 @@ check_case_value (location_t loc, tree value) ++ value = perform_integral_promotions (value); ++ else if (value != error_mark_node) ++ { ++- error_at (loc, "case label does not reduce to an integer constant"); ++- value = error_mark_node; +++ if (flag_portcosmo) { +++ value = patch_case_nonconst(loc, value); +++ if (value == NULL_TREE) { +++ error_at (loc, "case label does not reduce to an integer constant"); +++ value = error_mark_node; +++ } +++ } else { +++ error_at (loc, "case label does not reduce to an integer constant"); +++ value = error_mark_node; +++ } ++ } ++ ++ constant_expression_warning (value); ++diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c ++index 89e05a4c5..bf1d8b445 100644 ++--- a/gcc/c-family/c-opts.c +++++ b/gcc/c-family/c-opts.c ++@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see ++ #include "mkdeps.h" ++ #include "dumpfile.h" ++ #include "file-prefix-map.h" /* add_*_prefix_map() */ +++#include "c-family/portcosmo.h" ++ ++ #ifndef DOLLARS_IN_IDENTIFIERS ++ # define DOLLARS_IN_IDENTIFIERS true ++@@ -1196,6 +1197,11 @@ c_common_init (void) ++ return false; ++ } ++ +++ if (flag_portcosmo) +++ { +++ portcosmo_setup(); +++ } +++ ++ return true; ++ } ++ ++@@ -1281,6 +1287,10 @@ c_common_finish (void) ++ /* For performance, avoid tearing down cpplib's internal structures ++ with cpp_destroy (). */ ++ cpp_finish (parse_in, deps_stream); +++ if(flag_portcosmo) +++ { +++ portcosmo_teardown(); +++ } ++ ++ if (deps_stream && deps_stream != out_stream && deps_stream != stdout ++ && (ferror (deps_stream) || fclose (deps_stream))) ++@@ -1288,6 +1298,7 @@ c_common_finish (void) ++ ++ if (out_stream && (ferror (out_stream) || fclose (out_stream))) ++ fatal_error (input_location, "when writing output to %s: %m", out_fname); +++ ++ } ++ ++ /* Either of two environment variables can specify output of ++diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt ++index 2005b783c..a69a9a349 100644 ++--- a/gcc/c-family/c.opt +++++ b/gcc/c-family/c.opt ++@@ -1881,6 +1881,10 @@ fopenmp-simd ++ C ObjC C++ ObjC++ Var(flag_openmp_simd) ++ Enable OpenMP's SIMD directives. ++ +++fportcosmo +++C C++ RejectNegative Var(flag_portcosmo) +++Enable AST rewriting for Cosmopolitan Libc magic numbers. +++ ++ foperator-names ++ C++ ObjC++ ++ Recognize C++ keywords like \"compl\" and \"xor\". ++diff --git a/gcc/c-family/ifswitch.cc b/gcc/c-family/ifswitch.cc ++new file mode 100644 ++index 000000000..148b2c2b4 ++--- /dev/null +++++ b/gcc/c-family/ifswitch.cc ++@@ -0,0 +1,236 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "c-family/ifswitch.h" +++ +++static tree get_switch_body(tree swexpr) { +++ auto body = SWITCH_STMT_BODY(swexpr); +++ if (TREE_CODE(body) == BIND_EXPR) { +++ body = BIND_EXPR_BODY(body); +++ } +++ return body; +++} +++ +++source_range get_switch_bounds(tree sws) { +++ auto body = get_switch_body(sws); +++ source_range rng; +++ rng.m_start = MAX_LOCATION_T; +++ rng.m_finish = MAX_LOCATION_T; +++ if (STATEMENT_LIST_HEAD(body) && STATEMENT_LIST_TAIL(body)) { +++ /* otherwise this is an empty switch statement */ +++ auto rng1 = EXPR_LOCATION_RANGE(STATEMENT_LIST_HEAD(body)->stmt); +++ auto rng2 = EXPR_LOCATION_RANGE(STATEMENT_LIST_TAIL(body)->stmt); +++ rng.m_start = rng1.m_start; +++ rng.m_finish = rng2.m_finish; +++ } +++ return rng; +++} +++ +++unsigned int count_mods_in_switch(tree swexpr, subu_list *list) { +++ tree body = get_switch_body(swexpr); +++ tree t = NULL_TREE; +++ tree replacement = NULL_TREE; +++ subu_node *use = NULL; +++ unsigned int count = 0; +++ for (auto i = tsi_start(body); !tsi_end_p(i); tsi_next(&i)) { +++ t = tsi_stmt(i); +++ if (TREE_CODE(t) == CASE_LABEL_EXPR) { +++ if (get_subu_elem(list, EXPR_LOCATION(t), +++ &use) /* on a line we substituted */ +++ && CASE_LOW(t) != NULL_TREE /* not a x..y range */ +++ && CASE_HIGH(t) == NULL_TREE /* not a default */ +++ && arg_should_be_unpatched(CASE_LOW(t), use, &replacement) +++ /* the case is the one we substituted */) { +++ DEBUGF("we substituted a case label at %u,%u\n", EXPR_LOC_LINE(t), +++ EXPR_LOC_COL(t)); +++ // debug_tree(CASE_LOW(t)); +++ count += 1; +++ } +++ } +++ } +++ return count; +++} +++ +++tree build_modded_label(unsigned int swcount, const char *case_str, +++ location_t loc = UNKNOWN_LOCATION) { +++ char dest[STRING_BUFFER_SIZE] = {0}; +++ snprintf(dest, sizeof(dest), "__tmpcosmo_%u_%s", swcount, case_str); +++ tree lab = build_decl(loc, LABEL_DECL, get_identifier(dest), void_type_node); +++ /* gcc's GIMPLE needs to know that this label +++ * is within the current function declaration */ +++ DECL_CONTEXT(lab) = current_function_decl; +++ return build1(LABEL_EXPR, void_type_node, lab); +++} +++ +++tree build_modded_exit_label(unsigned int swcount) { +++ return build_modded_label(swcount, "__end"); +++} +++ +++static inline tree build_modded_if_stmt(tree condition, tree then_clause, +++ tree else_clause = NULL_TREE) { +++ return build3(COND_EXPR, void_type_node, condition, then_clause, else_clause); +++} +++ +++tree modded_case_label(tree t, unsigned int i, tree swcond, vec *&ifs, +++ SubContext *ctx, tree *default_label) { +++ // debug_tree(t); +++ tree result; +++ tree replacement = NULL_TREE; +++ subu_node *use = NULL; +++ char case_str[STRING_BUFFER_SIZE] = {0}; +++ +++ if (CASE_LOW(t) == NULL_TREE) { +++ DEBUGF("default case\n"); +++ /* default label of the switch case, needs to be last */ +++ result = build_modded_label(ctx->switchcount, "__dflt", EXPR_LOCATION(t)); +++ *default_label = result; +++ } else if (CASE_LOW(t) != NULL_TREE && CASE_HIGH(t) == NULL_TREE) { +++ /* a case label */ +++ if (get_subu_elem(ctx->mods, EXPR_LOCATION(t), &use) +++ /* the case is on a line we substituted */ +++ && arg_should_be_unpatched(CASE_LOW(t), use, &replacement) +++ /* the case value is the one we substituted */) { +++ DEBUGF("modded case\n"); +++ result = +++ build_modded_label(ctx->switchcount, use->name, EXPR_LOCATION(t)); +++ ifs->safe_push(build_modded_if_stmt( +++ build2(EQ_EXPR, integer_type_node, swcond, replacement), +++ build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(result)))); +++ remove_subu_elem(ctx->mods, use); +++ replacement = NULL_TREE; +++ } else { +++ /* a case label that we didn't substitute */ +++ DEBUGF("unmodded case\n"); +++ snprintf(case_str, sizeof(case_str), "%x_", i); +++ result = build_modded_label(ctx->switchcount, case_str, EXPR_LOCATION(t)); +++ ifs->safe_push(build_modded_if_stmt( +++ build2(EQ_EXPR, integer_type_node, swcond, CASE_LOW(t)), +++ build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(result)))); +++ } +++ } else { +++ DEBUGF("unmodded case range\n"); +++ /* CASE_LOW(t) != NULL_TREE && CASE_HIGH(t) != NULL_TREE */ +++ /* this is a case x .. y sort of range */ +++ snprintf(case_str, sizeof(case_str), "%x_", i); +++ result = build_modded_label(ctx->switchcount, case_str, EXPR_LOCATION(t)); +++ ifs->safe_push(build_modded_if_stmt( +++ build2(TRUTH_ANDIF_EXPR, integer_type_node, +++ build2(GE_EXPR, integer_type_node, swcond, CASE_LOW(t)), +++ build2(LE_EXPR, integer_type_node, swcond, CASE_HIGH(t))), +++ build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(result)))); +++ } +++ return result; +++} +++ +++tree build_modded_switch_stmt(tree swexpr, SubContext *ctx) { +++ int case_count = 0, break_count = 0; +++ int has_default = 0; +++ +++ tree swcond = save_expr(SWITCH_STMT_COND(swexpr)); +++ tree swbody = get_switch_body(swexpr); +++ tree *tp = NULL; +++ char dest[STRING_BUFFER_SIZE] = {0}; +++ +++ vec *ifs; +++ vec_alloc(ifs, 0); +++ +++ tree exit_label = build_modded_exit_label(ctx->switchcount); +++ tree default_label = NULL_TREE; +++ +++ for (auto it = tsi_start(swbody); !tsi_end_p(it); tsi_next(&it)) { +++ tp = tsi_stmt_ptr(it); +++ if (TREE_CODE(*tp) == CASE_LABEL_EXPR) { +++ case_count += 1; +++ has_default = has_default || (CASE_LOW(*tp) == NULL_TREE); +++ /* replace the case statement with a goto */ +++ *tp = +++ modded_case_label(*tp, case_count, swcond, ifs, ctx, &default_label); +++ } else if (TREE_CODE(*tp) == BREAK_STMT) { +++ break_count += 1; +++ /* replace the break statement with a goto to the end */ +++ *tp = build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(exit_label)); +++ } else if (TREE_CODE(*tp) == BIND_EXPR) { +++ for (auto it2 = tsi_start(BIND_EXPR_BODY(*tp)); !tsi_end_p(it2); +++ tsi_next(&it2)) { +++ auto tp2 = tsi_stmt_ptr(it2); +++ if (TREE_CODE(*tp2) == BREAK_STMT) { +++ break_count += 1; +++ /* replace the break statement with a goto to the end */ +++ *tp2 = +++ build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(exit_label)); +++ } +++ } +++ } +++ } +++ /* add all the if statements to the start of the switch body */ +++ /* TODO: do we have to combine them via COND_EXPR_ELSE? why, +++ * is it not possible to just them as a list one after the other? */ +++ tree res; +++ unsigned int zz = 0; +++ if (ifs->length() > 0) { +++ res = (*ifs)[0]; +++ for (zz = 1; zz < ifs->length(); ++zz) { +++ COND_EXPR_ELSE(res) = (*ifs)[zz]; +++ res = (*ifs)[zz]; +++ } +++ /* if we have a valid default for the switch, +++ * it should be the final else branch */ +++ if (default_label && default_label != NULL_TREE) { +++ COND_EXPR_ELSE(res) = +++ build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(default_label)); +++ } else { +++ /* if we don't have a default, then the final else branch +++ * should just jump to after the switch */ +++ COND_EXPR_ELSE(res) = +++ build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(exit_label)); +++ } +++ /* reset to the start of the if-else tree */ +++ res = (*ifs)[0]; +++ } else if (has_default && default_label != NULL_TREE) { +++ /* this switch has only a default? ok... */ +++ res = build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(default_label)); +++ } else { +++ /* this switch has no cases, and no default?! */ +++ warning_at(EXPR_LOCATION(swcond), 0, "switch without cases or default?"); +++ res = build1(GOTO_EXPR, void_type_node, LABEL_EXPR_LABEL(exit_label)); +++ } +++ auto it = tsi_start(swbody); +++ tsi_link_before(&it, res, TSI_SAME_STMT); +++ tsi_link_before(&it, build_empty_stmt(UNKNOWN_LOCATION), TSI_SAME_STMT); +++ +++ /* add the 'outside' of the switch, ie the 'finally' +++ * aka the target of the break statements, the 'exit_label', +++ * to the end of the switch body */ +++ append_to_statement_list(build_empty_stmt(UNKNOWN_LOCATION), &swbody); +++ append_to_statement_list(exit_label, &swbody); +++ append_to_statement_list(build_empty_stmt(UNKNOWN_LOCATION), &swbody); +++ /* +++ snprintf(dest, sizeof(dest), +++ "above switch had %d cases replaced and %d breaks\n", case_count, +++ break_count); +++ append_to_statement_list(build_call_expr(VAR_NAME_AS_TREE("printf"), 1, +++ BUILD_STRING_AS_TREE(dest)), +++ &swbody); */ +++ +++ /* debug_tree(swbody); */ +++ /* we are returning SWITCH_STMT_BODY(swexpr), +++ * instead of just swbody, because sometimes, +++ * SWITCH_STMT_BODY(swexpr) may be a BIND_EXPR +++ * that has some scoping-related information. */ +++ return SWITCH_STMT_BODY(swexpr); +++} ++diff --git a/gcc/c-family/ifswitch.h b/gcc/c-family/ifswitch.h ++new file mode 100644 ++index 000000000..ccf78067f ++--- /dev/null +++++ b/gcc/c-family/ifswitch.h ++@@ -0,0 +1,10 @@ +++#ifndef IFSWITCH_H +++#define IFSWITCH_H +++#include "c-family/portcosmo.internal.h" +++#include "c-family/subcontext.h" +++ +++source_range get_switch_bounds(tree); +++unsigned int count_mods_in_switch(tree, subu_list *); +++tree build_modded_switch_stmt(tree, SubContext *); +++ +++#endif /* IFSWITCH_H */ ++diff --git a/gcc/c-family/initstruct.cc b/gcc/c-family/initstruct.cc ++new file mode 100644 ++index 000000000..6a0d4a78e ++--- /dev/null +++++ b/gcc/c-family/initstruct.cc ++@@ -0,0 +1,422 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "c-family/initstruct.h" +++ +++void portcosmo_finish_decl(void *gcc_data) { +++ handle_decl(gcc_data, (void *)(&cosmo_ctx)); +++} +++ +++void set_values_based_on_ctor(tree ctor, subu_list *list, tree body, tree lhs, +++ location_t bound) { +++ subu_node *use = NULL; +++ unsigned int iprev = 0; +++ bool started = true; +++ tree replacement = NULL_TREE; +++ +++ while (list->count > 0 && LOCATION_BEFORE2(list->start, bound)) { +++ tree index = NULL_TREE; +++ tree val = NULL_TREE; +++ unsigned int i = 0; +++ int found = 0; +++ FOR_EACH_CONSTRUCTOR_ELT(CONSTRUCTOR_ELTS(ctor), i, index, val) { +++ DEBUGF("value %u is %s\n", i, get_tree_code_str(val)); +++ if (!started && i <= iprev) continue; +++ if (TREE_CODE(val) == INTEGER_CST) { +++ for (use = list->head; use; use = use->next) { +++ found = arg_should_be_unpatched(val, use, &replacement); +++ if (found) break; +++ } +++ if (found) { +++ iprev = i; +++ started = false; +++ break; +++ } +++ } else if (TREE_CODE(val) == CONSTRUCTOR) { +++ auto sub = access_at(lhs, index); +++ // debug_tree(sub); +++ set_values_based_on_ctor(val, list, body, sub, bound); +++ use = NULL; /* might've gotten stomped */ +++ if (list->count == 0) return; +++ get_subu_elem(list, list->start, &use); +++ } +++ } +++ if (found) { +++ auto modexpr = build2(MODIFY_EXPR, TREE_TYPE(index), +++ access_at(lhs, index), replacement); +++ // debug_tree(modexpr); +++ append_to_statement_list(modexpr, &body); +++ remove_subu_elem(list, use); +++ replacement = NULL_TREE; +++ DEBUGF("found; %d left\n", list->count); +++ } else { +++ /* we did not find any (more) substitutions to fix */ +++ DEBUGF("exiting; %d left\n", list->count); +++ break; +++ } +++ } +++} +++ +++/* initstruct/global.cc */ +++ +++void update_global_decls(tree dcl, SubContext *ctx) { +++ tree body = alloc_stmt_list(); +++ subu_node *use = NULL; +++ char chk[STRING_BUFFER_SIZE]; +++ +++ /* dcl, the global declaration we have is like these: +++ * +++ * static int foo = __tmpcosmo_VAR; +++ * static struct toy myvalue = {.x=1, .y=__tmpcosmo_VAR}; +++ * +++ * we're going to add functions as follows: +++ * +++ * static int foo = __tmpcosmo_VAR; +++ * __attribute__((constructor)) __hidden_ctor1() { +++ * foo = VAR; +++ * } +++ * static struct toy myvalue = {.x=1, .y=__tmpcosmo_VAR}; +++ * __attribute__((constructor)) __hidden_ctor2() { +++ * myvalue.y = VAR; +++ * } +++ * +++ * the modifier functions have the constructor attribute, +++ * so it they run before anything uses the static. it +++ * works recursively too: you can have a struct of structs, +++ * an array of structs, whatever, and it will figure out +++ * where the substitutions are and attempt to mod them. +++ * +++ * a unique constructor for each declaration. probably +++ * we could have a common constructor for the entire +++ * file, but that's left as an exercise for the reader. */ +++ if (INTEGRAL_TYPE_P(TREE_TYPE(dcl)) && +++ get_subu_elem(ctx->mods, ctx->mods->start, &use) && +++ /* use is non-NULL if get_subu_elem succeeds */ +++ check_magic_equal(DECL_INITIAL(dcl), use->name)) { +++ if (TREE_READONLY(dcl)) { +++ error_at(EXPR_LOCATION(dcl), "cannot substitute this constant\n"); +++ /* actually I can, but the issue is if one of gcc's optimizations +++ * perform constant folding(and they do), I don't know all the spots +++ * where this variable has been folded, so I can't substitute there */ +++ ctx->active = 0; +++ return; +++ } +++ append_to_statement_list( +++ build2(MODIFY_EXPR, void_type_node, dcl, VAR_NAME_AS_TREE(use->name)), +++ &body); +++ /* +++ append_to_statement_list( +++ build_call_expr(VAR_NAME_AS_TREE("printf"), 2, +++ BUILD_STRING_AS_TREE("ctor initstruct %s\n"), +++ BUILD_STRING_AS_TREE(IDENTIFIER_NAME(dcl))), +++ &body); +++ */ +++ remove_subu_elem(ctx->mods, use); +++ cgraph_build_static_cdtor('I', body, 0); +++ } else if ((RECORD_TYPE == TREE_CODE(TREE_TYPE(dcl)) || +++ ARRAY_TYPE == TREE_CODE(TREE_TYPE(dcl))) && +++ DECL_INITIAL(dcl) != NULL_TREE) { +++ if (TREE_READONLY(dcl)) { +++ warning_at(DECL_SOURCE_LOCATION(dcl), 0, +++ "not sure if modding const structs is good\n"); +++ TREE_READONLY(dcl) = 0; +++ } +++ if (LOCATION_BEFORE2(ctx->mods->end, input_location)) { +++ set_values_based_on_ctor(DECL_INITIAL(dcl), ctx->mods, body, dcl, +++ input_location); +++ } else { +++ set_values_based_on_ctor(DECL_INITIAL(dcl), ctx->mods, body, dcl, +++ ctx->mods->end); +++ } +++ /* +++ append_to_statement_list( +++ build_call_expr(VAR_NAME_AS_TREE("printf"), 2, +++ BUILD_STRING_AS_TREE("ctor initstruct %s\n"), +++ BUILD_STRING_AS_TREE(IDENTIFIER_NAME(dcl))), +++ &body); +++ */ +++ cgraph_build_static_cdtor('I', body, 0); +++ DEBUGF("uploaded ctor\n"); +++ } +++} +++ +++void handle_decl(void *gcc_data, void *user_data) { +++ tree t = (tree)gcc_data; +++ SubContext *ctx = (SubContext *)user_data; +++ if (ctx->active && ctx->mods->count > 0 && DECL_INITIAL(t) != NULL && +++ DECL_CONTEXT(t) == NULL_TREE) { +++ int internal_use = +++ !strncmp(IDENTIFIER_NAME(t), "__tmpcosmo_", strlen("__tmpcosmo_")); +++ if (internal_use || DECL_EXTERNAL(t)) { +++ error_at(input_location, "the ACTUALLY is before the declaration!\n"); +++ ctx->active = 0; +++ return; +++ } +++ auto rng = EXPR_LOCATION_RANGE(t); +++ rng.m_start = DECL_SOURCE_LOCATION(t); +++ rng.m_finish = input_location; +++ +++ DEBUGF("handle_decl with %s %u,%u - %u-%u\n", IDENTIFIER_NAME(t), +++ LOCATION_LINE(rng.m_start), LOCATION_COLUMN(rng.m_start), +++ LOCATION_LINE(rng.m_finish), LOCATION_COLUMN(rng.m_finish)); +++ ctx->initcount += ctx->mods->count; +++ update_global_decls(t, ctx); +++ /* now at this stage, all uses of our macros have been +++ * fixed, INCLUDING case labels. Let's confirm that: */ +++ check_context_clear(ctx, MAX_LOCATION_T); +++ } +++} +++ +++/* initstruct/local.cc */ +++ +++static inline tree build_modded_if_stmt(tree condition, tree then_clause, +++ tree else_clause = NULL_TREE) { +++ return build3(COND_EXPR, void_type_node, condition, then_clause, else_clause); +++} +++ +++int build_modded_int_declaration(tree *dxpr, SubContext *ctx, subu_node *use) { +++ char chk[STRING_BUFFER_SIZE]; +++ tree dcl = DECL_EXPR_DECL(*dxpr); +++ tree replacement = NULL_TREE; +++ +++ if (INTEGRAL_TYPE_P(TREE_TYPE(dcl)) && +++ arg_should_be_unpatched(DECL_INITIAL(dcl), use, &replacement)) { +++ if (TREE_READONLY(dcl)) { +++ error_at(EXPR_LOCATION(dcl), "cannot substitute this constant\n"); +++ /* actually I can, but the issue is if one of gcc's optimizations +++ * perform constant folding(and they do), I don't know all the spots +++ * where this variable has been folded, so I can't substitute there */ +++ ctx->active = 0; +++ return 0; +++ } +++ +++ if (!TREE_STATIC(dcl)) { +++ DECL_INITIAL(dcl) = replacement; +++ remove_subu_elem(ctx->mods, use); +++ replacement = NULL_TREE; +++ return 1; +++ } +++ +++ DEBUGF("fixing decl for a static integer\n"); +++ /* (*dxpr), the statement we have is this: +++ * +++ * static int myvalue = __tmpcosmo_VAR; +++ * +++ * we're going to modify it to this: +++ * +++ * static int myvalue = __tmpcosmo_VAR; +++ * static uint8 __chk_ifs_myvalue = 0; +++ * if(__chk_ifs_myvalue != 1) { +++ * __chk_ifs_myvalue = 1; +++ * myvalue = VAR; +++ * } +++ * +++ * so the modified statement runs exactly once, +++ * whenever the function is first called, right +++ * after the initialization of the variable we +++ * wanted to modify. */ +++ +++ /* build __chk_ifs_myvalue */ +++ snprintf(chk, sizeof(chk), "__chk_ifs_%s", IDENTIFIER_NAME(dcl)); +++ tree chknode = build_decl(DECL_SOURCE_LOCATION(dcl), VAR_DECL, +++ get_identifier(chk), uint8_type_node); +++ DECL_INITIAL(chknode) = build_int_cst(uint8_type_node, 0); +++ TREE_STATIC(chknode) = TREE_STATIC(dcl); +++ TREE_USED(chknode) = TREE_USED(dcl); +++ DECL_READ_P(chknode) = DECL_READ_P(dcl); +++ DECL_CONTEXT(chknode) = DECL_CONTEXT(dcl); +++ DECL_CHAIN(chknode) = DECL_CHAIN(dcl); +++ DECL_CHAIN(dcl) = chknode; +++ +++ /* create the then clause of the if statement */ +++ tree then_clause = alloc_stmt_list(); +++ append_to_statement_list(build2(MODIFY_EXPR, void_type_node, chknode, +++ build_int_cst(uint8_type_node, 1)), +++ &then_clause); +++ append_to_statement_list( +++ build2(MODIFY_EXPR, void_type_node, dcl, replacement), +++ &then_clause); +++ /* +++ append_to_statement_list( +++ build_call_expr(VAR_NAME_AS_TREE("printf"), 1, +++ BUILD_STRING_AS_TREE("initstruct magic\n")), +++ &then_clause); +++ */ +++ +++ /* create the if statement into the overall result mentioned above */ +++ tree res = alloc_stmt_list(); +++ append_to_statement_list(*dxpr, &res); +++ append_to_statement_list(build1(DECL_EXPR, void_type_node, chknode), &res); +++ append_to_statement_list( +++ build_modded_if_stmt(build2(NE_EXPR, void_type_node, chknode, +++ build_int_cst(uint8_type_node, 1)), +++ then_clause), +++ &res); +++ /* overwrite the input tree with our new statements */ +++ *dxpr = res; +++ // debug_tree(res); +++ remove_subu_elem(ctx->mods, use); +++ replacement = NULL_TREE; +++ return 1; +++ } +++ return 0; +++} +++ +++void modify_local_struct_ctor(tree ctor, subu_list *list, location_t bound) { +++ subu_node *use = NULL; +++ unsigned int iprev = 0; +++ bool started = true; +++ tree replacement = NULL_TREE; +++ +++ while (list->count > 0 && LOCATION_BEFORE2(list->start, bound)) { +++ tree val = NULL_TREE; +++ unsigned int i = 0; +++ int found = 0; +++ FOR_EACH_CONSTRUCTOR_VALUE(CONSTRUCTOR_ELTS(ctor), i, val) { +++ DEBUGF("value %u is %s\n", i, get_tree_code_str(val)); +++ // debug_tree(val); +++ if (TREE_CODE(val) == INTEGER_CST) { +++ for (use = list->head; use; use = use->next) { +++ found = arg_should_be_unpatched(val, use, &replacement); +++ if (found) break; +++ } +++ if (found) { +++ iprev = i; +++ started = false; +++ break; +++ } +++ } else if (TREE_CODE(val) == CONSTRUCTOR) { +++ modify_local_struct_ctor(val, list, bound); +++ use = NULL; /* might've gotten stomped */ +++ if (list->count == 0 || LOCATION_AFTER2(list->start, bound)) return; +++ } +++ } +++ if (found) { +++ DEBUGF("found\n"); +++ // debug_tree(CONSTRUCTOR_ELT(ctor, i)->index); +++ CONSTRUCTOR_ELT(ctor, i)->value = replacement; +++ // debug_tree(CONSTRUCTOR_ELT(ctor, i)->value); +++ remove_subu_elem(list, use); +++ replacement = NULL_TREE; +++ } else { +++ /* we did not find any (more) substitutions to fix */ +++ break; +++ } +++ } +++} +++ +++void build_modded_declaration(tree *dxpr, SubContext *ctx, location_t bound) { +++ char chk[STRING_BUFFER_SIZE]; +++ tree dcl = DECL_EXPR_DECL(*dxpr); +++ subu_node *use = NULL; +++ subu_list *list = ctx->mods; +++ unsigned int oldcount = list->count; +++ +++ // debug_tree(DECL_INITIAL(dcl)); +++ +++ if (INTEGRAL_TYPE_P(TREE_TYPE(dcl))) { +++ get_subu_elem(list, list->start, &use); +++ if (build_modded_int_declaration(dxpr, ctx, use)) { +++ use = NULL; +++ ctx->initcount += 1; +++ } +++ return; +++ } +++ +++ if ((RECORD_TYPE == TREE_CODE(TREE_TYPE(dcl)) || +++ ARRAY_TYPE == TREE_CODE(TREE_TYPE(dcl))) && +++ DECL_INITIAL(dcl) != NULL_TREE) { +++ if (TREE_READONLY(dcl)) { +++ warning_at(EXPR_LOCATION(*dxpr), 0, +++ "not sure if modding const structs is good\n"); +++ TREE_READONLY(dcl) = 0; +++ build_modded_declaration(dxpr, ctx, bound); +++ return; +++ } else if (TREE_STATIC(dcl)) { +++ DEBUGF("fixing decl for a static struct\n"); +++ /* (*dxpr), the statement we have is this: +++ * +++ * static struct toy myvalue = {.x=1, .y=__tmpcosmo_VAR}; +++ * +++ * we're going to modify it to this: +++ * +++ * static struct toy myvalue = {.x=1, .y=__tmpcosmo_VAR}; +++ * static uint8 __chk_ifs_myvalue = 0; +++ * if(__chk_ifs_myvalue != 1) { +++ * __chk_ifs_myvalue = 1; +++ * myvalue.y = VAR; +++ * } +++ * +++ * so the modified statement runs exactly once, +++ * whenever the function is first called, right +++ * after the initialization of the variable we +++ * wanted to modify. */ +++ +++ /* build __chk_ifs_myvalue */ +++ snprintf(chk, sizeof(chk), "__chk_ifs_%s", IDENTIFIER_NAME(dcl)); +++ tree chknode = build_decl(DECL_SOURCE_LOCATION(dcl), VAR_DECL, +++ get_identifier(chk), uint8_type_node); +++ DECL_INITIAL(chknode) = build_int_cst(uint8_type_node, 0); +++ TREE_STATIC(chknode) = TREE_STATIC(dcl); +++ TREE_USED(chknode) = TREE_USED(dcl); +++ DECL_READ_P(chknode) = DECL_READ_P(dcl); +++ DECL_CONTEXT(chknode) = DECL_CONTEXT(dcl); +++ DECL_CHAIN(chknode) = DECL_CHAIN(dcl); +++ DECL_CHAIN(dcl) = chknode; +++ +++ /* build a scope block for the temporary value */ +++ tree tmpscope = build0(BLOCK, void_type_node); +++ BLOCK_SUPERCONTEXT(tmpscope) = TREE_BLOCK(*dxpr); +++ // debug_tree(BLOCK_SUPERCONTEXT(tmpscope)); +++ +++ /* create the then clause of the if statement */ +++ tree then_clause = alloc_stmt_list(); +++ append_to_statement_list(build2(MODIFY_EXPR, void_type_node, chknode, +++ build_int_cst(uint8_type_node, 1)), +++ &then_clause); +++ set_values_based_on_ctor(DECL_INITIAL(dcl), ctx->mods, then_clause, dcl, +++ bound); +++ /* +++ append_to_statement_list( +++ build_call_expr(VAR_NAME_AS_TREE("printf"), 2, +++ BUILD_STRING_AS_TREE("initstruct magic %lu bytes\n"), +++ DECL_SIZE_UNIT(dcl)), +++ &then_clause); +++ */ +++ +++ /* create the if statement into the overall result mentioned above */ +++ tree res = alloc_stmt_list(); +++ append_to_statement_list(*dxpr, &res); +++ append_to_statement_list(build1(DECL_EXPR, void_type_node, chknode), +++ &res); +++ append_to_statement_list( +++ build_modded_if_stmt(build2(NE_EXPR, void_type_node, chknode, +++ build_int_cst(uint8_type_node, 1)), +++ then_clause), +++ &res); +++ /* overwrite the input tree with our new statements */ +++ *dxpr = res; +++ } else { +++ /* if it's a local struct, we can +++ * just mod the constructor itself */ +++ auto ctor = DECL_INITIAL(dcl); +++ modify_local_struct_ctor(ctor, list, bound); +++ } +++ } +++ ctx->initcount += (oldcount - list->count); +++} ++diff --git a/gcc/c-family/initstruct.h b/gcc/c-family/initstruct.h ++new file mode 100644 ++index 000000000..f77fba707 ++--- /dev/null +++++ b/gcc/c-family/initstruct.h ++@@ -0,0 +1,15 @@ +++#ifndef INITSTRUCT_H +++#define INITSTRUCT_H +++#include "c-family/portcosmo.internal.h" +++/* gcc utils first */ +++#include "c-family/subcontext.h" +++ +++void build_modded_declaration(tree *, SubContext *, location_t); +++int build_modded_int_declaration(tree *, SubContext *, subu_node *); +++tree copy_struct_ctor(tree); +++void modify_local_struct_ctor(tree, subu_list *, location_t); +++ +++void set_values_based_on_ctor(tree, subu_list *, tree, tree, location_t); +++void handle_decl(void *, void *); +++tree access_at(tree, tree); +++#endif /* INITSTRUCT_H */ ++diff --git a/gcc/c-family/portcosmo.cc b/gcc/c-family/portcosmo.cc ++new file mode 100644 ++index 000000000..4716500b5 ++--- /dev/null +++++ b/gcc/c-family/portcosmo.cc ++@@ -0,0 +1,171 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "c-family/portcosmo.internal.h" +++#include "c-family/subcontext.h" +++#include "c-family/ifswitch.h" +++#include "c-family/initstruct.h" +++ +++static tree maybe_get_ifsw_identifier(const char *); +++static tree patch_int_nonconst(location_t, tree, const char **); +++ +++struct SubContext cosmo_ctx; +++static int ctx_inited = 0; +++ +++void portcosmo_setup() { +++ if (flag_portcosmo && 0 == ctx_inited) { +++ construct_context(&cosmo_ctx); +++ ctx_inited = 1; +++ } +++} +++ +++void portcosmo_teardown() { +++ if (flag_portcosmo && 1 == ctx_inited) { +++ cleanup_context(&cosmo_ctx); +++ ctx_inited = 0; +++ } +++} +++ +++void portcosmo_show_tree(location_t loc, tree t) { +++ INFORM(loc, "attempting case substitution at: line %u, col %u\n", +++ LOCATION_LINE(loc), LOCATION_COLUMN(loc)); +++ debug_tree(t); +++} +++ +++tree patch_case_nonconst(location_t loc, tree t) { +++ INFORM(loc, "attempting case substitution at: line %u, col %u\n", +++ LOCATION_LINE(loc), LOCATION_COLUMN(loc)); +++ tree subs = NULL_TREE; +++ const char *name = NULL; +++ if (cosmo_ctx.active) { +++ subs = patch_int_nonconst(loc, t, &name); +++ if (subs != NULL_TREE) { +++ DEBUGF("folding...\n"); +++ subs = c_fully_fold(subs, false, NULL, false); +++ /* this substitution was successful, so record +++ * the location for rewriting the thing later */ +++ add_context_subu(&cosmo_ctx, loc, name, strlen(name), +++ PORTCOSMO_SWCASE); +++ } +++ } +++ return subs; +++} +++ +++tree patch_init_nonconst(location_t loc, tree t) { +++ INFORM(loc, "attempting init substitution at: line %u, col %u\n", +++ LOCATION_LINE(loc), LOCATION_COLUMN(loc)); +++ tree subs = NULL_TREE; +++ const char *name = NULL; +++ if (cosmo_ctx.active) { +++ subs = patch_int_nonconst(loc, t, &name); +++ if (subs != NULL_TREE) { +++ DEBUGF("folding...\n"); +++ subs = c_fully_fold(subs, false, NULL, false); +++ /* this substitution was successful, so record +++ * the location for rewriting the thing later */ +++ add_context_subu(&cosmo_ctx, loc, name, strlen(name), +++ PORTCOSMO_INITVAL); +++ } +++ } +++ return subs; +++} +++ +++/* internal functions */ +++ +++static tree patch_int_nonconst(location_t loc, tree t, const char **res) { +++ /* t may be an integer inside a case label, or +++ * t may be an integer inside an initializer */ +++ tree subs = NULL_TREE; +++ switch (TREE_CODE(t)) { +++ case VAR_DECL: +++ subs = maybe_get_ifsw_identifier(IDENTIFIER_NAME(t)); +++ if (subs != NULL_TREE && TREE_STATIC(subs) && TREE_READONLY(subs)) { +++ subs = DECL_INITIAL(subs); +++ *res = IDENTIFIER_NAME(t); +++ DEBUGF("substitution exists %s\n", *res); +++ } +++ break; +++ case NOP_EXPR: +++ subs = patch_int_nonconst(loc, TREE_OPERAND(t, 0), res); +++ if (subs != NULL_TREE) { +++ subs = build1(NOP_EXPR, integer_type_node, subs); +++ } +++ break; +++ case NEGATE_EXPR: +++ subs = patch_int_nonconst(loc, TREE_OPERAND(t, 0), res); +++ if (subs != NULL_TREE) { +++ subs = build1(NEGATE_EXPR, integer_type_node, subs); +++ } +++ break; +++ case BIT_NOT_EXPR: +++ subs = patch_int_nonconst(loc, TREE_OPERAND(t, 0), res); +++ if (subs != NULL_TREE) { +++ subs = build1(BIT_NOT_EXPR, integer_type_node, subs); +++ } +++ break; +++ default: +++ subs = NULL_TREE; +++ } +++ return subs; +++} +++ +++const char *get_tree_code_str(tree expr) { +++#define END_OF_BASE_TREE_CODES +++#define DEFTREECODE(a, b, c, d) \ +++ case a: \ +++ return b; +++ switch (TREE_CODE(expr)) { +++#include "all-tree.def" +++ default: +++ return ""; +++ } +++#undef DEFTREECODE +++#undef END_OF_BASE_TREE_CODES +++} +++ +++static tree maybe_get_ifsw_identifier(const char *s) { +++ char *result = (char *)xmalloc(strlen("__tmpcosmo_") + strlen(s) + 1); +++ strcpy(result, "__tmpcosmo_"); +++ strcat(result, s); +++ tree t = maybe_get_identifier(result); +++ free(result); +++ if (t != NULL_TREE && lookup_name(t) != NULL_TREE) { +++ return lookup_name(t); +++ } +++ return NULL_TREE; +++} +++ +++tree get_ifsw_identifier(char *s) { +++ char *result = (char *)xmalloc(strlen("__tmpcosmo_") + strlen(s) + 1); +++ strcpy(result, "__tmpcosmo_"); +++ strcat(result, s); +++ tree t = lookup_name(get_identifier(result)); +++ free(result); +++ return t; +++} +++ +++int get_value_of_const(char *name) { +++ tree vx = get_ifsw_identifier(name); +++ int z = tree_to_shwi(DECL_INITIAL(vx)); +++ return z; +++} +++ +++int check_magic_equal(tree value, char *varname) { +++ tree vx = get_ifsw_identifier(varname); +++ return tree_int_cst_equal(value, DECL_INITIAL(vx)); +++} ++diff --git a/gcc/c-family/portcosmo.h b/gcc/c-family/portcosmo.h ++new file mode 100644 ++index 000000000..b25d8257b ++--- /dev/null +++++ b/gcc/c-family/portcosmo.h ++@@ -0,0 +1,13 @@ +++#ifndef PORTCOSMO_H +++#define PORTCOSMO_H +++#include +++ +++void portcosmo_setup(); +++void portcosmo_teardown(); +++void portcosmo_pre_genericize(void*); +++void portcosmo_finish_decl(void*); +++void portcosmo_show_tree(location_t, tree); +++tree patch_case_nonconst(location_t, tree); +++tree patch_init_nonconst(location_t, tree); +++ +++#endif /* PORTCOSMO_H */ ++diff --git a/gcc/c-family/portcosmo.internal.h b/gcc/c-family/portcosmo.internal.h ++new file mode 100644 ++index 000000000..1d361bfa7 ++--- /dev/null +++++ b/gcc/c-family/portcosmo.internal.h ++@@ -0,0 +1,55 @@ +++#ifndef PORTCOSMO_INTERNAL_H +++#define PORTCOSMO_INTERNAL_H +++/* first stdlib headers */ +++#include +++/* now all the plugin headers */ +++#include +++/* first gcc-plugin, then the others */ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++const char *get_tree_code_str(tree); +++int get_value_of_const(char *); +++tree get_ifsw_identifier(char *); +++int check_magic_equal(tree, char *); +++ +++#define EXPR_LOC_LINE(x) LOCATION_LINE(EXPR_LOCATION((x))) +++#define EXPR_LOC_COL(x) LOCATION_COLUMN(EXPR_LOCATION((x))) +++#define LOCATION_APPROX(x, y) (LOCATION_LINE((x)) == LOCATION_LINE((y))) +++#define LOCATION_BEFORE(x, y) (LOCATION_LINE((x)) <= LOCATION_LINE((y))) +++#define LOCATION_AFTER(x, y) (LOCATION_LINE((x)) >= LOCATION_LINE((y))) +++ +++#define LOCATION_BEFORE2(x, y) \ +++ (LOCATION_LINE((x)) < LOCATION_LINE((y)) || \ +++ (LOCATION_LINE((x)) == LOCATION_LINE((y)) && \ +++ LOCATION_COLUMN((x)) <= LOCATION_COLUMN((y)))) +++#define LOCATION_AFTER2(x, y) \ +++ (LOCATION_LINE((x)) > LOCATION_LINE((y)) || \ +++ (LOCATION_LINE((x)) == LOCATION_LINE((y)) && \ +++ LOCATION_COLUMN((x)) >= LOCATION_COLUMN((y)))) +++ +++#define VAR_NAME_AS_TREE(fname) lookup_name(get_identifier((fname))) +++#define IDENTIFIER_NAME(z) IDENTIFIER_POINTER(DECL_NAME((z))) +++#define BUILD_STRING_AS_TREE(str) build_string_literal(strlen((str)) + 1, (str)) +++ +++#if 0 +++#define DEBUGF(...) fprintf(stderr, " " __VA_ARGS__) +++#define INFORM(...) inform(__VA_ARGS__) +++#else +++#define DEBUGF(...) +++#define INFORM(...) +++#endif +++ +++#define STRING_BUFFER_SIZE 192 +++ +++void handle_pre_genericize(void *, void *); +++ +++#endif /* PORTCOSMO.INTERNAL_H */ ++diff --git a/gcc/c-family/subcontext.cc b/gcc/c-family/subcontext.cc ++new file mode 100644 ++index 000000000..e3e02800d ++--- /dev/null +++++ b/gcc/c-family/subcontext.cc ++@@ -0,0 +1,241 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "c-family/subcontext.h" +++ +++subu_node *build_subu(const location_t loc, const char *name, +++ unsigned int namelen, SubstType tp) { +++ /* xmalloc because malloc is poisoned by gcc-plugin's system.h */ +++ subu_node *res = (subu_node *)xmalloc(sizeof(subu_node)); +++ res->next = NULL; +++ res->loc = loc; +++ res->name = xstrndup(name, namelen); +++ res->tp = tp; +++ DEBUGF("allocated subu_node at %p\n", res); +++ return res; +++}; +++ +++void delete_subu(subu_node *node) { +++ DEBUGF("freeing subu_node at %p, %u,%u\n", node, LOCATION_LINE(node->loc), +++ LOCATION_COLUMN(node->loc)); +++ node->loc = 0x0; +++ free(node->name); +++ node->next = NULL; +++ node->tp = PORTCOSMO_UNKNOWN; +++ free(node); +++} +++ +++subu_list *init_subu_list() { +++ subu_list *res = (subu_list *)xmalloc(sizeof(subu_list)); +++ res->head = NULL; +++ res->count = 0; +++ res->start = 0; +++ res->end = 0; +++ DEBUGF("allocated subu_list at %p\n", res); +++ return res; +++} +++ +++static void recount_subu_list(subu_list *list) { +++ int i = 0; +++ location_t s = MAX_LOCATION_T; +++ location_t e = 0; +++ subu_node *it; +++ for (it = list->head; it != NULL; it = it->next) { +++ i += 1; +++ /* is it possible to compare for s and e? */ +++ if (s == MAX_LOCATION_T || LOCATION_BEFORE2(it->loc, s)) s = it->loc; +++ if (LOCATION_AFTER2(it->loc, e)) e = it->loc; +++ } +++ if (LOCATION_AFTER2(s, e)) { +++ s = e; +++ } +++ list->start = s; +++ list->end = e; +++ list->count = i; +++ DEBUGF("list with %d subus, start = %u,%u end = %u,%u\n", list->count, +++ LOCATION_LINE(list->start), LOCATION_COLUMN(list->start), +++ LOCATION_LINE(list->end), LOCATION_COLUMN(list->end)); +++} +++ +++void add_subu_elem(subu_list *list, subu_node *node) { +++ subu_node *tmp; +++ if (list->head == NULL) { +++ list->head = node; +++ } else { +++ for (tmp = list->head; tmp->next != NULL; tmp = tmp->next) +++ ; +++ tmp->next = node; +++ node->next = NULL; +++ } +++ recount_subu_list(list); +++} +++ +++void pop_subu_list(subu_list *list) { +++ if (list->head != NULL) { +++ subu_node *tmp = list->head; +++ list->head = list->head->next; +++ delete_subu(tmp); +++ } +++ recount_subu_list(list); +++} +++ +++int valid_subu_bounds(subu_list *list, location_t start, location_t end) { +++ /* return 1 if the bounds of list and provided bounds overlap */ +++ if (LOCATION_BEFORE(list->start, end) && LOCATION_AFTER(list->start, start)) +++ return 1; +++ if (LOCATION_BEFORE(start, list->end) && LOCATION_AFTER(start, list->start)) +++ return 1; +++ return 0; +++} +++ +++int check_loc_in_bound(subu_list *list, location_t loc) { +++ /* return 1 if loc is within the bounds */ +++ if (LOCATION_BEFORE(list->start, loc) && LOCATION_AFTER(list->end, loc)) { +++ return 1; +++ } else { +++ return 0; +++ } +++} +++ +++int get_subu_elem(subu_list *list, location_t loc, subu_node **node) { +++ /* *node is overwritten on returning 1 ie success */ +++ subu_node *it = list->head; +++ for (; it != NULL; it = it->next) { +++ if (LOCATION_APPROX(it->loc, loc)) { +++ *node = it; +++ return 1; +++ } +++ } +++ return 0; +++} +++ +++int get_subu_elem2(subu_list *list, source_range rng, subu_node **node) { +++ /* *node is overwritten on returning 1 ie success */ +++ /* returns the first node found within rng's bounds */ +++ subu_node *it = list->head; +++ for (; it != NULL; it = it->next) { +++ if (LOCATION_BEFORE(rng.m_start, it->loc) && +++ LOCATION_AFTER(rng.m_finish, it->loc)) { +++ *node = it; +++ return 1; +++ } +++ } +++ return 0; +++} +++ +++void remove_subu_elem(subu_list *list, subu_node *node) { +++ subu_node *cur, *prev; +++ if (list->head != NULL) { +++ if (list->head == node) { +++ cur = list->head; +++ list->head = list->head->next; +++ delete_subu(cur); +++ } else { +++ prev = list->head; +++ cur = list->head->next; +++ for (; cur != NULL; prev = cur, cur = cur->next) { +++ if (cur == node) { +++ prev->next = cur->next; +++ delete_subu(cur); +++ break; +++ } +++ } +++ } +++ recount_subu_list(list); +++ } +++} +++ +++void clear_subu_list(subu_list *list) { +++ subu_node *it, *tmp; +++ for (it = list->head; it != NULL;) { +++ tmp = it; +++ it = it->next; +++ delete_subu(tmp); +++ } +++ list->head = NULL; +++ list->count = 0; +++ list->start = 0; +++ list->end = 0; +++} +++ +++void delete_subu_list(subu_list *list) { +++ clear_subu_list(list); +++ free(list); +++ DEBUGF("freeing subu_list at %p\n", list); +++} +++ +++int check_empty_subu_list(subu_list *list, location_t start) { +++ /* we should have modded all locations before start, and so +++ * list should not contain any entries which have a location +++ * before start */ +++ int errcount = 0; +++ for (auto it = list->head; it; it = it->next) { +++ if (start == MAX_LOCATION_T || LOCATION_BEFORE2(it->loc, start)) { +++ error_at(it->loc, "unable to substitute constant\n"); +++ errcount += 1; +++ } +++ } +++ if (errcount != 0) { +++ /* DON'T DELETE! */ +++ clear_subu_list(list); +++ } +++ return errcount == 0; +++} +++ +++void construct_context(SubContext *ctx) { +++ ctx->active = 1; +++ ctx->mods = init_subu_list(); +++ ctx->prev = NULL; +++ ctx->switchcount = 0; +++ ctx->initcount = 0; +++ ctx->subcount = 0; +++} +++ +++void add_context_subu(SubContext *ctx, const location_t loc, const char *defn, +++ unsigned int at, SubstType st) { +++ if (ctx->mods == NULL) return; +++ add_subu_elem(ctx->mods, build_subu(loc, defn, at, st)); +++} +++ +++void check_context_clear(SubContext *ctx, location_t start) { +++ if (ctx->mods) { +++ ctx->active = check_empty_subu_list(ctx->mods, start); +++ } +++} +++ +++void cleanup_context(SubContext *ctx) { +++ check_context_clear(ctx, MAX_LOCATION_T); +++ if (ctx->mods) { +++ delete_subu_list(ctx->mods); +++ ctx->mods = NULL; +++ } +++ ctx->prev = NULL; +++ if (ctx->switchcount > 0) { +++ inform(UNKNOWN_LOCATION, "rewrote %u switch statements", ctx->switchcount); +++ } +++ ctx->switchcount = 0; +++ if (ctx->initcount > 0) { +++ inform(UNKNOWN_LOCATION, "modified %u initializations", ctx->initcount); +++ } +++ ctx->initcount = 0; +++ if (ctx->subcount > 0) { +++ inform(UNKNOWN_LOCATION, "modified %u other macro uses", ctx->subcount); +++ } +++ ctx->subcount = 0; +++ ctx->active = 0; +++} ++diff --git a/gcc/c-family/subcontext.h b/gcc/c-family/subcontext.h ++new file mode 100644 ++index 000000000..558e4fd06 ++--- /dev/null +++++ b/gcc/c-family/subcontext.h ++@@ -0,0 +1,81 @@ +++#ifndef SUBCONTEXT_H +++#define SUBCONTEXT_H +++#include "c-family/portcosmo.internal.h" +++ +++enum SubstType { +++ PORTCOSMO_UNKNOWN = 0, +++ PORTCOSMO_SWCASE = 1, +++ PORTCOSMO_INITVAL = 2 +++}; +++ +++struct _subu_node { +++ /* a node indicating that an ifswitch substitution has occurred. +++ * +++ * Details include: +++ * +++ * - location_t of the substitution +++ * - char* of name of the macro that was substituted (alloc'd) +++ * - whether the substitution was inside a switch statement +++ * - _subu_node* pointer to the next element in the list (NULL if last) +++ * +++ * the idea is that every time one of our modified macros is used, +++ * we record the substitution, and then we delete this record if +++ * we find the appropriate location_t during pre-genericize and +++ * construct the necessary parse trees at that point. +++ * +++ * at the end of compilation (ie PLUGIN_FINISH), there should be +++ * no subu_nodes allocated. +++ */ +++ location_t loc; +++ SubstType tp; +++ char *name; +++ struct _subu_node *next; +++}; +++ +++typedef struct _subu_node subu_node; +++ +++struct _subu_list { +++ subu_node *head; +++ /* inclusive bounds, range containing all recorded substitutions */ +++ location_t start, end; +++ /* number of substitutions */ +++ int count; +++}; +++typedef struct _subu_list subu_list; +++ +++int check_loc_in_bound(subu_list *, location_t); +++int valid_subu_bounds(subu_list *, location_t, location_t); +++int get_subu_elem(subu_list *, location_t, subu_node **); +++int get_subu_elem2(subu_list *, source_range, subu_node **); +++void remove_subu_elem(subu_list *, subu_node *); +++ +++/* Substitution Context */ +++struct SubContext { +++ /* record all macro uses */ +++ subu_list *mods; +++ /* address of the previous statement we walked through, +++ * in case we missed modding it and have to retry */ +++ tree *prev; +++ /* count number of switch statements rewritten */ +++ unsigned int switchcount; +++ /* count number of initializations rewritten */ +++ unsigned int initcount; +++ /* count number of other substitutions rewritten */ +++ unsigned int subcount; +++ /* if zero, it means we haven't started or something +++ * went wrong somewhere */ +++ int active; +++}; +++ +++void add_context_subu(SubContext *, const location_t, const char *, +++ unsigned int, SubstType); +++void construct_context(SubContext *); +++void check_context_clear(SubContext *, location_t); +++void cleanup_context(SubContext *); +++ +++int arg_should_be_unpatched(tree, const subu_node *, tree *); +++ +++/* declaring cosmo_ctx here so initstruct knows it exists */ +++extern struct SubContext cosmo_ctx; +++ +++#endif /* SUBCONTEXT_H */ ++diff --git a/gcc/c-family/unpatch_ast.cc b/gcc/c-family/unpatch_ast.cc ++new file mode 100644 ++index 000000000..56b5b9a01 ++--- /dev/null +++++ b/gcc/c-family/unpatch_ast.cc ++@@ -0,0 +1,115 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "c-family/ifswitch.h" +++#include "c-family/initstruct.h" +++ +++tree check_usage(tree *tp, int *check_subtree, void *data) { +++ SubContext *ctx = (SubContext *)(data); +++ tree t = *tp; +++ tree z; +++ subu_node *use = NULL; +++ location_t loc = EXPR_LOCATION(t); +++ source_range rng = EXPR_LOCATION_RANGE(t); +++ +++ if (ctx->active == 0 || ctx->mods->count == 0) { +++ /* DEBUGF("substitutions complete\n"); */ +++ *check_subtree = 0; +++ return NULL_TREE; +++ } +++ +++ if (LOCATION_AFTER2(loc, rng.m_start)) { +++ loc = rng.m_start; +++ } else { +++ rng.m_start = loc; +++ } +++ +++ if (ctx->prev && LOCATION_BEFORE2(ctx->mods->start, rng.m_start)) { +++ auto vloc = DECL_SOURCE_LOCATION(DECL_EXPR_DECL(*(ctx->prev))); +++ /* below inequality holds inside this if condition: +++ * vloc <= ctx->mods->start <= rng.m_start +++ * this means that there was a macro substitution +++ * between vloc and rng.m_start, which was not +++ * eliminated when we went through the other parts +++ * of the parse tree earlier. thus, the decl_expr +++ * that we have stored in ctx->prev needs to be +++ * checked for possible macro substitutions */ +++ DEBUGF( +++ "did we miss a decl? vloc=%u,%u, loc=%u,%u, rng.mstart=%u,%u, " +++ "start=%u,%u\n", +++ LOCATION_LINE(vloc), LOCATION_COLUMN(vloc), // +++ LOCATION_LINE(loc), LOCATION_COLUMN(loc), // +++ LOCATION_LINE(rng.m_start), LOCATION_COLUMN(rng.m_start), +++ LOCATION_LINE(ctx->mods->start), LOCATION_COLUMN(ctx->mods->start)); +++ auto z = ctx->initcount; +++ build_modded_declaration(ctx->prev, ctx, rng.m_start); +++ if (z != ctx->initcount) { +++ ctx->prev = NULL; +++ check_context_clear(ctx, loc); +++ } +++ } +++ +++ if (TREE_CODE(t) == DECL_EXPR && TREE_STATIC(DECL_EXPR_DECL(t))) { +++ INFORM(loc, "should we mod this?\n"); +++ ctx->prev = tp; +++ } +++ +++ if (TREE_CODE(t) == SWITCH_STMT) { +++ rng = get_switch_bounds(t); +++ if (valid_subu_bounds(ctx->mods, rng.m_start, rng.m_finish) && +++ count_mods_in_switch(t, ctx->mods) > 0) { +++ /* this is one of the switch statements +++ * where we modified a case label */ +++ DEBUGF("modding the switch \n"); +++ *tp = build_modded_switch_stmt(t, ctx); +++ DEBUGF("we modded it??\n"); +++ walk_tree_without_duplicates(tp, check_usage, ctx); +++ /* due to the above call, I don't need to check +++ * any subtrees from this current location */ +++ *check_subtree = 0; +++ ctx->switchcount += 1; +++ return NULL_TREE; +++ } +++ } +++ +++ return NULL_TREE; +++} +++ +++void handle_pre_genericize(void *gcc_data, void *user_data) { +++ tree t = (tree)gcc_data; +++ SubContext *ctx = (SubContext *)user_data; +++ tree t2; +++ if (ctx->active && TREE_CODE(t) == FUNCTION_DECL && +++ DECL_INITIAL(t) != NULL && TREE_STATIC(t)) { +++ /* this function is defined within the file I'm processing */ +++ if (ctx->mods->count == 0) { +++ // DEBUGF("no substitutions were made in %s\n", IDENTIFIER_NAME(t)); +++ return; +++ } +++ t2 = DECL_SAVED_TREE(t); +++ ctx->prev = NULL; +++ walk_tree_without_duplicates(&t2, check_usage, ctx); +++ /* now at this stage, all uses of our macros have been +++ * fixed, INCLUDING case labels. Let's confirm that: */ +++ check_context_clear(ctx, MAX_LOCATION_T); +++ } +++} +++ +++void portcosmo_pre_genericize(void *gcc_data) { +++ handle_pre_genericize(gcc_data, (void *)(&cosmo_ctx)); +++} ++diff --git a/gcc/c-family/unpatch_int.cc b/gcc/c-family/unpatch_int.cc ++new file mode 100644 ++index 000000000..08c96b544 ++--- /dev/null +++++ b/gcc/c-family/unpatch_int.cc ++@@ -0,0 +1,63 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "c-family/subcontext.h" +++ +++int arg_should_be_unpatched(tree arg, const subu_node *use, tree *rep_ptr) { +++ /* if we are returning 1, rep_ptr has been set. +++ * if we are returning 0, rep_ptr is unchanged. +++ * use is not affected! */ +++ if (TREE_CODE(arg) == INTEGER_CST) { +++ tree vx = DECL_INITIAL(get_ifsw_identifier(use->name)); +++ if (tree_int_cst_equal(arg, vx)) { +++ /* if this is an integer constant, AND its +++ * value is equal to the macro we substituted, +++ * then we replace the correct variable here */ +++ *rep_ptr = +++ build1(NOP_EXPR, integer_type_node, VAR_NAME_AS_TREE(use->name)); +++ INFORM(use->loc, "unpatched an integer here with %s\n", use->name); +++ return 1; +++ } +++ /* here you might want to handle some +++ * minimal constant folding algebra, +++ * like -VAR or ~VAR */ +++ if (tree_fits_poly_int64_p(vx) && tree_fits_poly_int64_p(arg)) { +++ auto v1 = tree_to_poly_int64(vx); +++ auto v2 = tree_to_poly_int64(arg); +++ +++ /* handle the -VAR case */ +++ if (known_eq(v1, -v2)) { +++ INFORM(use->loc, "unpatched an integer here with -%s\n", use->name); +++ *rep_ptr = +++ build1(NEGATE_EXPR, integer_type_node, VAR_NAME_AS_TREE(use->name)); +++ return 1; +++ } +++ +++ /* handle the ~VAR case */ +++ if (known_eq(v1, ~v2)) { +++ INFORM(use->loc, "unpatched an integer here with ~%s\n", use->name); +++ *rep_ptr = build1(BIT_NOT_EXPR, integer_type_node, +++ VAR_NAME_AS_TREE(use->name)); +++ return 1; +++ } +++ } +++ return 0; +++ } +++ +++ return 0; +++} ++diff --git a/gcc/c/Make-lang.in b/gcc/c/Make-lang.in ++index a1cdee872..ce57e8a27 100644 ++--- a/gcc/c/Make-lang.in +++++ b/gcc/c/Make-lang.in ++@@ -55,8 +55,10 @@ C_AND_OBJC_OBJS = attribs.o c/c-errors.o c/c-decl.o c/c-typeck.o \ ++ c/c-fold.o c/gimple-parser.o \ ++ $(C_COMMON_OBJS) $(C_TARGET_OBJS) ++ +++PORTCOSMO_C_OBJS = c/portcosmo_bcref.o +++ ++ # Language-specific object files for C. ++-C_OBJS = c/c-lang.o c-family/stub-objc.o $(C_AND_OBJC_OBJS) +++C_OBJS = c/c-lang.o c-family/stub-objc.o $(C_AND_OBJC_OBJS) $(PORTCOSMO_C_OBJS) ++ c_OBJS = $(C_OBJS) cc1-checksum.o c/gccspec.o ++ ++ # Use strict warnings for this front end. ++diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c ++index 53b2b5b63..51f3a72e5 100644 ++--- a/gcc/c/c-decl.c +++++ b/gcc/c/c-decl.c ++@@ -58,6 +58,7 @@ along with GCC; see the file COPYING3. If not see ++ #include "c-family/name-hint.h" ++ #include "c-family/known-headers.h" ++ #include "c-family/c-spellcheck.h" +++#include "c-family/portcosmo.h" ++ #include "context.h" /* For 'g'. */ ++ #include "omp-general.h" ++ #include "omp-offload.h" /* For offload_vars. */ ++@@ -5685,6 +5686,9 @@ finish_decl (tree decl, location_t init_loc, tree init, ++ && !DECL_HARD_REGISTER (decl)) ++ targetm.lower_local_decl_alignment (decl); ++ +++ if(flag_portcosmo) { +++ portcosmo_finish_decl(decl); +++ } ++ invoke_plugin_callbacks (PLUGIN_FINISH_DECL, decl); ++ } ++ ++@@ -10277,6 +10281,10 @@ finish_function (location_t end_loc) ++ { ++ if (!decl_function_context (fndecl)) ++ { +++ if (flag_portcosmo) +++ { +++ portcosmo_pre_genericize(fndecl); +++ } ++ invoke_plugin_callbacks (PLUGIN_PRE_GENERICIZE, fndecl); ++ c_genericize (fndecl); ++ ++diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c ++index b5d139e5d..2df309eba 100644 ++--- a/gcc/c/c-typeck.c +++++ b/gcc/c/c-typeck.c ++@@ -46,6 +46,7 @@ along with GCC; see the file COPYING3. If not see ++ #include "omp-general.h" ++ #include "c-family/c-objc.h" ++ #include "c-family/c-ubsan.h" +++#include "c-family/portcosmo.h" ++ #include "gomp-constants.h" ++ #include "spellcheck-tree.h" ++ #include "gcc-rich-location.h" ++@@ -8170,8 +8171,17 @@ digest_init (location_t init_loc, tree type, tree init, tree origtype, ++ && !initializer_constant_valid_p (inside_init, ++ TREE_TYPE (inside_init))) ++ { ++- error_init (init_loc, "initializer element is not constant"); ++- inside_init = error_mark_node; +++ if (flag_portcosmo) { +++ inside_init = patch_init_nonconst(init_loc, inside_init); +++ if (inside_init == NULL_TREE) { +++ error_init (init_loc, "initializer element is not constant"); +++ inside_init = error_mark_node; +++ } +++ } +++ else { +++ error_init (init_loc, "initializer element is not constant"); +++ inside_init = error_mark_node; +++ } ++ } ++ else if (require_constant && !maybe_const) ++ pedwarn_init (init_loc, OPT_Wpedantic, ++@@ -9787,7 +9797,7 @@ output_init_element (location_t loc, tree value, tree origtype, ++ /* Proceed to check the constness of the original initializer. */ ++ if (!initializer_constant_valid_p (value, TREE_TYPE (value))) ++ { ++- if (require_constant_value) +++ if (require_constant_value && !flag_portcosmo) ++ { ++ error_init (loc, "initializer element is not constant"); ++ value = error_mark_node; ++diff --git a/gcc/c/portcosmo_bcref.cc b/gcc/c/portcosmo_bcref.cc ++new file mode 100644 ++index 000000000..2c6a58296 ++--- /dev/null +++++ b/gcc/c/portcosmo_bcref.cc ++@@ -0,0 +1,30 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "c-family/initstruct.h" +++ +++/* initstruct/common.cc */ +++ +++tree access_at(tree obj, tree ind) { +++ if (TREE_CODE(TREE_TYPE(obj)) == ARRAY_TYPE) { +++ return build_array_ref(input_location, obj, ind); +++ } +++ return build_component_ref(input_location, obj, +++ get_identifier(IDENTIFIER_NAME(ind)), +++ DECL_SOURCE_LOCATION(ind)); +++} ++diff --git a/gcc/cp/Make-lang.in b/gcc/cp/Make-lang.in ++index 155be74ef..2fe9484ff 100644 ++--- a/gcc/cp/Make-lang.in +++++ b/gcc/cp/Make-lang.in ++@@ -84,6 +84,10 @@ g++-cross$(exeext): xg++$(exeext) ++ CXX_C_OBJS = attribs.o incpath.o \ ++ $(C_COMMON_OBJS) $(CXX_TARGET_OBJS) ++ +++# initstruct has some issues building with cc1plus, +++# so we provide nothing for now +++PORTCOSMO_CXX_OBJS = cp/portcosmo_bcref_cp.o +++ ++ # Language-specific object files for C++ and Objective C++. ++ CXX_AND_OBJCXX_OBJS = \ ++ cp/call.o cp/class.o cp/constexpr.o cp/constraint.o \ ++@@ -101,7 +105,7 @@ CXX_AND_OBJCXX_OBJS = \ ++ cp/rtti.o \ ++ cp/search.o cp/semantics.o \ ++ cp/tree.o cp/typeck.o cp/typeck2.o \ ++- cp/vtable-class-hierarchy.o $(CXX_C_OBJS) +++ cp/vtable-class-hierarchy.o $(CXX_C_OBJS) $(PORTCOSMO_CXX_OBJS) ++ ++ ifeq ($(if $(wildcard ../stage_current),$(shell cat \ ++ ../stage_current)),stageautofeedback) ++diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c ++index 5e101ffb8..1f68dddbf 100644 ++--- a/gcc/cp/decl.c +++++ b/gcc/cp/decl.c ++@@ -56,6 +56,7 @@ along with GCC; see the file COPYING3. If not see ++ #include "context.h" /* For 'g'. */ ++ #include "omp-general.h" ++ #include "omp-offload.h" /* For offload_vars. */ +++#include "c-family/portcosmo.h" ++ ++ /* Possible cases of bad specifiers type used by bad_specifiers. */ ++ enum bad_spec_place { ++@@ -8246,6 +8247,9 @@ cp_finish_decl (tree decl, tree init, bool init_const_expr_p, ++ && !DECL_HARD_REGISTER (decl)) ++ targetm.lower_local_decl_alignment (decl); ++ +++ if (flag_portcosmo) { +++ portcosmo_finish_decl(decl); +++ } ++ invoke_plugin_callbacks (PLUGIN_FINISH_DECL, decl); ++ } ++ ++@@ -17459,8 +17463,13 @@ finish_function (bool inline_p) ++ maybe_save_constexpr_fundef (fndecl); ++ ++ /* Invoke the pre-genericize plugin before we start munging things. */ ++- if (!processing_template_decl) +++ if (!processing_template_decl) { +++ if (flag_portcosmo) +++ { +++ portcosmo_pre_genericize(fndecl); +++ } ++ invoke_plugin_callbacks (PLUGIN_PRE_GENERICIZE, fndecl); +++ } ++ ++ /* Perform delayed folding before NRV transformation. */ ++ if (!processing_template_decl ++diff --git a/gcc/cp/portcosmo_bcref_cp.cc b/gcc/cp/portcosmo_bcref_cp.cc ++new file mode 100644 ++index 000000000..bf01d9aeb ++--- /dev/null +++++ b/gcc/cp/portcosmo_bcref_cp.cc ++@@ -0,0 +1,42 @@ +++/*- mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +++│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +++╞══════════════════════════════════════════════════════════════════════════════╡ +++│ Copyright © 2022, Gautham Venkatasubramanian │ +++│ │ +++│ Permission to use, copy, modify, and/or distribute this software for │ +++│ any purpose with or without fee is hereby granted, provided that the │ +++│ above copyright notice and this permission notice appear in all copies. │ +++│ │ +++│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +++│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +++│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +++│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +++│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +++│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +++│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +++│ PERFORMANCE OF THIS SOFTWARE. │ +++╚─────────────────────────────────────────────────────────────────────────────*/ +++#include "config.h" +++#include "system.h" +++#include "coretypes.h" +++#include "target.h" +++#include "c-family/c-target.h" +++#include "cp-tree.h" +++#include "tree.h" +++#include "stringpool.h" +++ +++#define IDENTIFIER_NAME(z) IDENTIFIER_POINTER(DECL_NAME((z))) +++ +++/* initstruct/common.cc */ +++ +++tree access_at(tree obj, tree ind) { +++ return cp_build_addr_expr(ind, 0); +++ /* +++ if (TREE_CODE(TREE_TYPE(obj)) == ARRAY_TYPE) { +++ return build_array_ref(input_location, obj, ind); +++ } +++ return build_component_ref(input_location, obj, +++ get_identifier(IDENTIFIER_NAME(ind)), +++ DECL_SOURCE_LOCATION(ind)); +++ */ +++} ++ diff --git a/third_party/gcc/upgrade-cosmo-gcc.sh b/third_party/gcc/upgrade-cosmo-gcc.sh new file mode 100755 index 000000000..e91f7e369 --- /dev/null +++ b/third_party/gcc/upgrade-cosmo-gcc.sh @@ -0,0 +1,46 @@ +#!/bin/sh + +ARCH=${1:-x86_64} +IMPORT=${2:-/opt/cross11portcosmo} +PREFIX=third_party/gcc/ +OLDVERSION=9.2.0 +NEWVERSION=11.2.0 + +rm -rf o/third_party/gcc +mv $PREFIX/libexec/gcc/$ARCH-linux-musl/$OLDVERSION $PREFIX/libexec/gcc/$ARCH-linux-musl/$NEWVERSION +mv $PREFIX/lib/gcc/$ARCH-linux-musl/$OLDVERSION $PREFIX/lib/gcc/$ARCH-linux-musl/$NEWVERSION +sed -i -e "s/$OLDVERSION/$NEWVERSION/g" $(find $PREFIX -name \*.sym | grep $ARCH) + +FILES=" +$ARCH-linux-musl/bin/ld.bfd +libexec/gcc/$ARCH-linux-musl/$NEWVERSION/collect2 +libexec/gcc/$ARCH-linux-musl/$NEWVERSION/cc1 +libexec/gcc/$ARCH-linux-musl/$NEWVERSION/cc1plus +bin/$ARCH-linux-musl-elfedit +bin/$ARCH-linux-musl-nm +bin/$ARCH-linux-musl-objcopy +bin/$ARCH-linux-musl-gcc +bin/$ARCH-linux-musl-c++filt +bin/$ARCH-linux-musl-gcc-ranlib +bin/$ARCH-linux-musl-addr2line +bin/$ARCH-linux-musl-objdump +bin/$ARCH-linux-musl-gcov +bin/$ARCH-linux-musl-ranlib +bin/$ARCH-linux-musl-gcc-nm +bin/$ARCH-linux-musl-strip +bin/$ARCH-linux-musl-gcov-tool +bin/$ARCH-linux-musl-gprof +bin/$ARCH-linux-musl-strings +bin/$ARCH-linux-musl-gcov-dump +bin/$ARCH-linux-musl-cpp +bin/$ARCH-linux-musl-ar +bin/$ARCH-linux-musl-readelf +bin/$ARCH-linux-musl-size +bin/$ARCH-linux-musl-as +bin/$ARCH-linux-musl-g++ +bin/$ARCH-linux-musl-gcc-ar +" + +for f in $FILES; do + gzip -9 <$IMPORT/$f >$PREFIX/$f.gz || exit +done diff --git a/third_party/gcc/x86_64-linux-musl/bin/ld.bfd.gz b/third_party/gcc/x86_64-linux-musl/bin/ld.bfd.gz index ea7e9fb5c..b3791a535 100644 Binary files a/third_party/gcc/x86_64-linux-musl/bin/ld.bfd.gz and b/third_party/gcc/x86_64-linux-musl/bin/ld.bfd.gz differ diff --git a/third_party/intel/adxintrin.internal.h b/third_party/intel/adxintrin.internal.h index b0f4e9b02..3689b9a0e 100644 --- a/third_party/intel/adxintrin.internal.h +++ b/third_party/intel/adxintrin.internal.h @@ -1,43 +1,53 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _ADXINTRIN_H_INCLUDED #define _ADXINTRIN_H_INCLUDED - -__funline unsigned char _subborrow_u32(unsigned char __CF, unsigned int __X, - unsigned int __Y, unsigned int *__P) { - return __builtin_ia32_sbb_u32(__CF, __X, __Y, __P); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_subborrow_u32 (unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) +{ + return __builtin_ia32_sbb_u32 (__CF, __X, __Y, __P); } - -__funline unsigned char _addcarry_u32(unsigned char __CF, unsigned int __X, - unsigned int __Y, unsigned int *__P) { - return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarry_u32 (unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) +{ + return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P); } - -__funline unsigned char _addcarryx_u32(unsigned char __CF, unsigned int __X, - unsigned int __Y, unsigned int *__P) { - return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarryx_u32 (unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) +{ + return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P); } - #ifdef __x86_64__ -__funline unsigned char _subborrow_u64(unsigned char __CF, unsigned long long __X, - unsigned long long __Y, - unsigned long long *__P) { - return __builtin_ia32_sbb_u64(__CF, __X, __Y, __P); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_subborrow_u64 (unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) +{ + return __builtin_ia32_sbb_u64 (__CF, __X, __Y, __P); } - -__funline unsigned char _addcarry_u64(unsigned char __CF, unsigned long long __X, - unsigned long long __Y, - unsigned long long *__P) { - return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarry_u64 (unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) +{ + return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P); } - -__funline unsigned char _addcarryx_u64(unsigned char __CF, unsigned long long __X, - unsigned long long __Y, - unsigned long long *__P) { - return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarryx_u64 (unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) +{ + return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P); } #endif - -#endif /* _ADXINTRIN_H_INCLUDED */ +#endif +#endif diff --git a/third_party/intel/ammintrin.internal.h b/third_party/intel/ammintrin.internal.h index 66045f832..3b41f0e9b 100644 --- a/third_party/intel/ammintrin.internal.h +++ b/third_party/intel/ammintrin.internal.h @@ -1,58 +1,54 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _AMMINTRIN_H_INCLUDED #define _AMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/pmmintrin.internal.h" - #ifndef __SSE4A__ #pragma GCC push_options #pragma GCC target("sse4a") #define __DISABLE_SSE4A__ -#endif /* __SSE4A__ */ - -__funline void _mm_stream_sd(double* __P, __m128d __Y) { - __builtin_ia32_movntsd(__P, (__v2df)__Y); +#endif +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_sd (double * __P, __m128d __Y) +{ + __builtin_ia32_movntsd (__P, (__v2df) __Y); } - -__funline void _mm_stream_ss(float* __P, __m128 __Y) { - __builtin_ia32_movntss(__P, (__v4sf)__Y); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_ss (float * __P, __m128 __Y) +{ + __builtin_ia32_movntss (__P, (__v4sf) __Y); } - -__funline __m128i _mm_extract_si64(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_extrq((__v2di)__X, (__v16qi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_si64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_extracti_si64(__m128i __X, unsigned const int __I, - unsigned const int __L) { - return (__m128i)__builtin_ia32_extrqi((__v2di)__X, __I, __L); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L); } #else -#define _mm_extracti_si64(X, I, L) \ - ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(X), (unsigned int)(I), \ - (unsigned int)(L))) +#define _mm_extracti_si64(X, I, L) ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X), (unsigned int)(I), (unsigned int)(L))) #endif - -__funline __m128i _mm_insert_si64(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_insertq((__v2di)__X, (__v2di)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_si64 (__m128i __X,__m128i __Y) +{ + return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_inserti_si64(__m128i __X, __m128i __Y, - unsigned const int __I, - unsigned const int __L) { - return (__m128i)__builtin_ia32_insertqi((__v2di)__X, (__v2di)__Y, __I, __L); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L); } #else -#define _mm_inserti_si64(X, Y, I, L) \ - ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(X), \ - (__v2di)(__m128i)(Y), (unsigned int)(I), \ - (unsigned int)(L))) +#define _mm_inserti_si64(X, Y, I, L) ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (unsigned int)(I), (unsigned int)(L))) #endif - #ifdef __DISABLE_SSE4A__ #undef __DISABLE_SSE4A__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4A__ */ - -#endif /* __x86_64__ */ -#endif /* _AMMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/amxbf16intrin.internal.h b/third_party/intel/amxbf16intrin.internal.h new file mode 100644 index 000000000..1d60e0c15 --- /dev/null +++ b/third_party/intel/amxbf16intrin.internal.h @@ -0,0 +1,22 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AMXBF16INTRIN_H_INCLUDED +#define _AMXBF16INTRIN_H_INCLUDED +#if !defined(__AMX_BF16__) +#pragma GCC push_options +#pragma GCC target("amx-bf16") +#define __DISABLE_AMX_BF16__ +#endif +#if defined(__x86_64__) && defined(__AMX_BF16__) +#define _tile_dpbf16ps_internal(dst,src1,src2) __asm__ volatile ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) +#define _tile_dpbf16ps(dst,src1,src2) _tile_dpbf16ps_internal (dst, src1, src2) +#endif +#ifdef __DISABLE_AMX_BF16__ +#undef __DISABLE_AMX_BF16__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/amxint8intrin.internal.h b/third_party/intel/amxint8intrin.internal.h new file mode 100644 index 000000000..9ae0b506a --- /dev/null +++ b/third_party/intel/amxint8intrin.internal.h @@ -0,0 +1,25 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AMXINT8INTRIN_H_INCLUDED +#define _AMXINT8INTRIN_H_INCLUDED +#if !defined(__AMX_INT8__) +#pragma GCC push_options +#pragma GCC target("amx-int8") +#define __DISABLE_AMX_INT8__ +#endif +#if defined(__x86_64__) && defined(__AMX_INT8__) +#define _tile_int8_dp_internal(name,dst,src1,src2) __asm__ volatile ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) +#define _tile_dpbssd(dst,src1,src2) _tile_int8_dp_internal (tdpbssd, dst, src1, src2) +#define _tile_dpbsud(dst,src1,src2) _tile_int8_dp_internal (tdpbsud, dst, src1, src2) +#define _tile_dpbusd(dst,src1,src2) _tile_int8_dp_internal (tdpbusd, dst, src1, src2) +#define _tile_dpbuud(dst,src1,src2) _tile_int8_dp_internal (tdpbuud, dst, src1, src2) +#endif +#ifdef __DISABLE_AMX_INT8__ +#undef __DISABLE_AMX_INT8__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/amxtileintrin.internal.h b/third_party/intel/amxtileintrin.internal.h new file mode 100644 index 000000000..c7dd73554 --- /dev/null +++ b/third_party/intel/amxtileintrin.internal.h @@ -0,0 +1,46 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AMXTILEINTRIN_H_INCLUDED +#define _AMXTILEINTRIN_H_INCLUDED +#if !defined(__AMX_TILE__) +#pragma GCC push_options +#pragma GCC target("amx-tile") +#define __DISABLE_AMX_TILE__ +#endif +#if defined(__x86_64__) && defined(__AMX_TILE__) +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_loadconfig (const void *__config) +{ + __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_storeconfig (void *__config) +{ + __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_release (void) +{ + __asm__ volatile ("tilerelease" ::); +} +#define _tile_loadd(dst,base,stride) _tile_loadd_internal (dst, base, stride) +#define _tile_loadd_internal(dst,base,stride) __asm__ volatile ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" :: "r" ((const void*) base), "r" ((long) stride)) +#define _tile_stream_loadd(dst,base,stride) _tile_stream_loadd_internal (dst, base, stride) +#define _tile_stream_loadd_internal(dst,base,stride) __asm__ volatile ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" :: "r" ((const void*) base), "r" ((long) stride)) +#define _tile_stored(dst,base,stride) _tile_stored_internal (dst, base, stride) +#define _tile_stored_internal(src,base,stride) __asm__ volatile ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" :: "r" ((void*) base), "r" ((long) stride) : "memory") +#define _tile_zero(dst) _tile_zero_internal (dst) +#define _tile_zero_internal(dst) __asm__ volatile ("tilezero\t%%tmm"#dst ::) +#endif +#ifdef __DISABLE_AMX_TILE__ +#undef __DISABLE_AMX_TILE__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/avx2intrin.internal.h b/third_party/intel/avx2intrin.internal.h index 6806f5ef6..a44141ac5 100644 --- a/third_party/intel/avx2intrin.internal.h +++ b/third_party/intel/avx2intrin.internal.h @@ -1,1140 +1,1452 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _AVX2INTRIN_H_INCLUDED #define _AVX2INTRIN_H_INCLUDED - #ifndef __AVX2__ #pragma GCC push_options #pragma GCC target("avx2") #define __DISABLE_AVX2__ -#endif /* __AVX2__ */ - +#endif #ifdef __OPTIMIZE__ -__funline __m256i _mm256_mpsadbw_epu8(__m256i __X, __m256i __Y, const int __M) { - return (__m256i)__builtin_ia32_mpsadbw256((__v32qi)__X, (__v32qi)__Y, __M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, + (__v32qi)__Y, __M); } #else -#define _mm256_mpsadbw_epu8(X, Y, M) \ - ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ - (__v32qi)(__m256i)(Y), (int)(M))) +#define _mm256_mpsadbw_epu8(X, Y, M) ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(M))) #endif - -__funline __m256i _mm256_abs_epi8(__m256i __A) { - return (__m256i)__builtin_ia32_pabsb256((__v32qi)__A); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi8 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); } - -__funline __m256i _mm256_abs_epi16(__m256i __A) { - return (__m256i)__builtin_ia32_pabsw256((__v16hi)__A); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi16 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); } - -__funline __m256i _mm256_abs_epi32(__m256i __A) { - return (__m256i)__builtin_ia32_pabsd256((__v8si)__A); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi32 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); } - -__funline __m256i _mm256_packs_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_packssdw256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_packs_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_packsswb256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_packus_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_packusdw256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_packus_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_packuswb256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_add_epi8(__m256i __A, __m256i __B) { - return (__m256i)((__v32qu)__A + (__v32qu)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qu)__A + (__v32qu)__B); } - -__funline __m256i _mm256_add_epi16(__m256i __A, __m256i __B) { - return (__m256i)((__v16hu)__A + (__v16hu)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hu)__A + (__v16hu)__B); } - -__funline __m256i _mm256_add_epi32(__m256i __A, __m256i __B) { - return (__m256i)((__v8su)__A + (__v8su)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A + (__v8su)__B); } - -__funline __m256i _mm256_add_epi64(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A + (__v4du)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A + (__v4du)__B); } - -__funline __m256i _mm256_adds_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_paddsb256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_adds_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_paddsw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_adds_epu8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_paddusb256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_adds_epu16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_paddusw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_alignr_epi8(__m256i __A, __m256i __B, const int __N) { - return (__m256i)__builtin_ia32_palignr256((__v4di)__A, (__v4di)__B, __N * 8); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, + (__v4di)__B, + __N * 8); } #else -#define _mm256_alignr_epi8(A, B, N) \ - ((__m256i)__builtin_ia32_palignr256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(N)*8)) +#define _mm256_alignr_epi8(A, B, N) ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (int)(N) * 8)) #endif - -__funline __m256i _mm256_and_si256(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A & (__v4du)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A & (__v4du)__B); } - -__funline __m256i _mm256_andnot_si256(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_andnotsi256((__v4di)__A, (__v4di)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); } - -__funline __m256i _mm256_avg_epu8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pavgb256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_avg_epu16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pavgw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_blendv_epi8(__m256i __X, __m256i __Y, __m256i __M) { - return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__X, (__v32qi)__Y, - (__v32qi)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) +{ + return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, + (__v32qi)__Y, + (__v32qi)__M); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_blend_epi16(__m256i __X, __m256i __Y, const int __M) { - return (__m256i)__builtin_ia32_pblendw256((__v16hi)__X, (__v16hi)__Y, __M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, + (__v16hi)__Y, + __M); } #else -#define _mm256_blend_epi16(X, Y, M) \ - ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(X), \ - (__v16hi)(__m256i)(Y), (int)(M))) +#define _mm256_blend_epi16(X, Y, M) ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(M))) #endif - -__funline __m256i _mm256_cmpeq_epi8(__m256i __A, __m256i __B) { - return (__m256i)((__v32qi)__A == (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qi)__A == (__v32qi)__B); } - -__funline __m256i _mm256_cmpeq_epi16(__m256i __A, __m256i __B) { - return (__m256i)((__v16hi)__A == (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hi)__A == (__v16hi)__B); } - -__funline __m256i _mm256_cmpeq_epi32(__m256i __A, __m256i __B) { - return (__m256i)((__v8si)__A == (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8si)__A == (__v8si)__B); } - -__funline __m256i _mm256_cmpeq_epi64(__m256i __A, __m256i __B) { - return (__m256i)((__v4di)__A == (__v4di)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4di)__A == (__v4di)__B); } - -__funline __m256i _mm256_cmpgt_epi8(__m256i __A, __m256i __B) { - return (__m256i)((__v32qi)__A > (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qs)__A > (__v32qs)__B); } - -__funline __m256i _mm256_cmpgt_epi16(__m256i __A, __m256i __B) { - return (__m256i)((__v16hi)__A > (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hi)__A > (__v16hi)__B); } - -__funline __m256i _mm256_cmpgt_epi32(__m256i __A, __m256i __B) { - return (__m256i)((__v8si)__A > (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8si)__A > (__v8si)__B); } - -__funline __m256i _mm256_cmpgt_epi64(__m256i __A, __m256i __B) { - return (__m256i)((__v4di)__A > (__v4di)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4di)__A > (__v4di)__B); } - -__funline __m256i _mm256_hadd_epi16(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_phaddw256((__v16hi)__X, (__v16hi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, + (__v16hi)__Y); } - -__funline __m256i _mm256_hadd_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_phaddd256((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); } - -__funline __m256i _mm256_hadds_epi16(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__X, (__v16hi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadds_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, + (__v16hi)__Y); } - -__funline __m256i _mm256_hsub_epi16(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_phsubw256((__v16hi)__X, (__v16hi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, + (__v16hi)__Y); } - -__funline __m256i _mm256_hsub_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_phsubd256((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); } - -__funline __m256i _mm256_hsubs_epi16(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__X, (__v16hi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, + (__v16hi)__Y); } - -__funline __m256i _mm256_maddubs_epi16(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__X, (__v32qi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, + (__v32qi)__Y); } - -__funline __m256i _mm256_madd_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, + (__v16hi)__B); } - -__funline __m256i _mm256_max_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_max_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_max_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_max_epu8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_max_epu16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_max_epu32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxud256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_min_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminsb256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_min_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminsw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_min_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminsd256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_min_epu8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminub256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_min_epu16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminuw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_min_epu32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminud256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); } - -__funline int _mm256_movemask_epi8(__m256i __A) { - return __builtin_ia32_pmovmskb256((__v32qi)__A); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_epi8 (__m256i __A) +{ + return __builtin_ia32_pmovmskb256 ((__v32qi)__A); } - -__funline __m256i _mm256_cvtepi8_epi16(__m128i __X) { - return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); } - -__funline __m256i _mm256_cvtepi8_epi32(__m128i __X) { - return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); } - -__funline __m256i _mm256_cvtepi8_epi64(__m128i __X) { - return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); } - -__funline __m256i _mm256_cvtepi16_epi32(__m128i __X) { - return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); } - -__funline __m256i _mm256_cvtepi16_epi64(__m128i __X) { - return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); } - -__funline __m256i _mm256_cvtepi32_epi64(__m128i __X) { - return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); } - -__funline __m256i _mm256_cvtepu8_epi16(__m128i __X) { - return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); } - -__funline __m256i _mm256_cvtepu8_epi32(__m128i __X) { - return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); } - -__funline __m256i _mm256_cvtepu8_epi64(__m128i __X) { - return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); } - -__funline __m256i _mm256_cvtepu16_epi32(__m128i __X) { - return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); } - -__funline __m256i _mm256_cvtepu16_epi64(__m128i __X) { - return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); } - -__funline __m256i _mm256_cvtepu32_epi64(__m128i __X) { - return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); } - -__funline __m256i _mm256_mul_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pmuldq256((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); } - -__funline __m256i _mm256_mulhrs_epi16(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__X, (__v16hi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, + (__v16hi)__Y); } - -__funline __m256i _mm256_mulhi_epu16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_mulhi_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_mullo_epi16(__m256i __A, __m256i __B) { - return (__m256i)((__v16hu)__A * (__v16hu)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hu)__A * (__v16hu)__B); } - -__funline __m256i _mm256_mullo_epi32(__m256i __A, __m256i __B) { - return (__m256i)((__v8su)__A * (__v8su)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A * (__v8su)__B); } - -__funline __m256i _mm256_mul_epu32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmuludq256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_or_si256(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A | (__v4du)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A | (__v4du)__B); } - -__funline __m256i _mm256_sad_epu8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psadbw256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sad_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_shuffle_epi8(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pshufb256((__v32qi)__X, (__v32qi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, + (__v32qi)__Y); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_shuffle_epi32(__m256i __A, const int __mask) { - return (__m256i)__builtin_ia32_pshufd256((__v8si)__A, __mask); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi32 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); } - -__funline __m256i _mm256_shufflehi_epi16(__m256i __A, const int __mask) { - return (__m256i)__builtin_ia32_pshufhw256((__v16hi)__A, __mask); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflehi_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); } - -__funline __m256i _mm256_shufflelo_epi16(__m256i __A, const int __mask) { - return (__m256i)__builtin_ia32_pshuflw256((__v16hi)__A, __mask); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflelo_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); } #else -#define _mm256_shuffle_epi32(A, N) \ - ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(A), (int)(N))) -#define _mm256_shufflehi_epi16(A, N) \ - ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(A), (int)(N))) -#define _mm256_shufflelo_epi16(A, N) \ - ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(A), (int)(N))) +#define _mm256_shuffle_epi32(A, N) ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) +#define _mm256_shufflehi_epi16(A, N) ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) +#define _mm256_shufflelo_epi16(A, N) ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) #endif - -__funline __m256i _mm256_sign_epi8(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psignb256((__v32qi)__X, (__v32qi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); } - -__funline __m256i _mm256_sign_epi16(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psignw256((__v16hi)__X, (__v16hi)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); } - -__funline __m256i _mm256_sign_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psignd256((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_bslli_epi128(__m256i __A, const int __N) { - return (__m256i)__builtin_ia32_pslldqi256(__A, __N * 8); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bslli_epi128 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); } - -__funline __m256i _mm256_slli_si256(__m256i __A, const int __N) { - return (__m256i)__builtin_ia32_pslldqi256(__A, __N * 8); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); } #else -#define _mm256_bslli_epi128(A, N) \ - ((__m256i)__builtin_ia32_pslldqi256((__m256i)(A), (int)(N)*8)) -#define _mm256_slli_si256(A, N) \ - ((__m256i)__builtin_ia32_pslldqi256((__m256i)(A), (int)(N)*8)) +#define _mm256_bslli_epi128(A, N) ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) +#define _mm256_slli_si256(A, N) ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) #endif - -__funline __m256i _mm256_slli_epi16(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psllwi256((__v16hi)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); } - -__funline __m256i _mm256_sll_epi16(__m256i __A, __m128i __B) { +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi16 (__m256i __A, __m128i __B) +{ return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); } - -__funline __m256i _mm256_slli_epi32(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_pslldi256((__v8si)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); } - -__funline __m256i _mm256_sll_epi32(__m256i __A, __m128i __B) { +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi32 (__m256i __A, __m128i __B) +{ return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); } - -__funline __m256i _mm256_slli_epi64(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psllqi256((__v4di)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); } - -__funline __m256i _mm256_sll_epi64(__m256i __A, __m128i __B) { +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi64 (__m256i __A, __m128i __B) +{ return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); } - -__funline __m256i _mm256_srai_epi16(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psrawi256((__v16hi)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); } - -__funline __m256i _mm256_sra_epi16(__m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psraw256((__v16hi)__A, (__v8hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); } - -__funline __m256i _mm256_srai_epi32(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psradi256((__v8si)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); } - -__funline __m256i _mm256_sra_epi32(__m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psrad256((__v8si)__A, (__v4si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_bsrli_epi128(__m256i __A, const int __N) { - return (__m256i)__builtin_ia32_psrldqi256(__A, __N * 8); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bsrli_epi128 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); } - -__funline __m256i _mm256_srli_si256(__m256i __A, const int __N) { - return (__m256i)__builtin_ia32_psrldqi256(__A, __N * 8); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); } #else -#define _mm256_bsrli_epi128(A, N) \ - ((__m256i)__builtin_ia32_psrldqi256((__m256i)(A), (int)(N)*8)) -#define _mm256_srli_si256(A, N) \ - ((__m256i)__builtin_ia32_psrldqi256((__m256i)(A), (int)(N)*8)) +#define _mm256_bsrli_epi128(A, N) ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) +#define _mm256_srli_si256(A, N) ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) #endif - -__funline __m256i _mm256_srli_epi16(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); } - -__funline __m256i _mm256_srl_epi16(__m256i __A, __m128i __B) { +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi16 (__m256i __A, __m128i __B) +{ return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); } - -__funline __m256i _mm256_srli_epi32(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psrldi256((__v8si)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); } - -__funline __m256i _mm256_srl_epi32(__m256i __A, __m128i __B) { +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi32 (__m256i __A, __m128i __B) +{ return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); } - -__funline __m256i _mm256_srli_epi64(__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psrlqi256((__v4di)__A, __B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); } - -__funline __m256i _mm256_srl_epi64(__m256i __A, __m128i __B) { +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi64 (__m256i __A, __m128i __B) +{ return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); } - -__funline __m256i _mm256_sub_epi8(__m256i __A, __m256i __B) { - return (__m256i)((__v32qu)__A - (__v32qu)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qu)__A - (__v32qu)__B); } - -__funline __m256i _mm256_sub_epi16(__m256i __A, __m256i __B) { - return (__m256i)((__v16hu)__A - (__v16hu)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hu)__A - (__v16hu)__B); } - -__funline __m256i _mm256_sub_epi32(__m256i __A, __m256i __B) { - return (__m256i)((__v8su)__A - (__v8su)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A - (__v8su)__B); } - -__funline __m256i _mm256_sub_epi64(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A - (__v4du)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A - (__v4du)__B); } - -__funline __m256i _mm256_subs_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psubsb256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_subs_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psubsw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_subs_epu8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psubusb256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_subs_epu16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psubusw256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_unpackhi_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhbw256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_unpackhi_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhwd256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_unpackhi_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhdq256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_unpackhi_epi64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhqdq256((__v4di)__A, (__v4di)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); } - -__funline __m256i _mm256_unpacklo_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpcklbw256((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); } - -__funline __m256i _mm256_unpacklo_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpcklwd256((__v16hi)__A, (__v16hi)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); } - -__funline __m256i _mm256_unpacklo_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckldq256((__v8si)__A, (__v8si)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); } - -__funline __m256i _mm256_unpacklo_epi64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpcklqdq256((__v4di)__A, (__v4di)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); } - -__funline __m256i _mm256_xor_si256(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A ^ (__v4du)__B); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A ^ (__v4du)__B); } - -__funline __m256i _mm256_stream_load_si256(__m256i const *__X) { - return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_load_si256 (__m256i const *__X) +{ + return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); } - -__funline __m128 _mm_broadcastss_ps(__m128 __X) { - return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastss_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); } - -__funline __m256 _mm256_broadcastss_ps(__m128 __X) { - return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastss_ps (__m128 __X) +{ + return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); } - -__funline __m256d _mm256_broadcastsd_pd(__m128d __X) { - return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastsd_pd (__m128d __X) +{ + return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); } - -__funline __m256i _mm256_broadcastsi128_si256(__m128i __X) { - return (__m256i)__builtin_ia32_vbroadcastsi256((__v2di)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastsi128_si256 (__m128i __X) +{ + return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); } - +#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) +#define _mm_broadcastsd_pd(X) _mm_movedup_pd(X) #ifdef __OPTIMIZE__ -__funline __m128i _mm_blend_epi32(__m128i __X, __m128i __Y, const int __M) { - return (__m128i)__builtin_ia32_pblendd128((__v4si)__X, (__v4si)__Y, __M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, + (__v4si)__Y, + __M); } #else -#define _mm_blend_epi32(X, Y, M) \ - ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(X), \ - (__v4si)(__m128i)(Y), (int)(M))) +#define _mm_blend_epi32(X, Y, M) ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(M))) #endif - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_blend_epi32(__m256i __X, __m256i __Y, const int __M) { - return (__m256i)__builtin_ia32_pblendd256((__v8si)__X, (__v8si)__Y, __M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, + (__v8si)__Y, + __M); } #else -#define _mm256_blend_epi32(X, Y, M) \ - ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(X), \ - (__v8si)(__m256i)(Y), (int)(M))) +#define _mm256_blend_epi32(X, Y, M) ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(M))) #endif - -__funline __m256i _mm256_broadcastb_epi8(__m128i __X) { - return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastb_epi8 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); } - -__funline __m256i _mm256_broadcastw_epi16(__m128i __X) { - return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastw_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); } - -__funline __m256i _mm256_broadcastd_epi32(__m128i __X) { - return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastd_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); } - -__funline __m256i _mm256_broadcastq_epi64(__m128i __X) { - return (__m256i)__builtin_ia32_pbroadcastq256((__v2di)__X); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastq_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); } - -__funline __m128i _mm_broadcastb_epi8(__m128i __X) { - return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastb_epi8 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); } - -__funline __m128i _mm_broadcastw_epi16(__m128i __X) { - return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastw_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); } - -__funline __m128i _mm_broadcastd_epi32(__m128i __X) { - return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastd_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); } - -__funline __m128i _mm_broadcastq_epi64(__m128i __X) { - return (__m128i)__builtin_ia32_pbroadcastq128((__v2di)__X); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastq_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); } - -__funline __m256i _mm256_permutevar8x32_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_permvarsi256((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); } - #ifdef __OPTIMIZE__ -__funline __m256d _mm256_permute4x64_pd(__m256d __X, const int __M) { - return (__m256d)__builtin_ia32_permdf256((__v4df)__X, __M); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); } #else -#define _mm256_permute4x64_pd(X, M) \ - ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(M))) +#define _mm256_permute4x64_pd(X, M) ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) #endif - -__funline __m256 _mm256_permutevar8x32_ps(__m256 __X, __m256i __Y) { - return (__m256)__builtin_ia32_permvarsf256((__v8sf)__X, (__v8si)__Y); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) +{ + return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_permute4x64_epi64(__m256i __X, const int __M) { - return (__m256i)__builtin_ia32_permdi256((__v4di)__X, __M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_epi64 (__m256i __X, const int __M) +{ + return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); } #else -#define _mm256_permute4x64_epi64(X, M) \ - ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(M))) +#define _mm256_permute4x64_epi64(X, M) ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) #endif - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_permute2x128_si256(__m256i __X, __m256i __Y, - const int __M) { - return (__m256i)__builtin_ia32_permti256((__v4di)__X, (__v4di)__Y, __M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); } #else -#define _mm256_permute2x128_si256(X, Y, M) \ - ((__m256i)__builtin_ia32_permti256((__v4di)(__m256i)(X), \ - (__v4di)(__m256i)(Y), (int)(M))) +#define _mm256_permute2x128_si256(X, Y, M) ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) #endif - #ifdef __OPTIMIZE__ -__funline __m128i _mm256_extracti128_si256(__m256i __X, const int __M) { - return (__m128i)__builtin_ia32_extract128i256((__v4di)__X, __M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti128_si256 (__m256i __X, const int __M) +{ + return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); } #else -#define _mm256_extracti128_si256(X, M) \ - ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(X), (int)(M))) +#define _mm256_extracti128_si256(X, M) ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) #endif - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_inserti128_si256(__m256i __X, __m128i __Y, - const int __M) { - return (__m256i)__builtin_ia32_insert128i256((__v4di)__X, (__v2di)__Y, __M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); } #else -#define _mm256_inserti128_si256(X, Y, M) \ - ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(X), \ - (__v2di)(__m128i)(Y), (int)(M))) +#define _mm256_inserti128_si256(X, Y, M) ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(M))) #endif - -__funline __m256i _mm256_maskload_epi32(int const *__X, __m256i __M) { - return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi32 (int const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, + (__v8si)__M); } - -__funline __m256i _mm256_maskload_epi64(long long const *__X, __m256i __M) { - return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi64 (long long const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, + (__v4di)__M); } - -__funline __m128i _mm_maskload_epi32(int const *__X, __m128i __M) { - return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi32 (int const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, + (__v4si)__M); } - -__funline __m128i _mm_maskload_epi64(long long const *__X, __m128i __M) { - return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi64 (long long const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, + (__v2di)__M); } - -__funline void _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) { - __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); } - -__funline void _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) { - __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); } - -__funline void _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) { - __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); } - -__funline void _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) { - __builtin_ia32_maskstoreq((__v2di *)__X, (__v2di)__M, (__v2di)__Y); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); } - -__funline __m256i _mm256_sllv_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); } - -__funline __m128i _mm_sllv_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); } - -__funline __m256i _mm256_sllv_epi64(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); } - -__funline __m128i _mm_sllv_epi64(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); } - -__funline __m256i _mm256_srav_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); } - -__funline __m128i _mm_srav_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); } - -__funline __m256i _mm256_srlv_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); } - -__funline __m128i _mm_srlv_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); } - -__funline __m256i _mm256_srlv_epi64(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); } - -__funline __m128i _mm_srlv_epi64(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); } - #ifdef __OPTIMIZE__ -__funline __m128d _mm_i32gather_pd(double const *__base, __m128i __index, - const int __scale) { - __v2df __zero = _mm_setzero_pd(); - __v2df __mask = _mm_cmpeq_pd(__zero, __zero); - - return (__m128d)__builtin_ia32_gathersiv2df(_mm_undefined_pd(), __base, - (__v4si)__index, __mask, __scale); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v2df __zero = _mm_setzero_pd (); + __v2df __mask = _mm_cmpeq_pd (__zero, __zero); + return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (), + __base, + (__v4si)__index, + __mask, + __scale); } - -__funline __m128d _mm_mask_i32gather_pd(__m128d __src, double const *__base, - __m128i __index, __m128d __mask, - const int __scale) { - return (__m128d)__builtin_ia32_gathersiv2df( - (__v2df)__src, __base, (__v4si)__index, (__v2df)__mask, __scale); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) +{ + return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src, + __base, + (__v4si)__index, + (__v2df)__mask, + __scale); } - -__funline __m256d _mm256_i32gather_pd(double const *__base, __m128i __index, - const int __scale) { - __v4df __zero = _mm256_setzero_pd(); - __v4df __mask = _mm256_cmp_pd(__zero, __zero, _CMP_EQ_OQ); - return (__m256d)__builtin_ia32_gathersiv4df(_mm256_undefined_pd(), __base, - (__v4si)__index, __mask, __scale); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v4df __zero = _mm256_setzero_pd (); + __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ); + return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (), + __base, + (__v4si)__index, + __mask, + __scale); } - -__funline __m256d _mm256_mask_i32gather_pd(__m256d __src, double const *__base, - __m128i __index, __m256d __mask, - const int __scale) { - return (__m256d)__builtin_ia32_gathersiv4df( - (__v4df)__src, __base, (__v4si)__index, (__v4df)__mask, __scale); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_pd (__m256d __src, double const *__base, + __m128i __index, __m256d __mask, const int __scale) +{ + return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src, + __base, + (__v4si)__index, + (__v4df)__mask, + __scale); } - -__funline __m128d _mm_i64gather_pd(double const *__base, __m128i __index, - const int __scale) { - __v2df __src = _mm_setzero_pd(); - __v2df __mask = _mm_cmpeq_pd(__src, __src); - return (__m128d)__builtin_ia32_gatherdiv2df(__src, __base, (__v2di)__index, - __mask, __scale); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v2df __src = _mm_setzero_pd (); + __v2df __mask = _mm_cmpeq_pd (__src, __src); + return (__m128d) __builtin_ia32_gatherdiv2df (__src, + __base, + (__v2di)__index, + __mask, + __scale); } - -__funline __m128d _mm_mask_i64gather_pd(__m128d __src, double const *__base, - __m128i __index, __m128d __mask, - const int __scale) { - return (__m128d)__builtin_ia32_gatherdiv2df( - (__v2df)__src, __base, (__v2di)__index, (__v2df)__mask, __scale); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) +{ + return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src, + __base, + (__v2di)__index, + (__v2df)__mask, + __scale); } - -__funline __m256d _mm256_i64gather_pd(double const *__base, __m256i __index, - const int __scale) { - __v4df __src = _mm256_setzero_pd(); - __v4df __mask = _mm256_cmp_pd(__src, __src, _CMP_EQ_OQ); - return (__m256d)__builtin_ia32_gatherdiv4df(__src, __base, (__v4di)__index, - __mask, __scale); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale) +{ + __v4df __src = _mm256_setzero_pd (); + __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ); + return (__m256d) __builtin_ia32_gatherdiv4df (__src, + __base, + (__v4di)__index, + __mask, + __scale); } - -__funline __m256d _mm256_mask_i64gather_pd(__m256d __src, double const *__base, - __m256i __index, __m256d __mask, - const int __scale) { - return (__m256d)__builtin_ia32_gatherdiv4df( - (__v4df)__src, __base, (__v4di)__index, (__v4df)__mask, __scale); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_pd (__m256d __src, double const *__base, + __m256i __index, __m256d __mask, const int __scale) +{ + return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src, + __base, + (__v4di)__index, + (__v4df)__mask, + __scale); } - -__funline __m128 _mm_i32gather_ps(float const *__base, __m128i __index, - const int __scale) { - __v4sf __src = _mm_setzero_ps(); - __v4sf __mask = _mm_cmpeq_ps(__src, __src); - return (__m128)__builtin_ia32_gathersiv4sf(__src, __base, (__v4si)__index, - __mask, __scale); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_ps (float const *__base, __m128i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + return (__m128) __builtin_ia32_gathersiv4sf (__src, + __base, + (__v4si)__index, + __mask, + __scale); } - -__funline __m128 _mm_mask_i32gather_ps(__m128 __src, float const *__base, - __m128i __index, __m128 __mask, - const int __scale) { - return (__m128)__builtin_ia32_gathersiv4sf( - (__v4sf)__src, __base, (__v4si)__index, (__v4sf)__mask, __scale); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src, + __base, + (__v4si)__index, + (__v4sf)__mask, + __scale); } - -__funline __m256 _mm256_i32gather_ps(float const *__base, __m256i __index, - const int __scale) { - __v8sf __src = _mm256_setzero_ps(); - __v8sf __mask = _mm256_cmp_ps(__src, __src, _CMP_EQ_OQ); - return (__m256)__builtin_ia32_gathersiv8sf(__src, __base, (__v8si)__index, - __mask, __scale); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale) +{ + __v8sf __src = _mm256_setzero_ps (); + __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ); + return (__m256) __builtin_ia32_gathersiv8sf (__src, + __base, + (__v8si)__index, + __mask, + __scale); } - -__funline __m256 _mm256_mask_i32gather_ps(__m256 __src, float const *__base, - __m256i __index, __m256 __mask, - const int __scale) { - return (__m256)__builtin_ia32_gathersiv8sf( - (__v8sf)__src, __base, (__v8si)__index, (__v8sf)__mask, __scale); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_ps (__m256 __src, float const *__base, + __m256i __index, __m256 __mask, const int __scale) +{ + return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src, + __base, + (__v8si)__index, + (__v8sf)__mask, + __scale); } - -__funline __m128 _mm_i64gather_ps(float const *__base, __m128i __index, - const int __scale) { - __v4sf __src = _mm_setzero_ps(); - __v4sf __mask = _mm_cmpeq_ps(__src, __src); - return (__m128)__builtin_ia32_gatherdiv4sf(__src, __base, (__v2di)__index, - __mask, __scale); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_ps (float const *__base, __m128i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + return (__m128) __builtin_ia32_gatherdiv4sf (__src, + __base, + (__v2di)__index, + __mask, + __scale); } - -__funline __m128 _mm_mask_i64gather_ps(__m128 __src, float const *__base, - __m128i __index, __m128 __mask, - const int __scale) { - return (__m128)__builtin_ia32_gatherdiv4sf( - (__v4sf)__src, __base, (__v2di)__index, (__v4sf)__mask, __scale); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src, + __base, + (__v2di)__index, + (__v4sf)__mask, + __scale); } - -__funline __m128 _mm256_i64gather_ps(float const *__base, __m256i __index, - const int __scale) { - __v4sf __src = _mm_setzero_ps(); - __v4sf __mask = _mm_cmpeq_ps(__src, __src); - return (__m128)__builtin_ia32_gatherdiv4sf256(__src, __base, (__v4di)__index, - __mask, __scale); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + return (__m128) __builtin_ia32_gatherdiv4sf256 (__src, + __base, + (__v4di)__index, + __mask, + __scale); } - -__funline __m128 _mm256_mask_i64gather_ps(__m128 __src, float const *__base, - __m256i __index, __m128 __mask, - const int __scale) { - return (__m128)__builtin_ia32_gatherdiv4sf256( - (__v4sf)__src, __base, (__v4di)__index, (__v4sf)__mask, __scale); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_ps (__m128 __src, float const *__base, + __m256i __index, __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src, + __base, + (__v4di)__index, + (__v4sf)__mask, + __scale); } - -__funline __m128i _mm_i32gather_epi64(long long int const *__base, - __m128i __index, const int __scale) { - __v2di __src = __extension__(__v2di){0, 0}; - __v2di __mask = __extension__(__v2di){~0, ~0}; - return (__m128i)__builtin_ia32_gathersiv2di(__src, __base, (__v4si)__index, - __mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v2di __src = __extension__ (__v2di){ 0, 0 }; + __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; + return (__m128i) __builtin_ia32_gathersiv2di (__src, + __base, + (__v4si)__index, + __mask, + __scale); } - -__funline __m128i _mm_mask_i32gather_epi64(__m128i __src, - long long int const *__base, - __m128i __index, __m128i __mask, - const int __scale) { - return (__m128i)__builtin_ia32_gathersiv2di( - (__v2di)__src, __base, (__v4si)__index, (__v2di)__mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src, + __base, + (__v4si)__index, + (__v2di)__mask, + __scale); } - -__funline __m256i _mm256_i32gather_epi64(long long int const *__base, - __m128i __index, const int __scale) { - __v4di __src = __extension__(__v4di){0, 0, 0, 0}; - __v4di __mask = __extension__(__v4di){~0, ~0, ~0, ~0}; - return (__m256i)__builtin_ia32_gathersiv4di(__src, __base, (__v4si)__index, - __mask, __scale); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + return (__m256i) __builtin_ia32_gathersiv4di (__src, + __base, + (__v4si)__index, + __mask, + __scale); } - -__funline __m256i _mm256_mask_i32gather_epi64(__m256i __src, - long long int const *__base, - __m128i __index, __m256i __mask, - const int __scale) { - return (__m256i)__builtin_ia32_gathersiv4di( - (__v4di)__src, __base, (__v4si)__index, (__v4di)__mask, __scale); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base, + __m128i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src, + __base, + (__v4si)__index, + (__v4di)__mask, + __scale); } - -__funline __m128i _mm_i64gather_epi64(long long int const *__base, - __m128i __index, const int __scale) { - __v2di __src = __extension__(__v2di){0, 0}; - __v2di __mask = __extension__(__v2di){~0, ~0}; - return (__m128i)__builtin_ia32_gatherdiv2di(__src, __base, (__v2di)__index, - __mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v2di __src = __extension__ (__v2di){ 0, 0 }; + __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; + return (__m128i) __builtin_ia32_gatherdiv2di (__src, + __base, + (__v2di)__index, + __mask, + __scale); } - -__funline __m128i _mm_mask_i64gather_epi64(__m128i __src, - long long int const *__base, - __m128i __index, __m128i __mask, - const int __scale) { - return (__m128i)__builtin_ia32_gatherdiv2di( - (__v2di)__src, __base, (__v2di)__index, (__v2di)__mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src, + __base, + (__v2di)__index, + (__v2di)__mask, + __scale); } - -__funline __m256i _mm256_i64gather_epi64(long long int const *__base, - __m256i __index, const int __scale) { - __v4di __src = __extension__(__v4di){0, 0, 0, 0}; - __v4di __mask = __extension__(__v4di){~0, ~0, ~0, ~0}; - return (__m256i)__builtin_ia32_gatherdiv4di(__src, __base, (__v4di)__index, - __mask, __scale); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi64 (long long int const *__base, + __m256i __index, const int __scale) +{ + __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + return (__m256i) __builtin_ia32_gatherdiv4di (__src, + __base, + (__v4di)__index, + __mask, + __scale); } - -__funline __m256i _mm256_mask_i64gather_epi64(__m256i __src, - long long int const *__base, - __m256i __index, __m256i __mask, - const int __scale) { - return (__m256i)__builtin_ia32_gatherdiv4di( - (__v4di)__src, __base, (__v4di)__index, (__v4di)__mask, __scale); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base, + __m256i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src, + __base, + (__v4di)__index, + (__v4di)__mask, + __scale); } - -__funline __m128i _mm_i32gather_epi32(int const *__base, __m128i __index, - const int __scale) { - __v4si __src = __extension__(__v4si){0, 0, 0, 0}; - __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; - return (__m128i)__builtin_ia32_gathersiv4si(__src, __base, (__v4si)__index, - __mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + return (__m128i) __builtin_ia32_gathersiv4si (__src, + __base, + (__v4si)__index, + __mask, + __scale); } - -__funline __m128i _mm_mask_i32gather_epi32(__m128i __src, int const *__base, - __m128i __index, __m128i __mask, - const int __scale) { - return (__m128i)__builtin_ia32_gathersiv4si( - (__v4si)__src, __base, (__v4si)__index, (__v4si)__mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src, + __base, + (__v4si)__index, + (__v4si)__mask, + __scale); } - -__funline __m256i _mm256_i32gather_epi32(int const *__base, __m256i __index, - const int __scale) { - __v8si __src = __extension__(__v8si){0, 0, 0, 0, 0, 0, 0, 0}; - __v8si __mask = __extension__(__v8si){~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0}; - return (__m256i)__builtin_ia32_gathersiv8si(__src, __base, (__v8si)__index, - __mask, __scale); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale) +{ + __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; + __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; + return (__m256i) __builtin_ia32_gathersiv8si (__src, + __base, + (__v8si)__index, + __mask, + __scale); } - -__funline __m256i _mm256_mask_i32gather_epi32(__m256i __src, int const *__base, - __m256i __index, __m256i __mask, - const int __scale) { - return (__m256i)__builtin_ia32_gathersiv8si( - (__v8si)__src, __base, (__v8si)__index, (__v8si)__mask, __scale); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi32 (__m256i __src, int const *__base, + __m256i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src, + __base, + (__v8si)__index, + (__v8si)__mask, + __scale); } - -__funline __m128i _mm_i64gather_epi32(int const *__base, __m128i __index, - const int __scale) { - __v4si __src = __extension__(__v4si){0, 0, 0, 0}; - __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; - return (__m128i)__builtin_ia32_gatherdiv4si(__src, __base, (__v2di)__index, - __mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + return (__m128i) __builtin_ia32_gatherdiv4si (__src, + __base, + (__v2di)__index, + __mask, + __scale); } - -__funline __m128i _mm_mask_i64gather_epi32(__m128i __src, int const *__base, - __m128i __index, __m128i __mask, - const int __scale) { - return (__m128i)__builtin_ia32_gatherdiv4si( - (__v4si)__src, __base, (__v2di)__index, (__v4si)__mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src, + __base, + (__v2di)__index, + (__v4si)__mask, + __scale); } - -__funline __m128i _mm256_i64gather_epi32(int const *__base, __m256i __index, - const int __scale) { - __v4si __src = __extension__(__v4si){0, 0, 0, 0}; - __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; - return (__m128i)__builtin_ia32_gatherdiv4si256(__src, __base, (__v4di)__index, - __mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + return (__m128i) __builtin_ia32_gatherdiv4si256 (__src, + __base, + (__v4di)__index, + __mask, + __scale); } - -__funline __m128i _mm256_mask_i64gather_epi32(__m128i __src, int const *__base, - __m256i __index, __m128i __mask, - const int __scale) { - return (__m128i)__builtin_ia32_gatherdiv4si256( - (__v4si)__src, __base, (__v4di)__index, (__v4si)__mask, __scale); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, + __m256i __index, __m128i __mask, + const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src, + __base, + (__v4di)__index, + (__v4si)__mask, + __scale); } -#else /* __OPTIMIZE__ */ -#define _mm_i32gather_pd(BASE, INDEX, SCALE) \ - (__m128d) __builtin_ia32_gathersiv2df( \ - (__v2df)_mm_setzero_pd(), (double const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v2df)_mm_set1_pd((double)(long long int)-1), (int)SCALE) - -#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128d) __builtin_ia32_gathersiv2df( \ - (__v2df)(__m128d)SRC, (double const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v2df)(__m128d)MASK, (int)SCALE) - -#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ - (__m256d) __builtin_ia32_gathersiv4df( \ - (__v4df)_mm256_setzero_pd(), (double const *)BASE, \ - (__v4si)(__m128i)INDEX, \ - (__v4df)_mm256_set1_pd((double)(long long int)-1), (int)SCALE) - -#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ - (__m256d) __builtin_ia32_gathersiv4df( \ - (__v4df)(__m256d)SRC, (double const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v4df)(__m256d)MASK, (int)SCALE) - -#define _mm_i64gather_pd(BASE, INDEX, SCALE) \ - (__m128d) __builtin_ia32_gatherdiv2df( \ - (__v2df)_mm_setzero_pd(), (double const *)BASE, (__v2di)(__m128i)INDEX, \ - (__v2df)_mm_set1_pd((double)(long long int)-1), (int)SCALE) - -#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128d) __builtin_ia32_gatherdiv2df( \ - (__v2df)(__m128d)SRC, (double const *)BASE, (__v2di)(__m128i)INDEX, \ - (__v2df)(__m128d)MASK, (int)SCALE) - -#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ - (__m256d) __builtin_ia32_gatherdiv4df( \ - (__v4df)_mm256_setzero_pd(), (double const *)BASE, \ - (__v4di)(__m256i)INDEX, \ - (__v4df)_mm256_set1_pd((double)(long long int)-1), (int)SCALE) - -#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ - (__m256d) __builtin_ia32_gatherdiv4df( \ - (__v4df)(__m256d)SRC, (double const *)BASE, (__v4di)(__m256i)INDEX, \ - (__v4df)(__m256d)MASK, (int)SCALE) - -#define _mm_i32gather_ps(BASE, INDEX, SCALE) \ - (__m128) __builtin_ia32_gathersiv4sf( \ - (__v4sf)_mm_setzero_ps(), (float const *)BASE, (__v4si)(__m128i)INDEX, \ - _mm_set1_ps((float)(int)-1), (int)SCALE) - -#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128) __builtin_ia32_gathersiv4sf( \ - (__v4sf)(__m128d)SRC, (float const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v4sf)(__m128d)MASK, (int)SCALE) - -#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ - (__m256) __builtin_ia32_gathersiv8sf( \ - (__v8sf)_mm256_setzero_ps(), (float const *)BASE, \ - (__v8si)(__m256i)INDEX, (__v8sf)_mm256_set1_ps((float)(int)-1), \ - (int)SCALE) - -#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ - (__m256) __builtin_ia32_gathersiv8sf( \ - (__v8sf)(__m256)SRC, (float const *)BASE, (__v8si)(__m256i)INDEX, \ - (__v8sf)(__m256d)MASK, (int)SCALE) - -#define _mm_i64gather_ps(BASE, INDEX, SCALE) \ - (__m128) __builtin_ia32_gatherdiv4sf( \ - (__v4sf)_mm_setzero_pd(), (float const *)BASE, (__v2di)(__m128i)INDEX, \ - (__v4sf)_mm_set1_ps((float)(int)-1), (int)SCALE) - -#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128) __builtin_ia32_gatherdiv4sf( \ - (__v4sf)(__m128)SRC, (float const *)BASE, (__v2di)(__m128i)INDEX, \ - (__v4sf)(__m128d)MASK, (int)SCALE) - -#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ - (__m128) __builtin_ia32_gatherdiv4sf256( \ - (__v4sf)_mm_setzero_ps(), (float const *)BASE, (__v4di)(__m256i)INDEX, \ - (__v4sf)_mm_set1_ps((float)(int)-1), (int)SCALE) - -#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128) __builtin_ia32_gatherdiv4sf256( \ - (__v4sf)(__m128)SRC, (float const *)BASE, (__v4di)(__m256i)INDEX, \ - (__v4sf)(__m128)MASK, (int)SCALE) - -#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ - (__m128i) __builtin_ia32_gathersiv2di( \ - (__v2di)_mm_setzero_si128(), (long long const *)BASE, \ - (__v4si)(__m128i)INDEX, (__v2di)_mm_set1_epi64x(-1), (int)SCALE) - -#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128i) __builtin_ia32_gathersiv2di( \ - (__v2di)(__m128i)SRC, (long long const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v2di)(__m128i)MASK, (int)SCALE) - -#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ - (__m256i) __builtin_ia32_gathersiv4di( \ - (__v4di)_mm256_setzero_si256(), (long long const *)BASE, \ - (__v4si)(__m128i)INDEX, (__v4di)_mm256_set1_epi64x(-1), (int)SCALE) - -#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ - (__m256i) __builtin_ia32_gathersiv4di( \ - (__v4di)(__m256i)SRC, (long long const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v4di)(__m256i)MASK, (int)SCALE) - -#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ - (__m128i) __builtin_ia32_gatherdiv2di( \ - (__v2di)_mm_setzero_si128(), (long long const *)BASE, \ - (__v2di)(__m128i)INDEX, (__v2di)_mm_set1_epi64x(-1), (int)SCALE) - -#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128i) __builtin_ia32_gatherdiv2di( \ - (__v2di)(__m128i)SRC, (long long const *)BASE, (__v2di)(__m128i)INDEX, \ - (__v2di)(__m128i)MASK, (int)SCALE) - -#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ - (__m256i) __builtin_ia32_gatherdiv4di( \ - (__v4di)_mm256_setzero_si256(), (long long const *)BASE, \ - (__v4di)(__m256i)INDEX, (__v4di)_mm256_set1_epi64x(-1), (int)SCALE) - -#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ - (__m256i) __builtin_ia32_gatherdiv4di( \ - (__v4di)(__m256i)SRC, (long long const *)BASE, (__v4di)(__m256i)INDEX, \ - (__v4di)(__m256i)MASK, (int)SCALE) - -#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ - (__m128i) __builtin_ia32_gathersiv4si( \ - (__v4si)_mm_setzero_si128(), (int const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v4si)_mm_set1_epi32(-1), (int)SCALE) - -#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128i) __builtin_ia32_gathersiv4si( \ - (__v4si)(__m128i)SRC, (int const *)BASE, (__v4si)(__m128i)INDEX, \ - (__v4si)(__m128i)MASK, (int)SCALE) - -#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ - (__m256i) __builtin_ia32_gathersiv8si( \ - (__v8si)_mm256_setzero_si256(), (int const *)BASE, \ - (__v8si)(__m256i)INDEX, (__v8si)_mm256_set1_epi32(-1), (int)SCALE) - -#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ - (__m256i) __builtin_ia32_gathersiv8si( \ - (__v8si)(__m256i)SRC, (int const *)BASE, (__v8si)(__m256i)INDEX, \ - (__v8si)(__m256i)MASK, (int)SCALE) - -#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ - (__m128i) __builtin_ia32_gatherdiv4si( \ - (__v4si)_mm_setzero_si128(), (int const *)BASE, (__v2di)(__m128i)INDEX, \ - (__v4si)_mm_set1_epi32(-1), (int)SCALE) - -#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128i) __builtin_ia32_gatherdiv4si( \ - (__v4si)(__m128i)SRC, (int const *)BASE, (__v2di)(__m128i)INDEX, \ - (__v4si)(__m128i)MASK, (int)SCALE) - -#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ - (__m128i) __builtin_ia32_gatherdiv4si256( \ - (__v4si)_mm_setzero_si128(), (int const *)BASE, (__v4di)(__m256i)INDEX, \ - (__v4si)_mm_set1_epi32(-1), (int)SCALE) - -#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ - (__m128i) __builtin_ia32_gatherdiv4si256( \ - (__v4si)(__m128i)SRC, (int const *)BASE, (__v4di)(__m256i)INDEX, \ - (__v4si)(__m128i)MASK, (int)SCALE) -#endif /* __OPTIMIZE__ */ - +#else +#define _mm_i32gather_pd(BASE, INDEX, SCALE) (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), (double const *) (BASE), (__v4si)(__m128i) (INDEX), (__v2df) _mm_cmpeq_pd (_mm_setzero_pd (), _mm_setzero_pd ()), (int) (SCALE)) +#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d) (SRC), (double const *) (BASE), (__v4si)(__m128i) (INDEX), (__v2df)(__m128d) (MASK), (int) (SCALE)) +#define _mm256_i32gather_pd(BASE, INDEX, SCALE) (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), (double const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4df) _mm256_cmp_pd (_mm256_setzero_pd (), _mm256_setzero_pd (), _CMP_EQ_OQ), (int) (SCALE)) +#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d) (SRC), (double const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4df)(__m256d) (MASK), (int) (SCALE)) +#define _mm_i64gather_pd(BASE, INDEX, SCALE) (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), (double const *) (BASE), (__v2di)(__m128i) (INDEX), (__v2df) _mm_cmpeq_pd (_mm_setzero_pd (), _mm_setzero_pd ()), (int) (SCALE)) +#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d) (SRC), (double const *) (BASE), (__v2di)(__m128i) (INDEX), (__v2df)(__m128d) (MASK), (int) (SCALE)) +#define _mm256_i64gather_pd(BASE, INDEX, SCALE) (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), (double const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4df) _mm256_cmp_pd (_mm256_setzero_pd (), _mm256_setzero_pd (), _CMP_EQ_OQ), (int) (SCALE)) +#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d) (SRC), (double const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4df)(__m256d) (MASK), (int) (SCALE)) +#define _mm_i32gather_ps(BASE, INDEX, SCALE) (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), (float const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4sf) _mm_cmpeq_ps (_mm_setzero_ps (), _mm_setzero_ps ()), (int) (SCALE)) +#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128) (SRC), (float const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4sf)(__m128) (MASK), (int) (SCALE)) +#define _mm256_i32gather_ps(BASE, INDEX, SCALE) (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), (float const *) (BASE), (__v8si)(__m256i) (INDEX), (__v8sf) _mm256_cmp_ps (_mm256_setzero_ps (), _mm256_setzero_ps (), _CMP_EQ_OQ), (int) (SCALE)) +#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256) (SRC), (float const *) (BASE), (__v8si)(__m256i) (INDEX), (__v8sf)(__m256) (MASK), (int) (SCALE)) +#define _mm_i64gather_ps(BASE, INDEX, SCALE) (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), (float const *) (BASE), (__v2di)(__m128i) (INDEX), (__v4sf) _mm_cmpeq_ps (_mm_setzero_ps (), _mm_setzero_ps ()), (int) (SCALE)) +#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128) (SRC), (float const *) (BASE), (__v2di)(__m128i) (INDEX), (__v4sf)(__m128) (MASK), (int) (SCALE)) +#define _mm256_i64gather_ps(BASE, INDEX, SCALE) (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), (float const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4sf) _mm_cmpeq_ps (_mm_setzero_ps (), _mm_setzero_ps ()), (int) (SCALE)) +#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128) (SRC), (float const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4sf)(__m128) (MASK), (int) (SCALE)) +#define _mm_i32gather_epi64(BASE, INDEX, SCALE) (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), (long long const *) (BASE), (__v4si)(__m128i) (INDEX), (__v2di)_mm_set1_epi64x (-1), (int) (SCALE)) +#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i) (SRC), (long long const *) (BASE), (__v4si)(__m128i) (INDEX), (__v2di)(__m128i) (MASK), (int) (SCALE)) +#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), (long long const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4di)_mm256_set1_epi64x (-1), (int) (SCALE)) +#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i) (SRC), (long long const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4di)(__m256i) (MASK), (int) (SCALE)) +#define _mm_i64gather_epi64(BASE, INDEX, SCALE) (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), (long long const *) (BASE), (__v2di)(__m128i) (INDEX), (__v2di)_mm_set1_epi64x (-1), (int) (SCALE)) +#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i) (SRC), (long long const *) (BASE), (__v2di)(__m128i) (INDEX), (__v2di)(__m128i) (MASK), (int) (SCALE)) +#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), (long long const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4di)_mm256_set1_epi64x (-1), (int) (SCALE)) +#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i) (SRC), (long long const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4di)(__m256i) (MASK), (int) (SCALE)) +#define _mm_i32gather_epi32(BASE, INDEX, SCALE) (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), (int const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4si)_mm_set1_epi32 (-1), (int) (SCALE)) +#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i) (SRC), (int const *) (BASE), (__v4si)(__m128i) (INDEX), (__v4si)(__m128i) (MASK), (int) (SCALE)) +#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), (int const *) (BASE), (__v8si)(__m256i) (INDEX), (__v8si)_mm256_set1_epi32 (-1), (int) (SCALE)) +#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i) (SRC), (int const *) (BASE), (__v8si)(__m256i) (INDEX), (__v8si)(__m256i) (MASK), (int) (SCALE)) +#define _mm_i64gather_epi32(BASE, INDEX, SCALE) (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), (int const *) (BASE), (__v2di)(__m128i) (INDEX), (__v4si)_mm_set1_epi32 (-1), (int) (SCALE)) +#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i) (SRC), (int const *) (BASE), (__v2di)(__m128i) (INDEX), (__v4si)(__m128i) (MASK), (int) (SCALE)) +#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), (int const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4si)_mm_set1_epi32(-1), (int) (SCALE)) +#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC), (int const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4si)(__m128i) (MASK), (int) (SCALE)) +#endif #ifdef __DISABLE_AVX2__ #undef __DISABLE_AVX2__ #pragma GCC pop_options -#endif /* __DISABLE_AVX2__ */ - -#endif /* _AVX2INTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx5124fmapsintrin.internal.h b/third_party/intel/avx5124fmapsintrin.internal.h index 289540f04..7cbfc4404 100644 --- a/third_party/intel/avx5124fmapsintrin.internal.h +++ b/third_party/intel/avx5124fmapsintrin.internal.h @@ -1,112 +1,180 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #if !defined _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _AVX5124FMAPSINTRIN_H_INCLUDED #define _AVX5124FMAPSINTRIN_H_INCLUDED - #ifndef __AVX5124FMAPS__ #pragma GCC push_options #pragma GCC target("avx5124fmaps") #define __DISABLE_AVX5124FMAPS__ -#endif /* __AVX5124FMAPS__ */ - -__funline __m512 _mm512_4fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, - __m512 __E, __m128 *__F) { - return (__m512)__builtin_ia32_4fmaddps((__v16sf)__B, (__v16sf)__C, - (__v16sf)__D, (__v16sf)__E, - (__v16sf)__A, (const __v4sf *)__F); +#endif +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4fmadd_ps (__m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fmaddps ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F); } - -__funline __m512 _mm512_mask_4fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, __m512 __D, __m512 __E, - __m128 *__F) { - return (__m512)__builtin_ia32_4fmaddps_mask( - (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, - (const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U); +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) __A, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_4fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, __m512 __D, __m512 __E, - __m128 *__F) { - return (__m512)__builtin_ia32_4fmaddps_mask( - (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, - (const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4fmadd_ps (__mmask16 __U, + __m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m128 _mm_4fmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, - __m128 __E, __m128 *__F) { - return (__m128)__builtin_ia32_4fmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D, - (__v4sf)__E, (__v4sf)__A, - (const __v4sf *)__F); +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_4fmadd_ss (__m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fmaddss ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F); } - -__funline __m128 _mm_mask_4fmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C, __m128 __D, __m128 __E, - __m128 *__F) { - return (__m128)__builtin_ia32_4fmaddss_mask( - (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, - (const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U); +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_4fmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) __A, + (__mmask8) __U); } - -__funline __m128 _mm_maskz_4fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C, __m128 __D, __m128 __E, - __m128 *__F) { - return (__m128)__builtin_ia32_4fmaddss_mask( - (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, - (const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_4fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); } - -__funline __m512 _mm512_4fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, - __m512 __E, __m128 *__F) { - return (__m512)__builtin_ia32_4fnmaddps((__v16sf)__B, (__v16sf)__C, - (__v16sf)__D, (__v16sf)__E, - (__v16sf)__A, (const __v4sf *)__F); +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fnmaddps ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F); } - -__funline __m512 _mm512_mask_4fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, __m512 __D, __m512 __E, - __m128 *__F) { - return (__m512)__builtin_ia32_4fnmaddps_mask( - (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, - (const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U); +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) __A, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_4fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, __m512 __D, __m512 __E, - __m128 *__F) { - return (__m512)__builtin_ia32_4fnmaddps_mask( - (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, - (const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4fnmadd_ps (__mmask16 __U, + __m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m128 _mm_4fnmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, - __m128 __E, __m128 *__F) { - return (__m128)__builtin_ia32_4fnmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D, - (__v4sf)__E, (__v4sf)__A, - (const __v4sf *)__F); +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_4fnmadd_ss (__m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fnmaddss ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F); } - -__funline __m128 _mm_mask_4fnmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C, __m128 __D, __m128 __E, - __m128 *__F) { - return (__m128)__builtin_ia32_4fnmaddss_mask( - (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, - (const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U); +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_4fnmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) __A, + (__mmask8) __U); } - -__funline __m128 _mm_maskz_4fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C, __m128 __D, __m128 __E, - __m128 *__F) { - return (__m128)__builtin_ia32_4fnmaddss_mask( - (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, - (const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_4fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); } - #ifdef __DISABLE_AVX5124FMAPS__ #undef __DISABLE_AVX5124FMAPS__ #pragma GCC pop_options -#endif /* __DISABLE_AVX5124FMAPS__ */ - -#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx5124vnniwintrin.internal.h b/third_party/intel/avx5124vnniwintrin.internal.h index 71ea91c09..ff6a97762 100644 --- a/third_party/intel/avx5124vnniwintrin.internal.h +++ b/third_party/intel/avx5124vnniwintrin.internal.h @@ -1,69 +1,102 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #if !defined _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _AVX5124VNNIWINTRIN_H_INCLUDED #define _AVX5124VNNIWINTRIN_H_INCLUDED - #ifndef __AVX5124VNNIW__ #pragma GCC push_options #pragma GCC target("avx5124vnniw") #define __DISABLE_AVX5124VNNIW__ -#endif /* __AVX5124VNNIW__ */ - -__funline __m512i _mm512_4dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C, - __m512i __D, __m512i __E, __m128i *__F) { - return (__m512i)__builtin_ia32_vp4dpwssd((__v16si)__B, (__v16si)__C, - (__v16si)__D, (__v16si)__E, - (__v16si)__A, (const __v4si *)__F); +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C, + __m512i __D, __m512i __E, __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssd ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F); } - -__funline __m512i _mm512_mask_4dpwssd_epi32(__m512i __A, __mmask16 __U, - __m512i __B, __m512i __C, __m512i __D, - __m512i __E, __m128i *__F) { - return (__m512i)__builtin_ia32_vp4dpwssd_mask( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, - (const __v4si *)__F, (__v16si)__A, (__mmask16)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4dpwssd_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) __A, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_4dpwssd_epi32(__mmask16 __U, __m512i __A, - __m512i __B, __m512i __C, - __m512i __D, __m512i __E, - __m128i *__F) { - return (__m512i)__builtin_ia32_vp4dpwssd_mask( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, - (const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4dpwssd_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m512i _mm512_4dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C, - __m512i __D, __m512i __E, __m128i *__F) { - return (__m512i)__builtin_ia32_vp4dpwssds((__v16si)__B, (__v16si)__C, - (__v16si)__D, (__v16si)__E, - (__v16si)__A, (const __v4si *)__F); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C, + __m512i __D, __m512i __E, __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssds ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F); } - -__funline __m512i _mm512_mask_4dpwssds_epi32(__m512i __A, __mmask16 __U, - __m512i __B, __m512i __C, - __m512i __D, __m512i __E, - __m128i *__F) { - return (__m512i)__builtin_ia32_vp4dpwssds_mask( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, - (const __v4si *)__F, (__v16si)__A, (__mmask16)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4dpwssds_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) __A, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_4dpwssds_epi32(__mmask16 __U, __m512i __A, - __m512i __B, __m512i __C, - __m512i __D, __m512i __E, - __m128i *__F) { - return (__m512i)__builtin_ia32_vp4dpwssds_mask( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, - (const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4dpwssds_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) _mm512_setzero_ps (), + (__mmask16) __U); } - #ifdef __DISABLE_AVX5124VNNIW__ #undef __DISABLE_AVX5124VNNIW__ #pragma GCC pop_options -#endif /* __DISABLE_AVX5124VNNIW__ */ - -#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512bf16intrin.internal.h b/third_party/intel/avx512bf16intrin.internal.h new file mode 100644 index 000000000..7da74bec0 --- /dev/null +++ b/third_party/intel/avx512bf16intrin.internal.h @@ -0,0 +1,74 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AVX512BF16INTRIN_H_INCLUDED +#define _AVX512BF16INTRIN_H_INCLUDED +#ifndef __AVX512BF16__ +#pragma GCC push_options +#pragma GCC target("avx512bf16") +#define __DISABLE_AVX512BF16__ +#endif +typedef short __v32bh __attribute__ ((__vector_size__ (64))); +typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__)); +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B); +} +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B); +} +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A); +} +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtneps_pbh (__m512 __A) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A); +} +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B); +} +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A); +} +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C); +} +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B); +} +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A); +} +#ifdef __DISABLE_AVX512BF16__ +#undef __DISABLE_AVX512BF16__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/avx512bf16vlintrin.internal.h b/third_party/intel/avx512bf16vlintrin.internal.h new file mode 100644 index 000000000..f5e13a35a --- /dev/null +++ b/third_party/intel/avx512bf16vlintrin.internal.h @@ -0,0 +1,130 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AVX512BF16VLINTRIN_H_INCLUDED +#define _AVX512BF16VLINTRIN_H_INCLUDED +#if !defined(__AVX512VL__) || !defined(__AVX512BF16__) +#pragma GCC push_options +#pragma GCC target("avx512bf16,avx512vl") +#define __DISABLE_AVX512BF16VL__ +#endif +typedef short __v16bh __attribute__ ((__vector_size__ (32))); +typedef short __v8bh __attribute__ ((__vector_size__ (16))); +typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__)); +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B); +} +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B); +} +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtne2ps_pbh (__m128 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneps_pbh (__m256 __A) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneps_pbh (__m128 __A) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B); +} +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A); +} +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C); +} +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B); +} +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); +} +#ifdef __DISABLE_AVX512BF16VL__ +#undef __DISABLE_AVX512BF16VL__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/avx512bitalgintrin.internal.h b/third_party/intel/avx512bitalgintrin.internal.h index 7abb14c7b..eb63af884 100644 --- a/third_party/intel/avx512bitalgintrin.internal.h +++ b/third_party/intel/avx512bitalgintrin.internal.h @@ -1,172 +1,231 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #if !defined _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _AVX512BITALGINTRIN_H_INCLUDED #define _AVX512BITALGINTRIN_H_INCLUDED - #ifndef __AVX512BITALG__ #pragma GCC push_options #pragma GCC target("avx512bitalg") #define __DISABLE_AVX512BITALG__ -#endif /* __AVX512BITALG__ */ - -__funline __m512i _mm512_popcnt_epi8(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcountb_v64qi((__v64qi)__A); +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi8 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountb_v64qi ((__v64qi) __A); } - -__funline __m512i _mm512_popcnt_epi16(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcountw_v32hi((__v32hi)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi16 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountw_v32hi ((__v32hi) __A); } - #ifdef __DISABLE_AVX512BITALG__ #undef __DISABLE_AVX512BITALG__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512BITALG__ */ - +#endif #if !defined(__AVX512BITALG__) || !defined(__AVX512BW__) #pragma GCC push_options #pragma GCC target("avx512bitalg,avx512bw") #define __DISABLE_AVX512BITALGBW__ -#endif /* __AVX512VLBW__ */ - -__funline __m512i _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, - __m512i __B) { - return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask( - (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); } - -__funline __m512i _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask( - (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); } -__funline __m512i _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, - __m512i __B) { - return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask( - (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); } - -__funline __m512i _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask( - (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); } - -__funline __mmask64 _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) { - return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask( - (__v64qi)__A, (__v64qi)__B, (__mmask64)-1); +extern __inline __mmask64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_bitshuffle_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); } - -__funline __mmask64 _mm512_mask_bitshuffle_epi64_mask(__mmask64 __M, __m512i __A, - __m512i __B) { - return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask( - (__v64qi)__A, (__v64qi)__B, (__mmask64)__M); +extern __inline __mmask64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) __M); } - #ifdef __DISABLE_AVX512BITALGBW__ #undef __DISABLE_AVX512BITALGBW__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512BITALGBW__ */ - -#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || \ - !defined(__AVX512BW__) +#endif +#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || !defined(__AVX512BW__) #pragma GCC push_options #pragma GCC target("avx512bitalg,avx512vl,avx512bw") #define __DISABLE_AVX512BITALGVLBW__ -#endif /* __AVX512VLBW__ */ - -__funline __m256i _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, - __m256i __B) { - return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask( - (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); +#endif +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); } - -__funline __m256i _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask( - (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); } - -__funline __mmask32 _mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) { - return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask( - (__v32qi)__A, (__v32qi)__B, (__mmask32)-1); +extern __inline __mmask32 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bitshuffle_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); } - -__funline __mmask32 _mm256_mask_bitshuffle_epi64_mask(__mmask32 __M, __m256i __A, - __m256i __B) { - return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask( - (__v32qi)__A, (__v32qi)__B, (__mmask32)__M); +extern __inline __mmask32 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) __M); } - #ifdef __DISABLE_AVX512BITALGVLBW__ #undef __DISABLE_AVX512BITALGVLBW__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512BITALGVLBW__ */ - +#endif #if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) #pragma GCC push_options #pragma GCC target("avx512bitalg,avx512vl") #define __DISABLE_AVX512BITALGVL__ -#endif /* __AVX512VLBW__ */ - -__funline __mmask16 _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) { - return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask( - (__v16qi)__A, (__v16qi)__B, (__mmask16)-1); +#endif +extern __inline __mmask16 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); } - -__funline __mmask16 _mm_mask_bitshuffle_epi64_mask(__mmask16 __M, __m128i __A, - __m128i __B) { - return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask( - (__v16qi)__A, (__v16qi)__B, (__mmask16)__M); +extern __inline __mmask16 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_bitshuffle_epi64_mask (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) __M); } - -__funline __m256i _mm256_popcnt_epi8(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcountb_v32qi((__v32qi)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi8 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountb_v32qi ((__v32qi) __A); } - -__funline __m256i _mm256_popcnt_epi16(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcountw_v16hi((__v16hi)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi16 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountw_v16hi ((__v16hi) __A); } - -__funline __m128i _mm_popcnt_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcountb_v16qi((__v16qi)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountb_v16qi ((__v16qi) __A); } - -__funline __m128i _mm_popcnt_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcountw_v8hi((__v8hi)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountw_v8hi ((__v8hi) __A); } - -__funline __m256i _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, - __m256i __B) { - return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask( - (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); } - -__funline __m256i _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask( - (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi16 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); } - -__funline __m128i _mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask( - (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); } - -__funline __m128i _mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask( - (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); } -__funline __m128i _mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask((__v8hi)__A, (__v8hi)__B, - (__mmask8)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); } - -__funline __m128i _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask( - (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); } #ifdef __DISABLE_AVX512BITALGVL__ #undef __DISABLE_AVX512BITALGVL__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512BITALGBW__ */ - -#endif /* _AVX512BITALGINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512bwintrin.internal.h b/third_party/intel/avx512bwintrin.internal.h index 235bb541b..ddf888c7b 100644 --- a/third_party/intel/avx512bwintrin.internal.h +++ b/third_party/intel/avx512bwintrin.internal.h @@ -1,1954 +1,2842 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512BWINTRIN_H_INCLUDED #define _AVX512BWINTRIN_H_INCLUDED - #ifndef __AVX512BW__ #pragma GCC push_options #pragma GCC target("avx512bw") #define __DISABLE_AVX512BW__ -#endif /* __AVX512BW__ */ - -typedef short __v32hi __attribute__((__vector_size__(64))); -typedef char __v64qi __attribute__((__vector_size__(64))); - -typedef unsigned long long __mmask64; - -__funline unsigned char _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_ktestcsi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); -} - -__funline unsigned char _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_ktestcdi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); -} - -__funline unsigned char _ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) { - return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); -} - -__funline unsigned char _ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) { - return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); -} - -__funline unsigned char _ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) { - return (unsigned char)__builtin_ia32_ktestcsi(__A, __B); -} - -__funline unsigned char _ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) { - return (unsigned char)__builtin_ia32_ktestcdi(__A, __B); -} - -__funline unsigned char _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_kortestcsi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); -} - -__funline unsigned char _kortest_mask64_u8(__mmask64 __A, __mmask64 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_kortestcdi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); -} - -__funline unsigned char _kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) { - return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); -} - -__funline unsigned char _kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) { - return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); -} - -__funline unsigned char _kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) { - return (unsigned char)__builtin_ia32_kortestcsi(__A, __B); -} - -__funline unsigned char _kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) { - return (unsigned char)__builtin_ia32_kortestcdi(__A, __B); -} - -__funline __mmask32 _kadd_mask32(__mmask32 __A, __mmask32 __B) { - return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask64 _kadd_mask64(__mmask64 __A, __mmask64 __B) { - return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B); -} - -__funline unsigned int _cvtmask32_u32(__mmask32 __A) { - return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); -} - -__funline unsigned long long _cvtmask64_u64(__mmask64 __A) { - return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); -} - -__funline __mmask32 _cvtu32_mask32(unsigned int __A) { - return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); -} - -__funline __mmask64 _cvtu64_mask64(unsigned long long __A) { - return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); -} - -__funline __mmask32 _load_mask32(__mmask32 *__A) { - return (__mmask32)__builtin_ia32_kmovd(*__A); -} - -__funline __mmask64 _load_mask64(__mmask64 *__A) { - return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A); -} - -__funline void _store_mask32(__mmask32 *__A, __mmask32 __B) { - *(__mmask32 *)__A = __builtin_ia32_kmovd(__B); -} - -__funline void _store_mask64(__mmask64 *__A, __mmask64 __B) { - *(__mmask64 *)__A = __builtin_ia32_kmovq(__B); -} - -__funline __mmask32 _knot_mask32(__mmask32 __A) { - return (__mmask32)__builtin_ia32_knotsi((__mmask32)__A); -} - -__funline __mmask64 _knot_mask64(__mmask64 __A) { - return (__mmask64)__builtin_ia32_knotdi((__mmask64)__A); -} - -__funline __mmask32 _kor_mask32(__mmask32 __A, __mmask32 __B) { - return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask64 _kor_mask64(__mmask64 __A, __mmask64 __B) { - return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B); -} - -__funline __mmask32 _kxnor_mask32(__mmask32 __A, __mmask32 __B) { - return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask64 _kxnor_mask64(__mmask64 __A, __mmask64 __B) { - return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B); -} - -__funline __mmask32 _kxor_mask32(__mmask32 __A, __mmask32 __B) { - return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask64 _kxor_mask64(__mmask64 __A, __mmask64 __B) { - return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); -} - -__funline __mmask32 _kand_mask32(__mmask32 __A, __mmask32 __B) { - return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask64 _kand_mask64(__mmask64 __A, __mmask64 __B) { - return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B); -} - -__funline __mmask32 _kandn_mask32(__mmask32 __A, __mmask32 __B) { - return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask64 _kandn_mask64(__mmask64 __A, __mmask64 __B) { - return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B); -} - -__funline __m512i _mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdquhi512_mask((__v32hi)__A, (__v32hi)__W, - (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdquhi512_mask( - (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, - void const *__P) { - return (__m512i)__builtin_ia32_loaddquhi512_mask( - (const short *)__P, (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) { - return (__m512i)__builtin_ia32_loaddquhi512_mask( - (const short *)__P, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline void _mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) { - __builtin_ia32_storedquhi512_mask((short *)__P, (__v32hi)__A, (__mmask32)__U); -} - -__funline __m512i _mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdquqi512_mask((__v64qi)__A, (__v64qi)__W, - (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdquqi512_mask( - (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); -} - -__funline __mmask32 _mm512_kunpackw(__mmask32 __A, __mmask32 __B) { - return (__mmask32)__builtin_ia32_kunpcksi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask32 _kunpackw_mask32(__mmask16 __A, __mmask16 __B) { - return (__mmask32)__builtin_ia32_kunpcksi((__mmask32)__A, (__mmask32)__B); -} - -__funline __mmask64 _mm512_kunpackd(__mmask64 __A, __mmask64 __B) { - return (__mmask64)__builtin_ia32_kunpckdi((__mmask64)__A, (__mmask64)__B); -} - -__funline __mmask64 _kunpackd_mask64(__mmask32 __A, __mmask32 __B) { - return (__mmask64)__builtin_ia32_kunpckdi((__mmask64)__A, (__mmask64)__B); -} - -__funline __m512i _mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, - void const *__P) { - return (__m512i)__builtin_ia32_loaddquqi512_mask( - (const char *)__P, (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) { - return (__m512i)__builtin_ia32_loaddquqi512_mask( - (const char *)__P, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); -} - -__funline void _mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) { - __builtin_ia32_storedquqi512_mask((char *)__P, (__v64qi)__A, (__mmask64)__U); -} - -__funline __m512i _mm512_sad_epu8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psadbw512((__v64qi)__A, (__v64qi)__B); -} - -__funline __m256i _mm512_cvtepi16_epi8(__m512i __A) { - return (__m256i)__builtin_ia32_pmovwb512_mask( - (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); -} - -__funline void _mm512_mask_cvtepi16_storeu_epi8(void *__P, __mmask32 __M, - __m512i __A) { - __builtin_ia32_pmovwb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); -} - -__funline __m256i _mm512_mask_cvtepi16_epi8(__m256i __O, __mmask32 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovwb512_mask((__v32hi)__A, (__v32qi)__O, - __M); -} - -__funline __m256i _mm512_maskz_cvtepi16_epi8(__mmask32 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovwb512_mask( - (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm512_cvtsepi16_epi8(__m512i __A) { - return (__m256i)__builtin_ia32_pmovswb512_mask( - (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); -} - -__funline void _mm512_mask_cvtsepi16_storeu_epi8(void *__P, __mmask32 __M, - __m512i __A) { - __builtin_ia32_pmovswb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); -} - -__funline __m256i _mm512_mask_cvtsepi16_epi8(__m256i __O, __mmask32 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovswb512_mask((__v32hi)__A, (__v32qi)__O, - __M); -} - -__funline __m256i _mm512_maskz_cvtsepi16_epi8(__mmask32 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovswb512_mask( - (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm512_cvtusepi16_epi8(__m512i __A) { - return (__m256i)__builtin_ia32_pmovuswb512_mask( - (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); -} - -__funline __m256i _mm512_mask_cvtusepi16_epi8(__m256i __O, __mmask32 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovuswb512_mask((__v32hi)__A, (__v32qi)__O, - __M); -} - -__funline void _mm512_mask_cvtusepi16_storeu_epi8(void *__P, __mmask32 __M, - __m512i __A) { - __builtin_ia32_pmovuswb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); -} - -__funline __m256i _mm512_maskz_cvtusepi16_epi8(__mmask32 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovuswb512_mask( - (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); -} - -__funline __m512i _mm512_broadcastb_epi8(__m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastb512_mask( - (__v16qi)__A, (__v64qi)_mm512_undefined_epi32(), (__mmask64)-1); -} - -__funline __m512i _mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, - __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastb512_mask((__v16qi)__A, (__v64qi)__O, - __M); -} - -__funline __m512i _mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastb512_mask( - (__v16qi)__A, (__v64qi)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) { - return (__m512i)__builtin_ia32_pbroadcastb512_gpr_mask(__A, (__v64qi)__O, - __M); -} - -__funline __m512i _mm512_maskz_set1_epi8(__mmask64 __M, char __A) { - return (__m512i)__builtin_ia32_pbroadcastb512_gpr_mask( - __A, (__v64qi)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_broadcastw_epi16(__m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastw512_mask( - (__v8hi)__A, (__v32hi)_mm512_undefined_epi32(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, - __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastw512_mask((__v8hi)__A, (__v32hi)__O, - __M); -} - -__funline __m512i _mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastw512_mask( - (__v8hi)__A, (__v32hi)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) { - return (__m512i)__builtin_ia32_pbroadcastw512_gpr_mask(__A, (__v32hi)__O, - __M); -} - -__funline __m512i _mm512_maskz_set1_epi16(__mmask32 __M, short __A) { - return (__m512i)__builtin_ia32_pbroadcastw512_gpr_mask( - __A, (__v32hi)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_mulhrs_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmulhrsw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmulhrsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmulhrsw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_mulhi_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_mulhi_epu16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmulhuw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmulhuw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmulhuw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_mullo_epi16(__m512i __A, __m512i __B) { - return (__m512i)((__v32hu)__A * (__v32hu)__B); -} - -__funline __m512i _mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmullw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmullw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_cvtepi8_epi16(__m256i __A) { - return (__m512i)__builtin_ia32_pmovsxbw512_mask( - (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, - __m256i __A) { - return (__m512i)__builtin_ia32_pmovsxbw512_mask((__v32qi)__A, (__v32hi)__W, - (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { - return (__m512i)__builtin_ia32_pmovsxbw512_mask( - (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_cvtepu8_epi16(__m256i __A) { - return (__m512i)__builtin_ia32_pmovzxbw512_mask( - (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, - __m256i __A) { - return (__m512i)__builtin_ia32_pmovzxbw512_mask((__v32qi)__A, (__v32hi)__W, - (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { - return (__m512i)__builtin_ia32_pmovzxbw512_mask( - (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_permutexvar_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_permvarhi512_mask( - (__v32hi)__B, (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_permvarhi512_mask( - (__v32hi)__B, (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__M); -} - -__funline __m512i _mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_permvarhi512_mask( - (__v32hi)__B, (__v32hi)__A, (__v32hi)__W, (__mmask32)__M); -} - -__funline __m512i _mm512_permutex2var_epi16(__m512i __A, __m512i __I, - __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varhi512_mask( - (__v32hi)__I - /* idx */, - (__v32hi)__A, (__v32hi)__B, (__mmask32)-1); -} - -__funline __m512i _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varhi512_mask( - (__v32hi)__I - /* idx */, - (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); -} - -__funline __m512i _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, - __mmask32 __U, __m512i __B) { - return (__m512i)__builtin_ia32_vpermi2varhi512_mask((__v32hi)__A, - (__v32hi)__I - /* idx */, - (__v32hi)__B, - (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varhi512_maskz( - (__v32hi)__I - /* idx */, - (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); -} - -__funline __m512i _mm512_avg_epu8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_avg_epu8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_add_epi8(__m512i __A, __m512i __B) { - return (__m512i)((__v64qu)__A + (__v64qu)__B); -} - -__funline __m512i _mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_paddb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_sub_epi8(__m512i __A, __m512i __B) { - return (__m512i)((__v64qu)__A - (__v64qu)__B); -} - -__funline __m512i _mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psubb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_avg_epu16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_avg_epu16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_subs_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_subs_epu8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psubusb512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubusb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubusb512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_adds_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_adds_epu8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_paddusb512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddusb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddusb512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_sub_epi16(__m512i __A, __m512i __B) { - return (__m512i)((__v32hu)__A - (__v32hu)__B); -} - -__funline __m512i _mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_subs_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_subs_epu16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psubusw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubusw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubusw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_add_epi16(__m512i __A, __m512i __B) { - return (__m512i)((__v32hu)__A + (__v32hu)__B); -} - -__funline __m512i _mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_adds_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_adds_epu16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_paddusw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddusw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddusw512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_srl_epi16(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_packs_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_packsswb512_mask( - (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_sll_epi16(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_maddubs_epi16(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_pmaddubsw512_mask( - (__v64qi)__X, (__v64qi)__Y, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, - __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_pmaddubsw512_mask( - (__v64qi)__X, (__v64qi)__Y, (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_pmaddubsw512_mask( - (__v64qi)__X, (__v64qi)__Y, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_madd_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaddwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v16si)_mm512_setzero_si512(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaddwd512_mask((__v32hi)__A, (__v32hi)__B, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaddwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhbw512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhbw512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpckhbw512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpckhwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpcklbw512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpcklbw512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpcklbw512_mask( - (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpcklwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpcklwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpcklwd512_mask( - (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __mmask64 _mm512_cmpeq_epu8_mask(__m512i __A, __m512i __B) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 0, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmpeq_epi8_mask(__m512i __A, __m512i __B) { - return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__A, (__v64qi)__B, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_mask_cmpeq_epu8_mask(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 0, - __U); -} - -__funline __mmask64 _mm512_mask_cmpeq_epi8_mask(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__A, (__v64qi)__B, - __U); -} - -__funline __mmask32 _mm512_cmpeq_epu16_mask(__m512i __A, __m512i __B) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 0, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmpeq_epi16_mask(__m512i __A, __m512i __B) { - return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__A, (__v32hi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_mask_cmpeq_epu16_mask(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 0, - __U); -} - -__funline __mmask32 _mm512_mask_cmpeq_epi16_mask(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__A, (__v32hi)__B, - __U); -} - -__funline __mmask64 _mm512_cmpgt_epu8_mask(__m512i __A, __m512i __B) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 6, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmpgt_epi8_mask(__m512i __A, __m512i __B) { - return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__A, (__v64qi)__B, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_mask_cmpgt_epu8_mask(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 6, - __U); -} - -__funline __mmask64 _mm512_mask_cmpgt_epi8_mask(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__A, (__v64qi)__B, - __U); -} - -__funline __mmask32 _mm512_cmpgt_epu16_mask(__m512i __A, __m512i __B) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 6, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmpgt_epi16_mask(__m512i __A, __m512i __B) { - return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__A, (__v32hi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_mask_cmpgt_epu16_mask(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 6, - __U); -} - -__funline __mmask32 _mm512_mask_cmpgt_epi16_mask(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__A, (__v32hi)__B, - __U); -} - -__funline __mmask64 _mm512_movepi8_mask(__m512i __A) { - return (__mmask64)__builtin_ia32_cvtb2mask512((__v64qi)__A); -} - -__funline __mmask32 _mm512_movepi16_mask(__m512i __A) { - return (__mmask32)__builtin_ia32_cvtw2mask512((__v32hi)__A); -} - -__funline __m512i _mm512_movm_epi8(__mmask64 __A) { - return (__m512i)__builtin_ia32_cvtmask2b512(__A); -} - -__funline __m512i _mm512_movm_epi16(__mmask32 __A) { - return (__m512i)__builtin_ia32_cvtmask2w512(__A); -} - -__funline __mmask64 _mm512_test_epi8_mask(__m512i __A, __m512i __B) { - return (__mmask64)__builtin_ia32_ptestmb512((__v64qi)__A, (__v64qi)__B, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__mmask64)__builtin_ia32_ptestmb512((__v64qi)__A, (__v64qi)__B, __U); -} - -__funline __mmask32 _mm512_test_epi16_mask(__m512i __A, __m512i __B) { - return (__mmask32)__builtin_ia32_ptestmw512((__v32hi)__A, (__v32hi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__mmask32)__builtin_ia32_ptestmw512((__v32hi)__A, (__v32hi)__B, __U); -} - -__funline __mmask64 _mm512_testn_epi8_mask(__m512i __A, __m512i __B) { - return (__mmask64)__builtin_ia32_ptestnmb512((__v64qi)__A, (__v64qi)__B, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__mmask64)__builtin_ia32_ptestnmb512((__v64qi)__A, (__v64qi)__B, __U); -} - -__funline __mmask32 _mm512_testn_epi16_mask(__m512i __A, __m512i __B) { - return (__mmask32)__builtin_ia32_ptestnmw512((__v32hi)__A, (__v32hi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__mmask32)__builtin_ia32_ptestnmw512((__v32hi)__A, (__v32hi)__B, __U); -} - -__funline __m512i _mm512_shuffle_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_min_epu16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_maskz_min_epu16(__mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__M); -} - -__funline __m512i _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__M); -} - -__funline __m512i _mm512_min_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_maskz_min_epi16(__mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__M); -} - -__funline __m512i _mm512_mask_min_epi16(__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__M); -} - -__funline __m512i _mm512_max_epu8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_maskz_max_epu8(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__M); -} - -__funline __m512i _mm512_mask_max_epu8(__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__M); -} - -__funline __m512i _mm512_max_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_maskz_max_epi8(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__M); -} - -__funline __m512i _mm512_mask_max_epi8(__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__M); -} - -__funline __m512i _mm512_min_epu8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_maskz_min_epu8(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__M); -} - -__funline __m512i _mm512_mask_min_epu8(__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__M); -} - -__funline __m512i _mm512_min_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_maskz_min_epi8(__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)_mm512_setzero_si512(), - (__mmask64)__M); -} - -__funline __m512i _mm512_mask_min_epi8(__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, - (__v64qi)__W, (__mmask64)__M); -} - -__funline __m512i _mm512_max_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_maskz_max_epi16(__mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__M); -} - -__funline __m512i _mm512_mask_max_epi16(__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__M); -} - -__funline __m512i _mm512_max_epu16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_maskz_max_epu16(__mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__M); -} - -__funline __m512i _mm512_mask_max_epu16(__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__M); -} - -__funline __m512i _mm512_sra_epi16(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_srav_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_srlv_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_sllv_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, - (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_packsswb512_mask((__v32hi)__A, (__v32hi)__B, - (__v64qi)__W, (__mmask64)__M); -} - -__funline __m512i _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_packsswb512_mask( - (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_packus_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_packuswb512_mask( - (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)-1); -} - -__funline __m512i _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_packuswb512_mask((__v32hi)__A, (__v32hi)__B, - (__v64qi)__W, (__mmask64)__M); -} - -__funline __m512i _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_packuswb512_mask( - (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), - (__mmask64)__M); -} - -__funline __m512i _mm512_abs_epi8(__m512i __A) { - return (__m512i)__builtin_ia32_pabsb512_mask( - (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); -} - -__funline __m512i _mm512_mask_abs_epi8(__m512i __W, __mmask64 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsb512_mask((__v64qi)__A, (__v64qi)__W, - (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_abs_epi8(__mmask64 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsb512_mask( - (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); -} - -__funline __m512i _mm512_abs_epi16(__m512i __A) { - return (__m512i)__builtin_ia32_pabsw512_mask( - (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_abs_epi16(__m512i __W, __mmask32 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsw512_mask((__v32hi)__A, (__v32hi)__W, - (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsw512_mask( - (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __mmask64 _mm512_mask_cmpneq_epu8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, - (__mmask64)__M); -} - -__funline __mmask64 _mm512_mask_cmplt_epu8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, - (__mmask64)__M); -} - -__funline __mmask64 _mm512_mask_cmpge_epu8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, - (__mmask64)__M); -} - -__funline __mmask64 _mm512_mask_cmple_epu8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, - (__mmask64)__M); -} - -__funline __mmask32 _mm512_mask_cmpneq_epu16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, - (__mmask32)__M); -} - -__funline __mmask32 _mm512_mask_cmplt_epu16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, - (__mmask32)__M); -} - -__funline __mmask32 _mm512_mask_cmpge_epu16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, - (__mmask32)__M); -} - -__funline __mmask32 _mm512_mask_cmple_epu16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, - (__mmask32)__M); -} - -__funline __mmask64 _mm512_mask_cmpneq_epi8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, - (__mmask64)__M); -} - -__funline __mmask64 _mm512_mask_cmplt_epi8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, - (__mmask64)__M); -} - -__funline __mmask64 _mm512_mask_cmpge_epi8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, - (__mmask64)__M); -} - -__funline __mmask64 _mm512_mask_cmple_epi8_mask(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, - (__mmask64)__M); -} - -__funline __mmask32 _mm512_mask_cmpneq_epi16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, - (__mmask32)__M); -} - -__funline __mmask32 _mm512_mask_cmplt_epi16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, - (__mmask32)__M); -} - -__funline __mmask32 _mm512_mask_cmpge_epi16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, - (__mmask32)__M); -} - -__funline __mmask32 _mm512_mask_cmple_epi16_mask(__mmask32 __M, __m512i __X, - __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, - (__mmask32)__M); -} - -__funline __mmask64 _mm512_cmpneq_epu8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmplt_epu8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmpge_epu8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmple_epu8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, - (__mmask64)-1); -} - -__funline __mmask32 _mm512_cmpneq_epu16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmplt_epu16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmpge_epu16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmple_epu16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, - (__mmask32)-1); -} - -__funline __mmask64 _mm512_cmpneq_epi8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmplt_epi8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmpge_epi8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, - (__mmask64)-1); -} - -__funline __mmask64 _mm512_cmple_epi8_mask(__m512i __X, __m512i __Y) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, - (__mmask64)-1); -} - -__funline __mmask32 _mm512_cmpneq_epi16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmplt_epi16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmpge_epi16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, - (__mmask32)-1); -} - -__funline __mmask32 _mm512_cmple_epi16_mask(__m512i __X, __m512i __Y) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, - (__mmask32)-1); -} - -__funline __m512i _mm512_packs_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_packssdw512_mask( - (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_packssdw512_mask( - (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_packssdw512_mask((__v16si)__A, (__v16si)__B, - (__v32hi)__W, __M); -} - -__funline __m512i _mm512_packus_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_packusdw512_mask( - (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_packusdw512_mask( - (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_packusdw512_mask((__v16si)__A, (__v16si)__B, - (__v32hi)__W, __M); -} - -#ifdef __OPTIMIZE__ -__funline __mmask32 _kshiftli_mask32(__mmask32 __A, unsigned int __B) { - return (__mmask32)__builtin_ia32_kshiftlisi((__mmask32)__A, (__mmask8)__B); -} - -__funline __mmask64 _kshiftli_mask64(__mmask64 __A, unsigned int __B) { - return (__mmask64)__builtin_ia32_kshiftlidi((__mmask64)__A, (__mmask8)__B); -} - -__funline __mmask32 _kshiftri_mask32(__mmask32 __A, unsigned int __B) { - return (__mmask32)__builtin_ia32_kshiftrisi((__mmask32)__A, (__mmask8)__B); -} - -__funline __mmask64 _kshiftri_mask64(__mmask64 __A, unsigned int __B) { - return (__mmask64)__builtin_ia32_kshiftridi((__mmask64)__A, (__mmask8)__B); -} - -__funline __m512i _mm512_alignr_epi8(__m512i __A, __m512i __B, const int __N) { - return (__m512i)__builtin_ia32_palignr512((__v8di)__A, (__v8di)__B, __N * 8); -} - -__funline __m512i _mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B, const int __N) { - return (__m512i)__builtin_ia32_palignr512_mask( - (__v8di)__A, (__v8di)__B, __N * 8, (__v8di)__W, (__mmask64)__U); -} - -__funline __m512i _mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A, - __m512i __B, const int __N) { - return (__m512i)__builtin_ia32_palignr512_mask( - (__v8di)__A, (__v8di)__B, __N * 8, (__v8di)_mm512_setzero_si512(), - (__mmask64)__U); -} - -__funline __m512i _mm512_dbsad_epu8(__m512i __A, __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_dbpsadbw512_mask( - (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)_mm512_setzero_si512(), - (__mmask32)-1); -} - -__funline __m512i _mm512_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_dbpsadbw512_mask( - (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B, - const int __imm) { - return (__m512i)__builtin_ia32_dbpsadbw512_mask( - (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__U); -} - -__funline __m512i _mm512_srli_epi16(__m512i __A, const int __imm) { - return (__m512i)__builtin_ia32_psrlwi512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) { - return (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)__A, __imm, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, - const int __imm) { - return (__m512i)__builtin_ia32_psrlwi512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_slli_epi16(__m512i __A, const int __B) { - return (__m512i)__builtin_ia32_psllwi512_mask( - (__v32hi)__A, __B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, - const int __B) { - return (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)__A, __B, (__v32hi)__W, - (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, - const int __B) { - return (__m512i)__builtin_ia32_psllwi512_mask( - (__v32hi)__A, __B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_shufflehi_epi16(__m512i __A, const int __imm) { - return (__m512i)__builtin_ia32_pshufhw512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, - __m512i __A, const int __imm) { - return (__m512i)__builtin_ia32_pshufhw512_mask((__v32hi)__A, __imm, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A, - const int __imm) { - return (__m512i)__builtin_ia32_pshufhw512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_shufflelo_epi16(__m512i __A, const int __imm) { - return (__m512i)__builtin_ia32_pshuflw512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, - __m512i __A, const int __imm) { - return (__m512i)__builtin_ia32_pshuflw512_mask((__v32hi)__A, __imm, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A, - const int __imm) { - return (__m512i)__builtin_ia32_pshuflw512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_srai_epi16(__m512i __A, const int __imm) { - return (__m512i)__builtin_ia32_psrawi512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); -} - -__funline __m512i _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) { - return (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)__A, __imm, - (__v32hi)__W, (__mmask32)__U); -} - -__funline __m512i _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, - const int __imm) { - return (__m512i)__builtin_ia32_psrawi512_mask( - (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); -} - -__funline __m512i _mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, - __m512i __W) { - return (__m512i)__builtin_ia32_blendmw_512_mask((__v32hi)__A, (__v32hi)__W, - (__mmask32)__U); -} - -__funline __m512i _mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, - __m512i __W) { - return (__m512i)__builtin_ia32_blendmb_512_mask((__v64qi)__A, (__v64qi)__W, - (__mmask64)__U); -} - -__funline __mmask32 _mm512_mask_cmp_epi16_mask(__mmask32 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, - (__mmask32)__U); -} - -__funline __mmask32 _mm512_cmp_epi16_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, - (__mmask32)-1); -} - -__funline __mmask64 _mm512_mask_cmp_epi8_mask(__mmask64 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, - (__mmask64)__U); -} - -__funline __mmask64 _mm512_cmp_epi8_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, - (__mmask64)-1); -} - -__funline __mmask32 _mm512_mask_cmp_epu16_mask(__mmask32 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, - __P, (__mmask32)__U); -} - -__funline __mmask32 _mm512_cmp_epu16_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, - __P, (__mmask32)-1); -} - -__funline __mmask64 _mm512_mask_cmp_epu8_mask(__mmask64 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, - __P, (__mmask64)__U); -} - -__funline __mmask64 _mm512_cmp_epu8_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, - __P, (__mmask64)-1); -} - -__funline __m512i _mm512_bslli_epi128(__m512i __A, const int __N) { - return (__m512i)__builtin_ia32_pslldq512(__A, __N * 8); -} - -__funline __m512i _mm512_bsrli_epi128(__m512i __A, const int __N) { - return (__m512i)__builtin_ia32_psrldq512(__A, __N * 8); -} - -#else -#define _kshiftli_mask32(X, Y) \ - ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(X), (__mmask8)(Y))) - -#define _kshiftli_mask64(X, Y) \ - ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(X), (__mmask8)(Y))) - -#define _kshiftri_mask32(X, Y) \ - ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(X), (__mmask8)(Y))) - -#define _kshiftri_mask64(X, Y) \ - ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(X), (__mmask8)(Y))) - -#define _mm512_alignr_epi8(X, Y, N) \ - ((__m512i)__builtin_ia32_palignr512((__v8di)(__m512i)(X), \ - (__v8di)(__m512i)(Y), (int)(N * 8))) - -#define _mm512_mask_alignr_epi8(W, U, X, Y, N) \ - ((__m512i)__builtin_ia32_palignr512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(N * 8), \ - (__v8di)(__m512i)(W), (__mmask64)(U))) - -#define _mm512_maskz_alignr_epi8(U, X, Y, N) \ - ((__m512i)__builtin_ia32_palignr512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(N * 8), \ - (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask64)(U))) - -#define _mm512_dbsad_epu8(X, Y, C) \ - ((__m512i)__builtin_ia32_dbpsadbw512_mask( \ - (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(C), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) - -#define _mm512_mask_dbsad_epu8(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_dbpsadbw512_mask( \ - (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(C), \ - (__v32hi)(__m512i)(W), (__mmask32)(U))) - -#define _mm512_maskz_dbsad_epu8(U, X, Y, C) \ - ((__m512i)__builtin_ia32_dbpsadbw512_mask( \ - (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(C), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) - -#define _mm512_srli_epi16(A, B) \ - ((__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_si512(), \ - (__mmask32)-1)) - -#define _mm512_mask_srli_epi16(W, U, A, B) \ - ((__m512i)__builtin_ia32_psrlwi512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) - -#define _mm512_maskz_srli_epi16(U, A, B) \ - ((__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_si512(), \ - (__mmask32)(U))) - -#define _mm512_slli_epi16(X, C) \ - ((__m512i)__builtin_ia32_psllwi512_mask( \ - (__v32hi)(__m512i)(X), (int)(C), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) - -#define _mm512_mask_slli_epi16(W, U, X, C) \ - ((__m512i)__builtin_ia32_psllwi512_mask( \ - (__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)(W), (__mmask32)(U))) - -#define _mm512_maskz_slli_epi16(U, X, C) \ - ((__m512i)__builtin_ia32_psllwi512_mask( \ - (__v32hi)(__m512i)(X), (int)(C), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) - -#define _mm512_shufflehi_epi16(A, B) \ - ((__m512i)__builtin_ia32_pshufhw512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) - -#define _mm512_mask_shufflehi_epi16(W, U, A, B) \ - ((__m512i)__builtin_ia32_pshufhw512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) - -#define _mm512_maskz_shufflehi_epi16(U, A, B) \ - ((__m512i)__builtin_ia32_pshufhw512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) - -#define _mm512_shufflelo_epi16(A, B) \ - ((__m512i)__builtin_ia32_pshuflw512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) - -#define _mm512_mask_shufflelo_epi16(W, U, A, B) \ - ((__m512i)__builtin_ia32_pshuflw512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) - -#define _mm512_maskz_shufflelo_epi16(U, A, B) \ - ((__m512i)__builtin_ia32_pshuflw512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) - -#define _mm512_srai_epi16(A, B) \ - ((__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_si512(), \ - (__mmask32)-1)) - -#define _mm512_mask_srai_epi16(W, U, A, B) \ - ((__m512i)__builtin_ia32_psrawi512_mask( \ - (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) - -#define _mm512_maskz_srai_epi16(U, A, B) \ - ((__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \ - (__v32hi)_mm512_setzero_si512(), \ - (__mmask32)(U))) - -#define _mm512_mask_blend_epi16(__U, __A, __W) \ - ((__m512i)__builtin_ia32_blendmw_512_mask((__v32hi)(__A), (__v32hi)(__W), \ - (__mmask32)(__U))) - -#define _mm512_mask_blend_epi8(__U, __A, __W) \ - ((__m512i)__builtin_ia32_blendmb_512_mask((__v64qi)(__A), (__v64qi)(__W), \ - (__mmask64)(__U))) - -#define _mm512_cmp_epi16_mask(X, Y, P) \ - ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(X), \ - (__v32hi)(__m512i)(Y), (int)(P), \ - (__mmask32)(-1))) - -#define _mm512_cmp_epi8_mask(X, Y, P) \ - ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(X), \ - (__v64qi)(__m512i)(Y), (int)(P), \ - (__mmask64)(-1))) - -#define _mm512_cmp_epu16_mask(X, Y, P) \ - ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(X), \ - (__v32hi)(__m512i)(Y), (int)(P), \ - (__mmask32)(-1))) - -#define _mm512_cmp_epu8_mask(X, Y, P) \ - ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(X), \ - (__v64qi)(__m512i)(Y), (int)(P), \ - (__mmask64)(-1))) - -#define _mm512_mask_cmp_epi16_mask(M, X, Y, P) \ - ((__mmask32)__builtin_ia32_cmpw512_mask( \ - (__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) - -#define _mm512_mask_cmp_epi8_mask(M, X, Y, P) \ - ((__mmask64)__builtin_ia32_cmpb512_mask( \ - (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) - -#define _mm512_mask_cmp_epu16_mask(M, X, Y, P) \ - ((__mmask32)__builtin_ia32_ucmpw512_mask( \ - (__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) - -#define _mm512_mask_cmp_epu8_mask(M, X, Y, P) \ - ((__mmask64)__builtin_ia32_ucmpb512_mask( \ - (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) - -#define _mm512_bslli_epi128(A, N) \ - ((__m512i)__builtin_ia32_pslldq512((__m512i)(A), (int)(N)*8)) - -#define _mm512_bsrli_epi128(A, N) \ - ((__m512i)__builtin_ia32_psrldq512((__m512i)(A), (int)(N)*8)) - #endif - +typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef short __v32hi_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef char __v64qi_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); +typedef unsigned long long __mmask64; +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcsi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_ktestcsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_ktestcdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_kortestcsi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_kortestcdi (__A, __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask32_u32 (__mmask32 __A) +{ + return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A); +} +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask64_u64 (__mmask64 __A) +{ + return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask32 (unsigned int __A) +{ + return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu64_mask64 (unsigned long long __A) +{ + return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask32 (__mmask32 *__A) +{ + return (__mmask32) __builtin_ia32_kmovd (*__A); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask64 (__mmask64 *__A) +{ + return (__mmask64) __builtin_ia32_kmovq (*(__mmask64 *) __A); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask32 (__mmask32 *__A, __mmask32 __B) +{ + *(__mmask32 *) __A = __builtin_ia32_kmovd (__B); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask64 (__mmask64 *__A, __mmask64 __B) +{ + *(__mmask64 *) __A = __builtin_ia32_kmovq (__B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask32 (__mmask32 __A) +{ + return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask64 (__mmask64 __A) +{ + return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi16 (void const *__P) +{ + return (__m512i) (*(__v32hi_u *) __P); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi16 (void *__P, __m512i __A) +{ + *(__v32hi_u *) __P = (__v32hi_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) +{ + __builtin_ia32_storedquhi512_mask ((short *) __P, + (__v32hi) __A, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackw (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackw_mask32 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackd (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackd_mask64 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi8 (void const *__P) +{ + return (__m512i) (*(__v64qi_u *) __P); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi8 (void *__P, __m512i __A) +{ + *(__v64qi_u *) __P = (__v64qi_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) +{ + __builtin_ia32_storedquqi512_mask ((char *) __P, + (__v64qi) __A, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sad_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A, + (__v64qi) __B); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi8 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) _mm256_undefined_si256(), + (__mmask32) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi16_epi8 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)_mm256_undefined_si256(), + (__mmask32) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)__O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi16_epi8 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi)_mm256_undefined_si256(), + (__mmask32) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) __O, + __M); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastb_epi8 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi)_mm512_undefined_epi32(), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, + (__v64qi) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastb512_gpr_mask (__A, + (__v64qi) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastw_epi16 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi)_mm512_undefined_epi32(), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, + (__v32hi) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastw512_gpr_mask (__A, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhrs_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhi_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhi_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v32hu) __A * (__v32hu) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi16 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi16 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) __W, + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I + , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I + , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) + __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I, + __mmask32 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A, + (__v32hi) __I + , + (__v32hi) __B, + (__mmask32) + __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I + , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) + __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_avg_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v64qu) __A + (__v64qu) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v64qu) __A - (__v64qu) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_avg_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v32hu) __A - (__v32hu) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v32hu) __A + (__v32hu) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi16 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packs_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi16 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) +{ + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_madd_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 0, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 0, + __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, + (__v64qi) __B, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 0, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 0, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, + (__v32hi) __B, + __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 6, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 6, + __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, + (__v64qi) __B, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 6, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 6, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, + (__v32hi) __B, + __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi8_mask (__m512i __A) +{ + return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi16_mask (__m512i __A) +{ + return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi8 (__mmask64 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2b512 (__A); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi16 (__mmask32 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2w512 (__A); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, + (__v64qi) __B, __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, + (__v32hi) __B, __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, + (__v64qi) __B, __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, + (__v32hi) __B, __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi16 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) __W, + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packus_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) __W, + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi8 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi16 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) __M); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packs_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) __W, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packus_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) __W, + __M); +} +#ifdef __OPTIMIZE__ +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask32 (__mmask32 __A, unsigned int __B) +{ + return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A, + (__mmask8) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask64 (__mmask64 __A, unsigned int __B) +{ + return (__mmask64) __builtin_ia32_kshiftlidi ((__mmask64) __A, + (__mmask8) __B); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask32 (__mmask32 __A, unsigned int __B) +{ + return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A, + (__mmask8) __B); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask64 (__mmask64 __A, unsigned int __B) +{ + return (__mmask64) __builtin_ia32_kshiftridi ((__mmask64) __A, + (__mmask8) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi8 (__m512i __A, __m512i __B, const int __N) +{ + return (__m512i) __builtin_ia32_palignr512 ((__v8di) __A, + (__v8di) __B, __N * 8); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B, const int __N) +{ + return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A, + (__v8di) __B, + __N * 8, + (__v8di) __W, + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi8 (__mmask64 __U, __m512i __A, __m512i __B, + const int __N) +{ + return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A, + (__v8di) __B, + __N * 8, + (__v8di) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dbsad_epu8 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dbsad_epu8 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi16 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __B) +{ + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shufflehi_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shufflehi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shufflehi_epi16 (__mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shufflelo_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shufflelo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi16_mask (__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi16_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi8_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu16_mask (__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu16_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) -1); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) __U); +} +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu8_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_bslli_epi128 (__m512i __A, const int __N) +{ + return (__m512i) __builtin_ia32_pslldq512 (__A, __N * 8); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_bsrli_epi128 (__m512i __A, const int __N) +{ + return (__m512i) __builtin_ia32_psrldq512 (__A, __N * 8); +} +#else +#define _kshiftli_mask32(X, Y) ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y))) +#define _kshiftli_mask64(X, Y) ((__mmask64) __builtin_ia32_kshiftlidi ((__mmask64)(X), (__mmask8)(Y))) +#define _kshiftri_mask32(X, Y) ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y))) +#define _kshiftri_mask64(X, Y) ((__mmask64) __builtin_ia32_kshiftridi ((__mmask64)(X), (__mmask8)(Y))) +#define _mm512_alignr_epi8(X, Y, N) ((__m512i) __builtin_ia32_palignr512 ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)((N) * 8))) +#define _mm512_mask_alignr_epi8(W, U, X, Y, N) ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)((N) * 8), (__v8di)(__m512i)(W), (__mmask64)(U))) +#define _mm512_maskz_alignr_epi8(U, X, Y, N) ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)((N) * 8), (__v8di)(__m512i) _mm512_setzero_si512 (), (__mmask64)(U))) +#define _mm512_dbsad_epu8(X, Y, C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1)) +#define _mm512_mask_dbsad_epu8(W, U, X, Y, C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i)(W), (__mmask32)(U))) +#define _mm512_maskz_dbsad_epu8(U, X, Y, C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U))) +#define _mm512_srli_epi16(A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1)) +#define _mm512_mask_srli_epi16(W, U, A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) +#define _mm512_maskz_srli_epi16(U, A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U))) +#define _mm512_slli_epi16(X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)-1)) +#define _mm512_mask_slli_epi16(W, U, X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)(W), (__mmask32)(U))) +#define _mm512_maskz_slli_epi16(U, X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(U))) +#define _mm512_shufflehi_epi16(A, B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1)) +#define _mm512_mask_shufflehi_epi16(W, U, A, B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) +#define _mm512_maskz_shufflehi_epi16(U, A, B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U))) +#define _mm512_shufflelo_epi16(A, B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1)) +#define _mm512_mask_shufflelo_epi16(W, U, A, B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) +#define _mm512_maskz_shufflelo_epi16(U, A, B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U))) +#define _mm512_srai_epi16(A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1)) +#define _mm512_mask_srai_epi16(W, U, A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) +#define _mm512_maskz_srai_epi16(U, A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U))) +#define _mm512_mask_blend_epi16(__U, __A, __W) ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A), (__v32hi) (__W), (__mmask32) (__U))) +#define _mm512_mask_blend_epi8(__U, __A, __W) ((__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) (__A), (__v64qi) (__W), (__mmask64) (__U))) +#define _mm512_cmp_epi16_mask(X, Y, P) ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(-1))) +#define _mm512_cmp_epi8_mask(X, Y, P) ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(-1))) +#define _mm512_cmp_epu16_mask(X, Y, P) ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(-1))) +#define _mm512_cmp_epu8_mask(X, Y, P) ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(-1))) +#define _mm512_mask_cmp_epi16_mask(M, X, Y, P) ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) +#define _mm512_mask_cmp_epi8_mask(M, X, Y, P) ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) +#define _mm512_mask_cmp_epu16_mask(M, X, Y, P) ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) +#define _mm512_mask_cmp_epu8_mask(M, X, Y, P) ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) +#define _mm512_bslli_epi128(A, N) ((__m512i)__builtin_ia32_pslldq512 ((__m512i)(A), (int)(N) * 8)) +#define _mm512_bsrli_epi128(A, N) ((__m512i)__builtin_ia32_psrldq512 ((__m512i)(A), (int)(N) * 8)) +#endif #ifdef __DISABLE_AVX512BW__ #undef __DISABLE_AVX512BW__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512BW__ */ - -#endif /* _AVX512BWINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512cdintrin.internal.h b/third_party/intel/avx512cdintrin.internal.h index 990347cff..1aaa7ac4d 100644 --- a/third_party/intel/avx512cdintrin.internal.h +++ b/third_party/intel/avx512cdintrin.internal.h @@ -1,100 +1,140 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512CDINTRIN_H_INCLUDED #define _AVX512CDINTRIN_H_INCLUDED - #ifndef __AVX512CD__ #pragma GCC push_options #pragma GCC target("avx512cd") #define __DISABLE_AVX512CD__ -#endif /* __AVX512CD__ */ - -typedef long long __v8di __attribute__((__vector_size__(64))); -typedef int __v16si __attribute__((__vector_size__(64))); - -typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__)); -typedef double __m512d __attribute__((__vector_size__(64), __may_alias__)); - +#endif +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); typedef unsigned char __mmask8; typedef unsigned short __mmask16; - -__funline __m512i _mm512_conflict_epi32(__m512i __A) { - return (__m512i)__builtin_ia32_vpconflictsi_512_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conflict_epi32 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_vpconflictsi_512_mask( - (__v16si)__A, (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vpconflictsi_512_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_conflict_epi64(__m512i __A) { - return (__m512i)__builtin_ia32_vpconflictdi_512_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conflict_epi64 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_vpconflictdi_512_mask((__v8di)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vpconflictdi_512_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512i _mm512_lzcnt_epi64(__m512i __A) { - return (__m512i)__builtin_ia32_vplzcntq_512_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_lzcnt_epi64 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_vplzcntq_512_mask((__v8di)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vplzcntq_512_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512i _mm512_lzcnt_epi32(__m512i __A) { - return (__m512i)__builtin_ia32_vplzcntd_512_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_lzcnt_epi32 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_vplzcntd_512_mask((__v16si)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vplzcntd_512_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_broadcastmb_epi64(__mmask8 __A) { - return (__m512i)__builtin_ia32_broadcastmb512(__A); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m512i) __builtin_ia32_broadcastmb512 (__A); } - -__funline __m512i _mm512_broadcastmw_epi32(__mmask16 __A) { - return (__m512i)__builtin_ia32_broadcastmw512(__A); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m512i) __builtin_ia32_broadcastmw512 (__A); } - #ifdef __DISABLE_AVX512CD__ #undef __DISABLE_AVX512CD__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512CD__ */ - -#endif /* _AVX512CDINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512dqintrin.internal.h b/third_party/intel/avx512dqintrin.internal.h index f6d2bc07f..5563c8a55 100644 --- a/third_party/intel/avx512dqintrin.internal.h +++ b/third_party/intel/avx512dqintrin.internal.h @@ -1,1647 +1,2273 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512DQINTRIN_H_INCLUDED #define _AVX512DQINTRIN_H_INCLUDED - #ifndef __AVX512DQ__ #pragma GCC push_options #pragma GCC target("avx512dq") #define __DISABLE_AVX512DQ__ -#endif /* __AVX512DQ__ */ - -__funline unsigned char _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); -} - -__funline unsigned char _ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) { - return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); -} - -__funline unsigned char _ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) { - return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); -} - -__funline unsigned char _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_ktestchi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); -} - -__funline unsigned char _ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) { - return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); -} - -__funline unsigned char _ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) { - return (unsigned char)__builtin_ia32_ktestchi(__A, __B); -} - -__funline unsigned char _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); -} - -__funline unsigned char _kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) { - return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); -} - -__funline unsigned char _kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) { - return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); -} - -__funline __mmask8 _kadd_mask8(__mmask8 __A, __mmask8 __B) { - return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __mmask16 _kadd_mask16(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); -} - -__funline unsigned int _cvtmask8_u32(__mmask8 __A) { - return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); -} - -__funline __mmask8 _cvtu32_mask8(unsigned int __A) { - return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); -} - -__funline __mmask8 _load_mask8(__mmask8 *__A) { - return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); -} - -__funline void _store_mask8(__mmask8 *__A, __mmask8 __B) { - *(__mmask8 *)__A = __builtin_ia32_kmovb(__B); -} - -__funline __mmask8 _knot_mask8(__mmask8 __A) { - return (__mmask8)__builtin_ia32_knotqi((__mmask8)__A); -} - -__funline __mmask8 _kor_mask8(__mmask8 __A, __mmask8 __B) { - return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __mmask8 _kxnor_mask8(__mmask8 __A, __mmask8 __B) { - return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __mmask8 _kxor_mask8(__mmask8 __A, __mmask8 __B) { - return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __mmask8 _kand_mask8(__mmask8 __A, __mmask8 __B) { - return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __mmask8 _kandn_mask8(__mmask8 __A, __mmask8 __B) { - return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __m512d _mm512_broadcast_f64x2(__m128d __A) { - return (__m512d)__builtin_ia32_broadcastf64x2_512_mask( - (__v2df)__A, _mm512_undefined_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, - __m128d __A) { - return (__m512d)__builtin_ia32_broadcastf64x2_512_mask((__v2df)__A, - (__v8df)__O, __M); -} - -__funline __m512d _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { - return (__m512d)__builtin_ia32_broadcastf64x2_512_mask( - (__v2df)__A, (__v8df)_mm512_setzero_ps(), __M); -} - -__funline __m512i _mm512_broadcast_i64x2(__m128i __A) { - return (__m512i)__builtin_ia32_broadcasti64x2_512_mask( - (__v2di)__A, _mm512_undefined_epi32(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, - __m128i __A) { - return (__m512i)__builtin_ia32_broadcasti64x2_512_mask((__v2di)__A, - (__v8di)__O, __M); -} - -__funline __m512i _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { - return (__m512i)__builtin_ia32_broadcasti64x2_512_mask( - (__v2di)__A, (__v8di)_mm512_setzero_si512(), __M); -} - -__funline __m512 _mm512_broadcast_f32x2(__m128 __A) { - return (__m512)__builtin_ia32_broadcastf32x2_512_mask( - (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, - __m128 __A) { - return (__m512)__builtin_ia32_broadcastf32x2_512_mask((__v4sf)__A, - (__v16sf)__O, __M); -} - -__funline __m512 _mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_broadcastf32x2_512_mask( - (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); -} - -__funline __m512i _mm512_broadcast_i32x2(__m128i __A) { - return (__m512i)__builtin_ia32_broadcasti32x2_512_mask( - (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); -} - -__funline __m512i _mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, - __m128i __A) { - return (__m512i)__builtin_ia32_broadcasti32x2_512_mask((__v4si)__A, - (__v16si)__O, __M); -} - -__funline __m512i _mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_broadcasti32x2_512_mask( - (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); -} - -__funline __m512 _mm512_broadcast_f32x8(__m256 __A) { - return (__m512)__builtin_ia32_broadcastf32x8_512_mask( - (__v8sf)__A, _mm512_undefined_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, - __m256 __A) { - return (__m512)__builtin_ia32_broadcastf32x8_512_mask((__v8sf)__A, - (__v16sf)__O, __M); -} - -__funline __m512 _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { - return (__m512)__builtin_ia32_broadcastf32x8_512_mask( - (__v8sf)__A, (__v16sf)_mm512_setzero_ps(), __M); -} - -__funline __m512i _mm512_broadcast_i32x8(__m256i __A) { - return (__m512i)__builtin_ia32_broadcasti32x8_512_mask( - (__v8si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); -} - -__funline __m512i _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, - __m256i __A) { - return (__m512i)__builtin_ia32_broadcasti32x8_512_mask((__v8si)__A, - (__v16si)__O, __M); -} - -__funline __m512i _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { - return (__m512i)__builtin_ia32_broadcasti32x8_512_mask( - (__v8si)__A, (__v16si)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_mullo_epi64(__m512i __A, __m512i __B) { - return (__m512i)((__v8du)__A * (__v8du)__B); -} - -__funline __m512i _mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmullq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmullq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512d _mm512_xor_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_xorpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_xorpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_xorpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_xor_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_xorps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_xorps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_xorps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __m512d _mm512_or_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_orpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_orpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_orpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_or_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_orps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_orps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_orps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __m512d _mm512_and_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_andpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_andpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_andpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_and_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_andps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_andps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_andps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __m512d _mm512_andnot_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_andnpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_andnpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_andnpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_andnot_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_andnps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_andnps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_andnps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __mmask16 _mm512_movepi32_mask(__m512i __A) { - return (__mmask16)__builtin_ia32_cvtd2mask512((__v16si)__A); -} - -__funline __mmask8 _mm512_movepi64_mask(__m512i __A) { - return (__mmask8)__builtin_ia32_cvtq2mask512((__v8di)__A); -} - -__funline __m512i _mm512_movm_epi32(__mmask16 __A) { - return (__m512i)__builtin_ia32_cvtmask2d512(__A); -} - -__funline __m512i _mm512_movm_epi64(__mmask8 __A) { - return (__m512i)__builtin_ia32_cvtmask2q512(__A); -} - -__funline __m512i _mm512_cvttpd_epi64(__m512d __A) { - return (__m512i)__builtin_ia32_cvttpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvttpd_epi64(__m512i __W, __mmask8 __U, - __m512d __A) { - return (__m512i)__builtin_ia32_cvttpd2qq512_mask( - (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvttpd_epi64(__mmask8 __U, __m512d __A) { - return (__m512i)__builtin_ia32_cvttpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvttpd_epu64(__m512d __A) { - return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvttpd_epu64(__m512i __W, __mmask8 __U, - __m512d __A) { - return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( - (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvttpd_epu64(__mmask8 __U, __m512d __A) { - return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvttps_epi64(__m256 __A) { - return (__m512i)__builtin_ia32_cvttps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvttps_epi64(__m512i __W, __mmask8 __U, - __m256 __A) { - return (__m512i)__builtin_ia32_cvttps2qq512_mask( - (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvttps_epi64(__mmask8 __U, __m256 __A) { - return (__m512i)__builtin_ia32_cvttps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvttps_epu64(__m256 __A) { - return (__m512i)__builtin_ia32_cvttps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvttps_epu64(__m512i __W, __mmask8 __U, - __m256 __A) { - return (__m512i)__builtin_ia32_cvttps2uqq512_mask( - (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvttps_epu64(__mmask8 __U, __m256 __A) { - return (__m512i)__builtin_ia32_cvttps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvtpd_epi64(__m512d __A) { - return (__m512i)__builtin_ia32_cvtpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvtpd_epi64(__m512i __W, __mmask8 __U, - __m512d __A) { - return (__m512i)__builtin_ia32_cvtpd2qq512_mask( - (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvtpd_epi64(__mmask8 __U, __m512d __A) { - return (__m512i)__builtin_ia32_cvtpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvtpd_epu64(__m512d __A) { - return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvtpd_epu64(__m512i __W, __mmask8 __U, - __m512d __A) { - return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( - (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvtpd_epu64(__mmask8 __U, __m512d __A) { - return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvtps_epi64(__m256 __A) { - return (__m512i)__builtin_ia32_cvtps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvtps_epi64(__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512i)__builtin_ia32_cvtps2qq512_mask( - (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvtps_epi64(__mmask8 __U, __m256 __A) { - return (__m512i)__builtin_ia32_cvtps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvtps_epu64(__m256 __A) { - return (__m512i)__builtin_ia32_cvtps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvtps_epu64(__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512i)__builtin_ia32_cvtps2uqq512_mask( - (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvtps_epu64(__mmask8 __U, __m256 __A) { - return (__m512i)__builtin_ia32_cvtps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_cvtepi64_ps(__m512i __A) { - return (__m256)__builtin_ia32_cvtqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_mask_cvtepi64_ps(__m256 __W, __mmask8 __U, __m512i __A) { - return (__m256)__builtin_ia32_cvtqq2ps512_mask( - (__v8di)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_maskz_cvtepi64_ps(__mmask8 __U, __m512i __A) { - return (__m256)__builtin_ia32_cvtqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_cvtepu64_ps(__m512i __A) { - return (__m256)__builtin_ia32_cvtuqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_mask_cvtepu64_ps(__m256 __W, __mmask8 __U, __m512i __A) { - return (__m256)__builtin_ia32_cvtuqq2ps512_mask( - (__v8di)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_maskz_cvtepu64_ps(__mmask8 __U, __m512i __A) { - return (__m256)__builtin_ia32_cvtuqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_cvtepi64_pd(__m512i __A) { - return (__m512d)__builtin_ia32_cvtqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_cvtepi64_pd(__m512d __W, __mmask8 __U, - __m512i __A) { - return (__m512d)__builtin_ia32_cvtqq2pd512_mask( - (__v8di)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_cvtepi64_pd(__mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_cvtqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_cvtepu64_pd(__m512i __A) { - return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_cvtepu64_pd(__m512d __W, __mmask8 __U, - __m512i __A) { - return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( - (__v8di)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_cvtepu64_pd(__mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __OPTIMIZE__ -__funline __mmask8 _kshiftli_mask8(__mmask8 __A, unsigned int __B) { - return (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __mmask8 _kshiftri_mask8(__mmask8 __A, unsigned int __B) { - return (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)__A, (__mmask8)__B); -} - -__funline __m512d _mm512_range_pd(__m512d __A, __m512d __B, int __C) { - return (__m512d)__builtin_ia32_rangepd512_mask( - (__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_range_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, int __C) { - return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_range_pd(__mmask8 __U, __m512d __A, __m512d __B, - int __C) { - return (__m512d)__builtin_ia32_rangepd512_mask( - (__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_range_ps(__m512 __A, __m512 __B, int __C) { - return (__m512)__builtin_ia32_rangeps512_mask( - (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), - (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_range_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, int __C) { - return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_range_ps(__mmask16 __U, __m512 __A, __m512 __B, - int __C) { - return (__m512)__builtin_ia32_rangeps512_mask( - (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_reduce_sd(__m128d __A, __m128d __B, int __C) { - return (__m128d)__builtin_ia32_reducesd_mask( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, int __C) { - return (__m128d)__builtin_ia32_reducesd_mask((__v2df)__A, (__v2df)__B, __C, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B, - int __C) { - return (__m128d)__builtin_ia32_reducesd_mask( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_reduce_ss(__m128 __A, __m128 __B, int __C) { - return (__m128)__builtin_ia32_reducess_mask( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, int __C) { - return (__m128)__builtin_ia32_reducess_mask((__v4sf)__A, (__v4sf)__B, __C, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B, - int __C) { - return (__m128)__builtin_ia32_reducess_mask( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_range_sd(__m128d __A, __m128d __B, int __C) { - return (__m128d)__builtin_ia32_rangesd128_mask_round( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_range_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, int __C) { - return (__m128d)__builtin_ia32_rangesd128_mask_round( - (__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B, - int __C) { - return (__m128d)__builtin_ia32_rangesd128_mask_round( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_range_ss(__m128 __A, __m128 __B, int __C) { - return (__m128)__builtin_ia32_rangess128_mask_round( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, int __C) { - return (__m128)__builtin_ia32_rangess128_mask_round( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_range_ss(__mmask8 __U, __m128 __A, __m128 __B, - int __C) { - return (__m128)__builtin_ia32_rangess128_mask_round( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_range_round_sd(__m128d __A, __m128d __B, int __C, - const int __R) { - return (__m128d)__builtin_ia32_rangesd128_mask_round( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1, - __R); -} - -__funline __m128d _mm_mask_range_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, int __C, const int __R) { - return (__m128d)__builtin_ia32_rangesd128_mask_round( - (__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U, __R); -} - -__funline __m128d _mm_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - int __C, const int __R) { - return (__m128d)__builtin_ia32_rangesd128_mask_round( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - __R); -} - -__funline __m128 _mm_range_round_ss(__m128 __A, __m128 __B, int __C, - const int __R) { - return (__m128)__builtin_ia32_rangess128_mask_round( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, - __R); -} - -__funline __m128 _mm_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, int __C, const int __R) { - return (__m128)__builtin_ia32_rangess128_mask_round( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U, __R); -} - -__funline __m128 _mm_maskz_range_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - int __C, const int __R) { - return (__m128)__builtin_ia32_rangess128_mask_round( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - __R); -} - -__funline __mmask8 _mm_fpclass_ss_mask(__m128 __A, const int __imm) { - return (__mmask8)__builtin_ia32_fpclassss((__v4sf)__A, __imm); -} - -__funline __mmask8 _mm_fpclass_sd_mask(__m128d __A, const int __imm) { - return (__mmask8)__builtin_ia32_fpclasssd((__v2df)__A, __imm); -} - -__funline __m512i _mm512_cvtt_roundpd_epi64(__m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvttpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvtt_roundpd_epi64(__m512i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvtt_roundpd_epi64(__mmask8 __U, __m512d __A, - const int __R) { - return (__m512i)__builtin_ia32_cvttpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m512i _mm512_cvtt_roundpd_epu64(__m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvtt_roundpd_epu64(__m512i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvtt_roundpd_epu64(__mmask8 __U, __m512d __A, - const int __R) { - return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m512i _mm512_cvtt_roundps_epi64(__m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvtt_roundps_epi64(__m512i __W, __mmask8 __U, - __m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvtt_roundps_epi64(__mmask8 __U, __m256 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvttps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m512i _mm512_cvtt_roundps_epu64(__m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvtt_roundps_epu64(__m512i __W, __mmask8 __U, - __m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvtt_roundps_epu64(__mmask8 __U, __m256 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvttps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m512i _mm512_cvt_roundpd_epi64(__m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvtpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvt_roundpd_epi64(__m512i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvt_roundpd_epi64(__mmask8 __U, __m512d __A, - const int __R) { - return (__m512i)__builtin_ia32_cvtpd2qq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m512i _mm512_cvt_roundpd_epu64(__m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvt_roundpd_epu64(__m512i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvt_roundpd_epu64(__mmask8 __U, __m512d __A, - const int __R) { - return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( - (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m512i _mm512_cvt_roundps_epi64(__m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvt_roundps_epi64(__m512i __W, __mmask8 __U, - __m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvt_roundps_epi64(__mmask8 __U, __m256 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvtps2qq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m512i _mm512_cvt_roundps_epu64(__m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); -} - -__funline __m512i _mm512_mask_cvt_roundps_epu64(__m512i __W, __mmask8 __U, - __m256 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)__A, (__v8di)__W, - (__mmask8)__U, __R); -} - -__funline __m512i _mm512_maskz_cvt_roundps_epu64(__mmask8 __U, __m256 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvtps2uqq512_mask( - (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); -} - -__funline __m256 _mm512_cvt_roundepi64_ps(__m512i __A, const int __R) { - return (__m256)__builtin_ia32_cvtqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, __R); -} - -__funline __m256 _mm512_mask_cvt_roundepi64_ps(__m256 __W, __mmask8 __U, - __m512i __A, const int __R) { - return (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)__A, (__v8sf)__W, - (__mmask8)__U, __R); -} - -__funline __m256 _mm512_maskz_cvt_roundepi64_ps(__mmask8 __U, __m512i __A, - const int __R) { - return (__m256)__builtin_ia32_cvtqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); -} - -__funline __m256 _mm512_cvt_roundepu64_ps(__m512i __A, const int __R) { - return (__m256)__builtin_ia32_cvtuqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, __R); -} - -__funline __m256 _mm512_mask_cvt_roundepu64_ps(__m256 __W, __mmask8 __U, - __m512i __A, const int __R) { - return (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)__A, (__v8sf)__W, - (__mmask8)__U, __R); -} - -__funline __m256 _mm512_maskz_cvt_roundepu64_ps(__mmask8 __U, __m512i __A, - const int __R) { - return (__m256)__builtin_ia32_cvtuqq2ps512_mask( - (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); -} - -__funline __m512d _mm512_cvt_roundepi64_pd(__m512i __A, const int __R) { - return (__m512d)__builtin_ia32_cvtqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_cvt_roundepi64_pd(__m512d __W, __mmask8 __U, - __m512i __A, const int __R) { - return (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)__A, (__v8df)__W, - (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_cvt_roundepi64_pd(__mmask8 __U, __m512i __A, - const int __R) { - return (__m512d)__builtin_ia32_cvtqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); -} - -__funline __m512d _mm512_cvt_roundepu64_pd(__m512i __A, const int __R) { - return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_cvt_roundepu64_pd(__m512d __W, __mmask8 __U, - __m512i __A, const int __R) { - return (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)__A, (__v8df)__W, - (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_cvt_roundepu64_pd(__mmask8 __U, __m512i __A, - const int __R) { - return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( - (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); -} - -__funline __m512d _mm512_reduce_pd(__m512d __A, int __B) { - return (__m512d)__builtin_ia32_reducepd512_mask( - (__v8df)__A, __B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_reduce_pd(__m512d __W, __mmask8 __U, __m512d __A, - int __B) { - return (__m512d)__builtin_ia32_reducepd512_mask((__v8df)__A, __B, (__v8df)__W, - (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_reduce_pd(__mmask8 __U, __m512d __A, int __B) { - return (__m512d)__builtin_ia32_reducepd512_mask( - (__v8df)__A, __B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_reduce_ps(__m512 __A, int __B) { - return (__m512)__builtin_ia32_reduceps512_mask( - (__v16sf)__A, __B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_reduce_ps(__m512 __W, __mmask16 __U, __m512 __A, - int __B) { - return (__m512)__builtin_ia32_reduceps512_mask((__v16sf)__A, __B, - (__v16sf)__W, (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_reduce_ps(__mmask16 __U, __m512 __A, int __B) { - return (__m512)__builtin_ia32_reduceps512_mask( - (__v16sf)__A, __B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __m256 _mm512_extractf32x8_ps(__m512 __A, const int __imm) { - return (__m256)__builtin_ia32_extractf32x8_mask( - (__v16sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A, - const int __imm) { - return (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)__A, __imm, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A, - const int __imm) { - return (__m256)__builtin_ia32_extractf32x8_mask( - (__v16sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm512_extractf64x2_pd(__m512d __A, const int __imm) { - return (__m128d)__builtin_ia32_extractf64x2_512_mask( - (__v8df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, - __m512d __A, const int __imm) { - return (__m128d)__builtin_ia32_extractf64x2_512_mask( - (__v8df)__A, __imm, (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A, - const int __imm) { - return (__m128d)__builtin_ia32_extractf64x2_512_mask( - (__v8df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256i _mm512_extracti32x8_epi32(__m512i __A, const int __imm) { - return (__m256i)__builtin_ia32_extracti32x8_mask( - (__v16si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, - __m512i __A, const int __imm) { - return (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)__A, __imm, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A, - const int __imm) { - return (__m256i)__builtin_ia32_extracti32x8_mask( - (__v16si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm512_extracti64x2_epi64(__m512i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti64x2_512_mask( - (__v8di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, - __m512i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti64x2_512_mask( - (__v8di)__A, __imm, (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A, - const int __imm) { - return (__m128i)__builtin_ia32_extracti64x2_512_mask( - (__v8di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m512d _mm512_range_round_pd(__m512d __A, __m512d __B, int __C, - const int __R) { - return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, - (__v8df)_mm512_setzero_pd(), - (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_range_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, __m512d __B, int __C, - const int __R) { - return (__m512d)__builtin_ia32_rangepd512_mask( - (__v8df)__A, (__v8df)__B, __C, (__v8df)__W, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_range_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, int __C, - const int __R) { - return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); -} - -__funline __m512 _mm512_range_round_ps(__m512 __A, __m512 __B, int __C, - const int __R) { - return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_range_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, int __C, const int __R) { - return (__m512)__builtin_ia32_rangeps512_mask( - (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)__W, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_range_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, int __C, const int __R) { - return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); -} - -__funline __m512i _mm512_inserti32x8(__m512i __A, __m256i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti32x8_mask( - (__v16si)__A, (__v8si)__B, __imm, (__v16si)_mm512_setzero_si512(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, - __m256i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti32x8_mask( - (__v16si)__A, (__v8si)__B, __imm, (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, - __m256i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti32x8_mask( - (__v16si)__A, (__v8si)__B, __imm, (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512 _mm512_insertf32x8(__m512 __A, __m256 __B, const int __imm) { - return (__m512)__builtin_ia32_insertf32x8_mask( - (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), - (__mmask16)-1); -} - -__funline __m512 _mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, - __m256 __B, const int __imm) { - return (__m512)__builtin_ia32_insertf32x8_mask( - (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)__W, (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B, - const int __imm) { - return (__m512)__builtin_ia32_insertf32x8_mask( - (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U); -} - -__funline __m512i _mm512_inserti64x2(__m512i __A, __m128i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti64x2_512_mask( - (__v8di)__A, (__v2di)__B, __imm, (__v8di)_mm512_setzero_si512(), - (__mmask8)-1); -} - -__funline __m512i _mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, - __m128i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti64x2_512_mask( - (__v8di)__A, (__v2di)__B, __imm, (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B, - const int __imm) { - return (__m512i)__builtin_ia32_inserti64x2_512_mask( - (__v8di)__A, (__v2di)__B, __imm, (__v8di)_mm512_setzero_si512(), - (__mmask8)__U); -} - -__funline __m512d _mm512_insertf64x2(__m512d __A, __m128d __B, const int __imm) { - return (__m512d)__builtin_ia32_insertf64x2_512_mask( - (__v8df)__A, (__v2df)__B, __imm, (__v8df)_mm512_setzero_pd(), - (__mmask8)-1); -} - -__funline __m512d _mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, - __m128d __B, const int __imm) { - return (__m512d)__builtin_ia32_insertf64x2_512_mask( - (__v8df)__A, (__v2df)__B, __imm, (__v8df)__W, (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B, - const int __imm) { - return (__m512d)__builtin_ia32_insertf64x2_512_mask( - (__v8df)__A, (__v2df)__B, __imm, (__v8df)_mm512_setzero_pd(), - (__mmask8)__U); -} - -__funline __mmask8 _mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A, - const int __imm) { - return (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)__A, __imm, __U); -} - -__funline __mmask8 _mm512_fpclass_pd_mask(__m512d __A, const int __imm) { - return (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)__A, __imm, - (__mmask8)-1); -} - -__funline __mmask16 _mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A, - const int __imm) { - return (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)__A, __imm, __U); -} - -__funline __mmask16 _mm512_fpclass_ps_mask(__m512 __A, const int __imm) { - return (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)__A, __imm, - (__mmask16)-1); -} - -#else -#define _kshiftli_mask8(X, Y) \ - ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(X), (__mmask8)(Y))) - -#define _kshiftri_mask8(X, Y) \ - ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(X), (__mmask8)(Y))) - -#define _mm_range_sd(A, B, C) \ - ((__m128d)__builtin_ia32_rangesd128_mask_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_range_sd(W, U, A, B, C) \ - ((__m128d)__builtin_ia32_rangesd128_mask_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_range_sd(U, A, B, C) \ - ((__m128d)__builtin_ia32_rangesd128_mask_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_range_ss(A, B, C) \ - ((__m128)__builtin_ia32_rangess128_mask_round( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_range_ss(W, U, A, B, C) \ - ((__m128)__builtin_ia32_rangess128_mask_round( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ - (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_range_ss(U, A, B, C) \ - ((__m128)__builtin_ia32_rangess128_mask_round( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_range_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_rangesd128_mask_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)-1, (R))) - -#define _mm_mask_range_round_sd(W, U, A, B, C, R) \ - ((__m128d)__builtin_ia32_rangesd128_mask_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)(W), (__mmask8)(U), (R))) - -#define _mm_maskz_range_round_sd(U, A, B, C, R) \ - ((__m128d)__builtin_ia32_rangesd128_mask_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)(U), (R))) - -#define _mm_range_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_rangess128_mask_round( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (R))) - -#define _mm_mask_range_round_ss(W, U, A, B, C, R) \ - ((__m128)__builtin_ia32_rangess128_mask_round( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ - (__mmask8)(U), (R))) - -#define _mm_maskz_range_round_ss(U, A, B, C, R) \ - ((__m128)__builtin_ia32_rangess128_mask_round( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (R))) - -#define _mm512_cvtt_roundpd_epi64(A, B) \ - ((__m512i)__builtin_ia32_cvttpd2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvttpd2qq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvtt_roundpd_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_cvttpd2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvtt_roundpd_epu64(A, B) \ - ((__m512i)__builtin_ia32_cvttpd2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvtt_roundpd_epu64(U, A, B) \ - ((__m512i)__builtin_ia32_cvttpd2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvtt_roundps_epi64(A, B) \ - ((__m512i)__builtin_ia32_cvttps2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvtt_roundps_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2qq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvtt_roundps_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvtt_roundps_epu64(A, B) \ - ((__m512i)__builtin_ia32_cvttps2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvtt_roundps_epu64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2uqq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvtt_roundps_epu64(U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvt_roundpd_epi64(A, B) \ - ((__m512i)__builtin_ia32_cvtpd2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvt_roundpd_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvtpd2qq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvt_roundpd_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_cvtpd2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvt_roundpd_epu64(A, B) \ - ((__m512i)__builtin_ia32_cvtpd2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvt_roundpd_epu64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvt_roundpd_epu64(U, A, B) \ - ((__m512i)__builtin_ia32_cvtpd2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvt_roundps_epi64(A, B) \ - ((__m512i)__builtin_ia32_cvtps2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvt_roundps_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2qq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvt_roundps_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2qq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvt_roundps_epu64(A, B) \ - ((__m512i)__builtin_ia32_cvtps2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), -1, (B))) - -#define _mm512_mask_cvt_roundps_epu64(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2uqq512_mask((A), (__v8di)(W), (U), (B))) - -#define _mm512_maskz_cvt_roundps_epu64(U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2uqq512_mask( \ - (A), (__v8di)_mm512_setzero_si512(), (U), (B))) - -#define _mm512_cvt_roundepi64_ps(A, B) \ - ((__m256)__builtin_ia32_cvtqq2ps512_mask( \ - (__v8di)(A), (__v8sf)_mm256_setzero_ps(), -1, (B))) - -#define _mm512_mask_cvt_roundepi64_ps(W, U, A, B) \ - ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(A), (W), (U), (B))) - -#define _mm512_maskz_cvt_roundepi64_ps(U, A, B) \ - ((__m256)__builtin_ia32_cvtqq2ps512_mask( \ - (__v8di)(A), (__v8sf)_mm256_setzero_ps(), (U), (B))) - -#define _mm512_cvt_roundepu64_ps(A, B) \ - ((__m256)__builtin_ia32_cvtuqq2ps512_mask( \ - (__v8di)(A), (__v8sf)_mm256_setzero_ps(), -1, (B))) - -#define _mm512_mask_cvt_roundepu64_ps(W, U, A, B) \ - ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(A), (W), (U), (B))) - -#define _mm512_maskz_cvt_roundepu64_ps(U, A, B) \ - ((__m256)__builtin_ia32_cvtuqq2ps512_mask( \ - (__v8di)(A), (__v8sf)_mm256_setzero_ps(), (U), (B))) - -#define _mm512_cvt_roundepi64_pd(A, B) \ - ((__m512d)__builtin_ia32_cvtqq2pd512_mask( \ - (__v8di)(A), (__v8df)_mm512_setzero_pd(), -1, (B))) - -#define _mm512_mask_cvt_roundepi64_pd(W, U, A, B) \ - ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(A), (W), (U), (B))) - -#define _mm512_maskz_cvt_roundepi64_pd(U, A, B) \ - ((__m512d)__builtin_ia32_cvtqq2pd512_mask( \ - (__v8di)(A), (__v8df)_mm512_setzero_pd(), (U), (B))) - -#define _mm512_cvt_roundepu64_pd(A, B) \ - ((__m512d)__builtin_ia32_cvtuqq2pd512_mask( \ - (__v8di)(A), (__v8df)_mm512_setzero_pd(), -1, (B))) - -#define _mm512_mask_cvt_roundepu64_pd(W, U, A, B) \ - ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(A), (W), (U), (B))) - -#define _mm512_maskz_cvt_roundepu64_pd(U, A, B) \ - ((__m512d)__builtin_ia32_cvtuqq2pd512_mask( \ - (__v8di)(A), (__v8df)_mm512_setzero_pd(), (U), (B))) - -#define _mm512_reduce_pd(A, B) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm512_mask_reduce_pd(W, U, A, B) \ - ((__m512d)__builtin_ia32_reducepd512_mask( \ - (__v8df)(__m512d)(A), (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U))) - -#define _mm512_maskz_reduce_pd(U, A, B) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm512_reduce_ps(A, B) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1)) - -#define _mm512_mask_reduce_ps(W, U, A, B) \ - ((__m512)__builtin_ia32_reduceps512_mask( \ - (__v16sf)(__m512)(A), (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U))) - -#define _mm512_maskz_reduce_ps(U, A, B) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U))) - -#define _mm512_extractf32x8_ps(X, C) \ - ((__m256)__builtin_ia32_extractf32x8_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf32x8_ps(W, U, X, C) \ - ((__m256)__builtin_ia32_extractf32x8_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) - -#define _mm512_maskz_extractf32x8_ps(U, X, C) \ - ((__m256)__builtin_ia32_extractf32x8_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm512_extractf64x2_pd(X, C) \ - ((__m128d)__builtin_ia32_extractf64x2_512_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf64x2_pd(W, U, X, C) \ - ((__m128d)__builtin_ia32_extractf64x2_512_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm512_maskz_extractf64x2_pd(U, X, C) \ - ((__m128d)__builtin_ia32_extractf64x2_512_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm512_extracti32x8_epi32(X, C) \ - ((__m256i)__builtin_ia32_extracti32x8_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) - -#define _mm512_mask_extracti32x8_epi32(W, U, X, C) \ - ((__m256i)__builtin_ia32_extracti32x8_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm512_maskz_extracti32x8_epi32(U, X, C) \ - ((__m256i)__builtin_ia32_extracti32x8_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) - -#define _mm512_extracti64x2_epi64(X, C) \ - ((__m128i)__builtin_ia32_extracti64x2_512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm512_mask_extracti64x2_epi64(W, U, X, C) \ - ((__m128i)__builtin_ia32_extracti64x2_512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) - -#define _mm512_maskz_extracti64x2_epi64(U, X, C) \ - ((__m128i)__builtin_ia32_extracti64x2_512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm512_range_pd(A, B, C) \ - ((__m512d)__builtin_ia32_rangepd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_range_pd(W, U, A, B, C) \ - ((__m512d)__builtin_ia32_rangepd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_range_pd(U, A, B, C) \ - ((__m512d)__builtin_ia32_rangepd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_range_ps(A, B, C) \ - ((__m512)__builtin_ia32_rangeps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_range_ps(W, U, A, B, C) \ - ((__m512)__builtin_ia32_rangeps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_range_ps(U, A, B, C) \ - ((__m512)__builtin_ia32_rangeps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_range_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_rangepd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), (__mmask8)-1, (R))) - -#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ - ((__m512d)__builtin_ia32_rangepd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U), (R))) - -#define _mm512_maskz_range_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_rangepd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (R))) - -#define _mm512_range_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_rangeps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, (R))) - -#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ - ((__m512)__builtin_ia32_rangeps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U), (R))) - -#define _mm512_maskz_range_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_rangeps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (R))) - -#define _mm512_insertf64x2(X, Y, C) \ - ((__m512d)__builtin_ia32_insertf64x2_512_mask( \ - (__v8df)(__m512d)(X), (__v2df)(__m128d)(Y), (int)(C), \ - (__v8df)(__m512d)(X), (__mmask8)-1)) - -#define _mm512_mask_insertf64x2(W, U, X, Y, C) \ - ((__m512d)__builtin_ia32_insertf64x2_512_mask( \ - (__v8df)(__m512d)(X), (__v2df)(__m128d)(Y), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U))) - -#define _mm512_maskz_insertf64x2(U, X, Y, C) \ - ((__m512d)__builtin_ia32_insertf64x2_512_mask( \ - (__v8df)(__m512d)(X), (__v2df)(__m128d)(Y), (int)(C), \ - (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) - -#define _mm512_inserti64x2(X, Y, C) \ - ((__m512i)__builtin_ia32_inserti64x2_512_mask( \ - (__v8di)(__m512i)(X), (__v2di)(__m128i)(Y), (int)(C), \ - (__v8di)(__m512i)(X), (__mmask8)-1)) - -#define _mm512_mask_inserti64x2(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti64x2_512_mask( \ - (__v8di)(__m512i)(X), (__v2di)(__m128i)(Y), (int)(C), \ - (__v8di)(__m512i)(W), (__mmask8)(U))) - -#define _mm512_maskz_inserti64x2(U, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti64x2_512_mask( \ - (__v8di)(__m512i)(X), (__v2di)(__m128i)(Y), (int)(C), \ - (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask8)(U))) - -#define _mm512_insertf32x8(X, Y, C) \ - ((__m512)__builtin_ia32_insertf32x8_mask( \ - (__v16sf)(__m512)(X), (__v8sf)(__m256)(Y), (int)(C), \ - (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)-1)) - -#define _mm512_mask_insertf32x8(W, U, X, Y, C) \ - ((__m512)__builtin_ia32_insertf32x8_mask( \ - (__v16sf)(__m512)(X), (__v8sf)(__m256)(Y), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U))) - -#define _mm512_maskz_insertf32x8(U, X, Y, C) \ - ((__m512)__builtin_ia32_insertf32x8_mask( \ - (__v16sf)(__m512)(X), (__v8sf)(__m256)(Y), (int)(C), \ - (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) - -#define _mm512_inserti32x8(X, Y, C) \ - ((__m512i)__builtin_ia32_inserti32x8_mask( \ - (__v16si)(__m512i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)-1)) - -#define _mm512_mask_inserti32x8(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti32x8_mask( \ - (__v16si)(__m512i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v16si)(__m512i)(W), (__mmask16)(U))) - -#define _mm512_maskz_inserti32x8(U, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti32x8_mask( \ - (__v16si)(__m512i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) - -#define _mm_fpclass_ss_mask(X, C) \ - ((__mmask8)__builtin_ia32_fpclassss((__v4sf)(__m128)(X), (int)(C))) - -#define _mm_fpclass_sd_mask(X, C) \ - ((__mmask8)__builtin_ia32_fpclasssd((__v2df)(__m128d)(X), (int)(C))) - -#define _mm512_mask_fpclass_pd_mask(u, X, C) \ - ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(X), (int)(C), \ - (__mmask8)(u))) - -#define _mm512_mask_fpclass_ps_mask(u, x, c) \ - ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(x), (int)(c), \ - (__mmask8)(u))) - -#define _mm512_fpclass_pd_mask(X, C) \ - ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(X), (int)(C), \ - (__mmask8)-1)) - -#define _mm512_fpclass_ps_mask(x, c) \ - ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(x), (int)(c), \ - (__mmask8)-1)) - -#define _mm_reduce_sd(A, B, C) \ - ((__m128d)__builtin_ia32_reducesd_mask( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)-1)) - -#define _mm_mask_reduce_sd(W, U, A, B, C) \ - ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm_maskz_reduce_sd(U, A, B, C) \ - ((__m128d)__builtin_ia32_reducesd_mask( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)(U))) - -#define _mm_reduce_ss(A, B, C) \ - ((__m128)__builtin_ia32_reducess_mask( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) - -#define _mm_mask_reduce_ss(W, U, A, B, C) \ - ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm_maskz_reduce_ss(U, A, B, C) \ - ((__m128)__builtin_ia32_reducess_mask( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) - #endif - +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcqi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_ktestcqi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestchi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_ktestchi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcqi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); +} +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_kortestcqi (__A, __B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kaddqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask16 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kaddhi ((__mmask16) __A, (__mmask16) __B); +} +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask8_u32 (__mmask8 __A) +{ + return (unsigned int) __builtin_ia32_kmovb ((__mmask8 ) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask8 (unsigned int __A) +{ + return (__mmask8) __builtin_ia32_kmovb ((__mmask8) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask8 (__mmask8 *__A) +{ + return (__mmask8) __builtin_ia32_kmovb (*(__mmask8 *) __A); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask8 (__mmask8 *__A, __mmask8 __B) +{ + *(__mmask8 *) __A = __builtin_ia32_kmovb (__B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask8 (__mmask8 __A) +{ + return (__mmask8) __builtin_ia32_knotqi ((__mmask8) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_korqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kxnorqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kxorqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f64x2 (__m128d __A) +{ + return (__m512d) + __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, + _mm512_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) + __A, + (__v8df) + __O, __M); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) + __A, + (__v8df) + _mm512_setzero_ps (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i64x2 (__m128i __A) +{ + return (__m512i) + __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) + __A, + (__v8di) + __O, __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) + __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f32x2 (__m128 __A) +{ + return (__m512) + __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf)_mm512_undefined_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf) + __O, __M); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i32x2 (__m128i __A) +{ + return (__m512i) + __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) + __A, + (__v16si) + __O, __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) + __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f32x8 (__m256 __A) +{ + return (__m512) + __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + _mm512_undefined_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + (__v16sf)__O, + __M); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i32x8 (__m256i __A) +{ + return (__m512i) + __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) + __A, + (__v16si)__O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) + __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A * (__v8du) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) +{ + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) +{ + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) +{ + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) +{ + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi32_mask (__m512i __A) +{ + return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi64_mask (__m512i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi32 (__mmask16 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2d512 (__A); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi64 (__mmask8 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2q512 (__A); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epi64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epu64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epi64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epu64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epi64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epu64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epi64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epu64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_ps (__m512i __A) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu64_ps (__m512i __A) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_pd (__m512i __A) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu64_pd (__m512i __A) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask8 (__mmask8 __A, unsigned int __B) +{ + return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask8 (__mmask8 __A, unsigned int __B) +{ + return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_pd (__m512d __A, __m512d __B, int __C) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_pd (__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, int __C) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_pd (__mmask8 __U, __m512d __A, __m512d __B, int __C) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_ps (__m512 __A, __m512 __B, int __C) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_ps (__m512 __W, __mmask16 __U, + __m512 __A, __m512 __B, int __C) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_ps (__mmask16 __U, __m512 __A, __m512 __B, int __C) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_sd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_ss (__m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_sd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_ss (__m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_round_sd (__m128d __A, __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C, + const int __R) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_round_ss (__m128 __A, __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C, + const int __R) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_ss_mask (__m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_sd_mask (__m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_ss_mask (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epi64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epi64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epu64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epi64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epu64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epu64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epi64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epi64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epu64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epi64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epi64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epu64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epu64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi64_ps (__m512i __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + __R); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi64_ps (__m256 __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + __R); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi64_ps (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + __R); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu64_ps (__m512i __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + __R); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu64_ps (__m256 __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + __R); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu64_ps (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi64_pd (__m512i __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi64_pd (__m512d __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi64_pd (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu64_pd (__m512i __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu64_pd (__m512d __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu64_pd (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_pd (__m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_pd (__m512d __A, int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) __W, + __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_pd (__mmask8 __U, __m512d __A, int __B, + const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_ps (__m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_ps (__m512 __A, int __B, const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) __W, + __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_ps (__mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf32x8_ps (__m512 __A, const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf32x8_ps (__m256 __W, __mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf32x8_ps (__mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf64x2_pd (__m512d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) __W, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf64x2_pd (__mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti32x8_epi32 (__m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti32x8_epi32 (__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti32x8_epi32 (__mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti64x2_epi64 (__m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti64x2_epi64 (__mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_round_pd (__m512d __A, __m512d __B, int __C, + const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_round_pd (__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, int __C, + const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) __W, + (__mmask8) __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + int __C, const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_round_ps (__m512 __A, __m512 __B, int __C, const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_round_ps (__m512 __W, __mmask16 __U, + __m512 __A, __m512 __B, int __C, + const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) __W, + (__mmask16) __U, + __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + int __C, const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti32x8 (__m512i __A, __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti32x8 (__m512i __W, __mmask16 __U, __m512i __A, + __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti32x8 (__mmask16 __U, __m512i __A, __m256i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf32x8 (__m512 __A, __m256 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf32x8 (__m512 __W, __mmask16 __U, __m512 __A, + __m256 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf32x8 (__mmask16 __U, __m512 __A, __m256 __B, + const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti64x2 (__m512i __A, __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti64x2 (__m512i __W, __mmask8 __U, __m512i __A, + __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) __W, + (__mmask8) + __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti64x2 (__mmask8 __U, __m512i __A, __m128i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) + __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf64x2 (__m512d __A, __m128d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf64x2 (__m512d __W, __mmask8 __U, __m512d __A, + __m128d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) __W, + (__mmask8) + __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf64x2 (__mmask8 __U, __m512d __A, __m128d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_pd_mask (__mmask8 __U, __m512d __A, + const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_pd_mask (__m512d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_ps_mask (__mmask16 __U, __m512 __A, + const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A, + __imm, __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_ps_mask (__m512 __A, const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A, + __imm, + (__mmask16) -1); +} +#else +#define _kshiftli_mask8(X, Y) ((__mmask8) __builtin_ia32_kshiftliqi ((__mmask8)(X), (__mmask8)(Y))) +#define _kshiftri_mask8(X, Y) ((__mmask8) __builtin_ia32_kshiftriqi ((__mmask8)(X), (__mmask8)(Y))) +#define _mm_range_sd(A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_range_sd(W, U, A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_range_sd(U, A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_range_ss(A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_range_ss(W, U, A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_range_ss(U, A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_range_round_sd(A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8) -1, (R))) +#define _mm_mask_range_round_sd(W, U, A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), (R))) +#define _mm_maskz_range_round_sd(U, A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), (R))) +#define _mm_range_round_ss(A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8) -1, (R))) +#define _mm_mask_range_round_ss(W, U, A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), (R))) +#define _mm_maskz_range_round_ss(U, A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), (R))) +#define _mm512_cvtt_roundpd_epi64(A, B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di) _mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvtt_roundpd_epi64(U, A, B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvtt_roundpd_epu64(A, B) ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, B) ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvtt_roundpd_epu64(U, A, B) ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvtt_roundps_epi64(A, B) ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvtt_roundps_epi64(W, U, A, B) ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvtt_roundps_epi64(U, A, B) ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvtt_roundps_epu64(A, B) ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvtt_roundps_epu64(W, U, A, B) ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvtt_roundps_epu64(U, A, B) ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvt_roundpd_epi64(A, B) ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvt_roundpd_epi64(W, U, A, B) ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvt_roundpd_epi64(U, A, B) ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvt_roundpd_epu64(A, B) ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvt_roundpd_epu64(W, U, A, B) ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvt_roundpd_epu64(U, A, B) ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvt_roundps_epi64(A, B) ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvt_roundps_epi64(W, U, A, B) ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvt_roundps_epi64(U, A, B) ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvt_roundps_epu64(A, B) ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) +#define _mm512_mask_cvt_roundps_epu64(W, U, A, B) ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)(W), (U), (B))) +#define _mm512_maskz_cvt_roundps_epu64(U, A, B) ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) +#define _mm512_cvt_roundepi64_ps(A, B) ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B))) +#define _mm512_mask_cvt_roundepi64_ps(W, U, A, B) ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (W), (U), (B))) +#define _mm512_maskz_cvt_roundepi64_ps(U, A, B) ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B))) +#define _mm512_cvt_roundepu64_ps(A, B) ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B))) +#define _mm512_mask_cvt_roundepu64_ps(W, U, A, B) ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (W), (U), (B))) +#define _mm512_maskz_cvt_roundepu64_ps(U, A, B) ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B))) +#define _mm512_cvt_roundepi64_pd(A, B) ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B))) +#define _mm512_mask_cvt_roundepi64_pd(W, U, A, B) ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (W), (U), (B))) +#define _mm512_maskz_cvt_roundepi64_pd(U, A, B) ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B))) +#define _mm512_cvt_roundepu64_pd(A, B) ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B))) +#define _mm512_mask_cvt_roundepu64_pd(W, U, A, B) ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (W), (U), (B))) +#define _mm512_maskz_cvt_roundepu64_pd(U, A, B) ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B))) +#define _mm512_reduce_pd(A, B) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1)) +#define _mm512_reduce_round_pd(A, B, R) ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R))) +#define _mm512_mask_reduce_pd(W, U, A, B) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_mask_reduce_round_pd(W, U, A, B, R) ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A), (int)(B), (__v8df)(__m512d)(W), (U), (R))) +#define _mm512_maskz_reduce_pd(U, A, B) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)(U))) +#define _mm512_maskz_reduce_round_pd(U, A, B, R) ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_setzero_pd (), (U), (R))) +#define _mm512_reduce_ps(A, B) ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1)) +#define _mm512_reduce_round_ps(A, B, R) ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R))) +#define _mm512_mask_reduce_ps(W, U, A, B) ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U))) +#define _mm512_mask_reduce_round_ps(W, U, A, B, R) ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A), (int)(B), (__v16sf)(__m512)(W), (U), (R))) +#define _mm512_maskz_reduce_ps(U, A, B) ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U))) +#define _mm512_maskz_reduce_round_ps(U, A, B, R) ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R))) +#define _mm512_extractf32x8_ps(X, C) ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8)-1)) +#define _mm512_mask_extractf32x8_ps(W, U, X, C) ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), (int) (C), (__v8sf)(__m256) (W), (__mmask8) (U))) +#define _mm512_maskz_extractf32x8_ps(U, X, C) ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8) (U))) +#define _mm512_extractf64x2_pd(X, C) ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8)-1)) +#define _mm512_mask_extractf64x2_pd(W, U, X, C) ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X), (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U))) +#define _mm512_maskz_extractf64x2_pd(U, X, C) ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8) (U))) +#define _mm512_extracti32x8_epi32(X, C) ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm512_mask_extracti32x8_epi32(W, U, X, C) ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), (int) (C), (__v8si)(__m256i) (W), (__mmask8) (U))) +#define _mm512_maskz_extracti32x8_epi32(U, X, C) ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8) (U))) +#define _mm512_extracti64x2_epi64(X, C) ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1)) +#define _mm512_mask_extracti64x2_epi64(W, U, X, C) ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X), (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U))) +#define _mm512_maskz_extracti64x2_epi64(U, X, C) ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) +#define _mm512_range_pd(A, B, C) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_range_pd(W, U, A, B, C) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_range_pd(U, A, B, C) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_range_ps(A, B, C) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_range_ps(W, U, A, B, C) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_range_ps(U, A, B, C) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_range_round_pd(A, B, C, R) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R))) +#define _mm512_mask_range_round_pd(W, U, A, B, C, R) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U), (R))) +#define _mm512_maskz_range_round_pd(U, A, B, C, R) ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd (), (__mmask8)(U), (R))) +#define _mm512_range_round_ps(A, B, C, R) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R))) +#define _mm512_mask_range_round_ps(W, U, A, B, C, R) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U), (R))) +#define _mm512_maskz_range_round_ps(U, A, B, C, R) ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R))) +#define _mm512_insertf64x2(X, Y, C) ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (X), (__mmask8)-1)) +#define _mm512_mask_insertf64x2(W, U, X, Y, C) ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (W), (__mmask8) (U))) +#define _mm512_maskz_insertf64x2(U, X, Y, C) ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) _mm512_setzero_pd (), (__mmask8) (U))) +#define _mm512_inserti64x2(X, Y, C) ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (X), (__mmask8)-1)) +#define _mm512_mask_inserti64x2(W, U, X, Y, C) ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (W), (__mmask8) (U))) +#define _mm512_maskz_inserti64x2(U, X, Y, C) ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) _mm512_setzero_si512 (), (__mmask8) (U))) +#define _mm512_insertf32x8(X, Y, C) ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), (__v8sf)(__m256) (Y), (int) (C), (__v16sf)(__m512)_mm512_setzero_ps (), (__mmask16)-1)) +#define _mm512_mask_insertf32x8(W, U, X, Y, C) ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), (__v8sf)(__m256) (Y), (int) (C), (__v16sf)(__m512)(W), (__mmask16)(U))) +#define _mm512_maskz_insertf32x8(U, X, Y, C) ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), (__v8sf)(__m256) (Y), (int) (C), (__v16sf)(__m512)_mm512_setzero_ps (), (__mmask16)(U))) +#define _mm512_inserti32x8(X, Y, C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)-1)) +#define _mm512_mask_inserti32x8(W, U, X, Y, C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_inserti32x8(U, X, Y, C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) +#define _mm_fpclass_ss_mask(X, C) ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), (int) (C), (__mmask8) (-1))) +#define _mm_fpclass_sd_mask(X, C) ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), (int) (C), (__mmask8) (-1))) +#define _mm_mask_fpclass_ss_mask(X, C, U) ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), (int) (C), (__mmask8) (U))) +#define _mm_mask_fpclass_sd_mask(X, C, U) ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), (int) (C), (__mmask8) (U))) +#define _mm512_mask_fpclass_pd_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), (int) (C), (__mmask8)(u))) +#define _mm512_mask_fpclass_ps_mask(u, x, c) ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x), (int) (c),(__mmask8)(u))) +#define _mm512_fpclass_pd_mask(X, C) ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), (int) (C), (__mmask8)-1)) +#define _mm512_fpclass_ps_mask(x, c) ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x), (int) (c),(__mmask8)-1)) +#define _mm_reduce_sd(A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)-1)) +#define _mm_mask_reduce_sd(W, U, A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) +#define _mm_maskz_reduce_sd(U, A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U))) +#define _mm_reduce_round_sd(A, B, C, R) ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R))) +#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R))) +#define _mm_maskz_reduce_round_sd(U, A, B, C, R) ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), (int)(R))) +#define _mm_reduce_ss(A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)-1)) +#define _mm_mask_reduce_ss(W, U, A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm_maskz_reduce_ss(U, A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U))) +#define _mm_reduce_round_ss(A, B, C, R) ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R))) +#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R))) +#define _mm_maskz_reduce_round_ss(U, A, B, C, R) ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), (int)(R))) +#endif #ifdef __DISABLE_AVX512DQ__ #undef __DISABLE_AVX512DQ__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512DQ__ */ - -#endif /* _AVX512DQINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512erintrin.internal.h b/third_party/intel/avx512erintrin.internal.h index d59af79bb..d963eb2bd 100644 --- a/third_party/intel/avx512erintrin.internal.h +++ b/third_party/intel/avx512erintrin.internal.h @@ -1,281 +1,357 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512ERINTRIN_H_INCLUDED #define _AVX512ERINTRIN_H_INCLUDED - #ifndef __AVX512ER__ #pragma GCC push_options #pragma GCC target("avx512er") #define __DISABLE_AVX512ER__ -#endif /* __AVX512ER__ */ - -typedef double __v8df __attribute__((__vector_size__(64))); -typedef float __v16sf __attribute__((__vector_size__(64))); - -typedef float __m512 __attribute__((__vector_size__(64), __may_alias__)); -typedef double __m512d __attribute__((__vector_size__(64), __may_alias__)); - +#endif +typedef double __v8df __attribute__ ((__vector_size__ (64))); +typedef float __v16sf __attribute__ ((__vector_size__ (64))); +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); typedef unsigned char __mmask8; typedef unsigned short __mmask16; - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_exp2a23_round_pd(__m512d __A, int __R) { +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_exp2a23_round_pd (__m512d __A, int __R) +{ __m512d __W; - return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W, - (__mmask8)-1, __R); + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_exp2a23_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, int __R) { - return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_exp2a23_round_pd(__mmask8 __U, __m512d __A, - int __R) { - return (__m512d)__builtin_ia32_exp2pd_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_exp2a23_round_ps(__m512 __A, int __R) { +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_exp2a23_round_ps (__m512 __A, int __R) +{ __m512 __W; - return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)-1, __R); + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_exp2a23_round_ps(__m512 __W, __mmask16 __U, - __m512 __A, int __R) { - return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_exp2a23_round_ps(__mmask16 __U, __m512 __A, - int __R) { - return (__m512)__builtin_ia32_exp2ps_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m512d _mm512_rcp28_round_pd(__m512d __A, int __R) { +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp28_round_pd (__m512d __A, int __R) +{ __m512d __W; - return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W, - (__mmask8)-1, __R); + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_rcp28_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, int __R) { - return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_rcp28_round_pd(__mmask8 __U, __m512d __A, - int __R) { - return (__m512d)__builtin_ia32_rcp28pd_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_rcp28_round_ps(__m512 __A, int __R) { +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp28_round_ps (__m512 __A, int __R) +{ __m512 __W; - return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)-1, __R); + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_rcp28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - int __R) { - return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_rcp28_round_ps(__mmask16 __U, __m512 __A, int __R) { - return (__m512)__builtin_ia32_rcp28ps_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m128d _mm_rcp28_round_sd(__m128d __A, __m128d __B, int __R) { - return (__m128d)__builtin_ia32_rcp28sd_round((__v2df)__B, (__v2df)__A, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __B, + (__v2df) __A, + __R); } - -__funline __m128 _mm_rcp28_round_ss(__m128 __A, __m128 __B, int __R) { - return (__m128)__builtin_ia32_rcp28ss_round((__v4sf)__B, (__v4sf)__A, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); } - -__funline __m512d _mm512_rsqrt28_round_pd(__m512d __A, int __R) { +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __B, + (__v4sf) __A, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt28_round_pd (__m512d __A, int __R) +{ __m512d __W; - return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W, - (__mmask8)-1, __R); + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_rsqrt28_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, int __R) { - return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_rsqrt28_round_pd(__mmask8 __U, __m512d __A, - int __R) { - return (__m512d)__builtin_ia32_rsqrt28pd_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_rsqrt28_round_ps(__m512 __A, int __R) { +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt28_round_ps (__m512 __A, int __R) +{ __m512 __W; - return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)-1, __R); + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_rsqrt28_round_ps(__m512 __W, __mmask16 __U, - __m512 __A, int __R) { - return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_rsqrt28_round_ps(__mmask16 __U, __m512 __A, - int __R) { - return (__m512)__builtin_ia32_rsqrt28ps_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m128d _mm_rsqrt28_round_sd(__m128d __A, __m128d __B, int __R) { - return (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)__B, (__v2df)__A, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __B, + (__v2df) __A, + __R); } - -__funline __m128 _mm_rsqrt28_round_ss(__m128 __A, __m128 __B, int __R) { - return (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)__B, (__v4sf)__A, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __B, + (__v4sf) __A, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); } - #else -#define _mm512_exp2a23_round_pd(A, C) \ - __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) - -#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \ - __builtin_ia32_exp2pd_mask(A, W, U, C) - -#define _mm512_maskz_exp2a23_round_pd(U, A, C) \ - __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_exp2a23_round_ps(A, C) \ - __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) - -#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \ - __builtin_ia32_exp2ps_mask(A, W, U, C) - -#define _mm512_maskz_exp2a23_round_ps(U, A, C) \ - __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) - -#define _mm512_rcp28_round_pd(A, C) \ - __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) - -#define _mm512_mask_rcp28_round_pd(W, U, A, C) \ - __builtin_ia32_rcp28pd_mask(A, W, U, C) - -#define _mm512_maskz_rcp28_round_pd(U, A, C) \ - __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_rcp28_round_ps(A, C) \ - __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) - -#define _mm512_mask_rcp28_round_ps(W, U, A, C) \ - __builtin_ia32_rcp28ps_mask(A, W, U, C) - -#define _mm512_maskz_rcp28_round_ps(U, A, C) \ - __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) - -#define _mm512_rsqrt28_round_pd(A, C) \ - __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) - -#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \ - __builtin_ia32_rsqrt28pd_mask(A, W, U, C) - -#define _mm512_maskz_rsqrt28_round_pd(U, A, C) \ - __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_rsqrt28_round_ps(A, C) \ - __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) - -#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \ - __builtin_ia32_rsqrt28ps_mask(A, W, U, C) - -#define _mm512_maskz_rsqrt28_round_ps(U, A, C) \ - __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) - +#define _mm512_exp2a23_round_pd(A, C) __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) +#define _mm512_mask_exp2a23_round_pd(W, U, A, C) __builtin_ia32_exp2pd_mask(A, W, U, C) +#define _mm512_maskz_exp2a23_round_pd(U, A, C) __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_exp2a23_round_ps(A, C) __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) +#define _mm512_mask_exp2a23_round_ps(W, U, A, C) __builtin_ia32_exp2ps_mask(A, W, U, C) +#define _mm512_maskz_exp2a23_round_ps(U, A, C) __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm512_rcp28_round_pd(A, C) __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) +#define _mm512_mask_rcp28_round_pd(W, U, A, C) __builtin_ia32_rcp28pd_mask(A, W, U, C) +#define _mm512_maskz_rcp28_round_pd(U, A, C) __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_rcp28_round_ps(A, C) __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) +#define _mm512_mask_rcp28_round_ps(W, U, A, C) __builtin_ia32_rcp28ps_mask(A, W, U, C) +#define _mm512_maskz_rcp28_round_ps(U, A, C) __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm512_rsqrt28_round_pd(A, C) __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) +#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) __builtin_ia32_rsqrt28pd_mask(A, W, U, C) +#define _mm512_maskz_rsqrt28_round_pd(U, A, C) __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_rsqrt28_round_ps(A, C) __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) +#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) __builtin_ia32_rsqrt28ps_mask(A, W, U, C) +#define _mm512_maskz_rsqrt28_round_ps(U, A, C) __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) #define _mm_rcp28_round_sd(A, B, R) __builtin_ia32_rcp28sd_round(A, B, R) - +#define _mm_mask_rcp28_round_sd(W, U, A, B, R) __builtin_ia32_rcp28sd_mask_round ((A), (B), (W), (U), (R)) +#define _mm_maskz_rcp28_round_sd(U, A, B, R) __builtin_ia32_rcp28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (), (U), (R)) #define _mm_rcp28_round_ss(A, B, R) __builtin_ia32_rcp28ss_round(A, B, R) - +#define _mm_mask_rcp28_round_ss(W, U, A, B, R) __builtin_ia32_rcp28ss_mask_round ((A), (B), (W), (U), (R)) +#define _mm_maskz_rcp28_round_ss(U, A, B, R) __builtin_ia32_rcp28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (), (U), (R)) #define _mm_rsqrt28_round_sd(A, B, R) __builtin_ia32_rsqrt28sd_round(A, B, R) - +#define _mm_mask_rsqrt28_round_sd(W, U, A, B, R) __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (W), (U), (R)) +#define _mm_maskz_rsqrt28_round_sd(U, A, B, R) __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (), (U), (R)) #define _mm_rsqrt28_round_ss(A, B, R) __builtin_ia32_rsqrt28ss_round(A, B, R) - +#define _mm_mask_rsqrt28_round_ss(W, U, A, B, R) __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (W), (U), (R)) +#define _mm_maskz_rsqrt28_round_ss(U, A, B, R) __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (), (U), (R)) #endif - -#define _mm512_exp2a23_pd(A) \ - _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_exp2a23_pd(W, U, A) \ - _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_exp2a23_pd(U, A) \ - _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_exp2a23_ps(A) \ - _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_exp2a23_ps(W, U, A) \ - _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_exp2a23_ps(U, A) \ - _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) - +#define _mm_mask_rcp28_sd(W, U, A, B) _mm_mask_rcp28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_rcp28_sd(U, A, B) _mm_maskz_rcp28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_mask_rcp28_ss(W, U, A, B) _mm_mask_rcp28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_rcp28_ss(U, A, B) _mm_maskz_rcp28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_mask_rsqrt28_sd(W, U, A, B) _mm_mask_rsqrt28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_rsqrt28_sd(U, A, B) _mm_maskz_rsqrt28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_mask_rsqrt28_ss(W, U, A, B) _mm_mask_rsqrt28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_rsqrt28_ss(U, A, B) _mm_maskz_rsqrt28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm512_exp2a23_pd(A) _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_exp2a23_pd(W, U, A) _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_exp2a23_pd(U, A) _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_exp2a23_ps(A) _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_exp2a23_ps(W, U, A) _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_exp2a23_ps(U, A) _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) #define _mm512_rcp28_pd(A) _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rcp28_pd(W, U, A) \ - _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rcp28_pd(U, A) \ - _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) - +#define _mm512_mask_rcp28_pd(W, U, A) _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rcp28_pd(U, A) _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) #define _mm512_rcp28_ps(A) _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rcp28_ps(W, U, A) \ - _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rcp28_ps(U, A) \ - _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_rsqrt28_pd(A) \ - _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rsqrt28_pd(W, U, A) \ - _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rsqrt28_pd(U, A) \ - _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_rsqrt28_ps(A) \ - _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rsqrt28_ps(W, U, A) \ - _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rsqrt28_ps(U, A) \ - _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm_rcp28_sd(A, B) \ - __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm_rcp28_ss(A, B) \ - __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm_rsqrt28_sd(A, B) \ - __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) - -#define _mm_rsqrt28_ss(A, B) \ - __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) - +#define _mm512_mask_rcp28_ps(W, U, A) _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rcp28_ps(U, A) _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_rsqrt28_pd(A) _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_rsqrt28_pd(W, U, A) _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rsqrt28_pd(U, A) _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_rsqrt28_ps(A) _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_rsqrt28_ps(W, U, A) _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm512_maskz_rsqrt28_ps(U, A) _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rcp28_sd(A, B) __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rcp28_ss(A, B) __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rsqrt28_sd(A, B) __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) +#define _mm_rsqrt28_ss(A, B) __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) #ifdef __DISABLE_AVX512ER__ #undef __DISABLE_AVX512ER__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512ER__ */ - -#endif /* _AVX512ERINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512fintrin.internal.h b/third_party/intel/avx512fintrin.internal.h index d959242ec..c3005f8be 100644 --- a/third_party/intel/avx512fintrin.internal.h +++ b/third_party/intel/avx512fintrin.internal.h @@ -1,10326 +1,13224 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512FINTRIN_H_INCLUDED #define _AVX512FINTRIN_H_INCLUDED - #ifndef __AVX512F__ #pragma GCC push_options #pragma GCC target("avx512f") #define __DISABLE_AVX512F__ -#endif /* __AVX512F__ */ - -typedef double __v8df __attribute__((__vector_size__(64))); -typedef float __v16sf __attribute__((__vector_size__(64))); -typedef long long __v8di __attribute__((__vector_size__(64))); -typedef unsigned long long __v8du __attribute__((__vector_size__(64))); -typedef int __v16si __attribute__((__vector_size__(64))); -typedef unsigned int __v16su __attribute__((__vector_size__(64))); -typedef short __v32hi __attribute__((__vector_size__(64))); -typedef unsigned short __v32hu __attribute__((__vector_size__(64))); -typedef char __v64qi __attribute__((__vector_size__(64))); -typedef unsigned char __v64qu __attribute__((__vector_size__(64))); - -typedef float __m512 __attribute__((__vector_size__(64), __may_alias__)); -typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__)); -typedef double __m512d __attribute__((__vector_size__(64), __may_alias__)); - -typedef float __m512_u - __attribute__((__vector_size__(64), __may_alias__, __aligned__(1))); -typedef long long __m512i_u - __attribute__((__vector_size__(64), __may_alias__, __aligned__(1))); -typedef double __m512d_u - __attribute__((__vector_size__(64), __may_alias__, __aligned__(1))); - +#endif +typedef double __v8df __attribute__ ((__vector_size__ (64))); +typedef float __v16sf __attribute__ ((__vector_size__ (64))); +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef unsigned int __v16su __attribute__ ((__vector_size__ (64))); +typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64))); +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); +typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); +typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); typedef unsigned char __mmask8; typedef unsigned short __mmask16; - -__funline __mmask16 _mm512_int2mask(int __M) { - return (__mmask16)__M; +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_int2mask (int __M) +{ + return (__mmask16) __M; } - -__funline int _mm512_mask2int(__mmask16 __M) { - return (int)__M; +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2int (__mmask16 __M) +{ + return (int) __M; } - -__funline __m512i _mm512_set_epi64(long long __A, long long __B, long long __C, - long long __D, long long __E, long long __F, - long long __G, long long __H) { - return __extension__(__m512i)(__v8di){__H, __G, __F, __E, __D, __C, __B, __A}; +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi64 (long long __A, long long __B, long long __C, + long long __D, long long __E, long long __F, + long long __G, long long __H) +{ + return __extension__ (__m512i) (__v8di) + { __H, __G, __F, __E, __D, __C, __B, __A }; } - -__funline __m512i _mm512_set_epi32(int __A, int __B, int __C, int __D, int __E, - int __F, int __G, int __H, int __I, int __J, - int __K, int __L, int __M, int __N, int __O, - int __P) { - return __extension__(__m512i)(__v16si){__P, __O, __N, __M, __L, __K, - __J, __I, __H, __G, __F, __E, - __D, __C, __B, __A}; +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H, + int __I, int __J, int __K, int __L, + int __M, int __N, int __O, int __P) +{ + return __extension__ (__m512i)(__v16si) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; } - -__funline __m512i _mm512_set_epi16( - short __q31, short __q30, short __q29, short __q28, short __q27, - short __q26, short __q25, short __q24, short __q23, short __q22, - short __q21, short __q20, short __q19, short __q18, short __q17, - short __q16, short __q15, short __q14, short __q13, short __q12, - short __q11, short __q10, short __q09, short __q08, short __q07, - short __q06, short __q05, short __q04, short __q03, short __q02, - short __q01, short __q00) { - return __extension__(__m512i)(__v32hi){ - __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, - __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, - __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, - __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31}; +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi16 (short __q31, short __q30, short __q29, short __q28, + short __q27, short __q26, short __q25, short __q24, + short __q23, short __q22, short __q21, short __q20, + short __q19, short __q18, short __q17, short __q16, + short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return __extension__ (__m512i)(__v32hi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 + }; } - -__funline __m512i _mm512_set_epi8( - char __q63, char __q62, char __q61, char __q60, char __q59, char __q58, - char __q57, char __q56, char __q55, char __q54, char __q53, char __q52, - char __q51, char __q50, char __q49, char __q48, char __q47, char __q46, - char __q45, char __q44, char __q43, char __q42, char __q41, char __q40, - char __q39, char __q38, char __q37, char __q36, char __q35, char __q34, - char __q33, char __q32, char __q31, char __q30, char __q29, char __q28, - char __q27, char __q26, char __q25, char __q24, char __q23, char __q22, - char __q21, char __q20, char __q19, char __q18, char __q17, char __q16, - char __q15, char __q14, char __q13, char __q12, char __q11, char __q10, - char __q09, char __q08, char __q07, char __q06, char __q05, char __q04, - char __q03, char __q02, char __q01, char __q00) { - return __extension__(__m512i)(__v64qi){ - __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, - __q10, __q11, __q12, __q13, __q14, __q15, __q16, __q17, __q18, __q19, - __q20, __q21, __q22, __q23, __q24, __q25, __q26, __q27, __q28, __q29, - __q30, __q31, __q32, __q33, __q34, __q35, __q36, __q37, __q38, __q39, - __q40, __q41, __q42, __q43, __q44, __q45, __q46, __q47, __q48, __q49, - __q50, __q51, __q52, __q53, __q54, __q55, __q56, __q57, __q58, __q59, - __q60, __q61, __q62, __q63}; +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi8 (char __q63, char __q62, char __q61, char __q60, + char __q59, char __q58, char __q57, char __q56, + char __q55, char __q54, char __q53, char __q52, + char __q51, char __q50, char __q49, char __q48, + char __q47, char __q46, char __q45, char __q44, + char __q43, char __q42, char __q41, char __q40, + char __q39, char __q38, char __q37, char __q36, + char __q35, char __q34, char __q33, char __q32, + char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m512i)(__v64qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31, + __q32, __q33, __q34, __q35, __q36, __q37, __q38, __q39, + __q40, __q41, __q42, __q43, __q44, __q45, __q46, __q47, + __q48, __q49, __q50, __q51, __q52, __q53, __q54, __q55, + __q56, __q57, __q58, __q59, __q60, __q61, __q62, __q63 + }; } - -__funline __m512d _mm512_set_pd(double __A, double __B, double __C, double __D, - double __E, double __F, double __G, double __H) { - return __extension__(__m512d){__H, __G, __F, __E, __D, __C, __B, __A}; +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_pd (double __A, double __B, double __C, double __D, + double __E, double __F, double __G, double __H) +{ + return __extension__ (__m512d) + { __H, __G, __F, __E, __D, __C, __B, __A }; } - -__funline __m512 _mm512_set_ps(float __A, float __B, float __C, float __D, - float __E, float __F, float __G, float __H, - float __I, float __J, float __K, float __L, - float __M, float __N, float __O, float __P) { - return __extension__(__m512){__P, __O, __N, __M, __L, __K, __J, __I, - __H, __G, __F, __E, __D, __C, __B, __A}; +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H, + float __I, float __J, float __K, float __L, + float __M, float __N, float __O, float __P) +{ + return __extension__ (__m512) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; } - -#define _mm512_setr_epi64(e0, e1, e2, e3, e4, e5, e6, e7) \ - _mm512_set_epi64(e7, e6, e5, e4, e3, e2, e1, e0) - -#define _mm512_setr_epi32(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, \ - e12, e13, e14, e15) \ - _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, \ - e2, e1, e0) - -#define _mm512_setr_pd(e0, e1, e2, e3, e4, e5, e6, e7) \ - _mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) - -#define _mm512_setr_ps(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, \ - e13, e14, e15) \ - _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, \ - e1, e0) - -__funline __m512 _mm512_undefined_ps(void) { +#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) _mm512_set_epi64(e7,e6,e5,e4,e3,e2,e1,e0) +#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, e8,e9,e10,e11,e12,e13,e14,e15) _mm512_set_epi32(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) +#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) _mm512_set_pd(e7,e6,e5,e4,e3,e2,e1,e0) +#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) _mm512_set_ps(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_ps (void) +{ __m512 __Y = __Y; return __Y; } - #define _mm512_undefined _mm512_undefined_ps - -__funline __m512d _mm512_undefined_pd(void) { +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_pd (void) +{ __m512d __Y = __Y; return __Y; } - -__funline __m512i _mm512_undefined_epi32(void) { +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_epi32 (void) +{ __m512i __Y = __Y; return __Y; } - #define _mm512_undefined_si512 _mm512_undefined_epi32 - -__funline __m512i _mm512_set1_epi8(char __A) { - return __extension__(__m512i)(__v64qi){ - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A}; -} - -__funline __m512i _mm512_set1_epi16(short __A) { - return __extension__(__m512i)(__v32hi){ - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A}; -} - -__funline __m512d _mm512_set1_pd(double __A) { - return (__m512d)__builtin_ia32_broadcastsd512( - __extension__(__v2df){ - __A, - }, - (__v8df)_mm512_undefined_pd(), (__mmask8)-1); -} - -__funline __m512 _mm512_set1_ps(float __A) { - return (__m512)__builtin_ia32_broadcastss512( - __extension__(__v4sf){ - __A, - }, - (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); -} - -__funline __m512i _mm512_set4_epi32(int __A, int __B, int __C, int __D) { - return __extension__(__m512i)(__v16si){__D, __C, __B, __A, __D, __C, - __B, __A, __D, __C, __B, __A, - __D, __C, __B, __A}; -} - -__funline __m512i _mm512_set4_epi64(long long __A, long long __B, long long __C, - long long __D) { - return __extension__(__m512i)(__v8di){__D, __C, __B, __A, __D, __C, __B, __A}; -} - -__funline __m512d _mm512_set4_pd(double __A, double __B, double __C, double __D) { - return __extension__(__m512d){__D, __C, __B, __A, __D, __C, __B, __A}; -} - -__funline __m512 _mm512_set4_ps(float __A, float __B, float __C, float __D) { - return __extension__(__m512){__D, __C, __B, __A, __D, __C, __B, __A, - __D, __C, __B, __A, __D, __C, __B, __A}; -} - -#define _mm512_setr4_epi64(e0, e1, e2, e3) _mm512_set4_epi64(e3, e2, e1, e0) - -#define _mm512_setr4_epi32(e0, e1, e2, e3) _mm512_set4_epi32(e3, e2, e1, e0) - -#define _mm512_setr4_pd(e0, e1, e2, e3) _mm512_set4_pd(e3, e2, e1, e0) - -#define _mm512_setr4_ps(e0, e1, e2, e3) _mm512_set4_ps(e3, e2, e1, e0) - -__funline __m512 _mm512_setzero_ps(void) { - return __extension__(__m512){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; -} - -__funline __m512 _mm512_setzero(void) { - return _mm512_setzero_ps(); -} - -__funline __m512d _mm512_setzero_pd(void) { - return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; -} - -__funline __m512i _mm512_setzero_epi32(void) { - return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}; -} - -__funline __m512i _mm512_setzero_si512(void) { - return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}; -} - -__funline __m512d _mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_movapd512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_movapd512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_movaps512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_movaps512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __m512d _mm512_load_pd(void const *__P) { - return *(__m512d *)__P; -} - -__funline __m512d _mm512_mask_load_pd(__m512d __W, __mmask8 __U, - void const *__P) { - return (__m512d)__builtin_ia32_loadapd512_mask((const __v8df *)__P, - (__v8df)__W, (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_load_pd(__mmask8 __U, void const *__P) { - return (__m512d)__builtin_ia32_loadapd512_mask( - (const __v8df *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline void _mm512_store_pd(void *__P, __m512d __A) { - *(__m512d *)__P = __A; -} - -__funline void _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) { - __builtin_ia32_storeapd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)__U); -} - -__funline __m512 _mm512_load_ps(void const *__P) { - return *(__m512 *)__P; -} - -__funline __m512 _mm512_mask_load_ps(__m512 __W, __mmask16 __U, void const *__P) { - return (__m512)__builtin_ia32_loadaps512_mask((const __v16sf *)__P, - (__v16sf)__W, (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_load_ps(__mmask16 __U, void const *__P) { - return (__m512)__builtin_ia32_loadaps512_mask( - (const __v16sf *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline void _mm512_store_ps(void *__P, __m512 __A) { - *(__m512 *)__P = __A; -} - -__funline void _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) { - __builtin_ia32_storeaps512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)__U); -} - -__funline __m512i _mm512_mask_mov_epi64(__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdqa64_512_mask((__v8di)__A, (__v8di)__W, - (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdqa64_512_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_load_epi64(void const *__P) { - return *(__m512i *)__P; -} - -__funline __m512i _mm512_mask_load_epi64(__m512i __W, __mmask8 __U, - void const *__P) { - return (__m512i)__builtin_ia32_movdqa64load512_mask( - (const __v8di *)__P, (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_load_epi64(__mmask8 __U, void const *__P) { - return (__m512i)__builtin_ia32_movdqa64load512_mask( - (const __v8di *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline void _mm512_store_epi64(void *__P, __m512i __A) { - *(__m512i *)__P = __A; -} - -__funline void _mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) { - __builtin_ia32_movdqa64store512_mask((__v8di *)__P, (__v8di)__A, - (__mmask8)__U); -} - -__funline __m512i _mm512_mask_mov_epi32(__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdqa32_512_mask((__v16si)__A, (__v16si)__W, - (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_mov_epi32(__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_movdqa32_512_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); -} - -__funline __m512i _mm512_load_si512(void const *__P) { - return *(__m512i *)__P; -} - -__funline __m512i _mm512_load_epi32(void const *__P) { - return *(__m512i *)__P; -} - -__funline __m512i _mm512_mask_load_epi32(__m512i __W, __mmask16 __U, - void const *__P) { - return (__m512i)__builtin_ia32_movdqa32load512_mask( - (const __v16si *)__P, (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_load_epi32(__mmask16 __U, void const *__P) { - return (__m512i)__builtin_ia32_movdqa32load512_mask( - (const __v16si *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); -} - -__funline void _mm512_store_si512(void *__P, __m512i __A) { - *(__m512i *)__P = __A; -} - -__funline void _mm512_store_epi32(void *__P, __m512i __A) { - *(__m512i *)__P = __A; -} - -__funline void _mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) { - __builtin_ia32_movdqa32store512_mask((__v16si *)__P, (__v16si)__A, - (__mmask16)__U); -} - -__funline __m512i _mm512_mullo_epi32(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A * (__v16su)__B); -} - -__funline __m512i _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmulld512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmulld512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, __M); -} - -__funline __m512i _mm512_mullox_epi64(__m512i __A, __m512i __B) { - return (__m512i)((__v8du)__A * (__v8du)__B); -} - -__funline __m512i _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __M, __m512i __A, - __m512i __B) { - return _mm512_mask_mov_epi64(__W, __M, _mm512_mullox_epi64(__A, __B)); -} - -__funline __m512i _mm512_sllv_epi32(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_psllv16si_mask( - (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psllv16si_mask((__v16si)__X, (__v16si)__Y, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psllv16si_mask((__v16si)__X, (__v16si)__Y, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_srav_epi32(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_psrav16si_mask( - (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrav16si_mask((__v16si)__X, (__v16si)__Y, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrav16si_mask((__v16si)__X, (__v16si)__Y, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_srlv_epi32(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_psrlv16si_mask( - (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrlv16si_mask((__v16si)__X, (__v16si)__Y, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrlv16si_mask((__v16si)__X, (__v16si)__Y, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_add_epi64(__m512i __A, __m512i __B) { - return (__m512i)((__v8du)__A + (__v8du)__B); -} - -__funline __m512i _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_paddq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_sub_epi64(__m512i __A, __m512i __B) { - return (__m512i)((__v8du)__A - (__v8du)__B); -} - -__funline __m512i _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_psubq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_sllv_epi64(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_psllv8di_mask( - (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_pd(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psllv8di_mask((__v8di)__X, (__v8di)__Y, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psllv8di_mask( - (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_srav_epi64(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_psrav8di_mask( - (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrav8di_mask((__v8di)__X, (__v8di)__Y, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrav8di_mask( - (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_srlv_epi64(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_psrlv8di_mask( - (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrlv8di_mask((__v8di)__X, (__v8di)__Y, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_psrlv8di_mask( - (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_add_epi32(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A + (__v16su)__B); -} - -__funline __m512i _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_add_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_paddd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_mul_epi32(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_pmuldq512_mask( - (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_undefined_epi32(), - (__mmask8)-1); -} - -__funline __m512i _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_pmuldq512_mask((__v16si)__X, (__v16si)__Y, - (__v8di)__W, __M); -} - -__funline __m512i _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_pmuldq512_mask( - (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_setzero_si512(), __M); -} - -__funline __m512i _mm512_sub_epi32(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A - (__v16su)__B); -} - -__funline __m512i _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_psubd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_mul_epu32(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_pmuludq512_mask( - (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_undefined_epi32(), - (__mmask8)-1); -} - -__funline __m512i _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_pmuludq512_mask((__v16si)__X, (__v16si)__Y, - (__v8di)__W, __M); -} - -__funline __m512i _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_pmuludq512_mask( - (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_setzero_si512(), __M); -} - +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi8 (char __A) +{ + return __extension__ (__m512i)(__v64qi) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi16 (short __A) +{ + return __extension__ (__m512i)(__v32hi) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_pd (double __A) +{ + return __extension__ (__m512d)(__v8df) + { __A, __A, __A, __A, __A, __A, __A, __A }; +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_ps (float __A) +{ + return __extension__ (__m512)(__v16sf) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_epi32 (int __A, int __B, int __C, int __D) +{ + return __extension__ (__m512i)(__v16si) + { __D, __C, __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A, __D, __C, __B, __A }; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_epi64 (long long __A, long long __B, long long __C, + long long __D) +{ + return __extension__ (__m512i) (__v8di) + { __D, __C, __B, __A, __D, __C, __B, __A }; +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_pd (double __A, double __B, double __C, double __D) +{ + return __extension__ (__m512d) + { __D, __C, __B, __A, __D, __C, __B, __A }; +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_ps (float __A, float __B, float __C, float __D) +{ + return __extension__ (__m512) + { __D, __C, __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A, __D, __C, __B, __A }; +} +#define _mm512_setr4_epi64(e0,e1,e2,e3) _mm512_set4_epi64(e3,e2,e1,e0) +#define _mm512_setr4_epi32(e0,e1,e2,e3) _mm512_set4_epi32(e3,e2,e1,e0) +#define _mm512_setr4_pd(e0,e1,e2,e3) _mm512_set4_pd(e3,e2,e1,e0) +#define _mm512_setr4_ps(e0,e1,e2,e3) _mm512_set4_ps(e3,e2,e1,e0) +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_ps (void) +{ + return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero (void) +{ + return _mm512_setzero_ps (); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_pd (void) +{ + return __extension__ (__m512d) { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_epi32 (void) +{ + return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_si512 (void) +{ + return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_pd (void const *__P) +{ + return *(__m512d *) __P; +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_pd (void *__P, __m512d __A) +{ + *(__m512d *) __P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeapd512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_ps (void const *__P) +{ + return *(__m512 *) __P; +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_ps (void *__P, __m512 __A) +{ + *(__m512 *) __P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeaps512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_epi64 (void const *__P) +{ + return *(__m512i *) __P; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_epi64 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_si512 (void const *__P) +{ + return *(__m512i *) __P; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_epi32 (void const *__P) +{ + return *(__m512i *) __P; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_si512 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_epi32 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A * (__v16su) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullox_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A * (__v8du) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullox_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return _mm512_mask_mov_epi64 (__W, __M, _mm512_mullox_epi64 (__A, __B)); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A + (__v8du) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A - (__v8du) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A + (__v16su) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A - (__v16su) __B); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_epu32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} #ifdef __OPTIMIZE__ -__funline __m512i _mm512_slli_epi64(__m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_psllqi512_mask( - (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psllqi512_mask((__v8di)__A, __B, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psllqi512_mask( - (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } #else -#define _mm512_slli_epi64(X, C) \ - ((__m512i)__builtin_ia32_psllqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), \ - (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) - -#define _mm512_mask_slli_epi64(W, U, X, C) \ - ((__m512i)__builtin_ia32_psllqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) - -#define _mm512_maskz_slli_epi64(U, X, C) \ - ((__m512i)__builtin_ia32_psllqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512(), \ - (__mmask8)(U))) +#define _mm512_slli_epi64(X, C) ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) +#define _mm512_mask_slli_epi64(W, U, X, C) ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_slli_epi64(U, X, C) ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) #endif - -__funline __m512i _mm512_sll_epi64(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psllq512_mask( - (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psllq512_mask((__v8di)__A, (__v2di)__B, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psllq512_mask( - (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_srli_epi64(__m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_psrlqi512_mask( - (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)__A, __B, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psrlqi512_mask( - (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } #else -#define _mm512_srli_epi64(X, C) \ - ((__m512i)__builtin_ia32_psrlqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), \ - (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) - -#define _mm512_mask_srli_epi64(W, U, X, C) \ - ((__m512i)__builtin_ia32_psrlqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) - -#define _mm512_maskz_srli_epi64(U, X, C) \ - ((__m512i)__builtin_ia32_psrlqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512(), \ - (__mmask8)(U))) +#define _mm512_srli_epi64(X, C) ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) +#define _mm512_mask_srli_epi64(W, U, X, C) ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_srli_epi64(U, X, C) ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) #endif - -__funline __m512i _mm512_srl_epi64(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psrlq512_mask( - (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psrlq512_mask((__v8di)__A, (__v2di)__B, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psrlq512_mask( - (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_srai_epi64(__m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_psraqi512_mask( - (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psraqi512_mask((__v8di)__A, __B, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psraqi512_mask( - (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } #else -#define _mm512_srai_epi64(X, C) \ - ((__m512i)__builtin_ia32_psraqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), \ - (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) - -#define _mm512_mask_srai_epi64(W, U, X, C) \ - ((__m512i)__builtin_ia32_psraqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) - -#define _mm512_maskz_srai_epi64(U, X, C) \ - ((__m512i)__builtin_ia32_psraqi512_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512(), \ - (__mmask8)(U))) +#define _mm512_srai_epi64(X, C) ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) +#define _mm512_mask_srai_epi64(W, U, X, C) ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_srai_epi64(U, X, C) ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) #endif - -__funline __m512i _mm512_sra_epi64(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psraq512_mask( - (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psraq512_mask((__v8di)__A, (__v2di)__B, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psraq512_mask( - (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_slli_epi32(__m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_pslldi512_mask( - (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_pslldi512_mask((__v16si)__A, __B, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_pslldi512_mask( - (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } #else -#define _mm512_slli_epi32(X, C) \ - ((__m512i)__builtin_ia32_pslldi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) - -#define _mm512_mask_slli_epi32(W, U, X, C) \ - ((__m512i)__builtin_ia32_pslldi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) - -#define _mm512_maskz_slli_epi32(U, X, C) \ - ((__m512i)__builtin_ia32_pslldi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) +#define _mm512_slli_epi32(X, C) ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) +#define _mm512_mask_slli_epi32(W, U, X, C) ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_slli_epi32(U, X, C) ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) #endif - -__funline __m512i _mm512_sll_epi32(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_pslld512_mask( - (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_pslld512_mask((__v16si)__A, (__v4si)__B, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_pslld512_mask((__v16si)__A, (__v4si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_srli_epi32(__m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_psrldi512_mask( - (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psrldi512_mask((__v16si)__A, __B, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psrldi512_mask( - (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } #else -#define _mm512_srli_epi32(X, C) \ - ((__m512i)__builtin_ia32_psrldi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) - -#define _mm512_mask_srli_epi32(W, U, X, C) \ - ((__m512i)__builtin_ia32_psrldi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) - -#define _mm512_maskz_srli_epi32(U, X, C) \ - ((__m512i)__builtin_ia32_psrldi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) +#define _mm512_srli_epi32(X, C) ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) +#define _mm512_mask_srli_epi32(W, U, X, C) ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_srli_epi32(U, X, C) ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) #endif - -__funline __m512i _mm512_srl_epi32(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psrld512_mask( - (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psrld512_mask((__v16si)__A, (__v4si)__B, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psrld512_mask((__v16si)__A, (__v4si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_srai_epi32(__m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_psradi512_mask( - (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psradi512_mask((__v16si)__A, __B, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_psradi512_mask( - (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } #else -#define _mm512_srai_epi32(X, C) \ - ((__m512i)__builtin_ia32_psradi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) - -#define _mm512_mask_srai_epi32(W, U, X, C) \ - ((__m512i)__builtin_ia32_psradi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) - -#define _mm512_maskz_srai_epi32(U, X, C) \ - ((__m512i)__builtin_ia32_psradi512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) +#define _mm512_srai_epi32(X, C) ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) +#define _mm512_mask_srai_epi32(W, U, X, C) ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_srai_epi32(U, X, C) ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) #endif - -__funline __m512i _mm512_sra_epi32(__m512i __A, __m128i __B) { - return (__m512i)__builtin_ia32_psrad512_mask( - (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psrad512_mask((__v16si)__A, (__v4si)__B, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, - __m128i __B) { - return (__m512i)__builtin_ia32_psrad512_mask((__v16si)__A, (__v4si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - #ifdef __OPTIMIZE__ -__funline __m128d _mm_add_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_addsd_round((__v2df)__A, (__v2df)__B, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, + (__v2df) __B, + __R); } - -__funline __m128d _mm_mask_add_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_addsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } - -__funline __m128d _mm_maskz_add_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_addsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m128 _mm_add_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_addss_round((__v4sf)__A, (__v4sf)__B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } - -__funline __m128 _mm_mask_add_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_addss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } - -__funline __m128 _mm_maskz_add_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_addss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } - -__funline __m128d _mm_sub_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_subsd_round((__v2df)__A, (__v2df)__B, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, + (__v2df) __B, + __R); } - -__funline __m128d _mm_mask_sub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_subsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } - -__funline __m128d _mm_maskz_sub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_subsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m128 _mm_sub_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_subss_round((__v4sf)__A, (__v4sf)__B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } - -__funline __m128 _mm_mask_sub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_subss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } - -__funline __m128 _mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_subss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); -} - -#else -#define _mm_add_round_sd(A, B, C) (__m128d) __builtin_ia32_addsd_round(A, B, C) - -#define _mm_mask_add_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_addsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_add_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_add_round_ss(A, B, C) (__m128) __builtin_ia32_addss_round(A, B, C) - -#define _mm_mask_add_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_addss_mask_round(A, B, W, U, C) - -#define _mm_maskz_add_round_ss(U, A, B, C) \ - (__m128) __builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_sub_round_sd(A, B, C) (__m128d) __builtin_ia32_subsd_round(A, B, C) - -#define _mm_mask_sub_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_subsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_sub_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_sub_round_ss(A, B, C) (__m128) __builtin_ia32_subss_round(A, B, C) - -#define _mm_mask_sub_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_subss_mask_round(A, B, W, U, C) - -#define _mm_maskz_sub_round_ss(U, A, B, C) \ - (__m128) __builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#endif - -#ifdef __OPTIMIZE__ -__funline __m512i _mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C, - const int __imm) { - return (__m512i)__builtin_ia32_pternlogq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)-1); -} - -__funline __m512i _mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, - __m512i __B, __m512i __C, - const int __imm) { - return (__m512i)__builtin_ia32_pternlogq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, - __m512i __B, __m512i __C, - const int __imm) { - return (__m512i)__builtin_ia32_pternlogq512_maskz( - (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)__U); -} - -__funline __m512i _mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C, - const int __imm) { - return (__m512i)__builtin_ia32_pternlogd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)-1); -} - -__funline __m512i _mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, - __m512i __B, __m512i __C, - const int __imm) { - return (__m512i)__builtin_ia32_pternlogd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, - __m512i __B, __m512i __C, - const int __imm) { - return (__m512i)__builtin_ia32_pternlogd512_maskz( - (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } #else -#define _mm512_ternarylogic_epi64(A, B, C, I) \ - ((__m512i)__builtin_ia32_pternlogq512_mask( \ - (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ - (int)(I), (__mmask8)-1)) -#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) \ - ((__m512i)__builtin_ia32_pternlogq512_mask( \ - (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ - (int)(I), (__mmask8)(U))) -#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) \ - ((__m512i)__builtin_ia32_pternlogq512_maskz( \ - (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ - (int)(I), (__mmask8)(U))) -#define _mm512_ternarylogic_epi32(A, B, C, I) \ - ((__m512i)__builtin_ia32_pternlogd512_mask( \ - (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ - (int)(I), (__mmask16)-1)) -#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) \ - ((__m512i)__builtin_ia32_pternlogd512_mask( \ - (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ - (int)(I), (__mmask16)(U))) -#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) \ - ((__m512i)__builtin_ia32_pternlogd512_maskz( \ - (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ - (int)(I), (__mmask16)(U))) +#define _mm_add_round_sd(A, B, C) (__m128d)__builtin_ia32_addsd_round(A, B, C) +#define _mm_mask_add_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C) +#define _mm_maskz_add_round_sd(U, A, B, C) (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) +#define _mm_add_round_ss(A, B, C) (__m128)__builtin_ia32_addss_round(A, B, C) +#define _mm_mask_add_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C) +#define _mm_maskz_add_round_ss(U, A, B, C) (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) +#define _mm_sub_round_sd(A, B, C) (__m128d)__builtin_ia32_subsd_round(A, B, C) +#define _mm_mask_sub_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C) +#define _mm_maskz_sub_round_sd(U, A, B, C) (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) +#define _mm_sub_round_ss(A, B, C) (__m128)__builtin_ia32_subss_round(A, B, C) +#define _mm_mask_sub_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C) +#define _mm_maskz_sub_round_ss(U, A, B, C) (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) #endif - -__funline __m512d _mm512_rcp14_pd(__m512d __A) { - return (__m512d)__builtin_ia32_rcp14pd512_mask( - (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_rcp14_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_rcp14pd512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_rcp14_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_rcp14pd512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_rcp14_ps(__m512 __A) { - return (__m512)__builtin_ia32_rcp14ps512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_rcp14_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_rcp14ps512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_rcp14_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_rcp14ps512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __m128d _mm_rcp14_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_rcp14sd((__v2df)__B, (__v2df)__A); -} - -__funline __m128d _mm_mask_rcp14_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_rcp14sd_mask((__v2df)__B, (__v2df)__A, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_rcp14_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_rcp14sd_mask( - (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_rcp14_ss(__m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_rcp14ss((__v4sf)__B, (__v4sf)__A); -} - -__funline __m128 _mm_mask_rcp14_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_rcp14ss_mask((__v4sf)__B, (__v4sf)__A, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_rcp14_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_rcp14ss_mask( - (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m512d _mm512_rsqrt14_pd(__m512d __A) { - return (__m512d)__builtin_ia32_rsqrt14pd512_mask( - (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_rsqrt14_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_rsqrt14pd512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_rsqrt14_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_rsqrt14pd512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512 _mm512_rsqrt14_ps(__m512 __A) { - return (__m512)__builtin_ia32_rsqrt14ps512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); -} - -__funline __m512 _mm512_mask_rsqrt14_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_rsqrt14ps512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); -} - -__funline __m512 _mm512_maskz_rsqrt14_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_rsqrt14ps512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); -} - -__funline __m128d _mm_rsqrt14_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_rsqrt14sd((__v2df)__B, (__v2df)__A); -} - -__funline __m128d _mm_mask_rsqrt14_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_rsqrt14sd_mask((__v2df)__B, (__v2df)__A, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_rsqrt14_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_rsqrt14sd_mask( - (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_rsqrt14_ss(__m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_rsqrt14ss((__v4sf)__B, (__v4sf)__A); -} - -__funline __m128 _mm_mask_rsqrt14_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_rsqrt14ss_mask((__v4sf)__B, (__v4sf)__A, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_rsqrt14_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_rsqrt14ss_mask( - (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_sqrt_round_pd(__m512d __A, const int __R) { - return (__m512d)__builtin_ia32_sqrtpd512_mask( - (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C, + const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, __imm, + (__mmask8) -1); } - -__funline __m512d _mm512_mask_sqrt_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, __imm, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_sqrt_round_pd(__mmask8 __U, __m512d __A, - const int __R) { - return (__m512d)__builtin_ia32_sqrtpd512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + __imm, (__mmask8) __U); } - -__funline __m512 _mm512_sqrt_round_ps(__m512 __A, const int __R) { - return (__m512)__builtin_ia32_sqrtps512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C, + const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + __imm, (__mmask16) -1); } - -__funline __m512 _mm512_mask_sqrt_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - const int __R) { - return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + __imm, (__mmask16) __U); } - -__funline __m512 _mm512_maskz_sqrt_round_ps(__mmask16 __U, __m512 __A, - const int __R) { - return (__m512)__builtin_ia32_sqrtps512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); -} - -__funline __m128d _mm_sqrt_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_sqrtsd_mask_round( - (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1, __R); -} - -__funline __m128d _mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_sqrtsd_mask_round( - (__v2df)__B, (__v2df)__A, (__v2df)__W, (__mmask8)__U, __R); -} - -__funline __m128d _mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_sqrtsd_mask_round( - (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); -} - -__funline __m128 _mm_sqrt_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_sqrtss_mask_round( - (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, __R); -} - -__funline __m128 _mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_sqrtss_mask_round( - (__v4sf)__B, (__v4sf)__A, (__v4sf)__W, (__mmask8)__U, __R); -} - -__funline __m128 _mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_sqrtss_mask_round( - (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + __imm, (__mmask16) __U); } #else -#define _mm512_sqrt_round_pd(A, C) \ - (__m512d) \ - __builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C) - -#define _mm512_mask_sqrt_round_pd(W, U, A, C) \ - (__m512d) __builtin_ia32_sqrtpd512_mask(A, W, U, C) - -#define _mm512_maskz_sqrt_round_pd(U, A, C) \ - (__m512d) __builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_sqrt_round_ps(A, C) \ - (__m512) \ - __builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C) - -#define _mm512_mask_sqrt_round_ps(W, U, A, C) \ - (__m512) __builtin_ia32_sqrtps512_mask(A, W, U, C) - -#define _mm512_maskz_sqrt_round_ps(U, A, C) \ - (__m512) __builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) - -#define _mm_sqrt_round_sd(A, B, C) \ - (__m128d) \ - __builtin_ia32_sqrtsd_mask_round(B, A, (__v2df)_mm_setzero_pd(), -1, C) - -#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_sqrtsd_mask_round(B, A, W, U, C) - -#define _mm_maskz_sqrt_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_sqrtsd_mask_round(B, A, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_sqrt_round_ss(A, B, C) \ - (__m128) \ - __builtin_ia32_sqrtss_mask_round(B, A, (__v4sf)_mm_setzero_ps(), -1, C) - -#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_sqrtss_mask_round(B, A, W, U, C) - -#define _mm_maskz_sqrt_round_ss(U, A, B, C) \ - (__m128) \ - __builtin_ia32_sqrtss_mask_round(B, A, (__v4sf)_mm_setzero_ps(), U, C) +#define _mm512_ternarylogic_epi64(A, B, C, I) ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)-1)) +#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) ((__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U))) +#define _mm512_ternarylogic_epi32(A, B, C, I) ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), (__mmask16)-1)) +#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), (__mmask16)(U))) +#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) ((__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I), (__mmask16)(U))) #endif - -__funline __m512i _mm512_cvtepi8_epi32(__m128i __A) { - return (__m512i)__builtin_ia32_pmovsxbd512_mask( - (__v16qi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp14_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, - __m128i __A) { - return (__m512i)__builtin_ia32_pmovsxbd512_mask((__v16qi)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i)__builtin_ia32_pmovsxbd512_mask( - (__v16qi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512i _mm512_cvtepi8_epi64(__m128i __A) { - return (__m512i)__builtin_ia32_pmovsxbq512_mask( - (__v16qi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp14_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, - __m128i __A) { - return (__m512i)__builtin_ia32_pmovsxbq512_mask((__v16qi)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_pmovsxbq512_mask( - (__v16qi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m512i _mm512_cvtepi16_epi32(__m256i __A) { - return (__m512i)__builtin_ia32_pmovsxwd512_mask( - (__v16hi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B, + (__v2df) __A); } - -__funline __m512i _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, - __m256i __A) { - return (__m512i)__builtin_ia32_pmovsxwd512_mask((__v16hi)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i)__builtin_ia32_pmovsxwd512_mask( - (__v16hi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) _mm_setzero_ps (), + (__mmask8) __U); } - -__funline __m512i _mm512_cvtepi16_epi64(__m128i __A) { - return (__m512i)__builtin_ia32_pmovsxwq512_mask( - (__v8hi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B, + (__v4sf) __A); } - -__funline __m512i _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, - __m128i __A) { - return (__m512i)__builtin_ia32_pmovsxwq512_mask((__v8hi)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_pmovsxwq512_mask( - (__v8hi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); } - -__funline __m512i _mm512_cvtepi32_epi64(__m256i __X) { - return (__m512i)__builtin_ia32_pmovsxdq512_mask( - (__v8si)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt14_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, - __m256i __X) { - return (__m512i)__builtin_ia32_pmovsxdq512_mask((__v8si)__X, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i)__builtin_ia32_pmovsxdq512_mask( - (__v8si)__X, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512i _mm512_cvtepu8_epi32(__m128i __A) { - return (__m512i)__builtin_ia32_pmovzxbd512_mask( - (__v16qi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt14_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, - __m128i __A) { - return (__m512i)__builtin_ia32_pmovzxbd512_mask((__v16qi)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { - return (__m512i)__builtin_ia32_pmovzxbd512_mask( - (__v16qi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m512i _mm512_cvtepu8_epi64(__m128i __A) { - return (__m512i)__builtin_ia32_pmovzxbq512_mask( - (__v16qi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B, + (__v2df) __A); } - -__funline __m512i _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, - __m128i __A) { - return (__m512i)__builtin_ia32_pmovzxbq512_mask((__v16qi)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_pmovzxbq512_mask( - (__v16qi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); } - -__funline __m512i _mm512_cvtepu16_epi32(__m256i __A) { - return (__m512i)__builtin_ia32_pmovzxwd512_mask( - (__v16hi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B, + (__v4sf) __A); } - -__funline __m512i _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, - __m256i __A) { - return (__m512i)__builtin_ia32_pmovzxwd512_mask((__v16hi)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { - return (__m512i)__builtin_ia32_pmovzxwd512_mask( - (__v16hi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); } - -__funline __m512i _mm512_cvtepu16_epi64(__m128i __A) { - return (__m512i)__builtin_ia32_pmovzxwq512_mask( - (__v8hi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, - __m128i __A) { - return (__m512i)__builtin_ia32_pmovzxwq512_mask((__v8hi)__A, (__v8di)__W, - (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m512i)__builtin_ia32_pmovzxwq512_mask( - (__v8hi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_cvtepu32_epi64(__m256i __X) { - return (__m512i)__builtin_ia32_pmovzxdq512_mask( - (__v8si)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, - __m256i __X) { - return (__m512i)__builtin_ia32_pmovzxdq512_mask((__v8si)__X, (__v8di)__W, - (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { - return (__m512i)__builtin_ia32_pmovzxdq512_mask( - (__v8si)__X, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_add_round_pd(__m512d __A, __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_add_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_add_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_add_round_ps(__m512 __A, __m512 __B, const int __R) { - return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_add_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __R) { - return (__m512)__builtin_ia32_addps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_add_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { - return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m512d _mm512_sub_round_pd(__m512d __A, __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_sub_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_sub_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_sub_round_ps(__m512 __A, __m512 __B, const int __R) { - return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); } - -__funline __m512 _mm512_mask_sub_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __R) { - return (__m512)__builtin_ia32_subps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U, __R); } - -__funline __m512 _mm512_maskz_sub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { - return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } #else -#define _mm512_add_round_pd(A, B, C) \ - (__m512d) \ - __builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) - -#define _mm512_mask_add_round_pd(W, U, A, B, C) \ - (__m512d) __builtin_ia32_addpd512_mask(A, B, W, U, C) - -#define _mm512_maskz_add_round_pd(U, A, B, C) \ - (__m512d) \ - __builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_add_round_ps(A, B, C) \ - (__m512) __builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ - -1, C) - -#define _mm512_mask_add_round_ps(W, U, A, B, C) \ - (__m512) __builtin_ia32_addps512_mask(A, B, W, U, C) - -#define _mm512_maskz_add_round_ps(U, A, B, C) \ - (__m512) \ - __builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) - -#define _mm512_sub_round_pd(A, B, C) \ - (__m512d) \ - __builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) - -#define _mm512_mask_sub_round_pd(W, U, A, B, C) \ - (__m512d) __builtin_ia32_subpd512_mask(A, B, W, U, C) - -#define _mm512_maskz_sub_round_pd(U, A, B, C) \ - (__m512d) \ - __builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_sub_round_ps(A, B, C) \ - (__m512) __builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ - -1, C) - -#define _mm512_mask_sub_round_ps(W, U, A, B, C) \ - (__m512) __builtin_ia32_subps512_mask(A, B, W, U, C) - -#define _mm512_maskz_sub_round_ps(U, A, B, C) \ - (__m512) \ - __builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm512_sqrt_round_pd(A, C) (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C) +#define _mm512_mask_sqrt_round_pd(W, U, A, C) (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C) +#define _mm512_maskz_sqrt_round_pd(U, A, C) (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_sqrt_round_ps(A, C) (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C) +#define _mm512_mask_sqrt_round_ps(W, U, A, C) (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C) +#define _mm512_maskz_sqrt_round_ps(U, A, C) (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm_sqrt_round_sd(A, B, C) (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, (__v2df) _mm_setzero_pd (), -1, C) +#define _mm_mask_sqrt_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C) +#define _mm_maskz_sqrt_round_sd(U, A, B, C) (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, (__v2df) _mm_setzero_pd (), U, C) +#define _mm_sqrt_round_ss(A, B, C) (__m128)__builtin_ia32_sqrtss_mask_round (B, A, (__v4sf) _mm_setzero_ps (), -1, C) +#define _mm_mask_sqrt_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C) +#define _mm_maskz_sqrt_round_ss(U, A, B, C) (__m128)__builtin_ia32_sqrtss_mask_round (B, A, (__v4sf) _mm_setzero_ps (), U, C) #endif - +#define _mm_mask_sqrt_sd(W, U, A, B) _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_sqrt_sd(U, A, B) _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_mask_sqrt_ss(W, U, A, B) _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_sqrt_ss(U, A, B) _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi32 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi64 (__m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu16_epi32 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu16_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_epi64 (__m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} #ifdef __OPTIMIZE__ -__funline __m512d _mm512_mul_round_pd(__m512d __A, __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_mul_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_mul_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_mul_round_ps(__m512 __A, __m512 __B, const int __R) { - return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_mul_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __R) { - return (__m512)__builtin_ia32_mulps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_mul_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { - return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m512d _mm512_div_round_pd(__m512d __M, __m512d __V, const int __R) { - return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_div_round_pd(__m512d __W, __mmask8 __U, __m512d __M, - __m512d __V, const int __R) { - return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, - (__v8df)__W, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_div_round_pd(__mmask8 __U, __m512d __M, - __m512d __V, const int __R) { - return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_div_round_ps(__m512 __A, __m512 __B, const int __R) { - return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_div_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __R) { - return (__m512)__builtin_ia32_divps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { - return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); -} - -__funline __m128d _mm_mul_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_mulsd_round((__v2df)__A, (__v2df)__B, __R); -} - -__funline __m128d _mm_mask_mul_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_mulsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); -} - -__funline __m128d _mm_maskz_mul_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_mulsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); -} - -__funline __m128 _mm_mul_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_mulss_round((__v4sf)__A, (__v4sf)__B, __R); -} - -__funline __m128 _mm_mask_mul_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_mulss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); -} - -__funline __m128 _mm_maskz_mul_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_mulss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); -} - -__funline __m128d _mm_div_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_divsd_round((__v2df)__A, (__v2df)__B, __R); -} - -__funline __m128d _mm_mask_div_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_divsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); -} - -__funline __m128d _mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_divsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); -} - -__funline __m128 _mm_div_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_divss_round((__v4sf)__A, (__v4sf)__B, __R); -} - -__funline __m128 _mm_mask_div_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_divss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); -} - -__funline __m128 _mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_divss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); -} - -#else -#define _mm512_mul_round_pd(A, B, C) \ - (__m512d) \ - __builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) - -#define _mm512_mask_mul_round_pd(W, U, A, B, C) \ - (__m512d) __builtin_ia32_mulpd512_mask(A, B, W, U, C) - -#define _mm512_maskz_mul_round_pd(U, A, B, C) \ - (__m512d) \ - __builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_mul_round_ps(A, B, C) \ - (__m512) __builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ - -1, C) - -#define _mm512_mask_mul_round_ps(W, U, A, B, C) \ - (__m512) __builtin_ia32_mulps512_mask(A, B, W, U, C) - -#define _mm512_maskz_mul_round_ps(U, A, B, C) \ - (__m512) \ - __builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) - -#define _mm512_div_round_pd(A, B, C) \ - (__m512d) \ - __builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) - -#define _mm512_mask_div_round_pd(W, U, A, B, C) \ - (__m512d) __builtin_ia32_divpd512_mask(A, B, W, U, C) - -#define _mm512_maskz_div_round_pd(U, A, B, C) \ - (__m512d) \ - __builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_div_round_ps(A, B, C) \ - (__m512) __builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ - -1, C) - -#define _mm512_mask_div_round_ps(W, U, A, B, C) \ - (__m512) __builtin_ia32_divps512_mask(A, B, W, U, C) - -#define _mm512_maskz_div_round_ps(U, A, B, C) \ - (__m512) \ - __builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) - -#define _mm_mul_round_sd(A, B, C) (__m128d) __builtin_ia32_mulsd_round(A, B, C) - -#define _mm_mask_mul_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_mulsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_mul_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_mul_round_ss(A, B, C) (__m128) __builtin_ia32_mulss_round(A, B, C) - -#define _mm_mask_mul_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_mulss_mask_round(A, B, W, U, C) - -#define _mm_maskz_mul_round_ss(U, A, B, C) \ - (__m128) __builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_div_round_sd(A, B, C) (__m128d) __builtin_ia32_divsd_round(A, B, C) - -#define _mm_mask_div_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_divsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_div_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_div_round_ss(A, B, C) (__m128) __builtin_ia32_divss_round(A, B, C) - -#define _mm_mask_div_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_divss_mask_round(A, B, W, U, C) - -#define _mm_maskz_div_round_ss(U, A, B, C) \ - (__m128) __builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#endif - -#ifdef __OPTIMIZE__ -__funline __m512d _mm512_max_round_pd(__m512d __A, __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_max_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_max_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); -} - -__funline __m512 _mm512_max_round_ps(__m512 __A, __m512 __B, const int __R) { - return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_max_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __R) { - return (__m512)__builtin_ia32_maxps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_max_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { - return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); -} - -__funline __m512d _mm512_min_round_pd(__m512d __A, __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_min_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_min_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); -} - -__funline __m512 _mm512_min_round_ps(__m512 __A, __m512 __B, const int __R) { - return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_min_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __R) { - return (__m512)__builtin_ia32_minps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_min_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { - return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } #else -#define _mm512_max_round_pd(A, B, R) \ - (__m512d) \ - __builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) - -#define _mm512_mask_max_round_pd(W, U, A, B, R) \ - (__m512d) __builtin_ia32_maxpd512_mask(A, B, W, U, R) - -#define _mm512_maskz_max_round_pd(U, A, B, R) \ - (__m512d) \ - __builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) - -#define _mm512_max_round_ps(A, B, R) \ - (__m512) __builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), \ - -1, R) - -#define _mm512_mask_max_round_ps(W, U, A, B, R) \ - (__m512) __builtin_ia32_maxps512_mask(A, B, W, U, R) - -#define _mm512_maskz_max_round_ps(U, A, B, R) \ - (__m512) \ - __builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) - -#define _mm512_min_round_pd(A, B, R) \ - (__m512d) \ - __builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) - -#define _mm512_mask_min_round_pd(W, U, A, B, R) \ - (__m512d) __builtin_ia32_minpd512_mask(A, B, W, U, R) - -#define _mm512_maskz_min_round_pd(U, A, B, R) \ - (__m512d) \ - __builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) - -#define _mm512_min_round_ps(A, B, R) \ - (__m512) __builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ - -1, R) - -#define _mm512_mask_min_round_ps(W, U, A, B, R) \ - (__m512) __builtin_ia32_minps512_mask(A, B, W, U, R) - -#define _mm512_maskz_min_round_ps(U, A, B, R) \ - (__m512) \ - __builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) +#define _mm512_add_round_pd(A, B, C) (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) +#define _mm512_mask_add_round_pd(W, U, A, B, C) (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C) +#define _mm512_maskz_add_round_pd(U, A, B, C) (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_add_round_ps(A, B, C) (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) +#define _mm512_mask_add_round_ps(W, U, A, B, C) (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C) +#define _mm512_maskz_add_round_ps(U, A, B, C) (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm512_sub_round_pd(A, B, C) (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) +#define _mm512_mask_sub_round_pd(W, U, A, B, C) (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C) +#define _mm512_maskz_sub_round_pd(U, A, B, C) (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_sub_round_ps(A, B, C) (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) +#define _mm512_mask_sub_round_ps(W, U, A, B, C) (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C) +#define _mm512_maskz_sub_round_ps(U, A, B, C) (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) #endif - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_scalef_round_pd(__m512d __A, __m512d __B, - const int __R) { - return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_scalef_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, __m512d __B, - const int __R) { - return (__m512d)__builtin_ia32_scalefpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_scalef_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, const int __R) { - return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_scalef_round_ps(__m512 __A, __m512 __B, const int __R) { - return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_scalef_round_ps(__m512 __W, __mmask16 __U, - __m512 __A, __m512 __B, - const int __R) { - return (__m512)__builtin_ia32_scalefps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_scalef_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, const int __R) { - return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m128d _mm_scalef_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_scalefsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m128d _mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_scalefsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m128d _mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_scalefsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m128 _mm_scalef_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_scalefss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m128 _mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_scalefss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m128 _mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_scalefss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_divss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } #else -#define _mm512_scalef_round_pd(A, B, C) \ - (__m512d) __builtin_ia32_scalefpd512_mask( \ - A, B, (__v8df)_mm512_undefined_pd(), -1, C) - -#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \ - (__m512d) __builtin_ia32_scalefpd512_mask(A, B, W, U, C) - -#define _mm512_maskz_scalef_round_pd(U, A, B, C) \ - (__m512d) \ - __builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) - -#define _mm512_scalef_round_ps(A, B, C) \ - (__m512) __builtin_ia32_scalefps512_mask( \ - A, B, (__v16sf)_mm512_undefined_ps(), -1, C) - -#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \ - (__m512) __builtin_ia32_scalefps512_mask(A, B, W, U, C) - -#define _mm512_maskz_scalef_round_ps(U, A, B, C) \ - (__m512) __builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), \ - U, C) - -#define _mm_scalef_round_sd(A, B, C) \ - (__m128d) __builtin_ia32_scalefsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), \ - -1, C) - -#define _mm_scalef_round_ss(A, B, C) \ - (__m128) __builtin_ia32_scalefss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), \ - -1, C) +#define _mm512_mul_round_pd(A, B, C) (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) +#define _mm512_mask_mul_round_pd(W, U, A, B, C) (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C) +#define _mm512_maskz_mul_round_pd(U, A, B, C) (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_mul_round_ps(A, B, C) (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) +#define _mm512_mask_mul_round_ps(W, U, A, B, C) (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C) +#define _mm512_maskz_mul_round_ps(U, A, B, C) (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm512_div_round_pd(A, B, C) (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) +#define _mm512_mask_div_round_pd(W, U, A, B, C) (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C) +#define _mm512_maskz_div_round_pd(U, A, B, C) (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_div_round_ps(A, B, C) (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) +#define _mm512_mask_div_round_ps(W, U, A, B, C) (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C) +#define _mm512_maskz_div_round_ps(U, A, B, C) (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm_mul_round_sd(A, B, C) (__m128d)__builtin_ia32_mulsd_round(A, B, C) +#define _mm_mask_mul_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C) +#define _mm_maskz_mul_round_sd(U, A, B, C) (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) +#define _mm_mul_round_ss(A, B, C) (__m128)__builtin_ia32_mulss_round(A, B, C) +#define _mm_mask_mul_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C) +#define _mm_maskz_mul_round_ss(U, A, B, C) (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) +#define _mm_div_round_sd(A, B, C) (__m128d)__builtin_ia32_divsd_round(A, B, C) +#define _mm_mask_div_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C) +#define _mm_maskz_div_round_sd(U, A, B, C) (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) +#define _mm_div_round_ss(A, B, C) (__m128)__builtin_ia32_divss_round(A, B, C) +#define _mm_mask_div_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C) +#define _mm_maskz_div_round_ss(U, A, B, C) (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) #endif - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, - __m512d __C, __mmask8 __U, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { - return (__m512)__builtin_ia32_vfmaddps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m512 _mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { - return (__m512)__builtin_ia32_vfmaddps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512 _mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmsubpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512d _mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmsubpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512d _mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, - __m512d __C, __mmask8 __U, - const int __R) { - return (__m512d)__builtin_ia32_vfmsubpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512d _mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmsubpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512 _mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmsubps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { - return (__m512)__builtin_ia32_vfmsubps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { - return (__m512)__builtin_ia32_vfmsubps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmsubps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512d _mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, - __m512d __C, __mmask8 __U, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512 _mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, - __m512 __C, __mmask16 __U, - const int __R) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddsubps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512d _mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( - (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( - (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, - __m512d __C, __mmask8 __U, - const int __R) { - return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( - (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512 _mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask( - (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask( - (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, - __m512 __C, __mmask16 __U, - const int __R) { - return (__m512)__builtin_ia32_vfmsubaddps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfmaddsubps512_maskz( - (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512d _mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfnmaddpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfnmaddpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, - __m512d __C, __mmask8 __U, - const int __R) { - return (__m512d)__builtin_ia32_vfnmaddpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfnmaddpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512 _mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfnmaddps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfnmaddps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { - return (__m512)__builtin_ia32_vfnmaddps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfnmaddps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512d _mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfnmsubpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); -} - -__funline __m512d _mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfnmsubpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, - __m512d __C, __mmask8 __U, - const int __R) { - return (__m512d)__builtin_ia32_vfnmsubpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512d _mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, __m512d __C, - const int __R) { - return (__m512d)__builtin_ia32_vfnmsubpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); -} - -__funline __m512 _mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfnmsubps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); -} - -__funline __m512 _mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfnmsubps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { - return (__m512)__builtin_ia32_vfnmsubps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); -} - -__funline __m512 _mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, __m512 __C, - const int __R) { - return (__m512)__builtin_ia32_vfnmsubps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } #else -#define _mm512_fmadd_round_pd(A, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R) - -#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) - -#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ - (__m512d) __builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R) - -#define _mm512_fmadd_round_ps(A, B, C, R) \ - (__m512) __builtin_ia32_vfmaddps512_mask(A, B, C, -1, R) - -#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ - (__m512) __builtin_ia32_vfmaddps512_mask(A, B, C, U, R) - -#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ - (__m512) __builtin_ia32_vfmaddps512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ - (__m512) __builtin_ia32_vfmaddps512_maskz(A, B, C, U, R) - -#define _mm512_fmsub_round_pd(A, B, C, R) \ - (__m512d) __builtin_ia32_vfmsubpd512_mask(A, B, C, -1, R) - -#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ - (__m512d) __builtin_ia32_vfmsubpd512_mask(A, B, C, U, R) - -#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ - (__m512d) __builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ - (__m512d) __builtin_ia32_vfmsubpd512_maskz(A, B, C, U, R) - -#define _mm512_fmsub_round_ps(A, B, C, R) \ - (__m512) __builtin_ia32_vfmsubps512_mask(A, B, C, -1, R) - -#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ - (__m512) __builtin_ia32_vfmsubps512_mask(A, B, C, U, R) - -#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ - (__m512) __builtin_ia32_vfmsubps512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ - (__m512) __builtin_ia32_vfmsubps512_maskz(A, B, C, U, R) - -#define _mm512_fmaddsub_round_pd(A, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R) - -#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, C, U, R) - -#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ - (__m512d) __builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R) - -#define _mm512_fmaddsub_round_ps(A, B, C, R) \ - (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R) - -#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ - (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R) - -#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ - (__m512) __builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ - (__m512) __builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R) - -#define _mm512_fmsubadd_round_pd(A, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R) - -#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R) - -#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ - (__m512d) __builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ - (__m512d) __builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R) - -#define _mm512_fmsubadd_round_ps(A, B, C, R) \ - (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R) - -#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ - (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R) - -#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ - (__m512) __builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ - (__m512) __builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R) - -#define _mm512_fnmadd_round_pd(A, B, C, R) \ - (__m512d) __builtin_ia32_vfnmaddpd512_mask(A, B, C, -1, R) - -#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ - (__m512d) __builtin_ia32_vfnmaddpd512_mask(A, B, C, U, R) - -#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ - (__m512d) __builtin_ia32_vfnmaddpd512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ - (__m512d) __builtin_ia32_vfnmaddpd512_maskz(A, B, C, U, R) - -#define _mm512_fnmadd_round_ps(A, B, C, R) \ - (__m512) __builtin_ia32_vfnmaddps512_mask(A, B, C, -1, R) - -#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ - (__m512) __builtin_ia32_vfnmaddps512_mask(A, B, C, U, R) - -#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ - (__m512) __builtin_ia32_vfnmaddps512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ - (__m512) __builtin_ia32_vfnmaddps512_maskz(A, B, C, U, R) - -#define _mm512_fnmsub_round_pd(A, B, C, R) \ - (__m512d) __builtin_ia32_vfnmsubpd512_mask(A, B, C, -1, R) - -#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ - (__m512d) __builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R) - -#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ - (__m512d) __builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ - (__m512d) __builtin_ia32_vfnmsubpd512_maskz(A, B, C, U, R) - -#define _mm512_fnmsub_round_ps(A, B, C, R) \ - (__m512) __builtin_ia32_vfnmsubps512_mask(A, B, C, -1, R) - -#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ - (__m512) __builtin_ia32_vfnmsubps512_mask(A, B, C, U, R) - -#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ - (__m512) __builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R) - -#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ - (__m512) __builtin_ia32_vfnmsubps512_maskz(A, B, C, U, R) +#define _mm512_max_round_pd(A, B, R) (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) +#define _mm512_mask_max_round_pd(W, U, A, B, R) (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R) +#define _mm512_maskz_max_round_pd(U, A, B, R) (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) +#define _mm512_max_round_ps(A, B, R) (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), -1, R) +#define _mm512_mask_max_round_ps(W, U, A, B, R) (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R) +#define _mm512_maskz_max_round_ps(U, A, B, R) (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) +#define _mm512_min_round_pd(A, B, R) (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) +#define _mm512_mask_min_round_pd(W, U, A, B, R) (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R) +#define _mm512_maskz_min_round_pd(U, A, B, R) (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) +#define _mm512_min_round_ps(A, B, R) (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, R) +#define _mm512_mask_min_round_ps(W, U, A, B, R) (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R) +#define _mm512_maskz_min_round_ps(U, A, B, R) (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) #endif - -__funline __m512i _mm512_abs_epi64(__m512i __A) { - return (__m512i)__builtin_ia32_pabsq512_mask( - (__v8di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512i _mm512_mask_abs_epi64(__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsq512_mask((__v8di)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_maskz_abs_epi64(__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsq512_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512i _mm512_abs_epi32(__m512i __A) { - return (__m512i)__builtin_ia32_pabsd512_mask( - (__v16si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512i _mm512_mask_abs_epi32(__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsd512_mask((__v16si)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_pabsd512_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m512 _mm512_broadcastss_ps(__m128 __A) { - return (__m512)__builtin_ia32_broadcastss512( - (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); } - -__funline __m512 _mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, - __m128 __A) { - return (__m512)__builtin_ia32_broadcastss512((__v4sf)__A, (__v16sf)__O, __M); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } - -__funline __m512 _mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_broadcastss512( - (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512d _mm512_broadcastsd_pd(__m128d __A) { - return (__m512d)__builtin_ia32_broadcastsd512( - (__v2df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, - __m128d __A) { - return (__m512d)__builtin_ia32_broadcastsd512((__v2df)__A, (__v8df)__O, __M); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { - return (__m512d)__builtin_ia32_broadcastsd512( - (__v2df)__A, (__v8df)_mm512_setzero_pd(), __M); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } - -__funline __m512i _mm512_broadcastd_epi32(__m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastd512( - (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +#else +#define _mm512_scalef_round_pd(A, B, C) (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) +#define _mm512_mask_scalef_round_pd(W, U, A, B, C) (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C) +#define _mm512_maskz_scalef_round_pd(U, A, B, C) (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) +#define _mm512_scalef_round_ps(A, B, C) (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) +#define _mm512_mask_scalef_round_ps(W, U, A, B, C) (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C) +#define _mm512_maskz_scalef_round_ps(U, A, B, C) (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#define _mm_scalef_round_sd(A, B, C) (__m128d)__builtin_ia32_scalefsd_mask_round (A, B, (__v2df)_mm_setzero_pd (), -1, C) +#define _mm_scalef_round_ss(A, B, C) (__m128)__builtin_ia32_scalefss_mask_round (A, B, (__v4sf)_mm_setzero_ps (), -1, C) +#endif +#define _mm_mask_scalef_sd(W, U, A, B) _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_scalef_sd(U, A, B) _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_mask_scalef_ss(W, U, A, B) _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_scalef_ss(U, A, B) _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); } - -__funline __m512i _mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, - __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastd512((__v4si)__A, (__v16si)__O, __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastd512( - (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_set1_epi32(int __A) { - return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask( - __A, (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1)); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) { - return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask(__A, (__v16si)__O, - __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); } - -__funline __m512i _mm512_maskz_set1_epi32(__mmask16 __M, int __A) { - return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask( - __A, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_broadcastq_epi64(__m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastq512( - (__v2di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, - __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastq512((__v2di)__A, (__v8di)__O, __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { - return (__m512i)__builtin_ia32_pbroadcastq512( - (__v2di)__A, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); } - -__funline __m512i _mm512_set1_epi64(long long __A) { - return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask( - __A, (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1)); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, - long long __A) { - return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask(__A, (__v8di)__O, __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { - return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask( - __A, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512 _mm512_broadcast_f32x4(__m128 __A) { - return (__m512)__builtin_ia32_broadcastf32x4_512( - (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, - __m128 __A) { - return (__m512)__builtin_ia32_broadcastf32x4_512((__v4sf)__A, (__v16sf)__O, - __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { - return (__m512)__builtin_ia32_broadcastf32x4_512( - (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_broadcast_i32x4(__m128i __A) { - return (__m512i)__builtin_ia32_broadcasti32x4_512( - (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, - __m128i __A) { - return (__m512i)__builtin_ia32_broadcasti32x4_512((__v4si)__A, (__v16si)__O, - __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); } - -__funline __m512i _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { - return (__m512i)__builtin_ia32_broadcasti32x4_512( - (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_broadcast_f64x4(__m256d __A) { - return (__m512d)__builtin_ia32_broadcastf64x4_512( - (__v4df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, - __m256d __A) { - return (__m512d)__builtin_ia32_broadcastf64x4_512((__v4df)__A, (__v8df)__O, - __M); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { - return (__m512d)__builtin_ia32_broadcastf64x4_512( - (__v4df)__A, (__v8df)_mm512_setzero_pd(), __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); } - -__funline __m512i _mm512_broadcast_i64x4(__m256i __A) { - return (__m512i)__builtin_ia32_broadcasti64x4_512( - (__v4di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, - __m256i __A) { - return (__m512i)__builtin_ia32_broadcasti64x4_512((__v4di)__A, (__v8di)__O, - __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { - return (__m512i)__builtin_ia32_broadcasti64x4_512( - (__v4di)__A, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); } - -typedef enum { - _MM_PERM_AAAA = 0x00, - _MM_PERM_AAAB = 0x01, - _MM_PERM_AAAC = 0x02, - _MM_PERM_AAAD = 0x03, - _MM_PERM_AABA = 0x04, - _MM_PERM_AABB = 0x05, - _MM_PERM_AABC = 0x06, - _MM_PERM_AABD = 0x07, - _MM_PERM_AACA = 0x08, - _MM_PERM_AACB = 0x09, - _MM_PERM_AACC = 0x0A, - _MM_PERM_AACD = 0x0B, - _MM_PERM_AADA = 0x0C, - _MM_PERM_AADB = 0x0D, - _MM_PERM_AADC = 0x0E, - _MM_PERM_AADD = 0x0F, - _MM_PERM_ABAA = 0x10, - _MM_PERM_ABAB = 0x11, - _MM_PERM_ABAC = 0x12, - _MM_PERM_ABAD = 0x13, - _MM_PERM_ABBA = 0x14, - _MM_PERM_ABBB = 0x15, - _MM_PERM_ABBC = 0x16, - _MM_PERM_ABBD = 0x17, - _MM_PERM_ABCA = 0x18, - _MM_PERM_ABCB = 0x19, - _MM_PERM_ABCC = 0x1A, - _MM_PERM_ABCD = 0x1B, - _MM_PERM_ABDA = 0x1C, - _MM_PERM_ABDB = 0x1D, - _MM_PERM_ABDC = 0x1E, - _MM_PERM_ABDD = 0x1F, - _MM_PERM_ACAA = 0x20, - _MM_PERM_ACAB = 0x21, - _MM_PERM_ACAC = 0x22, - _MM_PERM_ACAD = 0x23, - _MM_PERM_ACBA = 0x24, - _MM_PERM_ACBB = 0x25, - _MM_PERM_ACBC = 0x26, - _MM_PERM_ACBD = 0x27, - _MM_PERM_ACCA = 0x28, - _MM_PERM_ACCB = 0x29, - _MM_PERM_ACCC = 0x2A, - _MM_PERM_ACCD = 0x2B, - _MM_PERM_ACDA = 0x2C, - _MM_PERM_ACDB = 0x2D, - _MM_PERM_ACDC = 0x2E, - _MM_PERM_ACDD = 0x2F, - _MM_PERM_ADAA = 0x30, - _MM_PERM_ADAB = 0x31, - _MM_PERM_ADAC = 0x32, - _MM_PERM_ADAD = 0x33, - _MM_PERM_ADBA = 0x34, - _MM_PERM_ADBB = 0x35, - _MM_PERM_ADBC = 0x36, - _MM_PERM_ADBD = 0x37, - _MM_PERM_ADCA = 0x38, - _MM_PERM_ADCB = 0x39, - _MM_PERM_ADCC = 0x3A, - _MM_PERM_ADCD = 0x3B, - _MM_PERM_ADDA = 0x3C, - _MM_PERM_ADDB = 0x3D, - _MM_PERM_ADDC = 0x3E, - _MM_PERM_ADDD = 0x3F, - _MM_PERM_BAAA = 0x40, - _MM_PERM_BAAB = 0x41, - _MM_PERM_BAAC = 0x42, - _MM_PERM_BAAD = 0x43, - _MM_PERM_BABA = 0x44, - _MM_PERM_BABB = 0x45, - _MM_PERM_BABC = 0x46, - _MM_PERM_BABD = 0x47, - _MM_PERM_BACA = 0x48, - _MM_PERM_BACB = 0x49, - _MM_PERM_BACC = 0x4A, - _MM_PERM_BACD = 0x4B, - _MM_PERM_BADA = 0x4C, - _MM_PERM_BADB = 0x4D, - _MM_PERM_BADC = 0x4E, - _MM_PERM_BADD = 0x4F, - _MM_PERM_BBAA = 0x50, - _MM_PERM_BBAB = 0x51, - _MM_PERM_BBAC = 0x52, - _MM_PERM_BBAD = 0x53, - _MM_PERM_BBBA = 0x54, - _MM_PERM_BBBB = 0x55, - _MM_PERM_BBBC = 0x56, - _MM_PERM_BBBD = 0x57, - _MM_PERM_BBCA = 0x58, - _MM_PERM_BBCB = 0x59, - _MM_PERM_BBCC = 0x5A, - _MM_PERM_BBCD = 0x5B, - _MM_PERM_BBDA = 0x5C, - _MM_PERM_BBDB = 0x5D, - _MM_PERM_BBDC = 0x5E, - _MM_PERM_BBDD = 0x5F, - _MM_PERM_BCAA = 0x60, - _MM_PERM_BCAB = 0x61, - _MM_PERM_BCAC = 0x62, - _MM_PERM_BCAD = 0x63, - _MM_PERM_BCBA = 0x64, - _MM_PERM_BCBB = 0x65, - _MM_PERM_BCBC = 0x66, - _MM_PERM_BCBD = 0x67, - _MM_PERM_BCCA = 0x68, - _MM_PERM_BCCB = 0x69, - _MM_PERM_BCCC = 0x6A, - _MM_PERM_BCCD = 0x6B, - _MM_PERM_BCDA = 0x6C, - _MM_PERM_BCDB = 0x6D, - _MM_PERM_BCDC = 0x6E, - _MM_PERM_BCDD = 0x6F, - _MM_PERM_BDAA = 0x70, - _MM_PERM_BDAB = 0x71, - _MM_PERM_BDAC = 0x72, - _MM_PERM_BDAD = 0x73, - _MM_PERM_BDBA = 0x74, - _MM_PERM_BDBB = 0x75, - _MM_PERM_BDBC = 0x76, - _MM_PERM_BDBD = 0x77, - _MM_PERM_BDCA = 0x78, - _MM_PERM_BDCB = 0x79, - _MM_PERM_BDCC = 0x7A, - _MM_PERM_BDCD = 0x7B, - _MM_PERM_BDDA = 0x7C, - _MM_PERM_BDDB = 0x7D, - _MM_PERM_BDDC = 0x7E, - _MM_PERM_BDDD = 0x7F, - _MM_PERM_CAAA = 0x80, - _MM_PERM_CAAB = 0x81, - _MM_PERM_CAAC = 0x82, - _MM_PERM_CAAD = 0x83, - _MM_PERM_CABA = 0x84, - _MM_PERM_CABB = 0x85, - _MM_PERM_CABC = 0x86, - _MM_PERM_CABD = 0x87, - _MM_PERM_CACA = 0x88, - _MM_PERM_CACB = 0x89, - _MM_PERM_CACC = 0x8A, - _MM_PERM_CACD = 0x8B, - _MM_PERM_CADA = 0x8C, - _MM_PERM_CADB = 0x8D, - _MM_PERM_CADC = 0x8E, - _MM_PERM_CADD = 0x8F, - _MM_PERM_CBAA = 0x90, - _MM_PERM_CBAB = 0x91, - _MM_PERM_CBAC = 0x92, - _MM_PERM_CBAD = 0x93, - _MM_PERM_CBBA = 0x94, - _MM_PERM_CBBB = 0x95, - _MM_PERM_CBBC = 0x96, - _MM_PERM_CBBD = 0x97, - _MM_PERM_CBCA = 0x98, - _MM_PERM_CBCB = 0x99, - _MM_PERM_CBCC = 0x9A, - _MM_PERM_CBCD = 0x9B, - _MM_PERM_CBDA = 0x9C, - _MM_PERM_CBDB = 0x9D, - _MM_PERM_CBDC = 0x9E, - _MM_PERM_CBDD = 0x9F, - _MM_PERM_CCAA = 0xA0, - _MM_PERM_CCAB = 0xA1, - _MM_PERM_CCAC = 0xA2, - _MM_PERM_CCAD = 0xA3, - _MM_PERM_CCBA = 0xA4, - _MM_PERM_CCBB = 0xA5, - _MM_PERM_CCBC = 0xA6, - _MM_PERM_CCBD = 0xA7, - _MM_PERM_CCCA = 0xA8, - _MM_PERM_CCCB = 0xA9, - _MM_PERM_CCCC = 0xAA, - _MM_PERM_CCCD = 0xAB, - _MM_PERM_CCDA = 0xAC, - _MM_PERM_CCDB = 0xAD, - _MM_PERM_CCDC = 0xAE, - _MM_PERM_CCDD = 0xAF, - _MM_PERM_CDAA = 0xB0, - _MM_PERM_CDAB = 0xB1, - _MM_PERM_CDAC = 0xB2, - _MM_PERM_CDAD = 0xB3, - _MM_PERM_CDBA = 0xB4, - _MM_PERM_CDBB = 0xB5, - _MM_PERM_CDBC = 0xB6, - _MM_PERM_CDBD = 0xB7, - _MM_PERM_CDCA = 0xB8, - _MM_PERM_CDCB = 0xB9, - _MM_PERM_CDCC = 0xBA, - _MM_PERM_CDCD = 0xBB, - _MM_PERM_CDDA = 0xBC, - _MM_PERM_CDDB = 0xBD, - _MM_PERM_CDDC = 0xBE, - _MM_PERM_CDDD = 0xBF, - _MM_PERM_DAAA = 0xC0, - _MM_PERM_DAAB = 0xC1, - _MM_PERM_DAAC = 0xC2, - _MM_PERM_DAAD = 0xC3, - _MM_PERM_DABA = 0xC4, - _MM_PERM_DABB = 0xC5, - _MM_PERM_DABC = 0xC6, - _MM_PERM_DABD = 0xC7, - _MM_PERM_DACA = 0xC8, - _MM_PERM_DACB = 0xC9, - _MM_PERM_DACC = 0xCA, - _MM_PERM_DACD = 0xCB, - _MM_PERM_DADA = 0xCC, - _MM_PERM_DADB = 0xCD, - _MM_PERM_DADC = 0xCE, - _MM_PERM_DADD = 0xCF, - _MM_PERM_DBAA = 0xD0, - _MM_PERM_DBAB = 0xD1, - _MM_PERM_DBAC = 0xD2, - _MM_PERM_DBAD = 0xD3, - _MM_PERM_DBBA = 0xD4, - _MM_PERM_DBBB = 0xD5, - _MM_PERM_DBBC = 0xD6, - _MM_PERM_DBBD = 0xD7, - _MM_PERM_DBCA = 0xD8, - _MM_PERM_DBCB = 0xD9, - _MM_PERM_DBCC = 0xDA, - _MM_PERM_DBCD = 0xDB, - _MM_PERM_DBDA = 0xDC, - _MM_PERM_DBDB = 0xDD, - _MM_PERM_DBDC = 0xDE, - _MM_PERM_DBDD = 0xDF, - _MM_PERM_DCAA = 0xE0, - _MM_PERM_DCAB = 0xE1, - _MM_PERM_DCAC = 0xE2, - _MM_PERM_DCAD = 0xE3, - _MM_PERM_DCBA = 0xE4, - _MM_PERM_DCBB = 0xE5, - _MM_PERM_DCBC = 0xE6, - _MM_PERM_DCBD = 0xE7, - _MM_PERM_DCCA = 0xE8, - _MM_PERM_DCCB = 0xE9, - _MM_PERM_DCCC = 0xEA, - _MM_PERM_DCCD = 0xEB, - _MM_PERM_DCDA = 0xEC, - _MM_PERM_DCDB = 0xED, - _MM_PERM_DCDC = 0xEE, - _MM_PERM_DCDD = 0xEF, - _MM_PERM_DDAA = 0xF0, - _MM_PERM_DDAB = 0xF1, - _MM_PERM_DDAC = 0xF2, - _MM_PERM_DDAD = 0xF3, - _MM_PERM_DDBA = 0xF4, - _MM_PERM_DDBB = 0xF5, - _MM_PERM_DDBC = 0xF6, - _MM_PERM_DDBD = 0xF7, - _MM_PERM_DDCA = 0xF8, - _MM_PERM_DDCB = 0xF9, - _MM_PERM_DDCC = 0xFA, - _MM_PERM_DDCD = 0xFB, - _MM_PERM_DDDA = 0xFC, - _MM_PERM_DDDB = 0xFD, - _MM_PERM_DDDC = 0xFE, +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +#else +#define _mm512_fmadd_round_pd(A, B, C, R) (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R) +#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) +#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R) +#define _mm512_fmadd_round_ps(A, B, C, R) (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R) +#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R) +#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R) +#define _mm512_fmsub_round_pd(A, B, C, R) (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, -1, R) +#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, U, R) +#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) (__m512d)__builtin_ia32_vfmsubpd512_maskz(A, B, C, U, R) +#define _mm512_fmsub_round_ps(A, B, C, R) (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, -1, R) +#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, U, R) +#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) (__m512)__builtin_ia32_vfmsubps512_maskz(A, B, C, U, R) +#define _mm512_fmaddsub_round_pd(A, B, C, R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R) +#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, U, R) +#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R) +#define _mm512_fmaddsub_round_ps(A, B, C, R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R) +#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R) +#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R) +#define _mm512_fmsubadd_round_pd(A, B, C, R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R) +#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R) +#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R) +#define _mm512_fmsubadd_round_ps(A, B, C, R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R) +#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R) +#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R) +#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R) +#define _mm512_fnmadd_round_pd(A, B, C, R) (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, -1, R) +#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, U, R) +#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) (__m512d)__builtin_ia32_vfnmaddpd512_mask3(A, B, C, U, R) +#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) (__m512d)__builtin_ia32_vfnmaddpd512_maskz(A, B, C, U, R) +#define _mm512_fnmadd_round_ps(A, B, C, R) (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, -1, R) +#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, U, R) +#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) (__m512)__builtin_ia32_vfnmaddps512_mask3(A, B, C, U, R) +#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) (__m512)__builtin_ia32_vfnmaddps512_maskz(A, B, C, U, R) +#define _mm512_fnmsub_round_pd(A, B, C, R) (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, -1, R) +#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R) +#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R) +#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) (__m512d)__builtin_ia32_vfnmsubpd512_maskz(A, B, C, U, R) +#define _mm512_fnmsub_round_ps(A, B, C, R) (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, -1, R) +#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R) +#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R) +#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) (__m512)__builtin_ia32_vfnmsubps512_maskz(A, B, C, U, R) +#endif +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi64 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi32 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastss_ps (__m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) __O, __M); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastsd_pd (__m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) __O, __M); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastd_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) __O, __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi32 (int __A) +{ + return (__m512i)(__v16si) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi32 (__mmask16 __M, int __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastd512_gpr_mask (__A, + (__v16si) _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastq_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) __O, __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi64 (long long __A) +{ + return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A }; +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastq512_gpr_mask (__A, + (__v8di) _mm512_setzero_si512 (), + __M); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f32x4 (__m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) __O, + __M); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i32x4 (__m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f64x4 (__m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) __O, + __M); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i64x4 (__m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) __O, + __M); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} +typedef enum +{ + _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, + _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, + _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, + _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, + _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, + _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, + _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, + _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, + _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, + _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, + _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, + _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, + _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, + _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, + _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, + _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, + _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, + _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, + _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, + _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, + _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, + _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, + _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, + _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, + _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, + _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, + _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, + _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, + _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, + _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, + _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, + _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, + _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, + _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, + _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, + _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, + _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, + _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, + _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, + _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, + _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, + _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, + _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, + _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, + _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, + _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, + _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, + _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, + _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, + _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, + _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, + _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, + _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, + _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, + _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, + _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, + _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, + _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, + _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, + _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, + _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, + _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, + _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, + _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, + _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, + _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, + _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, + _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, + _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, + _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, + _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, + _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, + _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, + _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, + _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, + _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, + _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, + _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, + _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, + _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, + _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, + _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, + _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, + _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, + _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, _MM_PERM_DDDD = 0xFF } _MM_PERM_ENUM; - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_shuffle_epi32(__m512i __A, _MM_PERM_ENUM __mask) { - return (__m512i)__builtin_ia32_pshufd512_mask( - (__v16si)__A, __mask, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, - __m512i __A, _MM_PERM_ENUM __mask) { - return (__m512i)__builtin_ia32_pshufd512_mask((__v16si)__A, __mask, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A, - _MM_PERM_ENUM __mask) { - return (__m512i)__builtin_ia32_pshufd512_mask( - (__v16si)__A, __mask, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_shuffle_i64x2(__m512i __A, __m512i __B, - const int __imm) { - return (__m512i)__builtin_ia32_shuf_i64x2_mask( - (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_undefined_epi32(), - (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, - __m512i __A, __m512i __B, - const int __imm) { - return (__m512i)__builtin_ia32_shuf_i64x2_mask( - (__v8di)__A, (__v8di)__B, __imm, (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, - __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_shuf_i64x2_mask( - (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_setzero_si512(), - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512i _mm512_shuffle_i32x4(__m512i __A, __m512i __B, - const int __imm) { - return (__m512i)__builtin_ia32_shuf_i32x4_mask( - (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, - __m512i __A, __m512i __B, - const int __imm) { - return (__m512i)__builtin_ia32_shuf_i32x4_mask( - (__v16si)__A, (__v16si)__B, __imm, (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, - __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_shuf_i32x4_mask( - (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512d _mm512_shuffle_f64x2(__m512d __A, __m512d __B, - const int __imm) { - return (__m512d)__builtin_ia32_shuf_f64x2_mask( - (__v8df)__A, (__v8df)__B, __imm, (__v8df)_mm512_undefined_pd(), - (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, - __m512d __A, __m512d __B, - const int __imm) { - return (__m512d)__builtin_ia32_shuf_f64x2_mask( - (__v8df)__A, (__v8df)__B, __imm, (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, - __m512d __B, const int __imm) { - return (__m512d)__builtin_ia32_shuf_f64x2_mask( - (__v8df)__A, (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512 _mm512_shuffle_f32x4(__m512 __A, __m512 __B, const int __imm) { - return (__m512)__builtin_ia32_shuf_f32x4_mask( - (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __imm) { - return (__m512)__builtin_ia32_shuf_f32x4_mask( - (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B, - const int __imm) { - return (__m512)__builtin_ia32_shuf_f32x4_mask( - (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U); -} - -#else -#define _mm512_shuffle_epi32(X, C) \ - ((__m512i)__builtin_ia32_pshufd512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) - -#define _mm512_mask_shuffle_epi32(W, U, X, C) \ - ((__m512i)__builtin_ia32_pshufd512_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) - -#define _mm512_maskz_shuffle_epi32(U, X, C) \ - ((__m512i)__builtin_ia32_pshufd512_mask( \ - (__v16si)(__m512i)(X), (int)(C), \ - (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) - -#define _mm512_shuffle_i64x2(X, Y, C) \ - ((__m512i)__builtin_ia32_shuf_i64x2_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ - (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) - -#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_shuf_i64x2_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ - (__v8di)(__m512i)(W), (__mmask8)(U))) - -#define _mm512_maskz_shuffle_i64x2(U, X, Y, C) \ - ((__m512i)__builtin_ia32_shuf_i64x2_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ - (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask8)(U))) - -#define _mm512_shuffle_i32x4(X, Y, C) \ - ((__m512i)__builtin_ia32_shuf_i32x4_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ - (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) - -#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_shuf_i32x4_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ - (__v16si)(__m512i)(W), (__mmask16)(U))) - -#define _mm512_maskz_shuffle_i32x4(U, X, Y, C) \ - ((__m512i)__builtin_ia32_shuf_i32x4_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ - (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) - -#define _mm512_shuffle_f64x2(X, Y, C) \ - ((__m512d)__builtin_ia32_shuf_f64x2_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ - (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) - -#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C) \ - ((__m512d)__builtin_ia32_shuf_f64x2_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U))) - -#define _mm512_maskz_shuffle_f64x2(U, X, Y, C) \ - ((__m512d)__builtin_ia32_shuf_f64x2_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ - (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) - -#define _mm512_shuffle_f32x4(X, Y, C) \ - ((__m512)__builtin_ia32_shuf_f32x4_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ - (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) - -#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C) \ - ((__m512)__builtin_ia32_shuf_f32x4_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U))) - -#define _mm512_maskz_shuffle_f32x4(U, X, Y, C) \ - ((__m512)__builtin_ia32_shuf_f32x4_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ - (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) -#endif - -__funline __m512i _mm512_rolv_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prolvd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prolvd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prolvd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_rorv_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prorvd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prorvd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prorvd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_rolv_epi64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prolvq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prolvq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prolvq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -__funline __m512i _mm512_rorv_epi64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_prorvq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); -} - -__funline __m512i _mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prorvq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_prorvq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); -} - -#ifdef __OPTIMIZE__ -__funline __m256i _mm512_cvtt_roundpd_epi32(__m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvttpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); -} - -__funline __m256i _mm512_mask_cvtt_roundpd_epi32(__m256i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)__A, (__v8si)__W, - (__mmask8)__U, __R); -} - -__funline __m256i _mm512_maskz_cvtt_roundpd_epi32(__mmask8 __U, __m512d __A, - const int __R) { - return (__m256i)__builtin_ia32_cvttpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); -} - -__funline __m256i _mm512_cvtt_roundpd_epu32(__m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvttpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); -} - -__funline __m256i _mm512_mask_cvtt_roundpd_epu32(__m256i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)__A, (__v8si)__W, - (__mmask8)__U, __R); -} - -__funline __m256i _mm512_maskz_cvtt_roundpd_epu32(__mmask8 __U, __m512d __A, - const int __R) { - return (__m256i)__builtin_ia32_cvttpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B, + const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } #else -#define _mm512_cvtt_roundpd_epi32(A, B) \ - ((__m256i)__builtin_ia32_cvttpd2dq512_mask( \ - A, (__v8si)_mm256_undefined_si256(), -1, B)) - -#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B) \ - ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B)) - -#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B) \ - ((__m256i)__builtin_ia32_cvttpd2dq512_mask( \ - A, (__v8si)_mm256_setzero_si256(), U, B)) - -#define _mm512_cvtt_roundpd_epu32(A, B) \ - ((__m256i)__builtin_ia32_cvttpd2udq512_mask( \ - A, (__v8si)_mm256_undefined_si256(), -1, B)) - -#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B) \ - ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B)) - -#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B) \ - ((__m256i)__builtin_ia32_cvttpd2udq512_mask( \ - A, (__v8si)_mm256_setzero_si256(), U, B)) +#define _mm512_shuffle_epi32(X, C) ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) +#define _mm512_mask_shuffle_epi32(W, U, X, C) ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_shuffle_epi32(U, X, C) ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) +#define _mm512_shuffle_i64x2(X, Y, C) ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) +#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C) ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_shuffle_i64x2(U, X, Y, C) ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) +#define _mm512_shuffle_i32x4(X, Y, C) ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)_mm512_undefined_epi32 (), (__mmask16)-1)) +#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C) ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_shuffle_i32x4(U, X, Y, C) ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U))) +#define _mm512_shuffle_f64x2(X, Y, C) ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) +#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C) ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_maskz_shuffle_f64x2(U, X, Y, C) ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) +#define _mm512_shuffle_f32x4(X, Y, C) ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) +#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C) ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) +#define _mm512_maskz_shuffle_f32x4(U, X, Y, C) ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) #endif - +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rolv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rorv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rolv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rorv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} #ifdef __OPTIMIZE__ -__funline __m256i _mm512_cvt_roundpd_epi32(__m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvtpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } - -__funline __m256i _mm512_mask_cvt_roundpd_epi32(__m256i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)__A, (__v8si)__W, - (__mmask8)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); } - -__funline __m256i _mm512_maskz_cvt_roundpd_epi32(__mmask8 __U, __m512d __A, - const int __R) { - return (__m256i)__builtin_ia32_cvtpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } - -__funline __m256i _mm512_cvt_roundpd_epu32(__m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvtpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } - -__funline __m256i _mm512_mask_cvt_roundpd_epu32(__m256i __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)__A, (__v8si)__W, - (__mmask8)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); } - -__funline __m256i _mm512_maskz_cvt_roundpd_epu32(__mmask8 __U, __m512d __A, - const int __R) { - return (__m256i)__builtin_ia32_cvtpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } #else -#define _mm512_cvt_roundpd_epi32(A, B) \ - ((__m256i)__builtin_ia32_cvtpd2dq512_mask( \ - A, (__v8si)_mm256_undefined_si256(), -1, B)) - -#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B) \ - ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B)) - -#define _mm512_maskz_cvt_roundpd_epi32(U, A, B) \ - ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), \ - U, B)) - -#define _mm512_cvt_roundpd_epu32(A, B) \ - ((__m256i)__builtin_ia32_cvtpd2udq512_mask( \ - A, (__v8si)_mm256_undefined_si256(), -1, B)) - -#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B) \ - ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B)) - -#define _mm512_maskz_cvt_roundpd_epu32(U, A, B) \ - ((__m256i)__builtin_ia32_cvtpd2udq512_mask( \ - A, (__v8si)_mm256_setzero_si256(), U, B)) +#define _mm512_cvtt_roundpd_epi32(A, B) ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) +#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B) ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B)) +#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B) ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) +#define _mm512_cvtt_roundpd_epu32(A, B) ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) +#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B) ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B)) +#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B) ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) #endif - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_cvtt_roundps_epi32(__m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } - -__funline __m512i _mm512_mask_cvtt_roundps_epi32(__m512i __W, __mmask16 __U, - __m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)__A, (__v16si)__W, - (__mmask16)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_maskz_cvtt_roundps_epi32(__mmask16 __U, __m512 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvttps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } - -__funline __m512i _mm512_cvtt_roundps_epu32(__m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); } - -__funline __m512i _mm512_mask_cvtt_roundps_epu32(__m512i __W, __mmask16 __U, - __m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)__A, (__v16si)__W, - (__mmask16)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); } - -__funline __m512i _mm512_maskz_cvtt_roundps_epu32(__mmask16 __U, __m512 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvttps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); } #else -#define _mm512_cvtt_roundps_epi32(A, B) \ - ((__m512i)__builtin_ia32_cvttps2dq512_mask( \ - A, (__v16si)_mm512_undefined_epi32(), -1, B)) - -#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B)) - -#define _mm512_maskz_cvtt_roundps_epi32(U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2dq512_mask( \ - A, (__v16si)_mm512_setzero_si512(), U, B)) - -#define _mm512_cvtt_roundps_epu32(A, B) \ - ((__m512i)__builtin_ia32_cvttps2udq512_mask( \ - A, (__v16si)_mm512_undefined_epi32(), -1, B)) - -#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B)) - -#define _mm512_maskz_cvtt_roundps_epu32(U, A, B) \ - ((__m512i)__builtin_ia32_cvttps2udq512_mask( \ - A, (__v16si)_mm512_setzero_si512(), U, B)) +#define _mm512_cvt_roundpd_epi32(A, B) ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) +#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B) ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B)) +#define _mm512_maskz_cvt_roundpd_epi32(U, A, B) ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) +#define _mm512_cvt_roundpd_epu32(A, B) ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) +#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B) ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B)) +#define _mm512_maskz_cvt_roundpd_epu32(U, A, B) ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) #endif - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_cvt_roundps_epi32(__m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); } - -__funline __m512i _mm512_mask_cvt_roundps_epi32(__m512i __W, __mmask16 __U, - __m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)__A, (__v16si)__W, - (__mmask16)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_maskz_cvt_roundps_epi32(__mmask16 __U, __m512 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvtps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); } - -__funline __m512i _mm512_cvt_roundps_epu32(__m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); } - -__funline __m512i _mm512_mask_cvt_roundps_epu32(__m512i __W, __mmask16 __U, - __m512 __A, const int __R) { - return (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)__A, (__v16si)__W, - (__mmask16)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); } - -__funline __m512i _mm512_maskz_cvt_roundps_epu32(__mmask16 __U, __m512 __A, - const int __R) { - return (__m512i)__builtin_ia32_cvtps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); } #else -#define _mm512_cvt_roundps_epi32(A, B) \ - ((__m512i)__builtin_ia32_cvtps2dq512_mask( \ - A, (__v16si)_mm512_undefined_epi32(), -1, B)) - -#define _mm512_mask_cvt_roundps_epi32(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B)) - -#define _mm512_maskz_cvt_roundps_epi32(U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2dq512_mask( \ - A, (__v16si)_mm512_setzero_si512(), U, B)) - -#define _mm512_cvt_roundps_epu32(A, B) \ - ((__m512i)__builtin_ia32_cvtps2udq512_mask( \ - A, (__v16si)_mm512_undefined_epi32(), -1, B)) - -#define _mm512_mask_cvt_roundps_epu32(W, U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B)) - -#define _mm512_maskz_cvt_roundps_epu32(U, A, B) \ - ((__m512i)__builtin_ia32_cvtps2udq512_mask( \ - A, (__v16si)_mm512_setzero_si512(), U, B)) +#define _mm512_cvtt_roundps_epi32(A, B) ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) +#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B) ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B)) +#define _mm512_maskz_cvtt_roundps_epi32(U, A, B) ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) +#define _mm512_cvtt_roundps_epu32(A, B) ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) +#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B) ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B)) +#define _mm512_maskz_cvtt_roundps_epu32(U, A, B) ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) #endif - -__funline __m128d _mm_cvtu32_sd(__m128d __A, unsigned __B) { - return (__m128d)__builtin_ia32_cvtusi2sd32((__v2df)__A, __B); +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epi32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epu32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} +#else +#define _mm512_cvt_roundps_epi32(A, B) ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) +#define _mm512_mask_cvt_roundps_epi32(W, U, A, B) ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B)) +#define _mm512_maskz_cvt_roundps_epi32(U, A, B) ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) +#define _mm512_cvt_roundps_epu32(A, B) ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) +#define _mm512_mask_cvt_roundps_epu32(W, U, A, B) ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B)) +#define _mm512_maskz_cvt_roundps_epu32(U, A, B) ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) +#endif +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_sd (__m128d __A, unsigned __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); } - #ifdef __x86_64__ #ifdef __OPTIMIZE__ -__funline __m128d _mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B, - const int __R) { - return (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)__A, __B, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R); } - -__funline __m128d _mm_cvt_roundi64_sd(__m128d __A, long long __B, const int __R) { - return (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)__A, __B, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); } - -__funline __m128d _mm_cvt_roundsi64_sd(__m128d __A, long long __B, - const int __R) { - return (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)__A, __B, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); } #else -#define _mm_cvt_roundu64_sd(A, B, C) \ - (__m128d) __builtin_ia32_cvtusi2sd64(A, B, C) - -#define _mm_cvt_roundi64_sd(A, B, C) \ - (__m128d) __builtin_ia32_cvtsi2sd64(A, B, C) - -#define _mm_cvt_roundsi64_sd(A, B, C) \ - (__m128d) __builtin_ia32_cvtsi2sd64(A, B, C) +#define _mm_cvt_roundu64_sd(A, B, C) (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C) +#define _mm_cvt_roundi64_sd(A, B, C) (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) +#define _mm_cvt_roundsi64_sd(A, B, C) (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) #endif - #endif - #ifdef __OPTIMIZE__ -__funline __m128 _mm_cvt_roundu32_ss(__m128 __A, unsigned __B, const int __R) { - return (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)__A, __B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R); } - -__funline __m128 _mm_cvt_roundsi32_ss(__m128 __A, int __B, const int __R) { - return (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)__A, __B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); } - -__funline __m128 _mm_cvt_roundi32_ss(__m128 __A, int __B, const int __R) { - return (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)__A, __B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); } #else -#define _mm_cvt_roundu32_ss(A, B, C) \ - (__m128) __builtin_ia32_cvtusi2ss32(A, B, C) - -#define _mm_cvt_roundi32_ss(A, B, C) (__m128) __builtin_ia32_cvtsi2ss32(A, B, C) - -#define _mm_cvt_roundsi32_ss(A, B, C) \ - (__m128) __builtin_ia32_cvtsi2ss32(A, B, C) +#define _mm_cvt_roundu32_ss(A, B, C) (__m128)__builtin_ia32_cvtusi2ss32(A, B, C) +#define _mm_cvt_roundi32_ss(A, B, C) (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) +#define _mm_cvt_roundsi32_ss(A, B, C) (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) #endif - #ifdef __x86_64__ #ifdef __OPTIMIZE__ -__funline __m128 _mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B, - const int __R) { - return (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)__A, __B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R); } - -__funline __m128 _mm_cvt_roundsi64_ss(__m128 __A, long long __B, const int __R) { - return (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)__A, __B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); } - -__funline __m128 _mm_cvt_roundi64_ss(__m128 __A, long long __B, const int __R) { - return (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)__A, __B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); } #else -#define _mm_cvt_roundu64_ss(A, B, C) \ - (__m128) __builtin_ia32_cvtusi2ss64(A, B, C) - -#define _mm_cvt_roundi64_ss(A, B, C) (__m128) __builtin_ia32_cvtsi2ss64(A, B, C) - -#define _mm_cvt_roundsi64_ss(A, B, C) \ - (__m128) __builtin_ia32_cvtsi2ss64(A, B, C) +#define _mm_cvt_roundu64_ss(A, B, C) (__m128)__builtin_ia32_cvtusi2ss64(A, B, C) +#define _mm_cvt_roundi64_ss(A, B, C) (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) +#define _mm_cvt_roundsi64_ss(A, B, C) (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) #endif - #endif - -__funline __m128i _mm512_cvtepi32_epi8(__m512i __A) { - return (__m128i)__builtin_ia32_pmovdb512_mask( - (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); -} - -__funline void _mm512_mask_cvtepi32_storeu_epi8(void *__P, __mmask16 __M, - __m512i __A) { - __builtin_ia32_pmovdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); -} - -__funline __m128i _mm512_mask_cvtepi32_epi8(__m128i __O, __mmask16 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovdb512_mask((__v16si)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm512_maskz_cvtepi32_epi8(__mmask16 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovdb512_mask( - (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm512_cvtsepi32_epi8(__m512i __A) { - return (__m128i)__builtin_ia32_pmovsdb512_mask( - (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); -} - -__funline void _mm512_mask_cvtsepi32_storeu_epi8(void *__P, __mmask16 __M, - __m512i __A) { - __builtin_ia32_pmovsdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); -} - -__funline __m128i _mm512_mask_cvtsepi32_epi8(__m128i __O, __mmask16 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovsdb512_mask((__v16si)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm512_maskz_cvtsepi32_epi8(__mmask16 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovsdb512_mask( - (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm512_cvtusepi32_epi8(__m512i __A) { - return (__m128i)__builtin_ia32_pmovusdb512_mask( - (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); -} - -__funline void _mm512_mask_cvtusepi32_storeu_epi8(void *__P, __mmask16 __M, - __m512i __A) { - __builtin_ia32_pmovusdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); -} - -__funline __m128i _mm512_mask_cvtusepi32_epi8(__m128i __O, __mmask16 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovusdb512_mask((__v16si)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm512_maskz_cvtusepi32_epi8(__mmask16 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovusdb512_mask( - (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm512_cvtepi32_epi16(__m512i __A) { - return (__m256i)__builtin_ia32_pmovdw512_mask( - (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); -} - -__funline void _mm512_mask_cvtepi32_storeu_epi16(void *__P, __mmask16 __M, - __m512i __A) { - __builtin_ia32_pmovdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); -} - -__funline __m256i _mm512_mask_cvtepi32_epi16(__m256i __O, __mmask16 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovdw512_mask((__v16si)__A, (__v16hi)__O, - __M); -} - -__funline __m256i _mm512_maskz_cvtepi32_epi16(__mmask16 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovdw512_mask( - (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm512_cvtsepi32_epi16(__m512i __A) { - return (__m256i)__builtin_ia32_pmovsdw512_mask( - (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); -} - -__funline void _mm512_mask_cvtsepi32_storeu_epi16(void *__P, __mmask16 __M, - __m512i __A) { - __builtin_ia32_pmovsdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); -} - -__funline __m256i _mm512_mask_cvtsepi32_epi16(__m256i __O, __mmask16 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovsdw512_mask((__v16si)__A, (__v16hi)__O, - __M); -} - -__funline __m256i _mm512_maskz_cvtsepi32_epi16(__mmask16 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovsdw512_mask( - (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm512_cvtusepi32_epi16(__m512i __A) { - return (__m256i)__builtin_ia32_pmovusdw512_mask( - (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); -} - -__funline void _mm512_mask_cvtusepi32_storeu_epi16(void *__P, __mmask16 __M, - __m512i __A) { - __builtin_ia32_pmovusdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); -} - -__funline __m256i _mm512_mask_cvtusepi32_epi16(__m256i __O, __mmask16 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovusdw512_mask((__v16si)__A, (__v16hi)__O, - __M); -} - -__funline __m256i _mm512_maskz_cvtusepi32_epi16(__mmask16 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovusdw512_mask( - (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm512_cvtepi64_epi32(__m512i __A) { - return (__m256i)__builtin_ia32_pmovqd512_mask( - (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); -} - -__funline __m256i _mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovqd512_mask((__v8di)__A, (__v8si)__O, __M); -} - -__funline __m256i _mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovqd512_mask( - (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm512_cvtsepi64_epi32(__m512i __A) { - return (__m256i)__builtin_ia32_pmovsqd512_mask( - (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovsqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); -} - -__funline __m256i _mm512_mask_cvtsepi64_epi32(__m256i __O, __mmask8 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovsqd512_mask((__v8di)__A, (__v8si)__O, __M); -} - -__funline __m256i _mm512_maskz_cvtsepi64_epi32(__mmask8 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovsqd512_mask( - (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm512_cvtusepi64_epi32(__m512i __A) { - return (__m256i)__builtin_ia32_pmovusqd512_mask( - (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovusqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); -} - -__funline __m256i _mm512_mask_cvtusepi64_epi32(__m256i __O, __mmask8 __M, - __m512i __A) { - return (__m256i)__builtin_ia32_pmovusqd512_mask((__v8di)__A, (__v8si)__O, - __M); -} - -__funline __m256i _mm512_maskz_cvtusepi64_epi32(__mmask8 __M, __m512i __A) { - return (__m256i)__builtin_ia32_pmovusqd512_mask( - (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm512_cvtepi64_epi16(__m512i __A) { - return (__m128i)__builtin_ia32_pmovqw512_mask( - (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); -} - -__funline __m128i _mm512_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovqw512_mask((__v8di)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm512_maskz_cvtepi64_epi16(__mmask8 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovqw512_mask( - (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm512_cvtsepi64_epi16(__m512i __A) { - return (__m128i)__builtin_ia32_pmovsqw512_mask( - (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovsqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); -} - -__funline __m128i _mm512_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovsqw512_mask((__v8di)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm512_maskz_cvtsepi64_epi16(__mmask8 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovsqw512_mask( - (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm512_cvtusepi64_epi16(__m512i __A) { - return (__m128i)__builtin_ia32_pmovusqw512_mask( - (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovusqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); -} - -__funline __m128i _mm512_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovusqw512_mask((__v8di)__A, (__v8hi)__O, - __M); -} - -__funline __m128i _mm512_maskz_cvtusepi64_epi16(__mmask8 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovusqw512_mask( - (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm512_cvtepi64_epi8(__m512i __A) { - return (__m128i)__builtin_ia32_pmovqb512_mask( - (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); -} - -__funline __m128i _mm512_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovqb512_mask((__v8di)__A, (__v16qi)__O, __M); -} - -__funline __m128i _mm512_maskz_cvtepi64_epi8(__mmask8 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovqb512_mask( - (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm512_cvtsepi64_epi8(__m512i __A) { - return (__m128i)__builtin_ia32_pmovsqb512_mask( - (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovsqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); -} - -__funline __m128i _mm512_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovsqb512_mask((__v8di)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm512_maskz_cvtsepi64_epi8(__mmask8 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovsqb512_mask( - (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm512_cvtusepi64_epi8(__m512i __A) { - return (__m128i)__builtin_ia32_pmovusqb512_mask( - (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm512_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, - __m512i __A) { - __builtin_ia32_pmovusqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); -} - -__funline __m128i _mm512_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, - __m512i __A) { - return (__m128i)__builtin_ia32_pmovusqb512_mask((__v8di)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm512_maskz_cvtusepi64_epi8(__mmask8 __M, __m512i __A) { - return (__m128i)__builtin_ia32_pmovusqb512_mask( - (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m512d _mm512_cvtepi32_pd(__m256i __A) { - return (__m512d)__builtin_ia32_cvtdq2pd512_mask( - (__v8si)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_cvtepi32_pd(__m512d __W, __mmask8 __U, - __m256i __A) { - return (__m512d)__builtin_ia32_cvtdq2pd512_mask((__v8si)__A, (__v8df)__W, - (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_cvtepi32_pd(__mmask8 __U, __m256i __A) { - return (__m512d)__builtin_ia32_cvtdq2pd512_mask( - (__v8si)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - -__funline __m512d _mm512_cvtepu32_pd(__m256i __A) { - return (__m512d)__builtin_ia32_cvtudq2pd512_mask( - (__v8si)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); -} - -__funline __m512d _mm512_mask_cvtepu32_pd(__m512d __W, __mmask8 __U, - __m256i __A) { - return (__m512d)__builtin_ia32_cvtudq2pd512_mask((__v8si)__A, (__v8df)__W, - (__mmask8)__U); -} - -__funline __m512d _mm512_maskz_cvtepu32_pd(__mmask8 __U, __m256i __A) { - return (__m512d)__builtin_ia32_cvtudq2pd512_mask( - (__v8si)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); -} - +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqb512mem_mask ((unsigned long long *) __P, + (__v8di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_pd (__m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_pd (__m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} #ifdef __OPTIMIZE__ -__funline __m512 _mm512_cvt_roundepi32_ps(__m512i __A, const int __R) { - return (__m512)__builtin_ia32_cvtdq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi32_ps (__m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_cvt_roundepi32_ps(__m512 __W, __mmask16 __U, - __m512i __A, const int __R) { - return (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_cvt_roundepi32_ps(__mmask16 __U, __m512i __A, - const int __R) { - return (__m512)__builtin_ia32_cvtdq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m512 _mm512_cvt_roundepu32_ps(__m512i __A, const int __R) { - return (__m512)__builtin_ia32_cvtudq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu32_ps (__m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_cvt_roundepu32_ps(__m512 __W, __mmask16 __U, - __m512i __A, const int __R) { - return (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_cvt_roundepu32_ps(__mmask16 __U, __m512i __A, - const int __R) { - return (__m512)__builtin_ia32_cvtudq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - #else -#define _mm512_cvt_roundepi32_ps(A, B) \ - (__m512) __builtin_ia32_cvtdq2ps512_mask( \ - (__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) - -#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B) \ - (__m512) __builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B) - -#define _mm512_maskz_cvt_roundepi32_ps(U, A, B) \ - (__m512) __builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \ - (__v16sf)_mm512_setzero_ps(), U, B) - -#define _mm512_cvt_roundepu32_ps(A, B) \ - (__m512) __builtin_ia32_cvtudq2ps512_mask( \ - (__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) - -#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B) \ - (__m512) __builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B) - -#define _mm512_maskz_cvt_roundepu32_ps(U, A, B) \ - (__m512) __builtin_ia32_cvtudq2ps512_mask( \ - (__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) +#define _mm512_cvt_roundepi32_ps(A, B) (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) +#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B) (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B) +#define _mm512_maskz_cvt_roundepi32_ps(U, A, B) (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) +#define _mm512_cvt_roundepu32_ps(A, B) (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) +#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B) (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B) +#define _mm512_maskz_cvt_roundepu32_ps(U, A, B) (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) #endif - #ifdef __OPTIMIZE__ -__funline __m256d _mm512_extractf64x4_pd(__m512d __A, const int __imm) { - return (__m256d)__builtin_ia32_extractf64x4_mask( - (__v8df)__A, __imm, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf64x4_pd (__m512d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_undefined_pd (), + (__mmask8) -1); } - -__funline __m256d _mm512_mask_extractf64x4_pd(__m256d __W, __mmask8 __U, - __m512d __A, const int __imm) { - return (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)__A, __imm, - (__v4df)__W, (__mmask8)__U); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) __W, + (__mmask8) __U); } - -__funline __m256d _mm512_maskz_extractf64x4_pd(__mmask8 __U, __m512d __A, - const int __imm) { - return (__m256d)__builtin_ia32_extractf64x4_mask( - (__v8df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); } - -__funline __m128 _mm512_extractf32x4_ps(__m512 __A, const int __imm) { - return (__m128)__builtin_ia32_extractf32x4_mask( - (__v16sf)__A, __imm, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf32x4_ps (__m512 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_undefined_ps (), + (__mmask8) -1); } - -__funline __m128 _mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m512 __A, - const int __imm) { - return (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)__A, __imm, - (__v4sf)__W, (__mmask8)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) __U); } - -__funline __m128 _mm512_maskz_extractf32x4_ps(__mmask8 __U, __m512 __A, - const int __imm) { - return (__m128)__builtin_ia32_extractf32x4_mask( - (__v16sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); } - -__funline __m256i _mm512_extracti64x4_epi64(__m512i __A, const int __imm) { - return (__m256i)__builtin_ia32_extracti64x4_mask( - (__v8di)__A, __imm, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti64x4_epi64 (__m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_undefined_si256 (), + (__mmask8) -1); } - -__funline __m256i _mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, - __m512i __A, const int __imm) { - return (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)__A, __imm, - (__v4di)__W, (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) __W, + (__mmask8) __U); } - -__funline __m256i _mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A, - const int __imm) { - return (__m256i)__builtin_ia32_extracti64x4_mask( - (__v8di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } - -__funline __m128i _mm512_extracti32x4_epi32(__m512i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti32x4_mask( - (__v16si)__A, __imm, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti32x4_epi32 (__m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); } - -__funline __m128i _mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, - __m512i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)__A, __imm, - (__v4si)__W, (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) __W, + (__mmask8) __U); } - -__funline __m128i _mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A, - const int __imm) { - return (__m128i)__builtin_ia32_extracti32x4_mask( - (__v16si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } #else - -#define _mm512_extractf64x4_pd(X, C) \ - ((__m256d)__builtin_ia32_extractf64x4_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v4df)(__m256d)_mm256_undefined_pd(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf64x4_pd(W, U, X, C) \ - ((__m256d)__builtin_ia32_extractf64x4_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm512_maskz_extractf64x4_pd(U, X, C) \ - ((__m256d)__builtin_ia32_extractf64x4_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm512_extractf32x4_ps(X, C) \ - ((__m128)__builtin_ia32_extractf32x4_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v4sf)(__m128)_mm_undefined_ps(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf32x4_ps(W, U, X, C) \ - ((__m128)__builtin_ia32_extractf32x4_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm512_maskz_extractf32x4_ps(U, X, C) \ - ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(X), (int)(C), \ - (__v4sf)(__m128)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm512_extracti64x4_epi64(X, C) \ - ((__m256i)__builtin_ia32_extracti64x4_mask( \ - (__v8di)(__m512i)(X), (int)(C), \ - (__v4di)(__m256i)_mm256_undefined_si256(), (__mmask8)-1)) - -#define _mm512_mask_extracti64x4_epi64(W, U, X, C) \ - ((__m256i)__builtin_ia32_extracti64x4_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm512_maskz_extracti64x4_epi64(U, X, C) \ - ((__m256i)__builtin_ia32_extracti64x4_mask( \ - (__v8di)(__m512i)(X), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm512_extracti32x4_epi32(X, C) \ - ((__m128i)__builtin_ia32_extracti32x4_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v4si)(__m128i)_mm_undefined_si128(), \ - (__mmask8)-1)) - -#define _mm512_mask_extracti32x4_epi32(W, U, X, C) \ - ((__m128i)__builtin_ia32_extracti32x4_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm512_maskz_extracti32x4_epi32(U, X, C) \ - ((__m128i)__builtin_ia32_extracti32x4_mask( \ - (__v16si)(__m512i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) +#define _mm512_extractf64x4_pd(X, C) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), (int) (C), (__v4df)(__m256d)_mm256_undefined_pd(), (__mmask8)-1)) +#define _mm512_mask_extractf64x4_pd(W, U, X, C) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), (int) (C), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm512_maskz_extractf64x4_pd(U, X, C) ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) +#define _mm512_extractf32x4_ps(X, C) ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), (int) (C), (__v4sf)(__m128)_mm_undefined_ps(), (__mmask8)-1)) +#define _mm512_mask_extractf32x4_ps(W, U, X, C) ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), (int) (C), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm512_maskz_extractf32x4_ps(U, X, C) ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), (int) (C), (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U))) +#define _mm512_extracti64x4_epi64(X, C) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), (int) (C), (__v4di)(__m256i)_mm256_undefined_si256 (), (__mmask8)-1)) +#define _mm512_mask_extracti64x4_epi64(W, U, X, C) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), (int) (C), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm512_maskz_extracti64x4_epi64(U, X, C) ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm512_extracti32x4_epi32(X, C) ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), (int) (C), (__v4si)(__m128i)_mm_undefined_si128 (), (__mmask8)-1)) +#define _mm512_mask_extracti32x4_epi32(W, U, X, C) ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm512_maskz_extracti32x4_epi32(U, X, C) ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) #endif - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_inserti32x4(__m512i __A, __m128i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)__A, (__v4si)__B, - __imm, (__v16si)__A, -1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A, + (__v4si) __B, + __imm, + (__v16si) __A, -1); } - -__funline __m512 _mm512_insertf32x4(__m512 __A, __m128 __B, const int __imm) { - return (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)__A, (__v4sf)__B, - __imm, (__v16sf)__A, -1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A, + (__v4sf) __B, + __imm, + (__v16sf) __A, -1); } - -__funline __m512i _mm512_inserti64x4(__m512i __A, __m256i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti64x4_mask( - (__v8di)__A, (__v4di)__B, __imm, (__v8di)_mm512_undefined_epi32(), - (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, - __m256i __B, const int __imm) { - return (__m512i)__builtin_ia32_inserti64x4_mask( - (__v8di)__A, (__v4di)__B, __imm, (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A, + __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B, - const int __imm) { - return (__m512i)__builtin_ia32_inserti64x4_mask( - (__v8di)__A, (__v4di)__B, __imm, (__v8di)_mm512_setzero_si512(), - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512d _mm512_insertf64x4(__m512d __A, __m256d __B, const int __imm) { - return (__m512d)__builtin_ia32_insertf64x4_mask( - (__v8df)__A, (__v4df)__B, __imm, (__v8df)_mm512_undefined_pd(), - (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, - __m256d __B, const int __imm) { - return (__m512d)__builtin_ia32_insertf64x4_mask( - (__v8df)__A, (__v4df)__B, __imm, (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A, + __m256d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B, - const int __imm) { - return (__m512d)__builtin_ia32_insertf64x4_mask( - (__v8df)__A, (__v4df)__B, __imm, (__v8df)_mm512_setzero_pd(), - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } #else -#define _mm512_insertf32x4(X, Y, C) \ - ((__m512)__builtin_ia32_insertf32x4_mask( \ - (__v16sf)(__m512)(X), (__v4sf)(__m128)(Y), (int)(C), \ - (__v16sf)(__m512)(X), (__mmask16)(-1))) - -#define _mm512_inserti32x4(X, Y, C) \ - ((__m512i)__builtin_ia32_inserti32x4_mask( \ - (__v16si)(__m512i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v16si)(__m512i)(X), (__mmask16)(-1))) - -#define _mm512_insertf64x4(X, Y, C) \ - ((__m512d)__builtin_ia32_insertf64x4_mask( \ - (__v8df)(__m512d)(X), (__v4df)(__m256d)(Y), (int)(C), \ - (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) - -#define _mm512_mask_insertf64x4(W, U, X, Y, C) \ - ((__m512d)__builtin_ia32_insertf64x4_mask( \ - (__v8df)(__m512d)(X), (__v4df)(__m256d)(Y), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U))) - -#define _mm512_maskz_insertf64x4(U, X, Y, C) \ - ((__m512d)__builtin_ia32_insertf64x4_mask( \ - (__v8df)(__m512d)(X), (__v4df)(__m256d)(Y), (int)(C), \ - (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) - -#define _mm512_inserti64x4(X, Y, C) \ - ((__m512i)__builtin_ia32_inserti64x4_mask( \ - (__v8di)(__m512i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) - -#define _mm512_mask_inserti64x4(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti64x4_mask( \ - (__v8di)(__m512i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v8di)(__m512i)(W), (__mmask8)(U))) - -#define _mm512_maskz_inserti64x4(U, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti64x4_mask( \ - (__v8di)(__m512i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask8)(U))) +#define _mm512_insertf32x4(X, Y, C) ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1))) +#define _mm512_inserti32x4(X, Y, C) ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1))) +#define _mm512_insertf64x4(X, Y, C) ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), (__v4df)(__m256d) (Y), (int) (C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) +#define _mm512_mask_insertf64x4(W, U, X, Y, C) ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), (__v4df)(__m256d) (Y), (int) (C), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_maskz_insertf64x4(U, X, Y, C) ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), (__v4df)(__m256d) (Y), (int) (C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) +#define _mm512_inserti64x4(X, Y, C) ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), (__v4di)(__m256i) (Y), (int) (C), (__v8di)(__m512i)_mm512_undefined_epi32 (), (__mmask8)-1)) +#define _mm512_mask_inserti64x4(W, U, X, Y, C) ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), (__v4di)(__m256i) (Y), (int) (C), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_inserti64x4(U, X, Y, C) ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), (__v4di)(__m256i) (Y), (int) (C), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(U))) #endif - -__funline __m512d _mm512_loadu_pd(void const *__P) { +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_pd (void const *__P) +{ return *(__m512d_u *)__P; } - -__funline __m512d _mm512_mask_loadu_pd(__m512d __W, __mmask8 __U, - void const *__P) { - return (__m512d)__builtin_ia32_loadupd512_mask((const double *)__P, - (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) { - return (__m512d)__builtin_ia32_loadupd512_mask( - (const double *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline void _mm512_storeu_pd(void *__P, __m512d __A) { +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_pd (void *__P, __m512d __A) +{ *(__m512d_u *)__P = __A; } - -__funline void _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) { - __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A, + (__mmask8) __U); } - -__funline __m512 _mm512_loadu_ps(void const *__P) { +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_ps (void const *__P) +{ return *(__m512_u *)__P; } - -__funline __m512 _mm512_mask_loadu_ps(__m512 __W, __mmask16 __U, - void const *__P) { - return (__m512)__builtin_ia32_loadups512_mask((const float *)__P, - (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) { - return (__m512)__builtin_ia32_loadups512_mask( - (const float *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline void _mm512_storeu_ps(void *__P, __m512 __A) { +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_ps (void *__P, __m512 __A) +{ *(__m512_u *)__P = __A; } - -__funline void _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) { - __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A, + (__mmask16) __U); } - -__funline __m128 _mm_mask_load_ss(__m128 __W, __mmask8 __U, const float *__P) { - return (__m128)__builtin_ia32_loadss_mask(__P, (__v4sf)__W, __U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P) +{ + return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U); } - -__funline __m128 _mm_maskz_load_ss(__mmask8 __U, const float *__P) { - return (__m128)__builtin_ia32_loadss_mask(__P, (__v4sf)_mm_setzero_ps(), __U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_ss (__mmask8 __U, const float *__P) +{ + return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (), + __U); } - -__funline __m128d _mm_mask_load_sd(__m128d __W, __mmask8 __U, const double *__P) { - return (__m128d)__builtin_ia32_loadsd_mask(__P, (__v2df)__W, __U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P) +{ + return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U); } - -__funline __m128d _mm_maskz_load_sd(__mmask8 __U, const double *__P) { - return (__m128d)__builtin_ia32_loadsd_mask(__P, (__v2df)_mm_setzero_pd(), - __U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_sd (__mmask8 __U, const double *__P) +{ + return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (), + __U); } - -__funline __m128 _mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_movess_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, __U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, + (__v4sf) __W, __U); } - -__funline __m128 _mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_movess_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)_mm_setzero_ps(), __U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), __U); } - -__funline __m128d _mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_movesd_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, __U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, + (__v2df) __W, __U); } - -__funline __m128d _mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_movesd_mask((__v2df)__A, (__v2df)__B, - (__v2df)_mm_setzero_pd(), __U); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, + (__v2df) _mm_setzero_pd (), + __U); } - -__funline void _mm_mask_store_ss(float *__P, __mmask8 __U, __m128 __A) { - __builtin_ia32_storess_mask(__P, (__v4sf)__A, (__mmask8)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U); } - -__funline void _mm_mask_store_sd(double *__P, __mmask8 __U, __m128d __A) { - __builtin_ia32_storesd_mask(__P, (__v2df)__A, (__mmask8)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U); } - -__funline __m512i _mm512_mask_loadu_epi64(__m512i __W, __mmask8 __U, - void const *__P) { - return (__m512i)__builtin_ia32_loaddqudi512_mask((const long long *)__P, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi64 (void const *__P) +{ + return *(__m512i_u *) __P; } - -__funline __m512i _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) { - return (__m512i)__builtin_ia32_loaddqudi512_mask( - (const long long *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, + (__v8di) __W, + (__mmask8) __U); } - -__funline void _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) { - __builtin_ia32_storedqudi512_mask((long long *)__P, (__v8di)__A, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512i _mm512_loadu_si512(void const *__P) { +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi64 (void *__P, __m512i __A) +{ + *(__m512i_u *) __P = (__m512i_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_storedqudi512_mask ((long long *) __P, (__v8di) __A, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_si512 (void const *__P) +{ return *(__m512i_u *)__P; } - -__funline __m512i _mm512_mask_loadu_epi32(__m512i __W, __mmask16 __U, - void const *__P) { - return (__m512i)__builtin_ia32_loaddqusi512_mask( - (const int *)__P, (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi32 (void const *__P) +{ + return *(__m512i_u *) __P; } - -__funline __m512i _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) { - return (__m512i)__builtin_ia32_loaddqusi512_mask( - (const int *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, + (__v16si) __W, + (__mmask16) __U); } - -__funline void _mm512_storeu_si512(void *__P, __m512i __A) { +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_si512 (void *__P, __m512i __A) +{ *(__m512i_u *)__P = __A; } - -__funline void _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) { - __builtin_ia32_storedqusi512_mask((int *)__P, (__v16si)__A, (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi32 (void *__P, __m512i __A) +{ + *(__m512i_u *) __P = (__m512i_u) __A; } - -__funline __m512d _mm512_permutevar_pd(__m512d __A, __m512i __C) { - return (__m512d)__builtin_ia32_vpermilvarpd512_mask( - (__v8df)__A, (__v8di)__C, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A, + (__mmask16) __U); } - -__funline __m512d _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, - __m512d __A, __m512i __C) { - return (__m512d)__builtin_ia32_vpermilvarpd512_mask( - (__v8df)__A, (__v8di)__C, (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutevar_pd (__m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, - __m512i __C) { - return (__m512d)__builtin_ia32_vpermilvarpd512_mask( - (__v8df)__A, (__v8di)__C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512 _mm512_permutevar_ps(__m512 __A, __m512i __C) { - return (__m512)__builtin_ia32_vpermilvarps512_mask( - (__v16sf)__A, (__v16si)__C, (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512i __C) { - return (__m512)__builtin_ia32_vpermilvarps512_mask( - (__v16sf)__A, (__v16si)__C, (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutevar_ps (__m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, - __m512i __C) { - return (__m512)__builtin_ia32_vpermilvarps512_mask( - (__v16sf)__A, (__v16si)__C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_permutex2var_epi64(__m512i __A, __m512i __I, - __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varq512_mask((__v8di)__I - /* idx */, - (__v8di)__A, (__v8di)__B, - (__mmask8)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m512i _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varq512_mask((__v8di)__I - /* idx */, - (__v8di)__A, (__v8di)__B, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + , + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } - -__funline __m512i _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, - __mmask8 __U, __m512i __B) { - return (__m512i)__builtin_ia32_vpermi2varq512_mask((__v8di)__A, - (__v8di)__I - /* idx */, - (__v8di)__B, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, + __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varq512_maskz((__v8di)__I - /* idx */, - (__v8di)__A, (__v8di)__B, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, + __mmask8 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A, + (__v8di) __I + , + (__v8di) __B, + (__mmask8) __U); } - -__funline __m512i _mm512_permutex2var_epi32(__m512i __A, __m512i __I, - __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2vard512_mask((__v16si)__I - /* idx */, - (__v16si)__A, (__v16si)__B, - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I + , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); } - -__funline __m512i _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2vard512_mask((__v16si)__I - /* idx */, - (__v16si)__A, (__v16si)__B, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + , + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } - -__funline __m512i _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, - __mmask16 __U, __m512i __B) { - return (__m512i)__builtin_ia32_vpermi2vard512_mask((__v16si)__A, - (__v16si)__I - /* idx */, - (__v16si)__B, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2vard512_maskz( - (__v16si)__I - /* idx */, - (__v16si)__A, (__v16si)__B, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, + __mmask16 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A, + (__v16si) __I + , + (__v16si) __B, + (__mmask16) __U); } - -__funline __m512d _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { - return (__m512d)__builtin_ia32_vpermt2varpd512_mask((__v8di)__I - /* idx */, - (__v8df)__A, (__v8df)__B, - (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I + , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); } - -__funline __m512d _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, - __m512i __I, __m512d __B) { - return (__m512d)__builtin_ia32_vpermt2varpd512_mask((__v8di)__I - /* idx */, - (__v8df)__A, (__v8df)__B, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + , + (__v8df) __A, + (__v8df) __B, + (__mmask8) -1); } - -__funline __m512d _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, - __mmask8 __U, __m512d __B) { - return (__m512d)__builtin_ia32_vpermi2varpd512_mask((__v8df)__A, - (__v8di)__I - /* idx */, - (__v8df)__B, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, - __m512i __I, __m512d __B) { - return (__m512d)__builtin_ia32_vpermt2varpd512_maskz((__v8di)__I - /* idx */, - (__v8df)__A, (__v8df)__B, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A, + (__v8di) __I + , + (__v8df) __B, + (__mmask8) __U); } - -__funline __m512 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { - return (__m512)__builtin_ia32_vpermt2varps512_mask((__v16si)__I - /* idx */, - (__v16sf)__A, (__v16sf)__B, - (__mmask16)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I + , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); } - -__funline __m512 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, - __m512i __I, __m512 __B) { - return (__m512)__builtin_ia32_vpermt2varps512_mask((__v16si)__I - /* idx */, - (__v16sf)__A, (__v16sf)__B, - (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) -1); } - -__funline __m512 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, - __mmask16 __U, __m512 __B) { - return (__m512)__builtin_ia32_vpermi2varps512_mask((__v16sf)__A, - (__v16si)__I - /* idx */, - (__v16sf)__B, - (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, - __m512i __I, __m512 __B) { - return (__m512)__builtin_ia32_vpermt2varps512_maskz( - (__v16si)__I - /* idx */, - (__v16sf)__A, (__v16sf)__B, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) +{ + return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A, + (__v16si) __I + , + (__v16sf) __B, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I + , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); } - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_permute_pd(__m512d __X, const int __C) { - return (__m512d)__builtin_ia32_vpermilpd512_mask( - (__v8df)__X, __C, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permute_pd (__m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X, - const int __C) { - return (__m512d)__builtin_ia32_vpermilpd512_mask((__v8df)__X, __C, - (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_permute_pd(__mmask8 __U, __m512d __X, - const int __C) { - return (__m512d)__builtin_ia32_vpermilpd512_mask( - (__v8df)__X, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512 _mm512_permute_ps(__m512 __X, const int __C) { - return (__m512)__builtin_ia32_vpermilps512_mask( - (__v16sf)__X, __C, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permute_ps (__m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X, - const int __C) { - return (__m512)__builtin_ia32_vpermilps512_mask((__v16sf)__X, __C, - (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_permute_ps(__mmask16 __U, __m512 __X, - const int __C) { - return (__m512)__builtin_ia32_vpermilps512_mask( - (__v16sf)__X, __C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } #else -#define _mm512_permute_pd(X, C) \ - ((__m512d)__builtin_ia32_vpermilpd512_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), \ - (__mmask8)(-1))) - -#define _mm512_mask_permute_pd(W, U, X, C) \ - ((__m512d)__builtin_ia32_vpermilpd512_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) - -#define _mm512_maskz_permute_pd(U, X, C) \ - ((__m512d)__builtin_ia32_vpermilpd512_mask( \ - (__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm512_permute_ps(X, C) \ - ((__m512)__builtin_ia32_vpermilps512_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), \ - (__mmask16)(-1))) - -#define _mm512_mask_permute_ps(W, U, X, C) \ - ((__m512)__builtin_ia32_vpermilps512_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) - -#define _mm512_maskz_permute_ps(U, X, C) \ - ((__m512)__builtin_ia32_vpermilps512_mask( \ - (__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), \ - (__mmask16)(U))) +#define _mm512_permute_pd(X, C) ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)(-1))) +#define _mm512_mask_permute_pd(W, U, X, C) ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_maskz_permute_pd(U, X, C) ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) +#define _mm512_permute_ps(X, C) ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)(-1))) +#define _mm512_mask_permute_ps(W, U, X, C) ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) +#define _mm512_maskz_permute_ps(U, X, C) ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) #endif - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_permutex_epi64(__m512i __X, const int __I) { - return (__m512i)__builtin_ia32_permdi512_mask( - (__v8di)__X, __I, (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1)); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex_epi64 (__m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) (-1)); } - -__funline __m512i _mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, - __m512i __X, const int __I) { - return (__m512i)__builtin_ia32_permdi512_mask((__v8di)__X, __I, (__v8di)__W, - (__mmask8)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M, + __m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) __W, + (__mmask8) __M); } - -__funline __m512i _mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X, - const int __I) { - return (__m512i)__builtin_ia32_permdi512_mask( - (__v8di)__X, __I, (__v8di)_mm512_setzero_si512(), (__mmask8)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __M); } - -__funline __m512d _mm512_permutex_pd(__m512d __X, const int __M) { - return (__m512d)__builtin_ia32_permdf512_mask( - (__v8df)__X, __M, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex_pd (__m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X, - const int __M) { - return (__m512d)__builtin_ia32_permdf512_mask((__v8df)__X, __M, (__v8df)__W, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X, - const int __M) { - return (__m512d)__builtin_ia32_permdf512_mask( - (__v8df)__X, __M, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } #else -#define _mm512_permutex_pd(X, M) \ - ((__m512d)__builtin_ia32_permdf512_mask( \ - (__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_undefined_pd(), \ - (__mmask8)-1)) - -#define _mm512_mask_permutex_pd(W, U, X, M) \ - ((__m512d)__builtin_ia32_permdf512_mask( \ - (__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)(W), (__mmask8)(U))) - -#define _mm512_maskz_permutex_pd(U, X, M) \ - ((__m512d)__builtin_ia32_permdf512_mask( \ - (__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm512_permutex_epi64(X, I) \ - ((__m512i)__builtin_ia32_permdi512_mask( \ - (__v8di)(__m512i)(X), (int)(I), \ - (__v8di)(__m512i)(_mm512_undefined_epi32()), (__mmask8)(-1))) - -#define _mm512_maskz_permutex_epi64(M, X, I) \ - ((__m512i)__builtin_ia32_permdi512_mask( \ - (__v8di)(__m512i)(X), (int)(I), \ - (__v8di)(__m512i)(_mm512_setzero_si512()), (__mmask8)(M))) - -#define _mm512_mask_permutex_epi64(W, M, X, I) \ - ((__m512i)__builtin_ia32_permdi512_mask( \ - (__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i)(W), (__mmask8)(M))) +#define _mm512_permutex_pd(X, M) ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) +#define _mm512_mask_permutex_pd(W, U, X, M) ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_maskz_permutex_pd(U, X, M) ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) +#define _mm512_permutex_epi64(X, I) ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i) (_mm512_undefined_epi32 ()), (__mmask8)(-1))) +#define _mm512_maskz_permutex_epi64(M, X, I) ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i) (_mm512_setzero_si512 ()), (__mmask8)(M))) +#define _mm512_mask_permutex_epi64(W, M, X, I) ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i)(W), (__mmask8)(M))) #endif - -__funline __m512i _mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_permvardi512_mask( - (__v8di)__Y, (__v8di)__X, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_permutexvar_epi64(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_permvardi512_mask( - (__v8di)__Y, (__v8di)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, - __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_permvardi512_mask((__v8di)__Y, (__v8di)__X, - (__v8di)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) __W, + __M); } - -__funline __m512i _mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_permvarsi512_mask( - (__v16si)__Y, (__v16si)__X, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_permutexvar_epi32(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_permvarsi512_mask( - (__v16si)__Y, (__v16si)__X, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, - __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_permvarsi512_mask((__v16si)__Y, (__v16si)__X, - (__v16si)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) __W, + __M); } - -__funline __m512d _mm512_permutexvar_pd(__m512i __X, __m512d __Y) { - return (__m512d)__builtin_ia32_permvardf512_mask( - (__v8df)__Y, (__v8di)__X, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_pd (__m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, - __m512i __X, __m512d __Y) { - return (__m512d)__builtin_ia32_permvardf512_mask((__v8df)__Y, (__v8di)__X, - (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, - __m512d __Y) { - return (__m512d)__builtin_ia32_permvardf512_mask( - (__v8df)__Y, (__v8di)__X, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512 _mm512_permutexvar_ps(__m512i __X, __m512 __Y) { - return (__m512)__builtin_ia32_permvarsf512_mask( - (__v16sf)__Y, (__v16si)__X, (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_ps (__m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, - __m512i __X, __m512 __Y) { - return (__m512)__builtin_ia32_permvarsf512_mask((__v16sf)__Y, (__v16si)__X, - (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, - __m512 __Y) { - return (__m512)__builtin_ia32_permvarsf512_mask( - (__v16sf)__Y, (__v16si)__X, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - #ifdef __OPTIMIZE__ -__funline __m512 _mm512_shuffle_ps(__m512 __M, __m512 __V, const int __imm) { - return (__m512)__builtin_ia32_shufps512_mask( - (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_shuffle_ps(__m512 __W, __mmask16 __U, __m512 __M, - __m512 __V, const int __imm) { - return (__m512)__builtin_ia32_shufps512_mask( - (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M, + __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V, - const int __imm) { - return (__m512)__builtin_ia32_shufps512_mask( - (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)_mm512_setzero_ps(), - (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m512d _mm512_shuffle_pd(__m512d __M, __m512d __V, const int __imm) { - return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, - (__v8df)_mm512_undefined_pd(), - (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_shuffle_pd(__m512d __W, __mmask8 __U, __m512d __M, - __m512d __V, const int __imm) { - return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, - (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_shuffle_pd(__mmask8 __U, __m512d __M, __m512d __V, - const int __imm) { - return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, - (__v8df)_mm512_setzero_pd(), - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512d _mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C, - const int __imm, const int __R) { - return (__m512d)__builtin_ia32_fixupimmpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C, + const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_fixupimm_round_pd(__m512d __A, __mmask8 __U, - __m512d __B, __m512i __C, - const int __imm, const int __R) { - return (__m512d)__builtin_ia32_fixupimmpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_fixupimm_round_pd(__mmask8 __U, __m512d __A, - __m512d __B, __m512i __C, - const int __imm, const int __R) { - return (__m512d)__builtin_ia32_fixupimmpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); } - -__funline __m512 _mm512_fixupimm_round_ps(__m512 __A, __m512 __B, __m512i __C, - const int __imm, const int __R) { - return (__m512)__builtin_ia32_fixupimmps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_fixupimm_round_ps(__m512 __A, __mmask16 __U, - __m512 __B, __m512i __C, - const int __imm, const int __R) { - return (__m512)__builtin_ia32_fixupimmps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_fixupimm_round_ps(__mmask16 __U, __m512 __A, - __m512 __B, __m512i __C, - const int __imm, const int __R) { - return (__m512)__builtin_ia32_fixupimmps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); } - -__funline __m128d _mm_fixupimm_round_sd(__m128d __A, __m128d __B, __m128i __C, - const int __imm, const int __R) { - return (__m128d)__builtin_ia32_fixupimmsd_mask( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, __R); } - -__funline __m128d _mm_mask_fixupimm_round_sd(__m128d __A, __mmask8 __U, - __m128d __B, __m128i __C, - const int __imm, const int __R) { - return (__m128d)__builtin_ia32_fixupimmsd_mask( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, __R); } - -__funline __m128d _mm_maskz_fixupimm_round_sd(__mmask8 __U, __m128d __A, - __m128d __B, __m128i __C, - const int __imm, const int __R) { - return (__m128d)__builtin_ia32_fixupimmsd_maskz( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, __R); } - -__funline __m128 _mm_fixupimm_round_ss(__m128 __A, __m128 __B, __m128i __C, - const int __imm, const int __R) { - return (__m128)__builtin_ia32_fixupimmss_mask( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, __R); } - -__funline __m128 _mm_mask_fixupimm_round_ss(__m128 __A, __mmask8 __U, __m128 __B, - __m128i __C, const int __imm, - const int __R) { - return (__m128)__builtin_ia32_fixupimmss_mask( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); } - -__funline __m128 _mm_maskz_fixupimm_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - __m128i __C, const int __imm, - const int __R) { - return (__m128)__builtin_ia32_fixupimmss_maskz( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); } - #else -#define _mm512_shuffle_pd(X, Y, C) \ - ((__m512d)__builtin_ia32_shufpd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ - (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) - -#define _mm512_mask_shuffle_pd(W, U, X, Y, C) \ - ((__m512d)__builtin_ia32_shufpd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U))) - -#define _mm512_maskz_shuffle_pd(U, X, Y, C) \ - ((__m512d)__builtin_ia32_shufpd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ - (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) - -#define _mm512_shuffle_ps(X, Y, C) \ - ((__m512)__builtin_ia32_shufps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ - (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) - -#define _mm512_mask_shuffle_ps(W, U, X, Y, C) \ - ((__m512)__builtin_ia32_shufps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U))) - -#define _mm512_maskz_shuffle_ps(U, X, Y, C) \ - ((__m512)__builtin_ia32_shufps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ - (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) - -#define _mm512_fixupimm_round_pd(X, Y, Z, C, R) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ - (int)(C), (__mmask8)(-1), (R))) - -#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ - (int)(C), (__mmask8)(U), (R))) - -#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R) \ - ((__m512d)__builtin_ia32_fixupimmpd512_maskz( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ - (int)(C), (__mmask8)(U), (R))) - -#define _mm512_fixupimm_round_ps(X, Y, Z, C, R) \ - ((__m512)__builtin_ia32_fixupimmps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ - (int)(C), (__mmask16)(-1), (R))) - -#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R) \ - ((__m512)__builtin_ia32_fixupimmps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ - (int)(C), (__mmask16)(U), (R))) - -#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R) \ - ((__m512)__builtin_ia32_fixupimmps512_maskz( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ - (int)(C), (__mmask16)(U), (R))) - -#define _mm_fixupimm_round_sd(X, Y, Z, C, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(-1), (R))) - -#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), (R))) - -#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_maskz( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), (R))) - -#define _mm_fixupimm_round_ss(X, Y, Z, C, R) \ - ((__m128)__builtin_ia32_fixupimmss_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(-1), (R))) - -#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) \ - ((__m128)__builtin_ia32_fixupimmss_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), (R))) - -#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) \ - ((__m128)__builtin_ia32_fixupimmss_maskz( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), (R))) +#define _mm512_shuffle_pd(X, Y, C) ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) +#define _mm512_mask_shuffle_pd(W, U, X, Y, C) ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_maskz_shuffle_pd(U, X, Y, C) ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) +#define _mm512_shuffle_ps(X, Y, C) ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) +#define _mm512_mask_shuffle_ps(W, U, X, Y, C) ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) +#define _mm512_maskz_shuffle_ps(U, X, Y, C) ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) +#define _mm512_fixupimm_round_pd(X, Y, Z, C, R) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(-1), (R))) +#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), (R))) +#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R) ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), (R))) +#define _mm512_fixupimm_round_ps(X, Y, Z, C, R) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(-1), (R))) +#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), (R))) +#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R) ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), (R))) +#define _mm_fixupimm_round_sd(X, Y, Z, C, R) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(-1), (R))) +#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) +#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) +#define _mm_fixupimm_round_ss(X, Y, Z, C, R) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(-1), (R))) +#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) +#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), (R))) #endif - -__funline __m512 _mm512_movehdup_ps(__m512 __A) { - return (__m512)__builtin_ia32_movshdup512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movehdup_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_movshdup512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_movshdup512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m512 _mm512_moveldup_ps(__m512 __A) { - return (__m512)__builtin_ia32_movsldup512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_moveldup_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_movsldup512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_movsldup512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - -__funline __m512i _mm512_or_si512(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A | (__v16su)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A | (__v16su) __B); } - -__funline __m512i _mm512_or_epi32(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A | (__v16su)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A | (__v16su) __B); } - -__funline __m512i _mm512_mask_or_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pord512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_or_epi32(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pord512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_or_epi64(__m512i __A, __m512i __B) { - return (__m512i)((__v8du)__A | (__v8du)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A | (__v8du) __B); } - -__funline __m512i _mm512_mask_or_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_porq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_or_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_porq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512i _mm512_xor_si512(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A ^ (__v16su)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A ^ (__v16su) __B); } - -__funline __m512i _mm512_xor_epi32(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A ^ (__v16su)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A ^ (__v16su) __B); } - -__funline __m512i _mm512_mask_xor_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pxord512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_xor_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pxord512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_xor_epi64(__m512i __A, __m512i __B) { - return (__m512i)((__v8du)__A ^ (__v8du)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A ^ (__v8du) __B); } - -__funline __m512i _mm512_mask_xor_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pxorq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_xor_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pxorq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_rol_epi32(__m512i __A, const int __B) { - return (__m512i)__builtin_ia32_prold512_mask( - (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rol_epi32 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A, - const int __B) { - return (__m512i)__builtin_ia32_prold512_mask((__v16si)__A, __B, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A, - const int __B) { - return (__m512i)__builtin_ia32_prold512_mask( - (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_ror_epi32(__m512i __A, int __B) { - return (__m512i)__builtin_ia32_prord512_mask( - (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ror_epi32 (__m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A, - int __B) { - return (__m512i)__builtin_ia32_prord512_mask((__v16si)__A, __B, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A, int __B) { - return (__m512i)__builtin_ia32_prord512_mask( - (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_rol_epi64(__m512i __A, const int __B) { - return (__m512i)__builtin_ia32_prolq512_mask( - (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rol_epi64 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A, - const int __B) { - return (__m512i)__builtin_ia32_prolq512_mask((__v8di)__A, __B, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A, - const int __B) { - return (__m512i)__builtin_ia32_prolq512_mask( - (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512i _mm512_ror_epi64(__m512i __A, int __B) { - return (__m512i)__builtin_ia32_prorq512_mask( - (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ror_epi64 (__m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A, - int __B) { - return (__m512i)__builtin_ia32_prorq512_mask((__v8di)__A, __B, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A, int __B) { - return (__m512i)__builtin_ia32_prorq512_mask( - (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - #else -#define _mm512_rol_epi32(A, B) \ - ((__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_undefined_epi32(), \ - (__mmask16)(-1))) -#define _mm512_mask_rol_epi32(W, U, A, B) \ - ((__m512i)__builtin_ia32_prold512_mask( \ - (__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_rol_epi32(U, A, B) \ - ((__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U))) -#define _mm512_ror_epi32(A, B) \ - ((__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_undefined_epi32(), \ - (__mmask16)(-1))) -#define _mm512_mask_ror_epi32(W, U, A, B) \ - ((__m512i)__builtin_ia32_prord512_mask( \ - (__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) -#define _mm512_maskz_ror_epi32(U, A, B) \ - ((__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U))) -#define _mm512_rol_epi64(A, B) \ - ((__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_undefined_epi32(), \ - (__mmask8)(-1))) -#define _mm512_mask_rol_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_rol_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U))) - -#define _mm512_ror_epi64(A, B) \ - ((__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_undefined_epi32(), \ - (__mmask8)(-1))) -#define _mm512_mask_ror_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), (__mmask8)(U))) -#define _mm512_maskz_ror_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U))) +#define _mm512_rol_epi32(A, B) ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_undefined_epi32 (), (__mmask16)(-1))) +#define _mm512_mask_rol_epi32(W, U, A, B) ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_rol_epi32(U, A, B) ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_setzero_si512 (), (__mmask16)(U))) +#define _mm512_ror_epi32(A, B) ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_undefined_epi32 (), (__mmask16)(-1))) +#define _mm512_mask_ror_epi32(W, U, A, B) ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_ror_epi32(U, A, B) ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), (int)(B), (__v16si)_mm512_setzero_si512 (), (__mmask16)(U))) +#define _mm512_rol_epi64(A, B) ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_undefined_epi32 (), (__mmask8)(-1))) +#define _mm512_mask_rol_epi64(W, U, A, B) ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_rol_epi64(U, A, B) ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_setzero_si512 (), (__mmask8)(U))) +#define _mm512_ror_epi64(A, B) ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_undefined_epi32 (), (__mmask8)(-1))) +#define _mm512_mask_ror_epi64(W, U, A, B) ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_ror_epi64(U, A, B) ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), (int)(B), (__v8di)_mm512_setzero_si512 (), (__mmask8)(U))) #endif - -__funline __m512i _mm512_and_si512(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A & (__v16su)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A & (__v16su) __B); } - -__funline __m512i _mm512_and_epi32(__m512i __A, __m512i __B) { - return (__m512i)((__v16su)__A & (__v16su)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A & (__v16su) __B); } - -__funline __m512i _mm512_mask_and_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pandd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_and_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pandd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_and_epi64(__m512i __A, __m512i __B) { - return (__m512i)((__v8du)__A & (__v8du)__B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A & (__v8du) __B); } - -__funline __m512i _mm512_mask_and_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pandq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, __U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); } - -__funline __m512i _mm512_maskz_and_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pandq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_pd(), __U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); } - -__funline __m512i _mm512_andnot_si512(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pandnd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_andnot_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pandnd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pandnd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pandnd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_andnot_epi64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pandnq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pandnq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, __U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); } - -__funline __m512i _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pandnq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_pd(), __U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); } - -__funline __mmask16 _mm512_test_epi32_mask(__m512i __A, __m512i __B) { - return (__mmask16)__builtin_ia32_ptestmd512((__v16si)__A, (__v16si)__B, - (__mmask16)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } - -__funline __mmask16 _mm512_mask_test_epi32_mask(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__mmask16)__builtin_ia32_ptestmd512((__v16si)__A, (__v16si)__B, __U); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, __U); } - -__funline __mmask8 _mm512_test_epi64_mask(__m512i __A, __m512i __B) { - return (__mmask8)__builtin_ia32_ptestmq512((__v8di)__A, (__v8di)__B, - (__mmask8)-1); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } - -__funline __mmask8 _mm512_mask_test_epi64_mask(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__mmask8)__builtin_ia32_ptestmq512((__v8di)__A, (__v8di)__B, __U); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U); } - -__funline __mmask16 _mm512_testn_epi32_mask(__m512i __A, __m512i __B) { - return (__mmask16)__builtin_ia32_ptestnmd512((__v16si)__A, (__v16si)__B, - (__mmask16)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } - -__funline __mmask16 _mm512_mask_testn_epi32_mask(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__mmask16)__builtin_ia32_ptestnmd512((__v16si)__A, (__v16si)__B, __U); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, __U); } - -__funline __mmask8 _mm512_testn_epi64_mask(__m512i __A, __m512i __B) { - return (__mmask8)__builtin_ia32_ptestnmq512((__v8di)__A, (__v8di)__B, - (__mmask8)-1); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } - -__funline __mmask8 _mm512_mask_testn_epi64_mask(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__mmask8)__builtin_ia32_ptestnmq512((__v8di)__A, (__v8di)__B, __U); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, __U); } - -__funline __m512 _mm512_abs_ps(__m512 __A) { - return (__m512)_mm512_and_epi32((__m512i)__A, _mm512_set1_epi32(0x7fffffff)); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_ps (__m512 __A) +{ + return (__m512) _mm512_and_epi32 ((__m512i) __A, + _mm512_set1_epi32 (0x7fffffff)); } - -__funline __m512 _mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)_mm512_mask_and_epi32((__m512i)__W, __U, (__m512i)__A, - _mm512_set1_epi32(0x7fffffff)); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) _mm512_mask_and_epi32 ((__m512i) __W, __U, (__m512i) __A, + _mm512_set1_epi32 (0x7fffffff)); } - -__funline __m512d _mm512_abs_pd(__m512d __A) { - return (__m512d)_mm512_and_epi64((__m512i)__A, - _mm512_set1_epi64(0x7fffffffffffffffLL)); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_pd (__m512d __A) +{ + return (__m512d) _mm512_and_epi64 ((__m512i) __A, + _mm512_set1_epi64 (0x7fffffffffffffffLL)); } - -__funline __m512d _mm512_mask_abs_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)_mm512_mask_and_epi64( - (__m512i)__W, __U, (__m512i)__A, _mm512_set1_epi64(0x7fffffffffffffffLL)); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) + _mm512_mask_and_epi64 ((__m512i) __W, __U, (__m512i) __A, + _mm512_set1_epi64 (0x7fffffffffffffffLL)); } - -__funline __m512i _mm512_unpackhi_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhdq512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhdq512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpckhdq512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_unpackhi_epi64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhqdq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckhqdq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpckhqdq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __m512i _mm512_unpacklo_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckldq512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpckldq512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpckldq512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_unpacklo_epi64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpcklqdq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_punpcklqdq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_unpacklo_epi64(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_punpcklqdq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - #ifdef __x86_64__ #ifdef __OPTIMIZE__ -__funline unsigned long long _mm_cvt_roundss_u64(__m128 __A, const int __R) { - return (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)__A, __R); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R); } - -__funline long long _mm_cvt_roundss_si64(__m128 __A, const int __R) { - return (long long)__builtin_ia32_vcvtss2si64((__v4sf)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); } - -__funline long long _mm_cvt_roundss_i64(__m128 __A, const int __R) { - return (long long)__builtin_ia32_vcvtss2si64((__v4sf)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); } - -__funline unsigned long long _mm_cvtt_roundss_u64(__m128 __A, const int __R) { - return (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)__A, __R); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R); } - -__funline long long _mm_cvtt_roundss_i64(__m128 __A, const int __R) { - return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); } - -__funline long long _mm_cvtt_roundss_si64(__m128 __A, const int __R) { - return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); } #else -#define _mm_cvt_roundss_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) - +#define _mm_cvt_roundss_u64(A, B) ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) #define _mm_cvt_roundss_si64(A, B) ((long long)__builtin_ia32_vcvtss2si64(A, B)) - #define _mm_cvt_roundss_i64(A, B) ((long long)__builtin_ia32_vcvtss2si64(A, B)) - -#define _mm_cvtt_roundss_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) - -#define _mm_cvtt_roundss_i64(A, B) \ - ((long long)__builtin_ia32_vcvttss2si64(A, B)) - -#define _mm_cvtt_roundss_si64(A, B) \ - ((long long)__builtin_ia32_vcvttss2si64(A, B)) +#define _mm_cvtt_roundss_u64(A, B) ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) +#define _mm_cvtt_roundss_i64(A, B) ((long long)__builtin_ia32_vcvttss2si64(A, B)) +#define _mm_cvtt_roundss_si64(A, B) ((long long)__builtin_ia32_vcvttss2si64(A, B)) #endif #endif - #ifdef __OPTIMIZE__ -__funline unsigned _mm_cvt_roundss_u32(__m128 __A, const int __R) { - return (unsigned)__builtin_ia32_vcvtss2usi32((__v4sf)__A, __R); +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R); } - -__funline int _mm_cvt_roundss_si32(__m128 __A, const int __R) { - return (int)__builtin_ia32_vcvtss2si32((__v4sf)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); } - -__funline int _mm_cvt_roundss_i32(__m128 __A, const int __R) { - return (int)__builtin_ia32_vcvtss2si32((__v4sf)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); } - -__funline unsigned _mm_cvtt_roundss_u32(__m128 __A, const int __R) { - return (unsigned)__builtin_ia32_vcvttss2usi32((__v4sf)__A, __R); +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R); } - -__funline int _mm_cvtt_roundss_i32(__m128 __A, const int __R) { - return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); } - -__funline int _mm_cvtt_roundss_si32(__m128 __A, const int __R) { - return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); } #else #define _mm_cvt_roundss_u32(A, B) ((unsigned)__builtin_ia32_vcvtss2usi32(A, B)) - #define _mm_cvt_roundss_si32(A, B) ((int)__builtin_ia32_vcvtss2si32(A, B)) - #define _mm_cvt_roundss_i32(A, B) ((int)__builtin_ia32_vcvtss2si32(A, B)) - -#define _mm_cvtt_roundss_u32(A, B) \ - ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) - +#define _mm_cvtt_roundss_u32(A, B) ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) #define _mm_cvtt_roundss_si32(A, B) ((int)__builtin_ia32_vcvttss2si32(A, B)) - #define _mm_cvtt_roundss_i32(A, B) ((int)__builtin_ia32_vcvttss2si32(A, B)) #endif - #ifdef __x86_64__ #ifdef __OPTIMIZE__ -__funline unsigned long long _mm_cvt_roundsd_u64(__m128d __A, const int __R) { - return (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)__A, __R); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R); } - -__funline long long _mm_cvt_roundsd_si64(__m128d __A, const int __R) { - return (long long)__builtin_ia32_vcvtsd2si64((__v2df)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); } - -__funline long long _mm_cvt_roundsd_i64(__m128d __A, const int __R) { - return (long long)__builtin_ia32_vcvtsd2si64((__v2df)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); } - -__funline unsigned long long _mm_cvtt_roundsd_u64(__m128d __A, const int __R) { - return (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)__A, __R); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R); } - -__funline long long _mm_cvtt_roundsd_si64(__m128d __A, const int __R) { - return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); } - -__funline long long _mm_cvtt_roundsd_i64(__m128d __A, const int __R) { - return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, __R); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); } #else -#define _mm_cvt_roundsd_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) - +#define _mm_cvt_roundsd_u64(A, B) ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) #define _mm_cvt_roundsd_si64(A, B) ((long long)__builtin_ia32_vcvtsd2si64(A, B)) - #define _mm_cvt_roundsd_i64(A, B) ((long long)__builtin_ia32_vcvtsd2si64(A, B)) - -#define _mm_cvtt_roundsd_u64(A, B) \ - ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) - -#define _mm_cvtt_roundsd_si64(A, B) \ - ((long long)__builtin_ia32_vcvttsd2si64(A, B)) - -#define _mm_cvtt_roundsd_i64(A, B) \ - ((long long)__builtin_ia32_vcvttsd2si64(A, B)) +#define _mm_cvtt_roundsd_u64(A, B) ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) +#define _mm_cvtt_roundsd_si64(A, B) ((long long)__builtin_ia32_vcvttsd2si64(A, B)) +#define _mm_cvtt_roundsd_i64(A, B) ((long long)__builtin_ia32_vcvttsd2si64(A, B)) #endif #endif - #ifdef __OPTIMIZE__ -__funline unsigned _mm_cvt_roundsd_u32(__m128d __A, const int __R) { - return (unsigned)__builtin_ia32_vcvtsd2usi32((__v2df)__A, __R); +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R); } - -__funline int _mm_cvt_roundsd_si32(__m128d __A, const int __R) { - return (int)__builtin_ia32_vcvtsd2si32((__v2df)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); } - -__funline int _mm_cvt_roundsd_i32(__m128d __A, const int __R) { - return (int)__builtin_ia32_vcvtsd2si32((__v2df)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); } - -__funline unsigned _mm_cvtt_roundsd_u32(__m128d __A, const int __R) { - return (unsigned)__builtin_ia32_vcvttsd2usi32((__v2df)__A, __R); +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R); } - -__funline int _mm_cvtt_roundsd_i32(__m128d __A, const int __R) { - return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); } - -__funline int _mm_cvtt_roundsd_si32(__m128d __A, const int __R) { - return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, __R); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); } #else #define _mm_cvt_roundsd_u32(A, B) ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B)) - #define _mm_cvt_roundsd_si32(A, B) ((int)__builtin_ia32_vcvtsd2si32(A, B)) - #define _mm_cvt_roundsd_i32(A, B) ((int)__builtin_ia32_vcvtsd2si32(A, B)) - -#define _mm_cvtt_roundsd_u32(A, B) \ - ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) - +#define _mm_cvtt_roundsd_u32(A, B) ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) #define _mm_cvtt_roundsd_si32(A, B) ((int)__builtin_ia32_vcvttsd2si32(A, B)) - #define _mm_cvtt_roundsd_i32(A, B) ((int)__builtin_ia32_vcvttsd2si32(A, B)) #endif - -__funline __m512d _mm512_movedup_pd(__m512d __A) { - return (__m512d)__builtin_ia32_movddup512_mask( - (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movedup_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_movedup_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_movddup512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_movddup512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512d _mm512_unpacklo_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_unpcklpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_unpcklpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_unpacklo_pd(__mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_unpcklpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512d _mm512_unpackhi_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_unpckhpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); } - -__funline __m512d _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_unpckhpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m512d _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_unpckhpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); } - -__funline __m512 _mm512_unpackhi_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_unpckhps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_unpckhps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_unpackhi_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_unpckhps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_cvt_roundps_pd(__m256 __A, const int __R) { - return (__m512d)__builtin_ia32_cvtps2pd512_mask( - (__v8sf)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_pd (__m256 __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_cvt_roundps_pd(__m512d __W, __mmask8 __U, - __m256 __A, const int __R) { - return (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)__A, (__v8df)__W, - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A, - const int __R) { - return (__m512d)__builtin_ia32_cvtps2pd512_mask( - (__v8sf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_cvt_roundph_ps(__m256i __A, const int __R) { - return (__m512)__builtin_ia32_vcvtph2ps512_mask( - (__v16hi)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_ps (__m256i __A, const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_cvt_roundph_ps(__m512 __W, __mmask16 __U, - __m256i __A, const int __R) { - return (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A, + const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_cvt_roundph_ps(__mmask16 __U, __m256i __A, - const int __R) { - return (__m512)__builtin_ia32_vcvtph2ps512_mask( - (__v16hi)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m256i _mm512_cvt_roundps_ph(__m512 __A, const int __I) { - return (__m256i)__builtin_ia32_vcvtps2ph512_mask( - (__v16sf)__A, __I, (__v16hi)_mm256_undefined_si256(), -1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_ph (__m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_undefined_si256 (), + -1); } - -__funline __m256i _mm512_cvtps_ph(__m512 __A, const int __I) { - return (__m256i)__builtin_ia32_vcvtps2ph512_mask( - (__v16sf)__A, __I, (__v16hi)_mm256_undefined_si256(), -1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_ph (__m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_undefined_si256 (), + -1); } - -__funline __m256i _mm512_mask_cvt_roundps_ph(__m256i __U, __mmask16 __W, - __m512 __A, const int __I) { - return (__m256i)__builtin_ia32_vcvtps2ph512_mask( - (__v16sf)__A, __I, (__v16hi)__U, (__mmask16)__W); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A, + const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); } - -__funline __m256i _mm512_mask_cvtps_ph(__m256i __U, __mmask16 __W, __m512 __A, - const int __I) { - return (__m256i)__builtin_ia32_vcvtps2ph512_mask( - (__v16sf)__A, __I, (__v16hi)__U, (__mmask16)__W); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); } - -__funline __m256i _mm512_maskz_cvt_roundps_ph(__mmask16 __W, __m512 __A, - const int __I) { - return (__m256i)__builtin_ia32_vcvtps2ph512_mask( - (__v16sf)__A, __I, (__v16hi)_mm256_setzero_si256(), (__mmask16)__W); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __W); } - -__funline __m256i _mm512_maskz_cvtps_ph(__mmask16 __W, __m512 __A, - const int __I) { - return (__m256i)__builtin_ia32_vcvtps2ph512_mask( - (__v16sf)__A, __I, (__v16hi)_mm256_setzero_si256(), (__mmask16)__W); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __W); } #else -#define _mm512_cvt_roundps_pd(A, B) \ - (__m512d) \ - __builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B) - -#define _mm512_mask_cvt_roundps_pd(W, U, A, B) \ - (__m512d) __builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B) - -#define _mm512_maskz_cvt_roundps_pd(U, A, B) \ - (__m512d) \ - __builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B) - -#define _mm512_cvt_roundph_ps(A, B) \ - (__m512) __builtin_ia32_vcvtph2ps512_mask( \ - (__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B) - -#define _mm512_mask_cvt_roundph_ps(W, U, A, B) \ - (__m512) __builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B) - -#define _mm512_maskz_cvt_roundph_ps(U, A, B) \ - (__m512) __builtin_ia32_vcvtph2ps512_mask( \ - (__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B) - -#define _mm512_cvt_roundps_ph(A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ - (__v16sf)(__m512)A, (int)(I), (__v16hi)_mm256_undefined_si256(), -1)) -#define _mm512_cvtps_ph(A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ - (__v16sf)(__m512)A, (int)(I), (__v16hi)_mm256_undefined_si256(), -1)) -#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ - (__v16sf)(__m512)A, (int)(I), (__v16hi)(__m256i)(U), (__mmask16)(W))) -#define _mm512_mask_cvtps_ph(U, W, A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ - (__v16sf)(__m512)A, (int)(I), (__v16hi)(__m256i)(U), (__mmask16)(W))) -#define _mm512_maskz_cvt_roundps_ph(W, A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)A, (int)(I), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(W))) -#define _mm512_maskz_cvtps_ph(W, A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)A, (int)(I), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(W))) +#define _mm512_cvt_roundps_pd(A, B) (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B) +#define _mm512_mask_cvt_roundps_pd(W, U, A, B) (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B) +#define _mm512_maskz_cvt_roundps_pd(U, A, B) (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B) +#define _mm512_cvt_roundph_ps(A, B) (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B) +#define _mm512_mask_cvt_roundph_ps(W, U, A, B) (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B) +#define _mm512_maskz_cvt_roundph_ps(U, A, B) (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B) +#define _mm512_cvt_roundps_ph(A, I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I), (__v16hi)_mm256_undefined_si256 (), -1)) +#define _mm512_cvtps_ph(A, I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I), (__v16hi)_mm256_undefined_si256 (), -1)) +#define _mm512_mask_cvt_roundps_ph(U, W, A, I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I), (__v16hi)(__m256i)(U), (__mmask16) (W))) +#define _mm512_mask_cvtps_ph(U, W, A, I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I), (__v16hi)(__m256i)(U), (__mmask16) (W))) +#define _mm512_maskz_cvt_roundps_ph(W, A, I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I), (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) +#define _mm512_maskz_cvtps_ph(W, A, I) ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I), (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) #endif - #ifdef __OPTIMIZE__ -__funline __m256 _mm512_cvt_roundpd_ps(__m512d __A, const int __R) { - return (__m256)__builtin_ia32_cvtpd2ps512_mask( - (__v8df)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, __R); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_ps (__m512d __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_undefined_ps (), + (__mmask8) -1, __R); } - -__funline __m256 _mm512_mask_cvt_roundpd_ps(__m256 __W, __mmask8 __U, __m512d __A, - const int __R) { - return (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)__A, (__v8sf)__W, - (__mmask8)__U, __R); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, __R); } - -__funline __m256 _mm512_maskz_cvt_roundpd_ps(__mmask8 __U, __m512d __A, - const int __R) { - return (__m256)__builtin_ia32_cvtpd2ps512_mask( - (__v8df)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, __R); } - -__funline __m128 _mm_cvt_roundsd_ss(__m128 __A, __m128d __B, const int __R) { - return (__m128)__builtin_ia32_cvtsd2ss_round((__v4sf)__A, (__v2df)__B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A, + (__v2df) __B, + __R); } - -__funline __m128d _mm_cvt_roundss_sd(__m128d __A, __m128 __B, const int __R) { - return (__m128d)__builtin_ia32_cvtss2sd_round((__v2df)__A, (__v4sf)__B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + (__v4sf) __W, + __U, + __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + _mm_setzero_ps (), + __U, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A, + (__v4sf) __B, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + (__v2df) __W, + __U, + __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + _mm_setzero_pd (), + __U, + __R); } #else -#define _mm512_cvt_roundpd_ps(A, B) \ - (__m256) \ - __builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) - -#define _mm512_mask_cvt_roundpd_ps(W, U, A, B) \ - (__m256) __builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B) - -#define _mm512_maskz_cvt_roundpd_ps(U, A, B) \ - (__m256) __builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B) - -#define _mm_cvt_roundsd_ss(A, B, C) \ - (__m128) __builtin_ia32_cvtsd2ss_round(A, B, C) - -#define _mm_cvt_roundss_sd(A, B, C) \ - (__m128d) __builtin_ia32_cvtss2sd_round(A, B, C) +#define _mm512_cvt_roundpd_ps(A, B) (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) +#define _mm512_mask_cvt_roundpd_ps(W, U, A, B) (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B) +#define _mm512_maskz_cvt_roundpd_ps(U, A, B) (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B) +#define _mm_cvt_roundsd_ss(A, B, C) (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) +#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C)) +#define _mm_maskz_cvt_roundsd_ss(U, A, B, C) (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), (U), (C)) +#define _mm_cvt_roundss_sd(A, B, C) (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) +#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C)) +#define _mm_maskz_cvt_roundss_sd(U, A, B, C) (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), (U), (C)) #endif - -__funline void _mm512_stream_si512(__m512i *__P, __m512i __A) { - __builtin_ia32_movntdq512((__v8di *)__P, (__v8di)__A); +#define _mm_mask_cvtss_sd(W, U, A, B) _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_cvtss_sd(U, A, B) _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_mask_cvtsd_ss(W, U, A, B) _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_cvtsd_ss(U, A, B) _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_si512 (__m512i * __P, __m512i __A) +{ + __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A); } - -__funline void _mm512_stream_ps(float *__P, __m512 __A) { - __builtin_ia32_movntps512(__P, (__v16sf)__A); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_ps (float *__P, __m512 __A) +{ + __builtin_ia32_movntps512 (__P, (__v16sf) __A); } - -__funline void _mm512_stream_pd(double *__P, __m512d __A) { - __builtin_ia32_movntpd512(__P, (__v8df)__A); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_pd (double *__P, __m512d __A) +{ + __builtin_ia32_movntpd512 (__P, (__v8df) __A); } - -__funline __m512i _mm512_stream_load_si512(void *__P) { - return __builtin_ia32_movntdqa512((__v8di *)__P); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_load_si512 (void *__P) +{ + return __builtin_ia32_movntdqa512 ((__v8di *)__P); } - -typedef enum { +typedef enum +{ _MM_MANT_NORM_1_2, _MM_MANT_NORM_p5_2, _MM_MANT_NORM_p5_1, _MM_MANT_NORM_p75_1p5 } _MM_MANTISSA_NORM_ENUM; - -typedef enum { +typedef enum +{ _MM_MANT_SIGN_src, _MM_MANT_SIGN_zero, _MM_MANT_SIGN_nan } _MM_MANTISSA_SIGN_ENUM; - #ifdef __OPTIMIZE__ -__funline __m128 _mm_getexp_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_getexpss128_round((__v4sf)__A, (__v4sf)__B, - __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + __R); } - -__funline __m128 _mm_mask_getexp_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_getexpss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } - -__funline __m128 _mm_maskz_getexp_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_getexpss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } - -__funline __m128d _mm_getexp_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_getexpsd128_round((__v2df)__A, (__v2df)__B, - __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + __R); } - -__funline __m128d _mm_mask_getexp_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_getexpsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } - -__funline __m128d _mm_maskz_getexp_round_sd(__mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_getexpsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512 _mm512_getexp_round_ps(__m512 __A, const int __R) { - return (__m512)__builtin_ia32_getexpps512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_getexp_round_ps(__m512 __W, __mmask16 __U, - __m512 __A, const int __R) { - return (__m512)__builtin_ia32_getexpps512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); } - -__funline __m512 _mm512_maskz_getexp_round_ps(__mmask16 __U, __m512 __A, - const int __R) { - return (__m512)__builtin_ia32_getexpps512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); } - -__funline __m512d _mm512_getexp_round_pd(__m512d __A, const int __R) { - return (__m512d)__builtin_ia32_getexppd512_mask( - (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_getexp_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, const int __R) { - return (__m512d)__builtin_ia32_getexppd512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); } - -__funline __m512d _mm512_maskz_getexp_round_pd(__mmask8 __U, __m512d __A, - const int __R) { - return (__m512d)__builtin_ia32_getexppd512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m512d _mm512_getmant_round_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, - const int __R) { - return (__m512d)__builtin_ia32_getmantpd512_mask( - (__v8df)__A, (__C << 2) | __B, _mm512_undefined_pd(), (__mmask8)-1, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_undefined_pd (), + (__mmask8) -1, __R); } - -__funline __m512d _mm512_mask_getmant_round_pd(__m512d __W, __mmask8 __U, - __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, - const int __R) { - return (__m512d)__builtin_ia32_getmantpd512_mask( - (__v8df)__A, (__C << 2) | __B, (__v8df)__W, __U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + __R); } - -__funline __m512d _mm512_maskz_getmant_round_pd(__mmask8 __U, __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, - const int __R) { - return (__m512d)__builtin_ia32_getmantpd512_mask( - (__v8df)__A, (__C << 2) | __B, (__v8df)_mm512_setzero_pd(), __U, __R); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); } - -__funline __m512 _mm512_getmant_round_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, - const int __R) { - return (__m512)__builtin_ia32_getmantps512_mask( - (__v16sf)__A, (__C << 2) | __B, _mm512_undefined_ps(), (__mmask16)-1, - __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_undefined_ps (), + (__mmask16) -1, __R); } - -__funline __m512 _mm512_mask_getmant_round_ps(__m512 __W, __mmask16 __U, - __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, - const int __R) { - return (__m512)__builtin_ia32_getmantps512_mask( - (__v16sf)__A, (__C << 2) | __B, (__v16sf)__W, __U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + __R); } - -__funline __m512 _mm512_maskz_getmant_round_ps(__mmask16 __U, __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, - const int __R) { - return (__m512)__builtin_ia32_getmantps512_mask( - (__v16sf)__A, (__C << 2) | __B, (__v16sf)_mm512_setzero_ps(), __U, __R); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); } - -__funline __m128d _mm_getmant_round_sd(__m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, - const int __R) { - return (__m128d)__builtin_ia32_getmantsd_round((__v2df)__A, (__v2df)__B, - (__D << 2) | __C, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_sd (__m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + __R); } - -__funline __m128d _mm_mask_getmant_round_sd(__m128d __W, __mmask8 __U, - __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, - const int __R) { - return (__m128d)__builtin_ia32_getmantsd_mask_round( - (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)__W, __U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, __R); } - -__funline __m128d _mm_maskz_getmant_round_sd(__mmask8 __U, __m128d __A, - __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, - const int __R) { - return (__m128d)__builtin_ia32_getmantsd_mask_round( - (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)_mm_setzero_pd(), __U, - __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, __R); } - -__funline __m128 _mm_getmant_round_ss(__m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) { - return (__m128)__builtin_ia32_getmantss_round((__v4sf)__A, (__v4sf)__B, - (__D << 2) | __C, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_ss (__m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + __R); } - -__funline __m128 _mm_mask_getmant_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, - const int __R) { - return (__m128)__builtin_ia32_getmantss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)__W, __U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, __R); } - -__funline __m128 _mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, - const int __R) { - return (__m128)__builtin_ia32_getmantss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)_mm_setzero_ps(), __U, - __R); -} - -#else -#define _mm512_getmant_round_pd(X, B, C, R) \ - ((__m512d)__builtin_ia32_getmantpd512_mask( \ - (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ - (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1, (R))) - -#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R) \ - ((__m512d)__builtin_ia32_getmantpd512_mask( \ - (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), (__v8df)(__m512d)(W), \ - (__mmask8)(U), (R))) - -#define _mm512_maskz_getmant_round_pd(U, X, B, C, R) \ - ((__m512d)__builtin_ia32_getmantpd512_mask( \ - (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ - (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U), (R))) -#define _mm512_getmant_round_ps(X, B, C, R) \ - ((__m512)__builtin_ia32_getmantps512_mask( \ - (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ - (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1, (R))) - -#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R) \ - ((__m512)__builtin_ia32_getmantps512_mask( \ - (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), (__v16sf)(__m512)(W), \ - (__mmask16)(U), (R))) - -#define _mm512_maskz_getmant_round_ps(U, X, B, C, R) \ - ((__m512)__builtin_ia32_getmantps512_mask( \ - (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ - (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U), (R))) -#define _mm_getmant_round_sd(X, Y, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_round((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (int)(((D) << 2) | (C)), (R))) - -#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ - (__v2df)(__m128d)(W), (__mmask8)(U), (R))) - -#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ - (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U), (R))) - -#define _mm_getmant_round_ss(X, Y, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_round( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), (R))) - -#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_mask_round( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ - (__v4sf)(__m128)(W), (__mmask8)(U), (R))) - -#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_mask_round( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ - (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U), (R))) - -#define _mm_getexp_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), R)) - -#define _mm_mask_getexp_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_getexpss_mask_round(A, B, W, U, C) - -#define _mm_maskz_getexp_round_ss(U, A, B, C) \ - (__m128) \ - __builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_getexp_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), R)) - -#define _mm_mask_getexp_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_getexpsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_getexp_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm512_getexp_round_ps(A, R) \ - ((__m512)__builtin_ia32_getexpps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R)) - -#define _mm512_mask_getexp_round_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_getexpps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), R)) - -#define _mm512_maskz_getexp_round_ps(U, A, R) \ - ((__m512)__builtin_ia32_getexpps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R)) - -#define _mm512_getexp_round_pd(A, R) \ - ((__m512d)__builtin_ia32_getexppd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R)) - -#define _mm512_mask_getexp_round_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_getexppd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), R)) - -#define _mm512_maskz_getexp_round_pd(U, A, R) \ - ((__m512d)__builtin_ia32_getexppd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R)) -#endif - -#ifdef __OPTIMIZE__ -__funline __m512 _mm512_roundscale_round_ps(__m512 __A, const int __imm, - const int __R) { - return (__m512)__builtin_ia32_rndscaleps_mask( - (__v16sf)__A, __imm, (__v16sf)_mm512_undefined_ps(), -1, __R); -} - -__funline __m512 _mm512_mask_roundscale_round_ps(__m512 __A, __mmask16 __B, - __m512 __C, const int __imm, - const int __R) { - return (__m512)__builtin_ia32_rndscaleps_mask( - (__v16sf)__C, __imm, (__v16sf)__A, (__mmask16)__B, __R); -} - -__funline __m512 _mm512_maskz_roundscale_round_ps(__mmask16 __A, __m512 __B, - const int __imm, - const int __R) { - return (__m512)__builtin_ia32_rndscaleps_mask( - (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__A, __R); -} - -__funline __m512d _mm512_roundscale_round_pd(__m512d __A, const int __imm, - const int __R) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__A, __imm, (__v8df)_mm512_undefined_pd(), -1, __R); -} - -__funline __m512d _mm512_mask_roundscale_round_pd(__m512d __A, __mmask8 __B, - __m512d __C, const int __imm, - const int __R) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__C, __imm, (__v8df)__A, (__mmask8)__B, __R); -} - -__funline __m512d _mm512_maskz_roundscale_round_pd(__mmask8 __A, __m512d __B, - const int __imm, - const int __R) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__A, __R); -} - -__funline __m128 _mm_roundscale_round_ss(__m128 __A, __m128 __B, const int __imm, - const int __R) { - return (__m128)__builtin_ia32_rndscaless_round((__v4sf)__A, (__v4sf)__B, - __imm, __R); -} - -__funline __m128d _mm_roundscale_round_sd(__m128d __A, __m128d __B, - const int __imm, const int __R) { - return (__m128d)__builtin_ia32_rndscalesd_round((__v2df)__A, (__v2df)__B, - __imm, __R); -} - -#else -#define _mm512_roundscale_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)(-1), R)) -#define _mm512_mask_roundscale_round_ps(A, B, C, D, R) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(D), \ - (__v16sf)(__m512)(A), \ - (__mmask16)(B), R)) -#define _mm512_maskz_roundscale_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(A), R)) -#define _mm512_roundscale_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)(-1), R)) -#define _mm512_mask_roundscale_round_pd(A, B, C, D, R) \ - ((__m512d)__builtin_ia32_rndscalepd_mask( \ - (__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), R)) -#define _mm512_maskz_roundscale_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(A), R)) -#define _mm_roundscale_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_rndscaless_round((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), R)) -#define _mm_roundscale_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_rndscalesd_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), R)) -#endif - -__funline __m512 _mm512_floor_ps(__m512 __A) { - return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_FLOOR, - (__v16sf)__A, -1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_floor_pd(__m512d __A) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__A, _MM_FROUND_FLOOR, (__v8df)__A, -1, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_ceil_ps(__m512 __A) { - return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_CEIL, - (__v16sf)__A, -1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_ceil_pd(__m512d __A) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__A, _MM_FROUND_CEIL, (__v8df)__A, -1, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_floor_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_FLOOR, - (__v16sf)__W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_floor_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)__A, _MM_FROUND_FLOOR, - (__v8df)__W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_ceil_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_CEIL, - (__v16sf)__W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_ceil_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__A, _MM_FROUND_CEIL, (__v8df)__W, __U, _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __OPTIMIZE__ -__funline __m512i _mm512_alignr_epi32(__m512i __A, __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_alignd512_mask( - (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); -} - -__funline __m512i _mm512_mask_alignr_epi32(__m512i __W, __mmask16 __U, - __m512i __A, __m512i __B, - const int __imm) { - return (__m512i)__builtin_ia32_alignd512_mask( - (__v16si)__A, (__v16si)__B, __imm, (__v16si)__W, (__mmask16)__U); -} - -__funline __m512i _mm512_maskz_alignr_epi32(__mmask16 __U, __m512i __A, - __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_alignd512_mask( - (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_setzero_si512(), - (__mmask16)__U); -} - -__funline __m512i _mm512_alignr_epi64(__m512i __A, __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_alignq512_mask( - (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_undefined_epi32(), - (__mmask8)-1); -} - -__funline __m512i _mm512_mask_alignr_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_alignq512_mask((__v8di)__A, (__v8di)__B, __imm, - (__v8di)__W, (__mmask8)__U); -} - -__funline __m512i _mm512_maskz_alignr_epi64(__mmask8 __U, __m512i __A, - __m512i __B, const int __imm) { - return (__m512i)__builtin_ia32_alignq512_mask((__v8di)__A, (__v8di)__B, __imm, - (__v8di)_mm512_setzero_si512(), - (__mmask8)__U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, __R); } #else -#define _mm512_alignr_epi32(X, Y, C) \ - ((__m512i)__builtin_ia32_alignd512_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ - (__v16si)_mm512_undefined_epi32(), (__mmask16)-1)) - -#define _mm512_mask_alignr_epi32(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_alignd512_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ - (__v16si)(__m512i)(W), (__mmask16)(U))) - -#define _mm512_maskz_alignr_epi32(U, X, Y, C) \ - ((__m512i)__builtin_ia32_alignd512_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ - (__v16si)_mm512_setzero_si512(), (__mmask16)(U))) - -#define _mm512_alignr_epi64(X, Y, C) \ - ((__m512i)__builtin_ia32_alignq512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ - (__v8di)_mm512_undefined_epi32(), (__mmask8)-1)) - -#define _mm512_mask_alignr_epi64(W, U, X, Y, C) \ - ((__m512i)__builtin_ia32_alignq512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ - (__v8di)(__m512i)(W), (__mmask8)(U))) - -#define _mm512_maskz_alignr_epi64(U, X, Y, C) \ - ((__m512i)__builtin_ia32_alignq512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ - (__v8di)_mm512_setzero_si512(), (__mmask8)(U))) +#define _mm512_getmant_round_pd(X, B, C, R) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1, (R))) +#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)(W), (__mmask8)(U), (R))) +#define _mm512_maskz_getmant_round_pd(U, X, B, C, R) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U), (R))) +#define _mm512_getmant_round_ps(X, B, C, R) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1, (R))) +#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)(W), (__mmask16)(U), (R))) +#define _mm512_maskz_getmant_round_ps(U, X, B, C, R) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U), (R))) +#define _mm_getmant_round_sd(X, Y, C, D, R) ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (R))) +#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)(__m128d)(W), (__mmask8)(U), (R))) +#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U), (R))) +#define _mm_getmant_round_ss(X, Y, C, D, R) ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (R))) +#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)(__m128)(W), (__mmask8)(U), (R))) +#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U), (R))) +#define _mm_getexp_round_ss(A, B, R) ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R)) +#define _mm_mask_getexp_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C) +#define _mm_maskz_getexp_round_ss(U, A, B, C) (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) +#define _mm_getexp_round_sd(A, B, R) ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R)) +#define _mm_mask_getexp_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C) +#define _mm_maskz_getexp_round_sd(U, A, B, C) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) +#define _mm512_getexp_round_ps(A, R) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R)) +#define _mm512_mask_getexp_round_ps(W, U, A, R) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), R)) +#define _mm512_maskz_getexp_round_ps(U, A, R) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R)) +#define _mm512_getexp_round_pd(A, R) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R)) +#define _mm512_mask_getexp_round_pd(W, U, A, R) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), R)) +#define _mm512_maskz_getexp_round_pd(U, A, R) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R)) #endif - -__funline __mmask16 _mm512_cmpeq_epi32_mask(__m512i __A, __m512i __B) { - return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__A, (__v16si)__B, - (__mmask16)-1); +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) + _mm512_undefined_ps (), + -1, __R); } - -__funline __mmask16 _mm512_mask_cmpeq_epi32_mask(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__A, (__v16si)__B, - __U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, __R); } - -__funline __mmask8 _mm512_mask_cmpeq_epi64_mask(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__A, (__v8di)__B, - __U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, __R); } - -__funline __mmask8 _mm512_cmpeq_epi64_mask(__m512i __A, __m512i __B) { - return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__A, (__v8di)__B, - (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) + _mm512_undefined_pd (), + -1, __R); } - -__funline __mmask16 _mm512_cmpgt_epi32_mask(__m512i __A, __m512i __B) { - return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__A, (__v16si)__B, - (__mmask16)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B, + __m512d __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, __R); } - -__funline __mmask16 _mm512_mask_cmpgt_epi32_mask(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__A, (__v16si)__B, - __U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B, + const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, __R); } - -__funline __mmask8 _mm512_mask_cmpgt_epi64_mask(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__A, (__v8di)__B, - __U); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, + const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + __R); } - -__funline __mmask8 _mm512_cmpgt_epi64_mask(__m512i __A, __m512i __B) { - return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__A, (__v8di)__B, - (__mmask8)-1); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_ss (__m128 __A, __mmask8 __B, __m128 __C, + __m128 __D, const int __imm, const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, + (__v4sf) __D, __imm, + (__v4sf) __A, + (__mmask8) __B, + __R); } - -__funline __mmask16 _mm512_cmpge_epi32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 5, - (__mmask16)-1); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_ss (__mmask8 __A, __m128 __B, __m128 __C, + const int __imm, const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, + (__v4sf) __C, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __A, + __R); } - -__funline __mmask16 _mm512_mask_cmpge_epi32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 5, - (__mmask16)__M); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm, + const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + __R); } - -__funline __mmask16 _mm512_mask_cmpge_epu32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 5, - (__mmask16)__M); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_sd (__m128d __A, __mmask8 __B, __m128d __C, + __m128d __D, const int __imm, const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, + (__v2df) __D, __imm, + (__v2df) __A, + (__mmask8) __B, + __R); } - -__funline __mmask16 _mm512_cmpge_epu32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 5, - (__mmask16)-1); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_sd (__mmask8 __A, __m128d __B, __m128d __C, + const int __imm, const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, + (__v2df) __C, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __A, + __R); } - -__funline __mmask8 _mm512_mask_cmpge_epi64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 5, - (__mmask8)__M); +#else +#define _mm512_roundscale_round_ps(A, B, R) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), R)) +#define _mm512_mask_roundscale_round_ps(A, B, C, D, R) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), (int)(D), (__v16sf)(__m512)(A), (__mmask16)(B), R)) +#define _mm512_maskz_roundscale_round_ps(A, B, C, R) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps(), (__mmask16)(A), R)) +#define _mm512_roundscale_round_pd(A, B, R) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), R)) +#define _mm512_mask_roundscale_round_pd(A, B, C, D, R) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), R)) +#define _mm512_maskz_roundscale_round_pd(A, B, C, R) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd(), (__mmask8)(A), R)) +#define _mm_roundscale_round_ss(A, B, I, R) ((__m128) __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), (__v4sf) (__m128) (B), (int) (I), (__v4sf) _mm_setzero_ps (), (__mmask8) (-1), (int) (R))) +#define _mm_mask_roundscale_round_ss(A, U, B, C, I, R) ((__m128) __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), (__v4sf) (__m128) (C), (int) (I), (__v4sf) (__m128) (A), (__mmask8) (U), (int) (R))) +#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) ((__m128) __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), (__v4sf) (__m128) (B), (int) (I), (__v4sf) _mm_setzero_ps (), (__mmask8) (U), (int) (R))) +#define _mm_roundscale_round_sd(A, B, I, R) ((__m128d) __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), (__v2df) (__m128d) (B), (int) (I), (__v2df) _mm_setzero_pd (), (__mmask8) (-1), (int) (R))) +#define _mm_mask_roundscale_round_sd(A, U, B, C, I, R) ((__m128d) __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), (__v2df) (__m128d) (C), (int) (I), (__v2df) (__m128d) (A), (__mmask8) (U), (int) (R))) +#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) ((__m128d) __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), (__v2df) (__m128d) (B), (int) (I), (__v2df) _mm_setzero_pd (), (__mmask8) (U), (int) (R))) +#endif +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_cmpge_epi64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 5, - (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_mask_cmpge_epu64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 5, - (__mmask8)__M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_cmpge_epu64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 5, - (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask16 _mm512_mask_cmple_epi32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 2, - (__mmask16)__M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask16 _mm512_cmple_epi32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 2, - (__mmask16)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask16 _mm512_mask_cmple_epu32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 2, - (__mmask16)__M); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask16 _mm512_cmple_epu32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 2, - (__mmask16)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_mask_cmple_epi64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 2, - (__mmask8)__M); +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __mmask8 _mm512_cmple_epi64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 2, - (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) __W, + (__mmask16) __U); } - -__funline __mmask8 _mm512_mask_cmple_epu64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 2, - (__mmask8)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __mmask8 _mm512_cmple_epu64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 2, - (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __mmask16 _mm512_mask_cmplt_epi32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 1, - (__mmask16)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); } - -__funline __mmask16 _mm512_cmplt_epi32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 1, - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - -__funline __mmask16 _mm512_mask_cmplt_epu32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 1, - (__mmask16)__M); +#else +#define _mm512_alignr_epi32(X, Y, C) ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_epi32 (), (__mmask16)-1)) +#define _mm512_mask_alignr_epi32(W, U, X, Y, C) ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_alignr_epi32(U, X, Y, C) ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_setzero_si512 (), (__mmask16)(U))) +#define _mm512_alignr_epi64(X, Y, C) ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_epi32 (), (__mmask8)-1)) +#define _mm512_mask_alignr_epi64(W, U, X, Y, C) ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_alignr_epi64(U, X, Y, C) ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_setzero_si512 (), (__mmask8)(U))) +#endif +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } - -__funline __mmask16 _mm512_cmplt_epu32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 1, - (__mmask16)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, __U); } - -__funline __mmask8 _mm512_mask_cmplt_epi64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 1, - (__mmask8)__M); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, __U); } - -__funline __mmask8 _mm512_cmplt_epi64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 1, - (__mmask8)-1); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } - -__funline __mmask8 _mm512_mask_cmplt_epu64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 1, - (__mmask8)__M); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); } - -__funline __mmask8 _mm512_cmplt_epu64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 1, - (__mmask8)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, __U); } - -__funline __mmask16 _mm512_cmpneq_epi32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 4, - (__mmask16)-1); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, __U); } - -__funline __mmask16 _mm512_mask_cmpneq_epi32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 4, - (__mmask16)__M); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); } - -__funline __mmask16 _mm512_mask_cmpneq_epu32_mask(__mmask16 __M, __m512i __X, - __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 4, - (__mmask16)__M); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) -1); } - -__funline __mmask16 _mm512_cmpneq_epu32_mask(__m512i __X, __m512i __Y) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 4, - (__mmask16)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) __M); } - -__funline __mmask8 _mm512_mask_cmpneq_epi64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 4, - (__mmask8)__M); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) __M); } - -__funline __mmask8 _mm512_cmpneq_epi64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 4, - (__mmask8)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) -1); } - -__funline __mmask8 _mm512_mask_cmpneq_epu64_mask(__mmask8 __M, __m512i __X, - __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 4, - (__mmask8)__M); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) __M); } - -__funline __mmask8 _mm512_cmpneq_epu64_mask(__m512i __X, __m512i __Y) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 4, - (__mmask8)-1); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) -1); } - -#define _MM_CMPINT_EQ 0x0 -#define _MM_CMPINT_LT 0x1 -#define _MM_CMPINT_LE 0x2 +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) -1); +} +#define _MM_CMPINT_EQ 0x0 +#define _MM_CMPINT_LT 0x1 +#define _MM_CMPINT_LE 0x2 #define _MM_CMPINT_UNUSED 0x3 -#define _MM_CMPINT_NE 0x4 -#define _MM_CMPINT_NLT 0x5 -#define _MM_CMPINT_GE 0x5 -#define _MM_CMPINT_NLE 0x6 -#define _MM_CMPINT_GT 0x6 - +#define _MM_CMPINT_NE 0x4 +#define _MM_CMPINT_NLT 0x5 +#define _MM_CMPINT_GE 0x5 +#define _MM_CMPINT_NLE 0x6 +#define _MM_CMPINT_GT 0x6 #ifdef __OPTIMIZE__ -__funline __mmask16 _kshiftli_mask16(__mmask16 __A, unsigned int __B) { - return (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)__A, (__mmask8)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask16 (__mmask16 __A, unsigned int __B) +{ + return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A, + (__mmask8) __B); } - -__funline __mmask16 _kshiftri_mask16(__mmask16 __A, unsigned int __B) { - return (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)__A, (__mmask8)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask16 (__mmask16 __A, unsigned int __B) +{ + return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A, + (__mmask8) __B); } - -__funline __mmask8 _mm512_cmp_epi64_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, __P, - (__mmask8)-1); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); } - -__funline __mmask16 _mm512_cmp_epi32_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, __P, - (__mmask16)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); } - -__funline __mmask8 _mm512_cmp_epu64_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, __P, - (__mmask8)-1); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); } - -__funline __mmask16 _mm512_cmp_epu32_mask(__m512i __X, __m512i __Y, - const int __P) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, - __P, (__mmask16)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); } - -__funline __mmask8 _mm512_cmp_round_pd_mask(__m512d __X, __m512d __Y, - const int __P, const int __R) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, __P, - (__mmask8)-1, __R); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P, + const int __R) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, __R); } - -__funline __mmask16 _mm512_cmp_round_ps_mask(__m512 __X, __m512 __Y, - const int __P, const int __R) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - __P, (__mmask16)-1, __R); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, __R); } - -__funline __mmask8 _mm512_mask_cmp_epi64_mask(__mmask8 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, __P, - (__mmask8)__U); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); } - -__funline __mmask16 _mm512_mask_cmp_epi32_mask(__mmask16 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, __P, - (__mmask16)__U); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); } - -__funline __mmask8 _mm512_mask_cmp_epu64_mask(__mmask8 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, __P, - (__mmask8)__U); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); } - -__funline __mmask16 _mm512_mask_cmp_epu32_mask(__mmask16 __U, __m512i __X, - __m512i __Y, const int __P) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, - __P, (__mmask16)__U); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); } - -__funline __mmask8 _mm512_mask_cmp_round_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y, const int __P, - const int __R) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, __P, - (__mmask8)__U, __R); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, __R); } - -__funline __mmask16 _mm512_mask_cmp_round_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y, const int __P, - const int __R) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - __P, (__mmask16)__U, __R); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, + const int __P, const int __R) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, __R); } - -__funline __mmask8 _mm_cmp_round_sd_mask(__m128d __X, __m128d __Y, const int __P, - const int __R) { - return (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)__X, (__v2df)__Y, __P, - (__mmask8)-1, __R); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, __R); } - -__funline __mmask8 _mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, - __m128d __Y, const int __P, - const int __R) { - return (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)__X, (__v2df)__Y, __P, - (__mmask8)__M, __R); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, __R); } - -__funline __mmask8 _mm_cmp_round_ss_mask(__m128 __X, __m128 __Y, const int __P, - const int __R) { - return (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)__X, (__v4sf)__Y, __P, - (__mmask8)-1, __R); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, __R); } - -__funline __mmask8 _mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, - __m128 __Y, const int __P, - const int __R) { - return (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)__X, (__v4sf)__Y, __P, - (__mmask8)__M, __R); -} - -#else -#define _kshiftli_mask16(X, Y) \ - ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(X), (__mmask8)(Y))) - -#define _kshiftri_mask16(X, Y) \ - ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(X), (__mmask8)(Y))) - -#define _mm512_cmp_epi64_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpq512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm512_cmp_epi32_mask(X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpd512_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) - -#define _mm512_cmp_epu64_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpq512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm512_cmp_epu32_mask(X, Y, P) \ - ((__mmask16)__builtin_ia32_ucmpd512_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) - -#define _mm512_cmp_round_pd_mask(X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmppd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1, R)) - -#define _mm512_cmp_round_ps_mask(X, Y, P, R) \ - ((__mmask16)__builtin_ia32_cmpps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1, R)) - -#define _mm512_mask_cmp_epi64_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpq512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)M)) - -#define _mm512_mask_cmp_epi32_mask(M, X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpd512_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)M)) - -#define _mm512_mask_cmp_epu64_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpq512_mask( \ - (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)M)) - -#define _mm512_mask_cmp_epu32_mask(M, X, Y, P) \ - ((__mmask16)__builtin_ia32_ucmpd512_mask( \ - (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)M)) - -#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmppd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)M, R)) - -#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R) \ - ((__mmask16)__builtin_ia32_cmpps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)M, R)) - -#define _mm_cmp_round_sd_mask(X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpsd_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1, R)) - -#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpsd_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (M), R)) - -#define _mm_cmp_round_ss_mask(X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpss_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1, R)) - -#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), (M), R)) -#endif - -#ifdef __OPTIMIZE__ -__funline __m512 _mm512_i32gather_ps(__m512i __index, void const *__addr, - int __scale) { - __m512 __v1_old = _mm512_undefined_ps(); - __mmask16 __mask = 0xFFFF; - - return (__m512)__builtin_ia32_gathersiv16sf( - (__v16sf)__v1_old, __addr, (__v16si)__index, __mask, __scale); -} - -__funline __m512 _mm512_mask_i32gather_ps(__m512 __v1_old, __mmask16 __mask, - __m512i __index, void const *__addr, - int __scale) { - return (__m512)__builtin_ia32_gathersiv16sf( - (__v16sf)__v1_old, __addr, (__v16si)__index, __mask, __scale); -} - -__funline __m512d _mm512_i32gather_pd(__m256i __index, void const *__addr, - int __scale) { - __m512d __v1_old = _mm512_undefined_pd(); - __mmask8 __mask = 0xFF; - - return (__m512d)__builtin_ia32_gathersiv8df((__v8df)__v1_old, __addr, - (__v8si)__index, __mask, __scale); -} - -__funline __m512d _mm512_mask_i32gather_pd(__m512d __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { - return (__m512d)__builtin_ia32_gathersiv8df((__v8df)__v1_old, __addr, - (__v8si)__index, __mask, __scale); -} - -__funline __m256 _mm512_i64gather_ps(__m512i __index, void const *__addr, - int __scale) { - __m256 __v1_old = _mm256_undefined_ps(); - __mmask8 __mask = 0xFF; - - return (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)__v1_old, __addr, - (__v8di)__index, __mask, __scale); -} - -__funline __m256 _mm512_mask_i64gather_ps(__m256 __v1_old, __mmask8 __mask, - __m512i __index, void const *__addr, - int __scale) { - return (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)__v1_old, __addr, - (__v8di)__index, __mask, __scale); -} - -__funline __m512d _mm512_i64gather_pd(__m512i __index, void const *__addr, - int __scale) { - __m512d __v1_old = _mm512_undefined_pd(); - __mmask8 __mask = 0xFF; - - return (__m512d)__builtin_ia32_gatherdiv8df((__v8df)__v1_old, __addr, - (__v8di)__index, __mask, __scale); -} - -__funline __m512d _mm512_mask_i64gather_pd(__m512d __v1_old, __mmask8 __mask, - __m512i __index, void const *__addr, - int __scale) { - return (__m512d)__builtin_ia32_gatherdiv8df((__v8df)__v1_old, __addr, - (__v8di)__index, __mask, __scale); -} - -__funline __m512i _mm512_i32gather_epi32(__m512i __index, void const *__addr, - int __scale) { - __m512i __v1_old = _mm512_undefined_epi32(); - __mmask16 __mask = 0xFFFF; - - return (__m512i)__builtin_ia32_gathersiv16si( - (__v16si)__v1_old, __addr, (__v16si)__index, __mask, __scale); -} - -__funline __m512i _mm512_mask_i32gather_epi32(__m512i __v1_old, __mmask16 __mask, - __m512i __index, void const *__addr, - int __scale) { - return (__m512i)__builtin_ia32_gathersiv16si( - (__v16si)__v1_old, __addr, (__v16si)__index, __mask, __scale); -} - -__funline __m512i _mm512_i32gather_epi64(__m256i __index, void const *__addr, - int __scale) { - __m512i __v1_old = _mm512_undefined_epi32(); - __mmask8 __mask = 0xFF; - - return (__m512i)__builtin_ia32_gathersiv8di((__v8di)__v1_old, __addr, - (__v8si)__index, __mask, __scale); -} - -__funline __m512i _mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { - return (__m512i)__builtin_ia32_gathersiv8di((__v8di)__v1_old, __addr, - (__v8si)__index, __mask, __scale); -} - -__funline __m256i _mm512_i64gather_epi32(__m512i __index, void const *__addr, - int __scale) { - __m256i __v1_old = _mm256_undefined_si256(); - __mmask8 __mask = 0xFF; - - return (__m256i)__builtin_ia32_gatherdiv16si( - (__v8si)__v1_old, __addr, (__v8di)__index, __mask, __scale); -} - -__funline __m256i _mm512_mask_i64gather_epi32(__m256i __v1_old, __mmask8 __mask, - __m512i __index, void const *__addr, - int __scale) { - return (__m256i)__builtin_ia32_gatherdiv16si( - (__v8si)__v1_old, __addr, (__v8di)__index, __mask, __scale); -} - -__funline __m512i _mm512_i64gather_epi64(__m512i __index, void const *__addr, - int __scale) { - __m512i __v1_old = _mm512_undefined_epi32(); - __mmask8 __mask = 0xFF; - - return (__m512i)__builtin_ia32_gatherdiv8di((__v8di)__v1_old, __addr, - (__v8di)__index, __mask, __scale); -} - -__funline __m512i _mm512_mask_i64gather_epi64(__m512i __v1_old, __mmask8 __mask, - __m512i __index, void const *__addr, - int __scale) { - return (__m512i)__builtin_ia32_gatherdiv8di((__v8di)__v1_old, __addr, - (__v8di)__index, __mask, __scale); -} - -__funline void _mm512_i32scatter_ps(void *__addr, __m512i __index, __m512 __v1, - int __scale) { - __builtin_ia32_scattersiv16sf(__addr, (__mmask16)0xFFFF, (__v16si)__index, - (__v16sf)__v1, __scale); -} - -__funline void _mm512_mask_i32scatter_ps(void *__addr, __mmask16 __mask, - __m512i __index, __m512 __v1, - int __scale) { - __builtin_ia32_scattersiv16sf(__addr, __mask, (__v16si)__index, (__v16sf)__v1, - __scale); -} - -__funline void _mm512_i32scatter_pd(void *__addr, __m256i __index, __m512d __v1, - int __scale) { - __builtin_ia32_scattersiv8df(__addr, (__mmask8)0xFF, (__v8si)__index, - (__v8df)__v1, __scale); -} - -__funline void _mm512_mask_i32scatter_pd(void *__addr, __mmask8 __mask, - __m256i __index, __m512d __v1, - int __scale) { - __builtin_ia32_scattersiv8df(__addr, __mask, (__v8si)__index, (__v8df)__v1, - __scale); -} - -__funline void _mm512_i64scatter_ps(void *__addr, __m512i __index, __m256 __v1, - int __scale) { - __builtin_ia32_scatterdiv16sf(__addr, (__mmask8)0xFF, (__v8di)__index, - (__v8sf)__v1, __scale); -} - -__funline void _mm512_mask_i64scatter_ps(void *__addr, __mmask8 __mask, - __m512i __index, __m256 __v1, - int __scale) { - __builtin_ia32_scatterdiv16sf(__addr, __mask, (__v8di)__index, (__v8sf)__v1, - __scale); -} - -__funline void _mm512_i64scatter_pd(void *__addr, __m512i __index, __m512d __v1, - int __scale) { - __builtin_ia32_scatterdiv8df(__addr, (__mmask8)0xFF, (__v8di)__index, - (__v8df)__v1, __scale); -} - -__funline void _mm512_mask_i64scatter_pd(void *__addr, __mmask8 __mask, - __m512i __index, __m512d __v1, - int __scale) { - __builtin_ia32_scatterdiv8df(__addr, __mask, (__v8di)__index, (__v8df)__v1, - __scale); -} - -__funline void _mm512_i32scatter_epi32(void *__addr, __m512i __index, - __m512i __v1, int __scale) { - __builtin_ia32_scattersiv16si(__addr, (__mmask16)0xFFFF, (__v16si)__index, - (__v16si)__v1, __scale); -} - -__funline void _mm512_mask_i32scatter_epi32(void *__addr, __mmask16 __mask, - __m512i __index, __m512i __v1, - int __scale) { - __builtin_ia32_scattersiv16si(__addr, __mask, (__v16si)__index, (__v16si)__v1, - __scale); -} - -__funline void _mm512_i32scatter_epi64(void *__addr, __m256i __index, - __m512i __v1, int __scale) { - __builtin_ia32_scattersiv8di(__addr, (__mmask8)0xFF, (__v8si)__index, - (__v8di)__v1, __scale); -} - -__funline void _mm512_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, - __m256i __index, __m512i __v1, - int __scale) { - __builtin_ia32_scattersiv8di(__addr, __mask, (__v8si)__index, (__v8di)__v1, - __scale); -} - -__funline void _mm512_i64scatter_epi32(void *__addr, __m512i __index, - __m256i __v1, int __scale) { - __builtin_ia32_scatterdiv16si(__addr, (__mmask8)0xFF, (__v8di)__index, - (__v8si)__v1, __scale); -} - -__funline void _mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, - __m512i __index, __m256i __v1, - int __scale) { - __builtin_ia32_scatterdiv16si(__addr, __mask, (__v8di)__index, (__v8si)__v1, - __scale); -} - -__funline void _mm512_i64scatter_epi64(void *__addr, __m512i __index, - __m512i __v1, int __scale) { - __builtin_ia32_scatterdiv8di(__addr, (__mmask8)0xFF, (__v8di)__index, - (__v8di)__v1, __scale); -} - -__funline void _mm512_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, - __m512i __index, __m512i __v1, - int __scale) { - __builtin_ia32_scatterdiv8di(__addr, __mask, (__v8di)__index, (__v8di)__v1, - __scale); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, __R); } #else -#define _mm512_i32gather_ps(INDEX, ADDR, SCALE) \ - (__m512) __builtin_ia32_gathersiv16sf( \ - (__v16sf)_mm512_undefined_ps(), (void const *)ADDR, \ - (__v16si)(__m512i)INDEX, (__mmask16)0xFFFF, (int)SCALE) - -#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m512) __builtin_ia32_gathersiv16sf( \ - (__v16sf)(__m512)V1OLD, (void const *)ADDR, (__v16si)(__m512i)INDEX, \ - (__mmask16)MASK, (int)SCALE) - -#define _mm512_i32gather_pd(INDEX, ADDR, SCALE) \ - (__m512d) __builtin_ia32_gathersiv8df( \ - (__v8df)_mm512_undefined_pd(), (void const *)ADDR, \ - (__v8si)(__m256i)INDEX, (__mmask8)0xFF, (int)SCALE) - -#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m512d) __builtin_ia32_gathersiv8df( \ - (__v8df)(__m512d)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm512_i64gather_ps(INDEX, ADDR, SCALE) \ - (__m256) __builtin_ia32_gatherdiv16sf( \ - (__v8sf)_mm256_undefined_ps(), (void const *)ADDR, \ - (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) - -#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256) __builtin_ia32_gatherdiv16sf( \ - (__v8sf)(__m256)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm512_i64gather_pd(INDEX, ADDR, SCALE) \ - (__m512d) __builtin_ia32_gatherdiv8df( \ - (__v8df)_mm512_undefined_pd(), (void const *)ADDR, \ - (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) - -#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m512d) __builtin_ia32_gatherdiv8df( \ - (__v8df)(__m512d)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE) \ - (__m512i) __builtin_ia32_gathersiv16si( \ - (__v16si)_mm512_undefined_epi32(), (void const *)ADDR, \ - (__v16si)(__m512i)INDEX, (__mmask16)0xFFFF, (int)SCALE) - -#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m512i) __builtin_ia32_gathersiv16si( \ - (__v16si)(__m512i)V1OLD, (void const *)ADDR, (__v16si)(__m512i)INDEX, \ - (__mmask16)MASK, (int)SCALE) - -#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE) \ - (__m512i) __builtin_ia32_gathersiv8di( \ - (__v8di)_mm512_undefined_epi32(), (void const *)ADDR, \ - (__v8si)(__m256i)INDEX, (__mmask8)0xFF, (int)SCALE) - -#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m512i) __builtin_ia32_gathersiv8di( \ - (__v8di)(__m512i)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE) \ - (__m256i) __builtin_ia32_gatherdiv16si( \ - (__v8si)_mm256_undefined_si256(), (void const *)ADDR, \ - (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) - -#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256i) __builtin_ia32_gatherdiv16si( \ - (__v8si)(__m256i)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE) \ - (__m512i) __builtin_ia32_gatherdiv8di( \ - (__v8di)_mm512_undefined_epi32(), (void const *)ADDR, \ - (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) - -#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m512i) __builtin_ia32_gatherdiv8di( \ - (__v8di)(__m512i)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv16sf((void *)ADDR, (__mmask16)0xFFFF, \ - (__v16si)(__m512i)INDEX, (__v16sf)(__m512)V1, \ - (int)SCALE) - -#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv16sf((void *)ADDR, (__mmask16)MASK, \ - (__v16si)(__m512i)INDEX, (__v16sf)(__m512)V1, \ - (int)SCALE) - -#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8df((void *)ADDR, (__mmask8)0xFF, \ - (__v8si)(__m256i)INDEX, (__v8df)(__m512d)V1, \ - (int)SCALE) - -#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8df((void *)ADDR, (__mmask8)MASK, \ - (__v8si)(__m256i)INDEX, (__v8df)(__m512d)V1, \ - (int)SCALE) - -#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv16sf((void *)ADDR, (__mmask8)0xFF, \ - (__v8di)(__m512i)INDEX, (__v8sf)(__m256)V1, \ - (int)SCALE) - -#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv16sf((void *)ADDR, (__mmask16)MASK, \ - (__v8di)(__m512i)INDEX, (__v8sf)(__m256)V1, \ - (int)SCALE) - -#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8df((void *)ADDR, (__mmask8)0xFF, \ - (__v8di)(__m512i)INDEX, (__v8df)(__m512d)V1, \ - (int)SCALE) - -#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8df((void *)ADDR, (__mmask8)MASK, \ - (__v8di)(__m512i)INDEX, (__v8df)(__m512d)V1, \ - (int)SCALE) - -#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv16si((void *)ADDR, (__mmask16)0xFFFF, \ - (__v16si)(__m512i)INDEX, (__v16si)(__m512i)V1, \ - (int)SCALE) - -#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv16si((void *)ADDR, (__mmask16)MASK, \ - (__v16si)(__m512i)INDEX, (__v16si)(__m512i)V1, \ - (int)SCALE) - -#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8di((void *)ADDR, (__mmask8)0xFF, \ - (__v8si)(__m256i)INDEX, (__v8di)(__m512i)V1, \ - (int)SCALE) - -#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8di((void *)ADDR, (__mmask8)MASK, \ - (__v8si)(__m256i)INDEX, (__v8di)(__m512i)V1, \ - (int)SCALE) - -#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv16si((void *)ADDR, (__mmask8)0xFF, \ - (__v8di)(__m512i)INDEX, (__v8si)(__m256i)V1, \ - (int)SCALE) - -#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv16si((void *)ADDR, (__mmask8)MASK, \ - (__v8di)(__m512i)INDEX, (__v8si)(__m256i)V1, \ - (int)SCALE) - -#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8di((void *)ADDR, (__mmask8)0xFF, \ - (__v8di)(__m512i)INDEX, (__v8di)(__m512i)V1, \ - (int)SCALE) - -#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8di((void *)ADDR, (__mmask8)MASK, \ - (__v8di)(__m512i)INDEX, (__v8di)(__m512i)V1, \ - (int)SCALE) +#define _kshiftli_mask16(X, Y) ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y))) +#define _kshiftri_mask16(X, Y) ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y))) +#define _mm512_cmp_epi64_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) +#define _mm512_cmp_epi32_mask(X, Y, P) ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) +#define _mm512_cmp_epu64_mask(X, Y, P) ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) +#define _mm512_cmp_epu32_mask(X, Y, P) ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) +#define _mm512_cmp_round_pd_mask(X, Y, P, R) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1, R)) +#define _mm512_cmp_round_ps_mask(X, Y, P, R) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1, R)) +#define _mm512_mask_cmp_epi64_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)(M))) +#define _mm512_mask_cmp_epi32_mask(M, X, Y, P) ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)(M))) +#define _mm512_mask_cmp_epu64_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)(M))) +#define _mm512_mask_cmp_epu32_mask(M, X, Y, P) ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)(M))) +#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)(M), R)) +#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)(M), R)) +#define _mm_cmp_round_sd_mask(X, Y, P, R) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1, R)) +#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (M), R)) +#define _mm_cmp_round_ss_mask(X, Y, P, R) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1, R)) +#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (M), R)) #endif - -__funline __m512d _mm512_mask_compress_pd(__m512d __W, __mmask8 __U, - __m512d __A) { - return (__m512d)__builtin_ia32_compressdf512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U); +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_ps (__m512i __index, void const *__addr, int __scale) +{ + __m512 __v1_old = _mm512_undefined_ps (); + __mmask16 __mask = 0xFFFF; + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } - -__funline __m512d _mm512_maskz_compress_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_compressdf512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_ps (__m512 __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } - -__funline void _mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, - __m512d __A) { - __builtin_ia32_compressstoredf512_mask((__v8df *)__P, (__v8df)__A, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_pd (__m256i __index, void const *__addr, int __scale) +{ + __m512d __v1_old = _mm512_undefined_pd (); + __mmask8 __mask = 0xFF; + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old, + __addr, + (__v8si) __index, __mask, + __scale); } - -__funline __m512 _mm512_mask_compress_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_compresssf512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, int __scale) +{ + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); } - -__funline __m512 _mm512_maskz_compress_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_compresssf512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_ps (__m512i __index, void const *__addr, int __scale) +{ + __m256 __v1_old = _mm256_undefined_ps (); + __mmask8 __mask = 0xFF; + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); } - -__funline void _mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, - __m512 __A) { - __builtin_ia32_compressstoresf512_mask((__v16sf *)__P, (__v16sf)__A, - (__mmask16)__U); +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } - -__funline __m512i _mm512_mask_compress_epi64(__m512i __W, __mmask8 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_compressdi512_mask((__v8di)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_pd (__m512i __index, void const *__addr, int __scale) +{ + __m512d __v1_old = _mm512_undefined_pd (); + __mmask8 __mask = 0xFF; + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); } - -__funline __m512i _mm512_maskz_compress_epi64(__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_compressdi512_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } - -__funline void _mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, - __m512i __A) { - __builtin_ia32_compressstoredi512_mask((__v8di *)__P, (__v8di)__A, - (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_epi32 (__m512i __index, void const *__addr, int __scale) +{ + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask16 __mask = 0xFFFF; + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } - -__funline __m512i _mm512_mask_compress_epi32(__m512i __W, __mmask16 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_compresssi512_mask((__v16si)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); } - -__funline __m512i _mm512_maskz_compress_epi32(__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_compresssi512_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_epi64 (__m256i __index, void const *__addr, int __scale) +{ + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask8 __mask = 0xFF; + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old, + __addr, + (__v8si) __index, __mask, + __scale); } - -__funline void _mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, - __m512i __A) { - __builtin_ia32_compressstoresi512_mask((__v16si *)__P, (__v16si)__A, - (__mmask16)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); } - -__funline __m512d _mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_expanddf512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_epi32 (__m512i __index, void const *__addr, int __scale) +{ + __m256i __v1_old = _mm256_undefined_si256 (); + __mmask8 __mask = 0xFF; + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } - -__funline __m512d _mm512_maskz_expand_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_expanddf512_maskz( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } - -__funline __m512d _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, - void const *__P) { - return (__m512d)__builtin_ia32_expandloaddf512_mask( - (const __v8df *)__P, (__v8df)__W, (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_epi64 (__m512i __index, void const *__addr, int __scale) +{ + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask8 __mask = 0xFF; + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); } - -__funline __m512d _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { - return (__m512d)__builtin_ia32_expandloaddf512_maskz( - (const __v8df *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) +{ + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); } - -__funline __m512 _mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_expandsf512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_ps (void *__addr, __m512i __index, __m512 __v1, int __scale) +{ + __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16sf) __v1, __scale); } - -__funline __m512 _mm512_maskz_expand_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_expandsf512_maskz( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_ps (void *__addr, __mmask16 __mask, + __m512i __index, __m512 __v1, int __scale) +{ + __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index, + (__v16sf) __v1, __scale); } - -__funline __m512 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, - void const *__P) { - return (__m512)__builtin_ia32_expandloadsf512_mask( - (const __v16sf *)__P, (__v16sf)__W, (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_pd (void *__addr, __m256i __index, __m512d __v1, + int __scale) +{ + __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8df) __v1, __scale); } - -__funline __m512 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) { - return (__m512)__builtin_ia32_expandloadsf512_maskz( - (const __v16sf *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, __m512d __v1, int __scale) +{ + __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index, + (__v8df) __v1, __scale); } - -__funline __m512i _mm512_mask_expand_epi64(__m512i __W, __mmask8 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_expanddi512_mask((__v8di)__A, (__v8di)__W, - (__mmask8)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_ps (void *__addr, __m512i __index, __m256 __v1, int __scale) +{ + __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8sf) __v1, __scale); } - -__funline __m512i _mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_expanddi512_maskz( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m512i __index, __m256 __v1, int __scale) +{ + __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index, + (__v8sf) __v1, __scale); } - -__funline __m512i _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, - void const *__P) { - return (__m512i)__builtin_ia32_expandloaddi512_mask( - (const __v8di *)__P, (__v8di)__W, (__mmask8)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_pd (void *__addr, __m512i __index, __m512d __v1, + int __scale) +{ + __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8df) __v1, __scale); } - -__funline __m512i _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { - return (__m512i)__builtin_ia32_expandloaddi512_maskz( - (const __v8di *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m512i __index, __m512d __v1, int __scale) +{ + __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index, + (__v8df) __v1, __scale); } - -__funline __m512i _mm512_mask_expand_epi32(__m512i __W, __mmask16 __U, - __m512i __A) { - return (__m512i)__builtin_ia32_expandsi512_mask((__v16si)__A, (__v16si)__W, - (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_epi32 (void *__addr, __m512i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16si) __v1, __scale); } - -__funline __m512i _mm512_maskz_expand_epi32(__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_expandsi512_maskz( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_epi32 (void *__addr, __mmask16 __mask, + __m512i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index, + (__v16si) __v1, __scale); } - -__funline __m512i _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, - void const *__P) { - return (__m512i)__builtin_ia32_expandloadsi512_mask( - (const __v16si *)__P, (__v16si)__W, (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_epi64 (void *__addr, __m256i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8di) __v1, __scale); } - -__funline __m512i _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) { - return (__m512i)__builtin_ia32_expandloadsi512_maskz( - (const __v16si *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m256i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index, + (__v8di) __v1, __scale); } - -/* Mask arithmetic operations */ -#define _kand_mask16 _mm512_kand +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_epi32 (void *__addr, __m512i __index, + __m256i __v1, int __scale) +{ + __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m512i __index, __m256i __v1, int __scale) +{ + __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index, + (__v8si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_epi64 (void *__addr, __m512i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8di) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m512i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index, + (__v8di) __v1, __scale); +} +#else +#define _mm512_i32gather_ps(INDEX, ADDR, SCALE) (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_undefined_ps(), (void const *) (ADDR), (__v16si)(__m512i) (INDEX), (__mmask16)0xFFFF, (int) (SCALE)) +#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512) (V1OLD), (void const *) (ADDR), (__v16si)(__m512i) (INDEX), (__mmask16) (MASK), (int) (SCALE)) +#define _mm512_i32gather_pd(INDEX, ADDR, SCALE) (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_undefined_pd(), (void const *) (ADDR), (__v8si)(__m256i) (INDEX), (__mmask8)0xFF, (int) (SCALE)) +#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d) (V1OLD), (void const *) (ADDR), (__v8si)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm512_i64gather_ps(INDEX, ADDR, SCALE) (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_undefined_ps(), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8)0xFF, (int) (SCALE)) +#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256) (V1OLD), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm512_i64gather_pd(INDEX, ADDR, SCALE) (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_undefined_pd(), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8)0xFF, (int) (SCALE)) +#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d) (V1OLD), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE) (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_epi32 (), (void const *) (ADDR), (__v16si)(__m512i) (INDEX), (__mmask16)0xFFFF, (int) (SCALE)) +#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i) (V1OLD), (void const *) (ADDR), (__v16si)(__m512i) (INDEX), (__mmask16) (MASK), (int) (SCALE)) +#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE) (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_epi32 (), (void const *) (ADDR), (__v8si)(__m256i) (INDEX), (__mmask8)0xFF, (int) (SCALE)) +#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i) (V1OLD), (void const *) (ADDR), (__v8si)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE) (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_undefined_si256(), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8)0xFF, (int) (SCALE)) +#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i) (V1OLD), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE) (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_epi32 (), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8)0xFF, (int) (SCALE)) +#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i) (V1OLD), (void const *) (ADDR), (__v8di)(__m512i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), (__v16sf)(__m512) (V1), (int) (SCALE)) +#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16) (MASK), (__v16si)(__m512i) (INDEX), (__v16sf)(__m512) (V1), (int) (SCALE)) +#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8)0xFF, (__v8si)(__m256i) (INDEX), (__v8df)(__m512d) (V1), (int) (SCALE)) +#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8) (MASK), (__v8si)(__m256i) (INDEX), (__v8df)(__m512d) (V1), (int) (SCALE)) +#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (__v8sf)(__m256) (V1), (int) (SCALE)) +#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask16) (MASK), (__v8di)(__m512i) (INDEX), (__v8sf)(__m256) (V1), (int) (SCALE)) +#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (__v8df)(__m512d) (V1), (int) (SCALE)) +#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8) (MASK), (__v8di)(__m512i) (INDEX), (__v8df)(__m512d) (V1), (int) (SCALE)) +#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), (__v16si)(__m512i) (V1), (int) (SCALE)) +#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16) (MASK), (__v16si)(__m512i) (INDEX), (__v16si)(__m512i) (V1), (int) (SCALE)) +#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8)0xFF, (__v8si)(__m256i) (INDEX), (__v8di)(__m512i) (V1), (int) (SCALE)) +#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8) (MASK), (__v8si)(__m256i) (INDEX), (__v8di)(__m512i) (V1), (int) (SCALE)) +#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (__v8si)(__m256i) (V1), (int) (SCALE)) +#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8) (MASK), (__v8di)(__m512i) (INDEX), (__v8si)(__m256i) (V1), (int) (SCALE)) +#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (__v8di)(__m512i) (V1), (int) (SCALE)) +#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8) (MASK), (__v8di)(__m512i) (INDEX), (__v8di)(__m512i) (V1), (int) (SCALE)) +#endif +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) + __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 + (), (__mmask16) __U); +} +#define _kand_mask16 _mm512_kand #define _kandn_mask16 _mm512_kandn -#define _knot_mask16 _mm512_knot -#define _kor_mask16 _mm512_kor +#define _knot_mask16 _mm512_knot +#define _kor_mask16 _mm512_kor #define _kxnor_mask16 _mm512_kxnor -#define _kxor_mask16 _mm512_kxor - -__funline unsigned char _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, - unsigned char *__CF) { - *__CF = (unsigned char)__builtin_ia32_kortestchi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); +#define _kxor_mask16 _mm512_kxor +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzhi (__A, __B); } - -__funline unsigned char _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) { - return (unsigned char)__builtin_ia32_kortestzhi((__mmask16)__A, - (__mmask16)__B); +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); } - -__funline unsigned char _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) { - return (unsigned char)__builtin_ia32_kortestchi((__mmask16)__A, - (__mmask16)__B); +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); } - -__funline unsigned int _cvtmask16_u32(__mmask16 __A) { - return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask16_u32 (__mmask16 __A) +{ + return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A); } - -__funline __mmask16 _cvtu32_mask16(unsigned int __A) { - return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask16 (unsigned int __A) +{ + return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A); } - -__funline __mmask16 _load_mask16(__mmask16 *__A) { - return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask16 (__mmask16 *__A) +{ + return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A); } - -__funline void _store_mask16(__mmask16 *__A, __mmask16 __B) { - *(__mmask16 *)__A = __builtin_ia32_kmovw(__B); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask16 (__mmask16 *__A, __mmask16 __B) +{ + *(__mmask16 *) __A = __builtin_ia32_kmovw (__B); } - -__funline __mmask16 _mm512_kand(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kandhi((__mmask16)__A, (__mmask16)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kand (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); } - -__funline __mmask16 _mm512_kandn(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kandnhi((__mmask16)__A, (__mmask16)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kandn (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, + (__mmask16) __B); } - -__funline __mmask16 _mm512_kor(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_korhi((__mmask16)__A, (__mmask16)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); } - -__funline int _mm512_kortestz(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kortestzhi((__mmask16)__A, (__mmask16)__B); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kortestz (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); } - -__funline int _mm512_kortestc(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kortestchi((__mmask16)__A, (__mmask16)__B); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kortestc (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); } - -__funline __mmask16 _mm512_kxnor(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kxnorhi((__mmask16)__A, (__mmask16)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kxnor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); } - -__funline __mmask16 _mm512_kxor(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kxorhi((__mmask16)__A, (__mmask16)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kxor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); } - -__funline __mmask16 _mm512_knot(__mmask16 __A) { - return (__mmask16)__builtin_ia32_knothi((__mmask16)__A); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_knot (__mmask16 __A) +{ + return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A); } - -__funline __mmask16 _mm512_kunpackb(__mmask16 __A, __mmask16 __B) { - return (__mmask16)__builtin_ia32_kunpckhi((__mmask16)__A, (__mmask16)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackb (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } - -__funline __mmask16 _kunpackb_mask16(__mmask8 __A, __mmask8 __B) { - return (__mmask16)__builtin_ia32_kunpckhi((__mmask16)__A, (__mmask16)__B); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackb_mask16 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_maskz_inserti32x4(__mmask16 __B, __m512i __C, - __m128i __D, const int __imm) { - return (__m512i)__builtin_ia32_inserti32x4_mask( - (__v16si)__C, (__v4si)__D, __imm, (__v16si)_mm512_setzero_si512(), __B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) + _mm512_setzero_si512 (), + __B); } - -__funline __m512 _mm512_maskz_insertf32x4(__mmask16 __B, __m512 __C, __m128 __D, - const int __imm) { - return (__m512)__builtin_ia32_insertf32x4_mask( - (__v16sf)__C, (__v4sf)__D, __imm, (__v16sf)_mm512_setzero_ps(), __B); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D, + const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) + _mm512_setzero_ps (), __B); } - -__funline __m512i _mm512_mask_inserti32x4(__m512i __A, __mmask16 __B, __m512i __C, - __m128i __D, const int __imm) { - return (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)__C, (__v4si)__D, - __imm, (__v16si)__A, __B); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C, + __m128i __D, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) __A, + __B); } - -__funline __m512 _mm512_mask_insertf32x4(__m512 __A, __mmask16 __B, __m512 __C, - __m128 __D, const int __imm) { - return (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)__C, (__v4sf)__D, - __imm, (__v16sf)__A, __B); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C, + __m128 __D, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) __A, __B); } #else -#define _mm512_maskz_insertf32x4(A, X, Y, C) \ - ((__m512)__builtin_ia32_insertf32x4_mask( \ - (__v16sf)(__m512)(X), (__v4sf)(__m128)(Y), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), (__mmask16)(A))) - -#define _mm512_maskz_inserti32x4(A, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti32x4_mask( \ - (__v16si)(__m512i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v16si)_mm512_setzero_si512(), (__mmask16)(A))) - -#define _mm512_mask_insertf32x4(A, B, X, Y, C) \ - ((__m512)__builtin_ia32_insertf32x4_mask( \ - (__v16sf)(__m512)(X), (__v4sf)(__m128)(Y), (int)(C), \ - (__v16sf)(__m512)(A), (__mmask16)(B))) - -#define _mm512_mask_inserti32x4(A, B, X, Y, C) \ - ((__m512i)__builtin_ia32_inserti32x4_mask( \ - (__v16si)(__m512i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v16si)(__m512i)(A), (__mmask16)(B))) +#define _mm512_maskz_insertf32x4(A, X, Y, C) ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(), (__mmask16)(A))) +#define _mm512_maskz_inserti32x4(A, X, Y, C) ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (), (__mmask16)(A))) +#define _mm512_mask_insertf32x4(A, B, X, Y, C) ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A), (__mmask16)(B))) +#define _mm512_mask_inserti32x4(A, B, X, Y, C) ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A), (__mmask16)(B))) #endif - -__funline __m512i _mm512_max_epi64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_maskz_max_epi64(__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_mask_max_epi64(__m512i __W, __mmask8 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } - -__funline __m512i _mm512_min_epi64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminsq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_min_epi64(__m512i __W, __mmask8 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminsq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } - -__funline __m512i _mm512_maskz_min_epi64(__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminsq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_max_epu64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxuq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_maskz_max_epu64(__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxuq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_mask_max_epu64(__m512i __W, __mmask8 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxuq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } - -__funline __m512i _mm512_min_epu64(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminuq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); } - -__funline __m512i _mm512_mask_min_epu64(__m512i __W, __mmask8 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminuq512_mask((__v8di)__A, (__v8di)__B, - (__v8di)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); } - -__funline __m512i _mm512_maskz_min_epu64(__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminuq512_mask( - (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_max_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_maskz_max_epi32(__mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_mask_max_epi32(__m512i __W, __mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxsd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } - -__funline __m512i _mm512_min_epi32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminsd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_maskz_min_epi32(__mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminsd512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_mask_min_epi32(__m512i __W, __mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminsd512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } - -__funline __m512i _mm512_max_epu32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaxud512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_maskz_max_epu32(__mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxud512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_mask_max_epu32(__m512i __W, __mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pmaxud512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } - -__funline __m512i _mm512_min_epu32(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pminud512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), - (__mmask16)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); } - -__funline __m512i _mm512_maskz_min_epu32(__mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminud512_mask( - (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); } - -__funline __m512i _mm512_mask_min_epu32(__m512i __W, __mmask16 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_pminud512_mask((__v16si)__A, (__v16si)__B, - (__v16si)__W, __M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); } - -__funline __m512 _mm512_unpacklo_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_unpcklps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)_mm512_undefined_ps(), - (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); } - -__funline __m512 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_unpcklps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m512 _mm512_maskz_unpacklo_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_unpcklps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); } - #ifdef __OPTIMIZE__ -__funline __m128d _mm_max_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_maxsd_round((__v2df)__A, (__v2df)__B, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, + (__v2df) __B, + __R); } - -__funline __m128d _mm_mask_max_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_maxsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } - -__funline __m128d _mm_maskz_max_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_maxsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m128 _mm_max_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_maxss_round((__v4sf)__A, (__v4sf)__B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } - -__funline __m128 _mm_mask_max_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_maxss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } - -__funline __m128 _mm_maskz_max_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_maxss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } - -__funline __m128d _mm_min_round_sd(__m128d __A, __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_minsd_round((__v2df)__A, (__v2df)__B, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, + (__v2df) __B, + __R); } - -__funline __m128d _mm_mask_min_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_minsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); } - -__funline __m128d _mm_maskz_min_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_minsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); } - -__funline __m128 _mm_min_round_ss(__m128 __A, __m128 __B, const int __R) { - return (__m128)__builtin_ia32_minss_round((__v4sf)__A, (__v4sf)__B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_minss_round ((__v4sf) __A, + (__v4sf) __B, + __R); } - -__funline __m128 _mm_mask_min_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_minss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); } - -__funline __m128 _mm_maskz_min_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_minss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); -} - -#else -#define _mm_max_round_sd(A, B, C) (__m128d) __builtin_ia32_maxsd_round(A, B, C) - -#define _mm_mask_max_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_maxsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_max_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_max_round_ss(A, B, C) (__m128) __builtin_ia32_maxss_round(A, B, C) - -#define _mm_mask_max_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_maxss_mask_round(A, B, W, U, C) - -#define _mm_maskz_max_round_ss(U, A, B, C) \ - (__m128) __builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#define _mm_min_round_sd(A, B, C) (__m128d) __builtin_ia32_minsd_round(A, B, C) - -#define _mm_mask_min_round_sd(W, U, A, B, C) \ - (__m128d) __builtin_ia32_minsd_mask_round(A, B, W, U, C) - -#define _mm_maskz_min_round_sd(U, A, B, C) \ - (__m128d) \ - __builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) - -#define _mm_min_round_ss(A, B, C) (__m128) __builtin_ia32_minss_round(A, B, C) - -#define _mm_mask_min_round_ss(W, U, A, B, C) \ - (__m128) __builtin_ia32_minss_mask_round(A, B, W, U, C) - -#define _mm_maskz_min_round_ss(U, A, B, C) \ - (__m128) __builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) - -#endif - -__funline __m512d _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) { - return (__m512d)__builtin_ia32_blendmpd_512_mask((__v8df)__A, (__v8df)__W, - (__mmask8)__U); -} - -__funline __m512 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) { - return (__m512)__builtin_ia32_blendmps_512_mask((__v16sf)__A, (__v16sf)__W, - (__mmask16)__U); -} - -__funline __m512i _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, - __m512i __W) { - return (__m512i)__builtin_ia32_blendmq_512_mask((__v8di)__A, (__v8di)__W, - (__mmask8)__U); -} - -__funline __m512i _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, - __m512i __W) { - return (__m512i)__builtin_ia32_blendmd_512_mask((__v16si)__A, (__v16si)__W, - (__mmask16)__U); -} - -#ifdef __OPTIMIZE__ -__funline __m128d _mm_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, (__v2df)__A, - (__v2df)__B, __R); -} - -__funline __m128 _mm_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, (__v4sf)__A, - (__v4sf)__B, __R); -} - -__funline __m128d _mm_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, (__v2df)__A, - -(__v2df)__B, __R); -} - -__funline __m128 _mm_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, (__v4sf)__A, - -(__v4sf)__B, __R); -} - -__funline __m128d _mm_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, -(__v2df)__A, - (__v2df)__B, __R); -} - -__funline __m128 _mm_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, -(__v4sf)__A, - (__v4sf)__B, __R); -} - -__funline __m128d _mm_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, -(__v2df)__A, - -(__v2df)__B, __R); -} - -__funline __m128 _mm_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, - const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, -(__v4sf)__A, - -(__v4sf)__B, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); } #else -#define _mm_fmadd_round_sd(A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_round(A, B, C, R) - -#define _mm_fmadd_round_ss(A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_round(A, B, C, R) - -#define _mm_fmsub_round_sd(A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_round(A, B, -(C), R) - -#define _mm_fmsub_round_ss(A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_round(A, B, -(C), R) - -#define _mm_fnmadd_round_sd(A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_round(A, -(B), C, R) - -#define _mm_fnmadd_round_ss(A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_round(A, -(B), C, R) - -#define _mm_fnmsub_round_sd(A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) - -#define _mm_fnmsub_round_ss(A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) +#define _mm_max_round_sd(A, B, C) (__m128d)__builtin_ia32_maxsd_round(A, B, C) +#define _mm_mask_max_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C) +#define _mm_maskz_max_round_sd(U, A, B, C) (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) +#define _mm_max_round_ss(A, B, C) (__m128)__builtin_ia32_maxss_round(A, B, C) +#define _mm_mask_max_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C) +#define _mm_maskz_max_round_ss(U, A, B, C) (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) +#define _mm_min_round_sd(A, B, C) (__m128d)__builtin_ia32_minsd_round(A, B, C) +#define _mm_mask_min_round_sd(W, U, A, B, C) (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C) +#define _mm_maskz_min_round_sd(U, A, B, C) (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) +#define _mm_min_round_ss(A, B, C) (__m128)__builtin_ia32_minss_round(A, B, C) +#define _mm_mask_min_round_ss(W, U, A, B, C) (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C) +#define _mm_maskz_min_round_ss(U, A, B, C) (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) #endif - -__funline __m128d _mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, (__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W) +{ + return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); } - -__funline __m128 _mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W) +{ + return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); } - -__funline __m128d _mm_mask3_fmadd_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)__W, (__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m128 _mm_mask3_fmadd_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)__W, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m128d _mm_maskz_fmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, (__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_fmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, (__v2df)__A, - -(__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, - -(__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask3_fmsub_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)__W, (__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask3_fmsub_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)__W, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_fmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, (__v2df)__A, - -(__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_fmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, (__v4sf)__A, - -(__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, -(__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask3_fnmadd_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)__W, -(__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask3_fnmadd_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)__W, -(__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, -(__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, -(__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, -(__v2df)__A, - -(__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, - -(__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask3_fnmsub_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)__W, -(__v2df)__A, - (__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask3_fnmsub_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)__W, -(__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, -(__v2df)__A, - -(__v2df)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, -(__v4sf)__A, - -(__v4sf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - #ifdef __OPTIMIZE__ -__funline __m128d _mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask( - (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + __R); } - -__funline __m128 _mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + __R); } - -__funline __m128d _mm_mask3_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask3( - (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + __R); } - -__funline __m128 _mm_mask3_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_mask3( - (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + __R); } - -__funline __m128d _mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz( - (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + __R); } - -__funline __m128 _mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_maskz( - (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + __R); } - -__funline __m128d _mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask( - (__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + __R); } - -__funline __m128 _mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_mask( - (__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_mask3_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { - return (__m128d)__builtin_ia32_vfmsubsd3_mask3( - (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_mask3_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U, const int __R) { - return (__m128)__builtin_ia32_vfmsubss3_mask3( - (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz( - (__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_maskz( - (__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask( - (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask3( - (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_mask3( - (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __W, - __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz( - (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_maskz( - (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_mask( - (__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_mask( - (__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { - return (__m128d)__builtin_ia32_vfmsubsd3_mask3( - (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, - __mmask8 __U, const int __R) { - return (__m128)__builtin_ia32_vfmsubss3_mask3( - (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); -} - -__funline __m128d _mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __W, - __m128d __A, __m128d __B, - const int __R) { - return (__m128d)__builtin_ia32_vfmaddsd3_maskz( - (__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); -} - -__funline __m128 _mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, - __m128 __B, const int __R) { - return (__m128)__builtin_ia32_vfmaddss3_maskz( - (__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + __R); } #else -#define _mm_mask_fmadd_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask(A, B, C, U, R) - -#define _mm_mask_fmadd_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask(A, B, C, U, R) - -#define _mm_mask3_fmadd_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask3(A, B, C, U, R) - -#define _mm_mask3_fmadd_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask3(A, B, C, U, R) - -#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, B, C, U, R) - -#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz(A, B, C, U, R) - -#define _mm_mask_fmsub_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask(A, B, -(C), U, R) - -#define _mm_mask_fmsub_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask(A, B, -(C), U, R) - -#define _mm_mask3_fmsub_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmsubsd3_mask3(A, B, C, U, R) - -#define _mm_mask3_fmsub_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmsubss3_mask3(A, B, C, U, R) - -#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, B, -(C), U, R) - -#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz(A, B, -(C), U, R) - -#define _mm_mask_fnmadd_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask(A, -(B), C, U, R) - -#define _mm_mask_fnmadd_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask(A, -(B), C, U, R) - -#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask3(A, -(B), C, U, R) - -#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask3(A, -(B), C, U, R) - -#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, -(B), C, U, R) - -#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz(A, -(B), C, U, R) - -#define _mm_mask_fnmsub_round_sd(A, U, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_mask(A, -(B), -(C), U, R) - -#define _mm_mask_fnmsub_round_ss(A, U, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_mask(A, -(B), -(C), U, R) - -#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R) \ - (__m128d) __builtin_ia32_vfmsubsd3_mask3(A, -(B), C, U, R) - -#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R) \ - (__m128) __builtin_ia32_vfmsubss3_mask3(A, -(B), C, U, R) - -#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ - (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, -(B), -(C), U, R) - -#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ - (__m128) __builtin_ia32_vfmaddss3_maskz(A, -(B), -(C), U, R) +#define _mm_fmadd_round_sd(A, B, C, R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R) +#define _mm_fmadd_round_ss(A, B, C, R) (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R) +#define _mm_fmsub_round_sd(A, B, C, R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R) +#define _mm_fmsub_round_ss(A, B, C, R) (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R) +#define _mm_fnmadd_round_sd(A, B, C, R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R) +#define _mm_fnmadd_round_ss(A, B, C, R) (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R) +#define _mm_fnmsub_round_sd(A, B, C, R) (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) +#define _mm_fnmsub_round_ss(A, B, C, R) (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) #endif - -#ifdef __OPTIMIZE__ -__funline int _mm_comi_round_ss(__m128 __A, __m128 __B, const int __P, - const int __R) { - return __builtin_ia32_vcomiss((__v4sf)__A, (__v4sf)__B, __P, __R); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline int _mm_comi_round_sd(__m128d __A, __m128d __B, const int __P, - const int __R) { - return __builtin_ia32_vcomisd((__v2df)__A, (__v2df)__B, __P, __R); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); } #else -#define _mm_comi_round_ss(A, B, C, D) __builtin_ia32_vcomiss(A, B, C, D) -#define _mm_comi_round_sd(A, B, C, D) __builtin_ia32_vcomisd(A, B, C, D) +#define _mm_mask_fmadd_round_sd(A, U, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, C, U, R) +#define _mm_mask_fmadd_round_ss(A, U, B, C, R) (__m128) __builtin_ia32_vfmaddss3_mask (A, B, C, U, R) +#define _mm_mask3_fmadd_round_sd(A, B, C, U, R) (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, B, C, U, R) +#define _mm_mask3_fmadd_round_ss(A, B, C, U, R) (__m128) __builtin_ia32_vfmaddss3_mask3 (A, B, C, U, R) +#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, C, U, R) +#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, C, U, R) +#define _mm_mask_fmsub_round_sd(A, U, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, -(C), U, R) +#define _mm_mask_fmsub_round_ss(A, U, B, C, R) (__m128) __builtin_ia32_vfmaddss3_mask (A, B, -(C), U, R) +#define _mm_mask3_fmsub_round_sd(A, B, C, U, R) (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, B, C, U, R) +#define _mm_mask3_fmsub_round_ss(A, B, C, U, R) (__m128) __builtin_ia32_vfmsubss3_mask3 (A, B, C, U, R) +#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, -(C), U, R) +#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, -(C), U, R) +#define _mm_mask_fnmadd_round_sd(A, U, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), C, U, R) +#define _mm_mask_fnmadd_round_ss(A, U, B, C, R) (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), C, U, R) +#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R) (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, -(B), C, U, R) +#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R) (__m128) __builtin_ia32_vfmaddss3_mask3 (A, -(B), C, U, R) +#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), C, U, R) +#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), C, U, R) +#define _mm_mask_fnmsub_round_sd(A, U, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), -(C), U, R) +#define _mm_mask_fnmsub_round_ss(A, U, B, C, R) (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), -(C), U, R) +#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R) (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, -(B), C, U, R) +#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R) (__m128) __builtin_ia32_vfmsubss3_mask3 (A, -(B), C, U, R) +#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), -(C), U, R) +#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), -(C), U, R) #endif - -__funline __m512d _mm512_sqrt_pd(__m512d __A) { - return (__m512d)__builtin_ia32_sqrtpd512_mask( - (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_sqrt_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_sqrtpd512_mask( - (__v8df)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_sqrt_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_sqrtpd512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_sqrt_ps(__m512 __A) { - return (__m512)__builtin_ia32_sqrtps512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_sqrtps512_mask( - (__v16sf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_sqrt_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_sqrtps512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_add_pd(__m512d __A, __m512d __B) { - return (__m512d)((__v8df)__A + (__v8df)__B); -} - -__funline __m512d _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_addpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_add_ps(__m512 __A, __m512 __B) { - return (__m512)((__v16sf)__A + (__v16sf)__B); -} - -__funline __m512 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_addps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_addsd_mask_round((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_addsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_addss_mask_round((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_addss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_sub_pd(__m512d __A, __m512d __B) { - return (__m512d)((__v8df)__A - (__v8df)__B); -} - -__funline __m512d _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_subpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_sub_ps(__m512 __A, __m512 __B) { - return (__m512)((__v16sf)__A - (__v16sf)__B); -} - -__funline __m512 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_subps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_subsd_mask_round((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_subsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_subss_mask_round((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_subss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mul_pd(__m512d __A, __m512d __B) { - return (__m512d)((__v8df)__A * (__v8df)__B); -} - -__funline __m512d _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_mulpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mul_ps(__m512 __A, __m512 __B) { - return (__m512)((__v16sf)__A * (__v16sf)__B); -} - -__funline __m512 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_mulps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_mulsd_mask_round((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_mulsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_mulss_mask_round((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_mulss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_div_pd(__m512d __M, __m512d __V) { - return (__m512d)((__v8df)__M / (__v8df)__V); -} - -__funline __m512d _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __M, - __m512d __V) { - return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_div_pd(__mmask8 __U, __m512d __M, __m512d __V) { - return (__m512d)__builtin_ia32_divpd512_mask( - (__v8df)__M, (__v8df)__V, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_div_ps(__m512 __A, __m512 __B) { - return (__m512)((__v16sf)__A / (__v16sf)__B); -} - -__funline __m512 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_divps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_divsd_mask_round((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_divsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_divss_mask_round((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_divss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_max_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_maxpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_max_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_max_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_maxpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_max_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_maxps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_max_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_max_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_maxps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_max_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_maxsd_mask_round((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_max_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_maxsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_max_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_maxss_mask_round((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_max_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_maxss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_min_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_minpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_min_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_min_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_minpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_min_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_minps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_min_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_min_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_minps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_min_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_minsd_mask_round((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_min_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_minsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_min_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_minss_mask_round((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_min_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_minss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_scalef_pd(__m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_scalefpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_scalef_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B) { - return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_scalef_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_scalefpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_scalef_ps(__m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_scalefps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_scalef_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B) { - return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_scalefps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_scalef_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_scalefsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_scalef_ss(__m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_scalefss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U) { - return (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U) { - return (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d)__builtin_ia32_vfmsubpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmsubpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U) { - return (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmsubpd512_maskz((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512)__builtin_ia32_vfmsubps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmsubps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U) { - return (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmsubps512_maskz((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmaddsubps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, - -(__v8df)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( - (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U) { - return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3( - (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( - (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask( - (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmaddsubps512_mask( - (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U) { - return (__m512)__builtin_ia32_vfmsubaddps512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfmaddsubps512_maskz( - (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U) { - return (__m512d)__builtin_ia32_vfnmaddpd512_mask3((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfnmaddpd512_maskz((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U) { - return (__m512)__builtin_ia32_vfnmaddps512_mask3((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfnmaddps512_maskz((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U) { - return (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C) { - return (__m512d)__builtin_ia32_vfnmsubpd512_maskz((__v8df)__A, (__v8df)__B, - (__v8df)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U) { - return (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C) { - return (__m512)__builtin_ia32_vfnmsubps512_maskz((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_cvttpd_epi32(__m512d __A) { - return (__m256i)__builtin_ia32_cvttpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_mask_cvttpd_epi32(__m256i __W, __mmask8 __U, - __m512d __A) { - return (__m256i)__builtin_ia32_cvttpd2dq512_mask( - (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_maskz_cvttpd_epi32(__mmask8 __U, __m512d __A) { - return (__m256i)__builtin_ia32_cvttpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_cvttpd_epu32(__m512d __A) { - return (__m256i)__builtin_ia32_cvttpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_mask_cvttpd_epu32(__m256i __W, __mmask8 __U, - __m512d __A) { - return (__m256i)__builtin_ia32_cvttpd2udq512_mask( - (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_maskz_cvttpd_epu32(__mmask8 __U, __m512d __A) { - return (__m256i)__builtin_ia32_cvttpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_cvtpd_epi32(__m512d __A) { - return (__m256i)__builtin_ia32_cvtpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_mask_cvtpd_epi32(__m256i __W, __mmask8 __U, - __m512d __A) { - return (__m256i)__builtin_ia32_cvtpd2dq512_mask( - (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_maskz_cvtpd_epi32(__mmask8 __U, __m512d __A) { - return (__m256i)__builtin_ia32_cvtpd2dq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_cvtpd_epu32(__m512d __A) { - return (__m256i)__builtin_ia32_cvtpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_mask_cvtpd_epu32(__m256i __W, __mmask8 __U, - __m512d __A) { - return (__m256i)__builtin_ia32_cvtpd2udq512_mask( - (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256i _mm512_maskz_cvtpd_epu32(__mmask8 __U, __m512d __A) { - return (__m256i)__builtin_ia32_cvtpd2udq512_mask( - (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvttps_epi32(__m512 __A) { - return (__m512i)__builtin_ia32_cvttps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvttps_epi32(__m512i __W, __mmask16 __U, - __m512 __A) { - return (__m512i)__builtin_ia32_cvttps2dq512_mask( - (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvttps_epi32(__mmask16 __U, __m512 __A) { - return (__m512i)__builtin_ia32_cvttps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvttps_epu32(__m512 __A) { - return (__m512i)__builtin_ia32_cvttps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvttps_epu32(__m512i __W, __mmask16 __U, - __m512 __A) { - return (__m512i)__builtin_ia32_cvttps2udq512_mask( - (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvttps_epu32(__mmask16 __U, __m512 __A) { - return (__m512i)__builtin_ia32_cvttps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvtps_epi32(__m512 __A) { - return (__m512i)__builtin_ia32_cvtps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvtps_epi32(__m512i __W, __mmask16 __U, - __m512 __A) { - return (__m512i)__builtin_ia32_cvtps2dq512_mask( - (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvtps_epi32(__mmask16 __U, __m512 __A) { - return (__m512i)__builtin_ia32_cvtps2dq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_cvtps_epu32(__m512 __A) { - return (__m512i)__builtin_ia32_cvtps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_mask_cvtps_epu32(__m512i __W, __mmask16 __U, - __m512 __A) { - return (__m512i)__builtin_ia32_cvtps2udq512_mask( - (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512i _mm512_maskz_cvtps_epu32(__mmask16 __U, __m512 __A) { - return (__m512i)__builtin_ia32_cvtps2udq512_mask( - (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline double _mm512_cvtsd_f64(__m512d __A) { +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R); +} +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R); +} +#else +#define _mm_comi_round_ss(A, B, C, D)__builtin_ia32_vcomiss(A, B, C, D) +#define _mm_comi_round_sd(A, B, C, D)__builtin_ia32_vcomisd(A, B, C, D) +#endif +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_pd (__m512d __A, __m512d __B) +{ + return (__m512d) ((__v8df)__A + (__v8df)__B); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A + (__v16sf)__B); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_pd (__m512d __A, __m512d __B) +{ + return (__m512d) ((__v8df)__A - (__v8df)__B); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A - (__v16sf)__B); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_pd (__m512d __A, __m512d __B) +{ + return (__m512d) ((__v8df)__A * (__v8df)__B); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A * (__v16sf)__B); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_pd (__m512d __M, __m512d __V) +{ + return (__m512d) ((__v8df)__M / (__v8df)__V); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A / (__v16sf)__B); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epi32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epi32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epi32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epu32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epi32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epu32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsd_f64 (__m512d __A) +{ return __A[0]; } - -__funline float _mm512_cvtss_f32(__m512 __A) { +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtss_f32 (__m512 __A) +{ return __A[0]; } - #ifdef __x86_64__ -__funline __m128 _mm_cvtu64_ss(__m128 __A, unsigned long long __B) { - return (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)__A, __B, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_ss (__m128 __A, unsigned long long __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128d _mm_cvtu64_sd(__m128d __A, unsigned long long __B) { - return (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)__A, __B, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_sd (__m128d __A, unsigned long long __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, + _MM_FROUND_CUR_DIRECTION); } #endif - -__funline __m128 _mm_cvtu32_ss(__m128 __A, unsigned __B) { - return (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)__A, __B, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_ss (__m128 __A, unsigned __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_cvtepi32_ps(__m512i __A) { - return (__m512)__builtin_ia32_cvtdq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_ps (__m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_mask_cvtepi32_ps(__m512 __W, __mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_cvtdq2ps512_mask( - (__v16si)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_maskz_cvtepi32_ps(__mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_cvtdq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_cvtepu32_ps(__m512i __A) { - return (__m512)__builtin_ia32_cvtudq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_ps (__m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_mask_cvtepu32_ps(__m512 __W, __mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_cvtudq2ps512_mask( - (__v16si)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_maskz_cvtepu32_ps(__mmask16 __U, __m512i __A) { - return (__m512)__builtin_ia32_cvtudq2ps512_mask( - (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - #ifdef __OPTIMIZE__ -__funline __m512d _mm512_fixupimm_pd(__m512d __A, __m512d __B, __m512i __C, - const int __imm) { - return (__m512d)__builtin_ia32_fixupimmpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512d _mm512_mask_fixupimm_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512i __C, const int __imm) { - return (__m512d)__builtin_ia32_fixupimmpd512_mask( - (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512d _mm512_maskz_fixupimm_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512i __C, const int __imm) { - return (__m512d)__builtin_ia32_fixupimmpd512_maskz( - (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_fixupimm_ps(__m512 __A, __m512 __B, __m512i __C, - const int __imm) { - return (__m512)__builtin_ia32_fixupimmps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_mask_fixupimm_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512i __C, const int __imm) { - return (__m512)__builtin_ia32_fixupimmps512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_maskz_fixupimm_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512i __C, const int __imm) { - return (__m512)__builtin_ia32_fixupimmps512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128d _mm_fixupimm_sd(__m128d __A, __m128d __B, __m128i __C, - const int __imm) { - return (__m128d)__builtin_ia32_fixupimmsd_mask( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128d _mm_mask_fixupimm_sd(__m128d __A, __mmask8 __U, __m128d __B, - __m128i __C, const int __imm) { - return (__m128d)__builtin_ia32_fixupimmsd_mask( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128d _mm_maskz_fixupimm_sd(__mmask8 __U, __m128d __A, __m128d __B, - __m128i __C, const int __imm) { - return (__m128d)__builtin_ia32_fixupimmsd_maskz( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128 _mm_fixupimm_ss(__m128 __A, __m128 __B, __m128i __C, - const int __imm) { - return (__m128)__builtin_ia32_fixupimmss_mask( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128 _mm_mask_fixupimm_ss(__m128 __A, __mmask8 __U, __m128 __B, - __m128i __C, const int __imm) { - return (__m128)__builtin_ia32_fixupimmss_mask( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128 _mm_maskz_fixupimm_ss(__mmask8 __U, __m128 __A, __m128 __B, - __m128i __C, const int __imm) { - return (__m128)__builtin_ia32_fixupimmss_maskz( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } #else -#define _mm512_fixupimm_pd(X, Y, Z, C) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ - (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ - (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C) \ - ((__m512d)__builtin_ia32_fixupimmpd512_maskz( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ - (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_fixupimm_ps(X, Y, Z, C) \ - ((__m512)__builtin_ia32_fixupimmps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ - (int)(C), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C) \ - ((__m512)__builtin_ia32_fixupimmps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ - (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C) \ - ((__m512)__builtin_ia32_fixupimmps512_maskz( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ - (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_fixupimm_sd(X, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmsd_maskz( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_fixupimm_ss(X, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmss_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmss_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmss_maskz( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_fixupimm_pd(X, Y, Z, C) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C) ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C) ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_fixupimm_ps(X, Y, Z, C) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C) ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C) ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_fixupimm_sd(X, Y, Z, C) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_fixupimm_ss(X, Y, Z, C) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) #endif - #ifdef __x86_64__ -__funline unsigned long long _mm_cvtss_u64(__m128 __A) { - return (unsigned long long)__builtin_ia32_vcvtss2usi64( - (__v4sf)__A, _MM_FROUND_CUR_DIRECTION); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline unsigned long long _mm_cvttss_u64(__m128 __A) { - return (unsigned long long)__builtin_ia32_vcvttss2usi64( - (__v4sf)__A, _MM_FROUND_CUR_DIRECTION); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline long long _mm_cvttss_i64(__m128 __A) { - return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, - _MM_FROUND_CUR_DIRECTION); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); } -#endif /* __x86_64__ */ - -__funline unsigned _mm_cvtss_u32(__m128 __A) { - return (unsigned)__builtin_ia32_vcvtss2usi32((__v4sf)__A, - _MM_FROUND_CUR_DIRECTION); +#endif +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsi512_si32 (__m512i __A) +{ + __v16si __B = (__v16si) __A; + return __B[0]; } - -__funline unsigned _mm_cvttss_u32(__m128 __A) { - return (unsigned)__builtin_ia32_vcvttss2usi32((__v4sf)__A, - _MM_FROUND_CUR_DIRECTION); +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline int _mm_cvttss_i32(__m128 __A) { - return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, - _MM_FROUND_CUR_DIRECTION); +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_cvtsd2si ((__v2df) __A); +} +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_cvtss2si ((__v4sf) __A); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_sd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_ss (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); } - #ifdef __x86_64__ -__funline unsigned long long _mm_cvtsd_u64(__m128d __A) { - return (unsigned long long)__builtin_ia32_vcvtsd2usi64( - (__v2df)__A, _MM_FROUND_CUR_DIRECTION); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline unsigned long long _mm_cvttsd_u64(__m128d __A) { - return (unsigned long long)__builtin_ia32_vcvttsd2usi64( - (__v2df)__A, _MM_FROUND_CUR_DIRECTION); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline long long _mm_cvttsd_i64(__m128d __A) { - return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, - _MM_FROUND_CUR_DIRECTION); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); } -#endif /* __x86_64__ */ - -__funline unsigned _mm_cvtsd_u32(__m128d __A) { - return (unsigned)__builtin_ia32_vcvtsd2usi32((__v2df)__A, - _MM_FROUND_CUR_DIRECTION); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A); } - -__funline unsigned _mm_cvttsd_u32(__m128d __A) { - return (unsigned)__builtin_ia32_vcvttsd2usi32((__v2df)__A, - _MM_FROUND_CUR_DIRECTION); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A); } - -__funline int _mm_cvttsd_i32(__m128d __A) { - return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_sd (__m128d __A, long long __B) +{ + return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B); } - -__funline __m512d _mm512_cvtps_pd(__m256 __A) { - return (__m512d)__builtin_ia32_cvtps2pd512_mask( - (__v8sf)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); } - -__funline __m512d _mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) { - return (__m512d)__builtin_ia32_cvtps2pd512_mask( - (__v8sf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) { - return (__m512d)__builtin_ia32_cvtps2pd512_mask( - (__v8sf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_cvtph_ps(__m256i __A) { - return (__m512)__builtin_ia32_vcvtph2ps512_mask( - (__v16hi)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_cvtph_ps(__m512 __W, __mmask16 __U, __m256i __A) { - return (__m512)__builtin_ia32_vcvtph2ps512_mask( - (__v16hi)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_cvtph_ps(__mmask16 __U, __m256i __A) { - return (__m512)__builtin_ia32_vcvtph2ps512_mask( - (__v16hi)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_cvtpd_ps(__m512d __A) { - return (__m256)__builtin_ia32_cvtpd2ps512_mask( - (__v8df)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_mask_cvtpd_ps(__m256 __W, __mmask8 __U, __m512d __A) { - return (__m256)__builtin_ia32_cvtpd2ps512_mask( - (__v8df)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m256 _mm512_maskz_cvtpd_ps(__mmask8 __U, __m512d __A) { - return (__m256)__builtin_ia32_cvtpd2ps512_mask( - (__v8df)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __OPTIMIZE__ -__funline __m512 _mm512_getexp_ps(__m512 __A) { - return (__m512)__builtin_ia32_getexpps512_mask( - (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_getexp_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_getexpps512_mask( - (__v16sf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_getexp_ps(__mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_getexpps512_mask( - (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_getexp_pd(__m512d __A) { - return (__m512d)__builtin_ia32_getexppd512_mask( - (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_getexp_pd(__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_getexppd512_mask( - (__v8df)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_getexp_pd(__mmask8 __U, __m512d __A) { - return (__m512d)__builtin_ia32_getexppd512_mask( - (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_getexp_ss(__m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_getexpss128_round((__v4sf)__A, (__v4sf)__B, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_getexp_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_getexpss_mask_round((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_getexp_ss(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_getexpss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_getexp_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_getexpsd128_round((__v2df)__A, (__v2df)__B, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_getexp_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_getexpsd_mask_round((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_getexp_sd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_getexpsd_mask_round( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_getmant_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m512d)__builtin_ia32_getmantpd512_mask( - (__v8df)__A, (__C << 2) | __B, _mm512_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_getmant_pd(__m512d __W, __mmask8 __U, __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m512d)__builtin_ia32_getmantpd512_mask( - (__v8df)__A, (__C << 2) | __B, (__v8df)__W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_getmant_pd(__mmask8 __U, __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m512d)__builtin_ia32_getmantpd512_mask( - (__v8df)__A, (__C << 2) | __B, (__v8df)_mm512_setzero_pd(), __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_getmant_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m512)__builtin_ia32_getmantps512_mask( - (__v16sf)__A, (__C << 2) | __B, _mm512_undefined_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_mask_getmant_ps(__m512 __W, __mmask16 __U, __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m512)__builtin_ia32_getmantps512_mask( - (__v16sf)__A, (__C << 2) | __B, (__v16sf)__W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512 _mm512_maskz_getmant_ps(__mmask16 __U, __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m512)__builtin_ia32_getmantps512_mask( - (__v16sf)__A, (__C << 2) | __B, (__v16sf)_mm512_setzero_ps(), __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_getmant_sd(__m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128d)__builtin_ia32_getmantsd_round( - (__v2df)__A, (__v2df)__B, (__D << 2) | __C, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_mask_getmant_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128d)__builtin_ia32_getmantsd_mask_round( - (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)__W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128d _mm_maskz_getmant_sd(__mmask8 __U, __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128d)__builtin_ia32_getmantsd_mask_round( - (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)_mm_setzero_pd(), __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_getmant_ss(__m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128)__builtin_ia32_getmantss_round( - (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_mask_getmant_ss(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128)__builtin_ia32_getmantss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)__W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __m128 _mm_maskz_getmant_ss(__mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { - return (__m128)__builtin_ia32_getmantss_mask_round( - (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)_mm_setzero_ps(), __U, - _MM_FROUND_CUR_DIRECTION); -} - -#else -#define _mm512_getmant_pd(X, B, C) \ - ((__m512d)__builtin_ia32_getmantpd512_mask( \ - (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ - (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_getmant_pd(W, U, X, B, C) \ - ((__m512d)__builtin_ia32_getmantpd512_mask( \ - (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), (__v8df)(__m512d)(W), \ - (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_getmant_pd(U, X, B, C) \ - ((__m512d)__builtin_ia32_getmantpd512_mask( \ - (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ - (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_getmant_ps(X, B, C) \ - ((__m512)__builtin_ia32_getmantps512_mask( \ - (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ - (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_getmant_ps(W, U, X, B, C) \ - ((__m512)__builtin_ia32_getmantps512_mask( \ - (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), (__v16sf)(__m512)(W), \ - (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_getmant_ps(U, X, B, C) \ - ((__m512)__builtin_ia32_getmantps512_mask( \ - (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ - (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) -#define _mm_getmant_sd(X, Y, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_round( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_sd(W, U, X, Y, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ - (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_getmant_sd(U, X, Y, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_mask_round( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ - (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_getmant_ss(X, Y, C, D) \ - ((__m128)__builtin_ia32_getmantss_round( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_ss(W, U, X, Y, C, D) \ - ((__m128)__builtin_ia32_getmantss_mask_round( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ - (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_getmant_ss(U, X, Y, C, D) \ - ((__m128)__builtin_ia32_getmantss_mask_round( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_getexp_ss(A, B) \ - ((__m128)__builtin_ia32_getexpss128_round( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getexp_ss(W, U, A, B) \ - (__m128) \ - __builtin_ia32_getexpss_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_getexp_ss(U, A, B) \ - (__m128) __builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), \ - U, _MM_FROUND_CUR_DIRECTION) - -#define _mm_getexp_sd(A, B) \ - ((__m128d)__builtin_ia32_getexpsd128_round( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getexp_sd(W, U, A, B) \ - (__m128d) \ - __builtin_ia32_getexpsd_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_getexp_sd(U, A, B) \ - (__m128d) __builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), \ - U, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_getexp_ps(A) \ - ((__m512)__builtin_ia32_getexpps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_getexp_ps(W, U, A) \ - ((__m512)__builtin_ia32_getexpps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_getexp_ps(U, A) \ - ((__m512)__builtin_ia32_getexpps512_mask( \ - (__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_getexp_pd(A) \ - ((__m512d)__builtin_ia32_getexppd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_getexp_pd(W, U, A) \ - ((__m512d)__builtin_ia32_getexppd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_getexp_pd(U, A) \ - ((__m512d)__builtin_ia32_getexppd512_mask( \ - (__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) #endif - +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_pd (__m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_ps (__m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_ps (__m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_undefined_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} #ifdef __OPTIMIZE__ -__funline __m512 _mm512_roundscale_ps(__m512 __A, const int __imm) { - return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, __imm, - (__v16sf)_mm512_undefined_ps(), - -1, _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_mask_roundscale_ps(__m512 __A, __mmask16 __B, __m512 __C, - const int __imm) { - return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__C, __imm, - (__v16sf)__A, (__mmask16)__B, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_maskz_roundscale_ps(__mmask16 __A, __m512 __B, - const int __imm) { - return (__m512)__builtin_ia32_rndscaleps_mask( - (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__A, +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } - -__funline __m512d _mm512_roundscale_pd(__m512d __A, const int __imm) { - return (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)__A, __imm, - (__v8df)_mm512_undefined_pd(), - -1, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_mask_roundscale_pd(__m512d __A, __mmask8 __B, - __m512d __C, const int __imm) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__C, __imm, (__v8df)__A, (__mmask8)__B, _MM_FROUND_CUR_DIRECTION); -} - -__funline __m512d _mm512_maskz_roundscale_pd(__mmask8 __A, __m512d __B, - const int __imm) { - return (__m512d)__builtin_ia32_rndscalepd_mask( - (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__A, +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } - -__funline __m128 _mm_roundscale_ss(__m128 __A, __m128 __B, const int __imm) { - return (__m128)__builtin_ia32_rndscaless_round( - (__v4sf)__A, (__v4sf)__B, __imm, _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128d _mm_roundscale_sd(__m128d __A, __m128d __B, const int __imm) { - return (__m128d)__builtin_ia32_rndscalesd_round( - (__v2df)__A, (__v2df)__B, __imm, _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, + _MM_FROUND_CUR_DIRECTION); } - #else -#define _mm512_roundscale_ps(A, B) \ - ((__m512)__builtin_ia32_rndscaleps_mask( \ - (__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_roundscale_ps(A, B, C, D) \ - ((__m512)__builtin_ia32_rndscaleps_mask( \ - (__v16sf)(__m512)(C), (int)(D), (__v16sf)(__m512)(A), (__mmask16)(B), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_roundscale_ps(A, B, C) \ - ((__m512)__builtin_ia32_rndscaleps_mask( \ - (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(A), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_roundscale_pd(A, B) \ - ((__m512d)__builtin_ia32_rndscalepd_mask( \ - (__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_undefined_pd(), \ - (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) -#define _mm512_mask_roundscale_pd(A, B, C, D) \ - ((__m512d)__builtin_ia32_rndscalepd_mask( \ - (__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm512_maskz_roundscale_pd(A, B, C) \ - ((__m512d)__builtin_ia32_rndscalepd_mask( \ - (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(A), _MM_FROUND_CUR_DIRECTION)) -#define _mm_roundscale_ss(A, B, C) \ - ((__m128)__builtin_ia32_rndscaless_round((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - _MM_FROUND_CUR_DIRECTION)) -#define _mm_roundscale_sd(A, B, C) \ - ((__m128d)__builtin_ia32_rndscalesd_round((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - _MM_FROUND_CUR_DIRECTION)) +#define _mm512_getmant_pd(X, B, C) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_getmant_pd(W, U, X, B, C) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_getmant_pd(U, X, B, C) ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), (int)(((C)<<2) | (B)), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_getmant_ps(X, B, C) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_getmant_ps(W, U, X, B, C) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_getmant_ps(U, X, B, C) ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), (int)(((C)<<2) | (B)), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_getmant_sd(X, Y, C, D) ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_getmant_sd(W, U, X, Y, C, D) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_getmant_sd(U, X, Y, C, D) ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D)<<2) | (C)), (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_getmant_ss(X, Y, C, D) ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_getmant_ss(W, U, X, Y, C, D) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_getmant_ss(U, X, Y, C, D) ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D)<<2) | (C)), (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_getexp_ss(A, B) ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_getexp_ss(W, U, A, B) (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_getexp_ss(U, A, B) (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, _MM_FROUND_CUR_DIRECTION) +#define _mm_getexp_sd(A, B) ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_getexp_sd(W, U, A, B) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) +#define _mm_maskz_getexp_sd(U, A, B) (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, _MM_FROUND_CUR_DIRECTION) +#define _mm512_getexp_ps(A) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_getexp_ps(W, U, A) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_getexp_ps(U, A) ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_getexp_pd(A) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_getexp_pd(W, U, A) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_getexp_pd(U, A) ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) #endif - #ifdef __OPTIMIZE__ -__funline __mmask8 _mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmppd512_mask( - (__v8df)__X, (__v8df)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_ps (__m512 __A, const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) + _mm512_undefined_ps (), + -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask16 _mm512_cmp_ps_mask(__m512 __X, __m512 __Y, const int __P) { - return (__mmask16)__builtin_ia32_cmpps512_mask( - (__v16sf)__X, (__v16sf)__Y, __P, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask16 _mm512_mask_cmp_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y, - const int __P) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - __P, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_mask_cmp_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmppd512_mask( - (__v8df)__X, (__v8df)__Y, __P, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_pd (__m512d __A, const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) + _mm512_undefined_pd (), + -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_cmpeq_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_EQ_OQ, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C, + const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_mask_cmpeq_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_EQ_OQ, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_cmplt_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_LT_OS, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_mask_cmplt_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_LT_OS, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ss (__m128 __A, __mmask8 __B, __m128 __C, __m128 __D, + const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, + (__v4sf) __D, __imm, + (__v4sf) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_cmple_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_LE_OS, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ss (__mmask8 __A, __m128 __B, __m128 __C, + const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, + (__v4sf) __C, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_mask_cmple_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_LE_OS, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_cmpunord_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_UNORD_Q, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_sd (__m128d __A, __mmask8 __B, __m128d __C, __m128d __D, + const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, + (__v2df) __D, __imm, + (__v2df) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_mask_cmpunord_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_UNORD_Q, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_sd (__mmask8 __A, __m128d __B, __m128d __C, + const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, + (__v2df) __C, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); } - -__funline __mmask8 _mm512_cmpneq_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_NEQ_UQ, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm512_mask_cmpneq_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_NEQ_UQ, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm512_cmpnlt_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_NLT_US, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm512_mask_cmpnlt_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_NLT_US, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm512_cmpnle_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_NLE_US, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm512_mask_cmpnle_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_NLE_US, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm512_cmpord_pd_mask(__m512d __X, __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_ORD_Q, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm512_mask_cmpord_pd_mask(__mmask8 __U, __m512d __X, - __m512d __Y) { - return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, - _CMP_ORD_Q, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmpeq_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_EQ_OQ, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmpeq_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_EQ_OQ, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmplt_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_LT_OS, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmplt_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_LT_OS, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmple_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_LE_OS, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmple_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_LE_OS, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmpunord_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_UNORD_Q, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmpunord_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_UNORD_Q, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmpneq_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_NEQ_UQ, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmpneq_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_NEQ_UQ, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmpnlt_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_NLT_US, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmpnlt_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_NLT_US, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmpnle_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_NLE_US, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmpnle_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_NLE_US, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_cmpord_ps_mask(__m512 __X, __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_ORD_Q, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask16 _mm512_mask_cmpord_ps_mask(__mmask16 __U, __m512 __X, - __m512 __Y) { - return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, - _CMP_ORD_Q, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm_cmp_sd_mask(__m128d __X, __m128d __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpsd_mask( - (__v2df)__X, (__v2df)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpsd_mask( - (__v2df)__X, (__v2df)__Y, __P, (__mmask8)__M, _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm_cmp_ss_mask(__m128 __X, __m128 __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpss_mask( - (__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); -} - -__funline __mmask8 _mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpss_mask( - (__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)__M, _MM_FROUND_CUR_DIRECTION); -} - #else -#define _mm512_cmp_pd_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmppd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_cmp_ps_mask(X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_cmp_pd_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmppd512_mask( \ - (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)M, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_cmp_ps_mask(M, X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpps512_mask( \ - (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)M, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_cmp_sd_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpsd_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), M, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_cmp_ss_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpss_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), M, \ - _MM_FROUND_CUR_DIRECTION)) +#define _mm512_roundscale_ps(A, B) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_ps(A, B, C, D) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), (int)(D), (__v16sf)(__m512)(A), (__mmask16)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_ps(A, B, C) ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps(), (__mmask16)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_roundscale_pd(A, B) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_pd(A, B, C, D) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_pd(A, B, C) ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd(), (__mmask8)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_ss(A, B, I) ((__m128) __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), (__v4sf) (__m128) (B), (int) (I), (__v4sf) _mm_setzero_ps (), (__mmask8) (-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_roundscale_ss(A, U, B, C, I) ((__m128) __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), (__v4sf) (__m128) (C), (int) (I), (__v4sf) (__m128) (A), (__mmask8) (U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_roundscale_ss(U, A, B, I) ((__m128) __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), (__v4sf) (__m128) (B), (int) (I), (__v4sf) _mm_setzero_ps (), (__mmask8) (U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_sd(A, B, I) ((__m128d) __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), (__v2df) (__m128d) (B), (int) (I), (__v2df) _mm_setzero_pd (), (__mmask8) (-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_roundscale_sd(A, U, B, C, I) ((__m128d) __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), (__v2df) (__m128d) (C), (int) (I), (__v2df) (__m128d) (A), (__mmask8) (U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_roundscale_sd(U, A, B, I) ((__m128d) __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), (__v2df) (__m128d) (B), (int) (I), (__v2df) _mm_setzero_pd (), (__mmask8) (U), _MM_FROUND_CUR_DIRECTION)) #endif - -__funline __mmask16 _mm512_kmov(__mmask16 __A) { - return __builtin_ia32_kmovw(__A); +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_castpd_ps(__m512d __A) { - return (__m512)(__A); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512i _mm512_castpd_si512(__m512d __A) { - return (__m512i)(__A); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512d _mm512_castps_pd(__m512 __A) { - return (__m512d)(__A); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512i _mm512_castps_si512(__m512 __A) { - return (__m512i)(__A); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512 _mm512_castsi512_ps(__m512i __A) { - return (__m512)(__A); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m512d _mm512_castsi512_pd(__m512i __A) { - return (__m512d)(__A); +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } - -__funline __m128d _mm512_castpd512_pd128(__m512d __A) { +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); +} +#else +#define _mm512_cmp_pd_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) +#define _mm512_cmp_ps_mask(X, Y, P) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1,_MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_cmp_pd_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)(M), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_cmp_ps_mask(M, X, Y, P) ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)(M),_MM_FROUND_CUR_DIRECTION)) +#define _mm_cmp_sd_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_cmp_sd_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), M,_MM_FROUND_CUR_DIRECTION)) +#define _mm_cmp_ss_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_cmp_ss_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), M,_MM_FROUND_CUR_DIRECTION)) +#endif +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_EQ_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_EQ_OQ, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LT_OS, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LE_OS, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpunord_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_UNORD_Q, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpunord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_UNORD_Q, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NEQ_UQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NEQ_UQ, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnlt_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLT_US, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnlt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLT_US, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnle_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLE_US, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnle_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLE_US, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpord_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_ORD_Q, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_ORD_Q, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_EQ_OQ, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_EQ_OQ, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LT_OS, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LT_OS, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LE_OS, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LE_OS, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpunord_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_UNORD_Q, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpunord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_UNORD_Q, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NEQ_UQ, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NEQ_UQ, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnlt_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLT_US, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnlt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLT_US, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnle_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLE_US, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnle_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLE_US, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpord_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_ORD_Q, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_ORD_Q, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kmov (__mmask16 __A) +{ + return __builtin_ia32_kmovw (__A); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd_ps (__m512d __A) +{ + return (__m512) (__A); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd_si512 (__m512d __A) +{ + return (__m512i) (__A); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps_pd (__m512 __A) +{ + return (__m512d) (__A); +} +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps_si512 (__m512 __A) +{ + return (__m512i) (__A); +} +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_ps (__m512i __A) +{ + return (__m512) (__A); +} +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_pd (__m512i __A) +{ + return (__m512d) (__A); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd512_pd128 (__m512d __A) +{ return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0); } - -__funline __m128 _mm512_castps512_ps128(__m512 __A) { +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps512_ps128 (__m512 __A) +{ return _mm512_extractf32x4_ps(__A, 0); } - -__funline __m128i _mm512_castsi512_si128(__m512i __A) { +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_si128 (__m512i __A) +{ return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0); } - -__funline __m256d _mm512_castpd512_pd256(__m512d __A) { +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd512_pd256 (__m512d __A) +{ return _mm512_extractf64x4_pd(__A, 0); } - -__funline __m256 _mm512_castps512_ps256(__m512 __A) { +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps512_ps256 (__m512 __A) +{ return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); } - -__funline __m256i _mm512_castsi512_si256(__m512i __A) { +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_si256 (__m512i __A) +{ return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0); } - -__funline __m512d _mm512_castpd128_pd512(__m128d __A) { - return (__m512d)__builtin_ia32_pd512_pd((__m128d)__A); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd128_pd512 (__m128d __A) +{ + return (__m512d) __builtin_ia32_pd512_pd((__m128d)__A); } - -__funline __m512 _mm512_castps128_ps512(__m128 __A) { - return (__m512)__builtin_ia32_ps512_ps((__m128)__A); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps128_ps512 (__m128 __A) +{ + return (__m512) __builtin_ia32_ps512_ps((__m128)__A); } - -__funline __m512i _mm512_castsi128_si512(__m128i __A) { - return (__m512i)__builtin_ia32_si512_si((__v4si)__A); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi128_si512 (__m128i __A) +{ + return (__m512i) __builtin_ia32_si512_si((__v4si)__A); } - -__funline __m512d _mm512_castpd256_pd512(__m256d __A) { - return __builtin_ia32_pd512_256pd(__A); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd256_pd512 (__m256d __A) +{ + return __builtin_ia32_pd512_256pd (__A); } - -__funline __m512 _mm512_castps256_ps512(__m256 __A) { - return __builtin_ia32_ps512_256ps(__A); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps256_ps512 (__m256 __A) +{ + return __builtin_ia32_ps512_256ps (__A); } - -__funline __m512i _mm512_castsi256_si512(__m256i __A) { - return (__m512i)__builtin_ia32_si512_256si((__v8si)__A); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi256_si512 (__m256i __A) +{ + return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A); } - -__funline __mmask16 _mm512_cmpeq_epu32_mask(__m512i __A, __m512i __B) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 0, - (__mmask16)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextpd128_pd512 (__m128d __A) +{ + return (__m512d) _mm512_insertf32x4 (_mm512_setzero_ps (), (__m128) __A, 0); } - -__funline __mmask16 _mm512_mask_cmpeq_epu32_mask(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 0, - __U); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextps128_ps512 (__m128 __A) +{ + return _mm512_insertf32x4 (_mm512_setzero_ps (), __A, 0); } - -__funline __mmask8 _mm512_mask_cmpeq_epu64_mask(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 0, - __U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextsi128_si512 (__m128i __A) +{ + return _mm512_inserti32x4 (_mm512_setzero_si512 (), __A, 0); } - -__funline __mmask8 _mm512_cmpeq_epu64_mask(__m512i __A, __m512i __B) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 0, - (__mmask8)-1); +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextpd256_pd512 (__m256d __A) +{ + return _mm512_insertf64x4 (_mm512_setzero_pd (), __A, 0); } - -__funline __mmask16 _mm512_cmpgt_epu32_mask(__m512i __A, __m512i __B) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 6, - (__mmask16)-1); +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextps256_ps512 (__m256 __A) +{ + return (__m512) _mm512_insertf64x4 (_mm512_setzero_pd (), (__m256d) __A, 0); } - -__funline __mmask16 _mm512_mask_cmpgt_epu32_mask(__mmask16 __U, __m512i __A, - __m512i __B) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 6, - __U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextsi256_si512 (__m256i __A) +{ + return _mm512_inserti64x4 (_mm512_setzero_si512 (), __A, 0); } - -__funline __mmask8 _mm512_mask_cmpgt_epu64_mask(__mmask8 __U, __m512i __A, - __m512i __B) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 6, - __U); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 0, + (__mmask16) -1); } - -__funline __mmask8 _mm512_cmpgt_epu64_mask(__m512i __A, __m512i __B) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 6, - (__mmask8)-1); +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 6, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 6, + (__mmask8) -1); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __v8si __T1 = (__v8si)_mm512_extracti64x4_epi64(__A, 1); \ - __v8si __T2 = (__v8si)_mm512_extracti64x4_epi64(__A, 0); \ - __m256i __T3 = (__m256i)(__T1 op __T2); \ - __v4si __T4 = (__v4si)_mm256_extracti128_si256(__T3, 1); \ - __v4si __T5 = (__v4si)_mm256_extracti128_si256(__T3, 0); \ - __v4si __T6 = __T4 op __T5; \ - __v4si __T7 = __builtin_shuffle(__T6, (__v4si){2, 3, 0, 1}); \ - __v4si __T8 = __T6 op __T7; \ - return __T8[0] op __T8[1] - -__funline int _mm512_reduce_add_epi32(__m512i __A) { - __MM512_REDUCE_OP(+); +#define __MM512_REDUCE_OP(op) __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 op __T2); __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); __v4si __T6 = __T4 op __T5; __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __v4si __T8 = __T6 op __T7; return __T8[0] op __T8[1] +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (+); } - -__funline int _mm512_reduce_mul_epi32(__m512i __A) { - __MM512_REDUCE_OP(*); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (*); } - -__funline int _mm512_reduce_and_epi32(__m512i __A) { - __MM512_REDUCE_OP(&); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_and_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (&); } - -__funline int _mm512_reduce_or_epi32(__m512i __A) { - __MM512_REDUCE_OP(|); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_or_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (|); } - -__funline int _mm512_mask_reduce_add_epi32(__mmask16 __U, __m512i __A) { - __A = _mm512_maskz_mov_epi32(__U, __A); - __MM512_REDUCE_OP(+); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (+); } - -__funline int _mm512_mask_reduce_mul_epi32(__mmask16 __U, __m512i __A) { - __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __U, __A); - __MM512_REDUCE_OP(*); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A); + __MM512_REDUCE_OP (*); } - -__funline int _mm512_mask_reduce_and_epi32(__mmask16 __U, __m512i __A) { - __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0), __U, __A); - __MM512_REDUCE_OP(&); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __MM512_REDUCE_OP (&); } - -__funline int _mm512_mask_reduce_or_epi32(__mmask16 __U, __m512i __A) { - __A = _mm512_maskz_mov_epi32(__U, __A); - __MM512_REDUCE_OP(|); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (|); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __m256i __T1 = (__m256i)_mm512_extracti64x4_epi64(__A, 1); \ - __m256i __T2 = (__m256i)_mm512_extracti64x4_epi64(__A, 0); \ - __m256i __T3 = _mm256_##op(__T1, __T2); \ - __m128i __T4 = (__m128i)_mm256_extracti128_si256(__T3, 1); \ - __m128i __T5 = (__m128i)_mm256_extracti128_si256(__T3, 0); \ - __m128i __T6 = _mm_##op(__T4, __T5); \ - __m128i __T7 = \ - (__m128i)__builtin_shuffle((__v4si)__T6, (__v4si){2, 3, 0, 1}); \ - __m128i __T8 = _mm_##op(__T6, __T7); \ - __m128i __T9 = \ - (__m128i)__builtin_shuffle((__v4si)__T8, (__v4si){1, 0, 1, 0}); \ - __v4si __T10 = (__v4si)_mm_##op(__T8, __T9); \ - return __T10[0] - -__funline int _mm512_reduce_min_epi32(__m512i __A) { - __MM512_REDUCE_OP(min_epi32); +#define __MM512_REDUCE_OP(op) __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = _mm256_##op (__T1, __T2); __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); __m128i __T6 = _mm_##op (__T4, __T5); __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, (__v4si) { 2, 3, 0, 1 }); __m128i __T8 = _mm_##op (__T6, __T7); __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, (__v4si) { 1, 0, 1, 0 }); __v4si __T10 = (__v4si) _mm_##op (__T8, __T9); return __T10[0] +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epi32); } - -__funline int _mm512_reduce_max_epi32(__m512i __A) { - __MM512_REDUCE_OP(max_epi32); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epi32); } - -__funline unsigned int _mm512_reduce_min_epu32(__m512i __A) { - __MM512_REDUCE_OP(min_epu32); +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epu32 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epu32); } - -__funline unsigned int _mm512_reduce_max_epu32(__m512i __A) { - __MM512_REDUCE_OP(max_epu32); +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epu32 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epu32); } - -__funline int _mm512_mask_reduce_min_epi32(__mmask16 __U, __m512i __A) { - __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __U, __A); - __MM512_REDUCE_OP(min_epi32); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A); + __MM512_REDUCE_OP (min_epi32); } - -__funline int _mm512_mask_reduce_max_epi32(__mmask16 __U, __m512i __A) { - __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __U, __A); - __MM512_REDUCE_OP(max_epi32); +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A); + __MM512_REDUCE_OP (max_epi32); } - -__funline unsigned int _mm512_mask_reduce_min_epu32(__mmask16 __U, __m512i __A) { - __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0), __U, __A); - __MM512_REDUCE_OP(min_epu32); +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __MM512_REDUCE_OP (min_epu32); } - -__funline unsigned int _mm512_mask_reduce_max_epu32(__mmask16 __U, __m512i __A) { - __A = _mm512_maskz_mov_epi32(__U, __A); - __MM512_REDUCE_OP(max_epu32); +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (max_epu32); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __m256 __T1 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 1); \ - __m256 __T2 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); \ - __m256 __T3 = __T1 op __T2; \ - __m128 __T4 = _mm256_extractf128_ps(__T3, 1); \ - __m128 __T5 = _mm256_extractf128_ps(__T3, 0); \ - __m128 __T6 = __T4 op __T5; \ - __m128 __T7 = __builtin_shuffle(__T6, (__v4si){2, 3, 0, 1}); \ - __m128 __T8 = __T6 op __T7; \ - return __T8[0] op __T8[1] - -__funline float _mm512_reduce_add_ps(__m512 __A) { - __MM512_REDUCE_OP(+); +#define __MM512_REDUCE_OP(op) __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = __T1 op __T2; __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = __T4 op __T5; __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = __T6 op __T7; return __T8[0] op __T8[1] +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_ps (__m512 __A) +{ + __MM512_REDUCE_OP (+); } - -__funline float _mm512_reduce_mul_ps(__m512 __A) { - __MM512_REDUCE_OP(*); +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_ps (__m512 __A) +{ + __MM512_REDUCE_OP (*); } - -__funline float _mm512_mask_reduce_add_ps(__mmask16 __U, __m512 __A) { - __A = _mm512_maskz_mov_ps(__U, __A); - __MM512_REDUCE_OP(+); +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_maskz_mov_ps (__U, __A); + __MM512_REDUCE_OP (+); } - -__funline float _mm512_mask_reduce_mul_ps(__mmask16 __U, __m512 __A) { - __A = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __U, __A); - __MM512_REDUCE_OP(*); +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A); + __MM512_REDUCE_OP (*); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __m256 __T1 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 1); \ - __m256 __T2 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); \ - __m256 __T3 = _mm256_##op(__T1, __T2); \ - __m128 __T4 = _mm256_extractf128_ps(__T3, 1); \ - __m128 __T5 = _mm256_extractf128_ps(__T3, 0); \ - __m128 __T6 = _mm_##op(__T4, __T5); \ - __m128 __T7 = __builtin_shuffle(__T6, (__v4si){2, 3, 0, 1}); \ - __m128 __T8 = _mm_##op(__T6, __T7); \ - __m128 __T9 = __builtin_shuffle(__T8, (__v4si){1, 0, 1, 0}); \ - __m128 __T10 = _mm_##op(__T8, __T9); \ - return __T10[0] - -__funline float _mm512_reduce_min_ps(__m512 __A) { - __MM512_REDUCE_OP(min_ps); +#define __MM512_REDUCE_OP(op) __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); __m256 __T3 = _mm256_##op (__T1, __T2); __m128 __T4 = _mm256_extractf128_ps (__T3, 1); __m128 __T5 = _mm256_extractf128_ps (__T3, 0); __m128 __T6 = _mm_##op (__T4, __T5); __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); __m128 __T8 = _mm_##op (__T6, __T7); __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); __m128 __T10 = _mm_##op (__T8, __T9); return __T10[0] +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_ps (__m512 __A) +{ + __MM512_REDUCE_OP (min_ps); } - -__funline float _mm512_reduce_max_ps(__m512 __A) { - __MM512_REDUCE_OP(max_ps); +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_ps (__m512 __A) +{ + __MM512_REDUCE_OP (max_ps); } - -__funline float _mm512_mask_reduce_min_ps(__mmask16 __U, __m512 __A) { - __A = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __U, __A); - __MM512_REDUCE_OP(min_ps); +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A); + __MM512_REDUCE_OP (min_ps); } - -__funline float _mm512_mask_reduce_max_ps(__mmask16 __U, __m512 __A) { - __A = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __U, __A); - __MM512_REDUCE_OP(max_ps); +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A); + __MM512_REDUCE_OP (max_ps); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __v4di __T1 = (__v4di)_mm512_extracti64x4_epi64(__A, 1); \ - __v4di __T2 = (__v4di)_mm512_extracti64x4_epi64(__A, 0); \ - __m256i __T3 = (__m256i)(__T1 op __T2); \ - __v2di __T4 = (__v2di)_mm256_extracti128_si256(__T3, 1); \ - __v2di __T5 = (__v2di)_mm256_extracti128_si256(__T3, 0); \ - __v2di __T6 = __T4 op __T5; \ - return __T6[0] op __T6[1] - -__funline long long _mm512_reduce_add_epi64(__m512i __A) { - __MM512_REDUCE_OP(+); +#define __MM512_REDUCE_OP(op) __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); __m256i __T3 = (__m256i) (__T1 op __T2); __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); __v2di __T6 = __T4 op __T5; return __T6[0] op __T6[1] +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (+); } - -__funline long long _mm512_reduce_mul_epi64(__m512i __A) { - __MM512_REDUCE_OP(*); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (*); } - -__funline long long _mm512_reduce_and_epi64(__m512i __A) { - __MM512_REDUCE_OP(&); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_and_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (&); } - -__funline long long _mm512_reduce_or_epi64(__m512i __A) { - __MM512_REDUCE_OP(|); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_or_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (|); } - -__funline long long _mm512_mask_reduce_add_epi64(__mmask8 __U, __m512i __A) { - __A = _mm512_maskz_mov_epi64(__U, __A); - __MM512_REDUCE_OP(+); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (+); } - -__funline long long _mm512_mask_reduce_mul_epi64(__mmask8 __U, __m512i __A) { - __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(1LL), __U, __A); - __MM512_REDUCE_OP(*); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A); + __MM512_REDUCE_OP (*); } - -__funline long long _mm512_mask_reduce_and_epi64(__mmask8 __U, __m512i __A) { - __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0LL), __U, __A); - __MM512_REDUCE_OP(&); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __MM512_REDUCE_OP (&); } - -__funline long long _mm512_mask_reduce_or_epi64(__mmask8 __U, __m512i __A) { - __A = _mm512_maskz_mov_epi64(__U, __A); - __MM512_REDUCE_OP(|); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (|); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __m512i __T1 = _mm512_shuffle_i64x2(__A, __A, 0x4e); \ - __m512i __T2 = _mm512_##op(__A, __T1); \ - __m512i __T3 = (__m512i)__builtin_shuffle((__v8di)__T2, \ - (__v8di){2, 3, 0, 1, 6, 7, 4, 5}); \ - __m512i __T4 = _mm512_##op(__T2, __T3); \ - __m512i __T5 = (__m512i)__builtin_shuffle((__v8di)__T4, \ - (__v8di){1, 0, 3, 2, 5, 4, 7, 6}); \ - __v8di __T6 = (__v8di)_mm512_##op(__T4, __T5); \ - return __T6[0] - -__funline long long _mm512_reduce_min_epi64(__m512i __A) { - __MM512_REDUCE_OP(min_epi64); +#define __MM512_REDUCE_OP(op) __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); __m512i __T2 = _mm512_##op (__A, __T1); __m512i __T3 = (__m512i) __builtin_shuffle ((__v8di) __T2, (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m512i __T4 = _mm512_##op (__T2, __T3); __m512i __T5 = (__m512i) __builtin_shuffle ((__v8di) __T4, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 }); __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5); return __T6[0] +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epi64); } - -__funline long long _mm512_reduce_max_epi64(__m512i __A) { - __MM512_REDUCE_OP(max_epi64); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epi64); } - -__funline long long _mm512_mask_reduce_min_epi64(__mmask8 __U, __m512i __A) { - __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __U, __A); - __MM512_REDUCE_OP(min_epi64); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__), + __U, __A); + __MM512_REDUCE_OP (min_epi64); } - -__funline long long _mm512_mask_reduce_max_epi64(__mmask8 __U, __m512i __A) { - __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1), __U, - __A); - __MM512_REDUCE_OP(max_epi64); +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1), + __U, __A); + __MM512_REDUCE_OP (max_epi64); } - -__funline unsigned long long _mm512_reduce_min_epu64(__m512i __A) { - __MM512_REDUCE_OP(min_epu64); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epu64 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epu64); } - -__funline unsigned long long _mm512_reduce_max_epu64(__m512i __A) { - __MM512_REDUCE_OP(max_epu64); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epu64 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epu64); } - -__funline unsigned long long _mm512_mask_reduce_min_epu64(__mmask8 __U, - __m512i __A) { - __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0LL), __U, __A); - __MM512_REDUCE_OP(min_epu64); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __MM512_REDUCE_OP (min_epu64); } - -__funline unsigned long long _mm512_mask_reduce_max_epu64(__mmask8 __U, - __m512i __A) { - __A = _mm512_maskz_mov_epi64(__U, __A); - __MM512_REDUCE_OP(max_epu64); +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (max_epu64); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __m256d __T1 = (__m256d)_mm512_extractf64x4_pd(__A, 1); \ - __m256d __T2 = (__m256d)_mm512_extractf64x4_pd(__A, 0); \ - __m256d __T3 = __T1 op __T2; \ - __m128d __T4 = _mm256_extractf128_pd(__T3, 1); \ - __m128d __T5 = _mm256_extractf128_pd(__T3, 0); \ - __m128d __T6 = __T4 op __T5; \ - return __T6[0] op __T6[1] - -__funline double _mm512_reduce_add_pd(__m512d __A) { - __MM512_REDUCE_OP(+); +#define __MM512_REDUCE_OP(op) __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = __T1 op __T2; __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = __T4 op __T5; return __T6[0] op __T6[1] +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_pd (__m512d __A) +{ + __MM512_REDUCE_OP (+); } - -__funline double _mm512_reduce_mul_pd(__m512d __A) { - __MM512_REDUCE_OP(*); +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_pd (__m512d __A) +{ + __MM512_REDUCE_OP (*); } - -__funline double _mm512_mask_reduce_add_pd(__mmask8 __U, __m512d __A) { - __A = _mm512_maskz_mov_pd(__U, __A); - __MM512_REDUCE_OP(+); +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_maskz_mov_pd (__U, __A); + __MM512_REDUCE_OP (+); } - -__funline double _mm512_mask_reduce_mul_pd(__mmask8 __U, __m512d __A) { - __A = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __U, __A); - __MM512_REDUCE_OP(*); +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A); + __MM512_REDUCE_OP (*); } - #undef __MM512_REDUCE_OP -#define __MM512_REDUCE_OP(op) \ - __m256d __T1 = (__m256d)_mm512_extractf64x4_pd(__A, 1); \ - __m256d __T2 = (__m256d)_mm512_extractf64x4_pd(__A, 0); \ - __m256d __T3 = _mm256_##op(__T1, __T2); \ - __m128d __T4 = _mm256_extractf128_pd(__T3, 1); \ - __m128d __T5 = _mm256_extractf128_pd(__T3, 0); \ - __m128d __T6 = _mm_##op(__T4, __T5); \ - __m128d __T7 = (__m128d)__builtin_shuffle(__T6, (__v2di){1, 0}); \ - __m128d __T8 = _mm_##op(__T6, __T7); \ - return __T8[0] - -__funline double _mm512_reduce_min_pd(__m512d __A) { - __MM512_REDUCE_OP(min_pd); +#define __MM512_REDUCE_OP(op) __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); __m256d __T3 = _mm256_##op (__T1, __T2); __m128d __T4 = _mm256_extractf128_pd (__T3, 1); __m128d __T5 = _mm256_extractf128_pd (__T3, 0); __m128d __T6 = _mm_##op (__T4, __T5); __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); __m128d __T8 = _mm_##op (__T6, __T7); return __T8[0] +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_pd (__m512d __A) +{ + __MM512_REDUCE_OP (min_pd); } - -__funline double _mm512_reduce_max_pd(__m512d __A) { - __MM512_REDUCE_OP(max_pd); +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_pd (__m512d __A) +{ + __MM512_REDUCE_OP (max_pd); } - -__funline double _mm512_mask_reduce_min_pd(__mmask8 __U, __m512d __A) { - __A = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __U, __A); - __MM512_REDUCE_OP(min_pd); +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A); + __MM512_REDUCE_OP (min_pd); } - -__funline double _mm512_mask_reduce_max_pd(__mmask8 __U, __m512d __A) { - __A = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __U, __A); - __MM512_REDUCE_OP(max_pd); +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A); + __MM512_REDUCE_OP (max_pd); } - #undef __MM512_REDUCE_OP - #ifdef __DISABLE_AVX512F__ #undef __DISABLE_AVX512F__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512F__ */ - -#endif /* _AVX512FINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512ifmaintrin.internal.h b/third_party/intel/avx512ifmaintrin.internal.h index 48ea3859b..5d10237e1 100644 --- a/third_party/intel/avx512ifmaintrin.internal.h +++ b/third_party/intel/avx512ifmaintrin.internal.h @@ -1,53 +1,74 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512IFMAINTRIN_H_INCLUDED #define _AVX512IFMAINTRIN_H_INCLUDED - #ifndef __AVX512IFMA__ #pragma GCC push_options #pragma GCC target("avx512ifma") #define __DISABLE_AVX512IFMA__ -#endif /* __AVX512IFMA__ */ - -__funline __m512i _mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i)__builtin_ia32_vpmadd52luq512_mask((__v8di)__X, (__v8di)__Y, - (__v8di)__Z, (__mmask8)-1); +#endif +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) -1); } - -__funline __m512i _mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i)__builtin_ia32_vpmadd52huq512_mask((__v8di)__X, (__v8di)__Y, - (__v8di)__Z, (__mmask8)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) -1); } - -__funline __m512i _mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, - __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_vpmadd52luq512_mask( - (__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W, + (__v8di) __X, + (__v8di) __Y, + (__mmask8) __M); } - -__funline __m512i _mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, - __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_vpmadd52huq512_mask( - (__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W, + (__v8di) __X, + (__v8di) __Y, + (__mmask8) __M); } - -__funline __m512i _mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, - __m512i __Y, __m512i __Z) { - return (__m512i)__builtin_ia32_vpmadd52luq512_maskz( - (__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) __M); } - -__funline __m512i _mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, - __m512i __Y, __m512i __Z) { - return (__m512i)__builtin_ia32_vpmadd52huq512_maskz( - (__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) __M); } - #ifdef __DISABLE_AVX512IFMA__ #undef __DISABLE_AVX512IFMA__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512IFMA__ */ - -#endif /* _AVX512IFMAINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512ifmavlintrin.internal.h b/third_party/intel/avx512ifmavlintrin.internal.h index 7bc9d68ef..c5cff3ebd 100644 --- a/third_party/intel/avx512ifmavlintrin.internal.h +++ b/third_party/intel/avx512ifmavlintrin.internal.h @@ -1,88 +1,128 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +#error "Never use directly; include instead." #endif - #ifndef _AVX512IFMAVLINTRIN_H_INCLUDED #define _AVX512IFMAVLINTRIN_H_INCLUDED - #if !defined(__AVX512VL__) || !defined(__AVX512IFMA__) #pragma GCC push_options #pragma GCC target("avx512ifma,avx512vl") #define __DISABLE_AVX512IFMAVL__ -#endif /* __AVX512IFMAVL__ */ - -__funline __m128i _mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i)__builtin_ia32_vpmadd52luq128_mask((__v2di)__X, (__v2di)__Y, - (__v2di)__Z, (__mmask8)-1); +#endif +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z, + (__mmask8) -1); } - -__funline __m128i _mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i)__builtin_ia32_vpmadd52huq128_mask((__v2di)__X, (__v2di)__Y, - (__v2di)__Z, (__mmask8)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z, + (__mmask8) -1); } - -__funline __m256i _mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i)__builtin_ia32_vpmadd52luq256_mask((__v4di)__X, (__v4di)__Y, - (__v4di)__Z, (__mmask8)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z, + (__mmask8) -1); } - -__funline __m256i _mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i)__builtin_ia32_vpmadd52huq256_mask((__v4di)__X, (__v4di)__Y, - (__v4di)__Z, (__mmask8)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z, + (__mmask8) -1); } - -__funline __m128i _mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_vpmadd52luq128_mask( - (__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W, + (__v2di) __X, + (__v2di) __Y, + (__mmask8) __M); } - -__funline __m128i _mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_vpmadd52huq128_mask( - (__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W, + (__v2di) __X, + (__v2di) __Y, + (__mmask8) __M); } - -__funline __m256i _mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, - __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_vpmadd52luq256_mask( - (__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W, + (__v4di) __X, + (__v4di) __Y, + (__mmask8) __M); } - -__funline __m256i _mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, - __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_vpmadd52huq256_mask( - (__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W, + (__v4di) __X, + (__v4di) __Y, + (__mmask8) __M); } - -__funline __m128i _mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, - __m128i __Z) { - return (__m128i)__builtin_ia32_vpmadd52luq128_maskz( - (__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z, + (__mmask8) __M); } - -__funline __m128i _mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, - __m128i __Z) { - return (__m128i)__builtin_ia32_vpmadd52huq128_maskz( - (__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z, + (__mmask8) __M); } - -__funline __m256i _mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, - __m256i __Y, __m256i __Z) { - return (__m256i)__builtin_ia32_vpmadd52luq256_maskz( - (__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z, + (__mmask8) __M); } - -__funline __m256i _mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, - __m256i __Y, __m256i __Z) { - return (__m256i)__builtin_ia32_vpmadd52huq256_maskz( - (__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z, + (__mmask8) __M); } - #ifdef __DISABLE_AVX512IFMAVL__ #undef __DISABLE_AVX512IFMAVL__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512IFMAVL__ */ - -#endif /* _AVX512IFMAVLINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512pfintrin.internal.h b/third_party/intel/avx512pfintrin.internal.h index 4401d24f6..0704fe6fc 100644 --- a/third_party/intel/avx512pfintrin.internal.h +++ b/third_party/intel/avx512pfintrin.internal.h @@ -1,190 +1,170 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512PFINTRIN_H_INCLUDED #define _AVX512PFINTRIN_H_INCLUDED - #ifndef __AVX512PF__ #pragma GCC push_options #pragma GCC target("avx512pf") #define __DISABLE_AVX512PF__ -#endif /* __AVX512PF__ */ - -typedef long long __v8di __attribute__((__vector_size__(64))); -typedef int __v16si __attribute__((__vector_size__(64))); -typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__)); +#endif +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); typedef unsigned char __mmask8; typedef unsigned short __mmask16; - #ifdef __OPTIMIZE__ -__funline void _mm512_prefetch_i32gather_pd(__m256i __index, void const *__addr, - int __scale, int __hint) { - __builtin_ia32_gatherpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale, - __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr, + __scale, __hint); } - -__funline void _mm512_prefetch_i32gather_ps(__m512i __index, void const *__addr, - int __scale, int __hint) { - __builtin_ia32_gatherpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr, - __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr, + __scale, __hint); } - -__funline void _mm512_mask_prefetch_i32gather_pd(__m256i __index, __mmask8 __mask, - void const *__addr, int __scale, - int __hint) { - __builtin_ia32_gatherpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfdpd (__mask, (__v8si) __index, __addr, __scale, + __hint); } - -__funline void _mm512_mask_prefetch_i32gather_ps(__m512i __index, - __mmask16 __mask, - void const *__addr, int __scale, - int __hint) { - __builtin_ia32_gatherpfdps(__mask, (__v16si)__index, __addr, __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32gather_ps (__m512i __index, __mmask16 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfdps (__mask, (__v16si) __index, __addr, __scale, + __hint); } - -__funline void _mm512_prefetch_i64gather_pd(__m512i __index, void const *__addr, - int __scale, int __hint) { - __builtin_ia32_gatherpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale, - __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); } - -__funline void _mm512_prefetch_i64gather_ps(__m512i __index, void const *__addr, - int __scale, int __hint) { - __builtin_ia32_gatherpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale, - __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); } - -__funline void _mm512_mask_prefetch_i64gather_pd(__m512i __index, __mmask8 __mask, - void const *__addr, int __scale, - int __hint) { - __builtin_ia32_gatherpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfqpd (__mask, (__v8di) __index, __addr, __scale, + __hint); } - -__funline void _mm512_mask_prefetch_i64gather_ps(__m512i __index, __mmask8 __mask, - void const *__addr, int __scale, - int __hint) { - __builtin_ia32_gatherpfqps(__mask, (__v8di)__index, __addr, __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64gather_ps (__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfqps (__mask, (__v8di) __index, __addr, __scale, + __hint); } - -__funline void _mm512_prefetch_i32scatter_pd(void *__addr, __m256i __index, - int __scale, int __hint) { - __builtin_ia32_scatterpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale, - __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32scatter_pd (void *__addr, __m256i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr, + __scale, __hint); } - -__funline void _mm512_prefetch_i32scatter_ps(void *__addr, __m512i __index, - int __scale, int __hint) { - __builtin_ia32_scatterpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr, - __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32scatter_ps (void *__addr, __m512i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr, + __scale, __hint); } - -__funline void _mm512_mask_prefetch_i32scatter_pd(void *__addr, __mmask8 __mask, - __m256i __index, int __scale, - int __hint) { - __builtin_ia32_scatterpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfdpd (__mask, (__v8si) __index, __addr, __scale, + __hint); } - -__funline void _mm512_mask_prefetch_i32scatter_ps(void *__addr, __mmask16 __mask, - __m512i __index, int __scale, - int __hint) { - __builtin_ia32_scatterpfdps(__mask, (__v16si)__index, __addr, __scale, - __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32scatter_ps (void *__addr, __mmask16 __mask, + __m512i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfdps (__mask, (__v16si) __index, __addr, __scale, + __hint); } - -__funline void _mm512_prefetch_i64scatter_pd(void *__addr, __m512i __index, - int __scale, int __hint) { - __builtin_ia32_scatterpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale, - __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64scatter_pd (void *__addr, __m512i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfqpd ((__mmask8) 0xFF, (__v8di) __index,__addr, + __scale, __hint); } - -__funline void _mm512_prefetch_i64scatter_ps(void *__addr, __m512i __index, - int __scale, int __hint) { - __builtin_ia32_scatterpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale, - __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64scatter_ps (void *__addr, __m512i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); } - -__funline void _mm512_mask_prefetch_i64scatter_pd(void *__addr, __mmask8 __mask, - __m512i __index, int __scale, - int __hint) { - __builtin_ia32_scatterpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64scatter_pd (void *__addr, __mmask8 __mask, + __m512i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfqpd (__mask, (__v8di) __index, __addr, __scale, + __hint); } - -__funline void _mm512_mask_prefetch_i64scatter_ps(void *__addr, __mmask8 __mask, - __m512i __index, int __scale, - int __hint) { - __builtin_ia32_scatterpfqps(__mask, (__v8di)__index, __addr, __scale, __hint); +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64scatter_ps (void *__addr, __mmask8 __mask, + __m512i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfqps (__mask, (__v8di) __index, __addr, __scale, + __hint); } - #else -#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfdpd((__mmask8)0xFF, (__v8si)(__m256i)INDEX, \ - (void const *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfdps((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, \ - (void const *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfdpd((__mmask8)MASK, (__v8si)(__m256i)INDEX, \ - (void const *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfdps((__mmask16)MASK, (__v16si)(__m512i)INDEX, \ - (void const *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfqpd((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfqps((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfqpd((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ - __builtin_ia32_gatherpfqps((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfdpd((__mmask8)0xFF, (__v8si)(__m256i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfdps((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfdpd((__mmask8)MASK, (__v8si)(__m256i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfdps((__mmask16)MASK, (__v16si)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfqpd((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfqps((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfqpd((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) - -#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ - __builtin_ia32_scatterpfqps((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ - (void *)ADDR, (int)SCALE, (int)HINT) +#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT) __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX), (void const *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT) __builtin_ia32_gatherpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), (void const *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT) __builtin_ia32_gatherpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), (void const *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT) __builtin_ia32_gatherpfdps ((__mmask16) (MASK), (__v16si)(__m512i) (INDEX), (void const *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT) __builtin_ia32_gatherpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT) __builtin_ia32_gatherpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT) __builtin_ia32_gatherpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT) __builtin_ia32_gatherpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT) __builtin_ia32_scatterpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT) __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) __builtin_ia32_scatterpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) __builtin_ia32_scatterpfdps ((__mmask16) (MASK), (__v16si)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT) __builtin_ia32_scatterpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT) __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) __builtin_ia32_scatterpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) __builtin_ia32_scatterpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), (void *) (ADDR), (int) (SCALE), (int) (HINT)) #endif - #ifdef __DISABLE_AVX512PF__ #undef __DISABLE_AVX512PF__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512PF__ */ - -#endif /* _AVX512PFINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vbmi2intrin.internal.h b/third_party/intel/avx512vbmi2intrin.internal.h index 1ab952209..0085907c2 100644 --- a/third_party/intel/avx512vbmi2intrin.internal.h +++ b/third_party/intel/avx512vbmi2intrin.internal.h @@ -1,381 +1,407 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +#error "Never use directly; include instead." #endif - #ifndef __AVX512VBMI2INTRIN_H_INCLUDED #define __AVX512VBMI2INTRIN_H_INCLUDED - #if !defined(__AVX512VBMI2__) #pragma GCC push_options #pragma GCC target("avx512vbmi2") #define __DISABLE_AVX512VBMI2__ -#endif /* __AVX512VBMI2__ */ - +#endif #ifdef __OPTIMIZE__ -__funline __m512i _mm512_shrdi_epi16(__m512i __A, __m512i __B, int __C) { - return (__m512i)__builtin_ia32_vpshrd_v32hi((__v32hi)__A, (__v32hi)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi16 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)__A, (__v32hi) __B, + __C); } - -__funline __m512i _mm512_shrdi_epi32(__m512i __A, __m512i __B, int __C) { - return (__m512i)__builtin_ia32_vpshrd_v16si((__v16si)__A, (__v16si)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi32 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)__A, (__v16si) __B, + __C); } - -__funline __m512i _mm512_mask_shrdi_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D, int __E) { - return (__m512i)__builtin_ia32_vpshrd_v16si_mask( - (__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__C, + (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_shrdi_epi32(__mmask16 __A, __m512i __B, - __m512i __C, int __D) { - return (__m512i)__builtin_ia32_vpshrd_v16si_mask( - (__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(), - (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__B, + (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A); } - -__funline __m512i _mm512_shrdi_epi64(__m512i __A, __m512i __B, int __C) { - return (__m512i)__builtin_ia32_vpshrd_v8di((__v8di)__A, (__v8di)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi64 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)__A, (__v8di) __B, __C); } - -__funline __m512i _mm512_mask_shrdi_epi64(__m512i __A, __mmask8 __B, __m512i __C, - __m512i __D, int __E) { - return (__m512i)__builtin_ia32_vpshrd_v8di_mask((__v8di)__C, (__v8di)__D, __E, - (__v8di)__A, (__mmask8)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__C, (__v8di) __D, + __E, (__v8di) __A, (__mmask8)__B); } - -__funline __m512i _mm512_maskz_shrdi_epi64(__mmask8 __A, __m512i __B, __m512i __C, - int __D) { - return (__m512i)__builtin_ia32_vpshrd_v8di_mask( - (__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(), - (__mmask8)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__B, (__v8di) __C, + __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A); } - -__funline __m512i _mm512_shldi_epi16(__m512i __A, __m512i __B, int __C) { - return (__m512i)__builtin_ia32_vpshld_v32hi((__v32hi)__A, (__v32hi)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi16 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)__A, (__v32hi) __B, + __C); } - -__funline __m512i _mm512_shldi_epi32(__m512i __A, __m512i __B, int __C) { - return (__m512i)__builtin_ia32_vpshld_v16si((__v16si)__A, (__v16si)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi32 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v16si ((__v16si)__A, (__v16si) __B, + __C); } - -__funline __m512i _mm512_mask_shldi_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D, int __E) { - return (__m512i)__builtin_ia32_vpshld_v16si_mask( - (__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__C, + (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_shldi_epi32(__mmask16 __A, __m512i __B, - __m512i __C, int __D) { - return (__m512i)__builtin_ia32_vpshld_v16si_mask( - (__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(), - (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__B, + (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A); } - -__funline __m512i _mm512_shldi_epi64(__m512i __A, __m512i __B, int __C) { - return (__m512i)__builtin_ia32_vpshld_v8di((__v8di)__A, (__v8di)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi64 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v8di ((__v8di)__A, (__v8di) __B, __C); } - -__funline __m512i _mm512_mask_shldi_epi64(__m512i __A, __mmask8 __B, __m512i __C, - __m512i __D, int __E) { - return (__m512i)__builtin_ia32_vpshld_v8di_mask((__v8di)__C, (__v8di)__D, __E, - (__v8di)__A, (__mmask8)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__C, (__v8di) __D, + __E, (__v8di) __A, (__mmask8)__B); } - -__funline __m512i _mm512_maskz_shldi_epi64(__mmask8 __A, __m512i __B, __m512i __C, - int __D) { - return (__m512i)__builtin_ia32_vpshld_v8di_mask( - (__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(), - (__mmask8)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__B, (__v8di) __C, + __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A); } #else -#define _mm512_shrdi_epi16(A, B, C) \ - ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), \ - (__v32hi)(__m512i)(B),(int)(C)) -#define _mm512_shrdi_epi32(A, B, C) \ - ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B),(int)(C)) -#define _mm512_mask_shrdi_epi32(A, B, C, D, E) \ - ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), \ - (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B)) -#define _mm512_maskz_shrdi_epi32(A, B, C, D) \ - ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C),(int)(D), \ - (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A)) -#define _mm512_shrdi_epi64(A, B, C) \ - ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B),(int)(C)) -#define _mm512_mask_shrdi_epi64(A, B, C, D, E) \ - ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), \ - (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B)) -#define _mm512_maskz_shrdi_epi64(A, B, C, D) \ - ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(C),(int)(D), \ - (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A)) -#define _mm512_shldi_epi16(A, B, C) \ - ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), \ - (__v32hi)(__m512i)(B),(int)(C)) -#define _mm512_shldi_epi32(A, B, C) \ - ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B),(int)(C)) -#define _mm512_mask_shldi_epi32(A, B, C, D, E) \ - ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), \ - (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B)) -#define _mm512_maskz_shldi_epi32(A, B, C, D) \ - ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C),(int)(D), \ - (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A)) -#define _mm512_shldi_epi64(A, B, C) \ - ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B),(int)(C)) -#define _mm512_mask_shldi_epi64(A, B, C, D, E) \ - ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), \ - (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B)) -#define _mm512_maskz_shldi_epi64(A, B, C, D) \ - ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(C),(int)(D), \ - (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A)) +#define _mm512_shrdi_epi16(A, B, C) ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), (__v32hi)(__m512i)(B),(int)(C))) +#define _mm512_shrdi_epi32(A, B, C) ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B),(int)(C))) +#define _mm512_mask_shrdi_epi32(A, B, C, D, E) ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A), (__mmask16)(B))) +#define _mm512_maskz_shrdi_epi32(A, B, C, D) ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), (__v16si)(__m512i)(C),(int)(D), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A))) +#define _mm512_shrdi_epi64(A, B, C) ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B),(int)(C))) +#define _mm512_mask_shrdi_epi64(A, B, C, D, E) ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A), (__mmask8)(B))) +#define _mm512_maskz_shrdi_epi64(A, B, C, D) ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), (__v8di)(__m512i)(C),(int)(D), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A))) +#define _mm512_shldi_epi16(A, B, C) ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), (__v32hi)(__m512i)(B),(int)(C))) +#define _mm512_shldi_epi32(A, B, C) ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), (__v16si)(__m512i)(B),(int)(C))) +#define _mm512_mask_shldi_epi32(A, B, C, D, E) ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A), (__mmask16)(B))) +#define _mm512_maskz_shldi_epi32(A, B, C, D) ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), (__v16si)(__m512i)(C),(int)(D), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A))) +#define _mm512_shldi_epi64(A, B, C) ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (int)(C))) +#define _mm512_mask_shldi_epi64(A, B, C, D, E) ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A), (__mmask8)(B))) +#define _mm512_maskz_shldi_epi64(A, B, C, D) ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), (__v8di)(__m512i)(C),(int)(D), (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A))) #endif - -__funline __m512i _mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshrdv_v32hi((__v32hi)__A, (__v32hi)__B, - (__v32hi)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdv_epi16 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshrdv_v32hi ((__v32hi)__A, (__v32hi) __B, + (__v32hi) __C); } - -__funline __m512i _mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshrdv_v16si((__v16si)__A, (__v16si)__B, - (__v16si)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdv_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshrdv_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); } - -__funline __m512i _mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshrdv_v16si_mask( - (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_shrdv_epi32(__mmask16 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); } - -__funline __m512i _mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshrdv_v8di((__v8di)__A, (__v8di)__B, - (__v8di)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdv_epi64 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshrdv_v8di ((__v8di)__A, (__v8di) __B, + (__v8di) __C); } - -__funline __m512i _mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshrdv_v8di_mask((__v8di)__A, (__v8di)__C, - (__v8di)__D, (__mmask8)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v8di_mask ((__v8di)__A, (__v8di) __C, + (__v8di) __D, (__mmask8)__B); } - -__funline __m512i _mm512_maskz_shrdv_epi64(__mmask8 __A, __m512i __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz((__v8di)__B, (__v8di)__C, - (__v8di)__D, (__mmask8)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz ((__v8di)__B, (__v8di) __C, + (__v8di) __D, (__mmask8)__A); } -__funline __m512i _mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshldv_v32hi((__v32hi)__A, (__v32hi)__B, - (__v32hi)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldv_epi16 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshldv_v32hi ((__v32hi)__A, (__v32hi) __B, + (__v32hi) __C); } - -__funline __m512i _mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshldv_v16si((__v16si)__A, (__v16si)__B, - (__v16si)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldv_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshldv_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); } - -__funline __m512i _mm512_mask_shldv_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshldv_v16si_mask( - (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_shldv_epi32(__mmask16 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpshldv_v16si_maskz( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); } - -__funline __m512i _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpshldv_v8di((__v8di)__A, (__v8di)__B, - (__v8di)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldv_epi64 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshldv_v8di ((__v8di)__A, (__v8di) __B, + (__v8di) __C); } - -__funline __m512i _mm512_mask_shldv_epi64(__m512i __A, __mmask8 __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshldv_v8di_mask((__v8di)__A, (__v8di)__C, - (__v8di)__D, (__mmask8)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v8di_mask ((__v8di)__A, (__v8di) __C, + (__v8di) __D, (__mmask8)__B); } - -__funline __m512i _mm512_maskz_shldv_epi64(__mmask8 __A, __m512i __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshldv_v8di_maskz((__v8di)__B, (__v8di)__C, - (__v8di)__D, (__mmask8)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v8di_maskz ((__v8di)__B, (__v8di) __C, + (__v8di) __D, (__mmask8)__A); } - #ifdef __DISABLE_AVX512VBMI2__ #undef __DISABLE_AVX512VBMI2__ - #pragma GCC pop_options -#endif /* __DISABLE_AVX512VBMI2__ */ - +#endif #if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__) #pragma GCC push_options #pragma GCC target("avx512vbmi2,avx512bw") #define __DISABLE_AVX512VBMI2BW__ -#endif /* __AVX512VBMI2BW__ */ - -__funline __m512i _mm512_mask_compress_epi8(__m512i __A, __mmask64 __B, - __m512i __C) { - return (__m512i)__builtin_ia32_compressqi512_mask((__v64qi)__C, (__v64qi)__A, - (__mmask64)__B); -} - -__funline __m512i _mm512_maskz_compress_epi8(__mmask64 __A, __m512i __B) { - return (__m512i)__builtin_ia32_compressqi512_mask( - (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); -} - -__funline void _mm512_mask_compressstoreu_epi8(void *__A, __mmask64 __B, - __m512i __C) { - __builtin_ia32_compressstoreuqi512_mask((__v64qi *)__A, (__v64qi)__C, - (__mmask64)__B); -} - -__funline __m512i _mm512_mask_compress_epi16(__m512i __A, __mmask32 __B, - __m512i __C) { - return (__m512i)__builtin_ia32_compresshi512_mask((__v32hi)__C, (__v32hi)__A, - (__mmask32)__B); -} - -__funline __m512i _mm512_maskz_compress_epi16(__mmask32 __A, __m512i __B) { - return (__m512i)__builtin_ia32_compresshi512_mask( - (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); -} - -__funline void _mm512_mask_compressstoreu_epi16(void *__A, __mmask32 __B, - __m512i __C) { - __builtin_ia32_compressstoreuhi512_mask((__v32hi *)__A, (__v32hi)__C, - (__mmask32)__B); -} - -__funline __m512i _mm512_mask_expand_epi8(__m512i __A, __mmask64 __B, - __m512i __C) { - return (__m512i)__builtin_ia32_expandqi512_mask((__v64qi)__C, (__v64qi)__A, - (__mmask64)__B); -} - -__funline __m512i _mm512_maskz_expand_epi8(__mmask64 __A, __m512i __B) { - return (__m512i)__builtin_ia32_expandqi512_maskz( - (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); -} - -__funline __m512i _mm512_mask_expandloadu_epi8(__m512i __A, __mmask64 __B, - const void *__C) { - return (__m512i)__builtin_ia32_expandloadqi512_mask( - (const __v64qi *)__C, (__v64qi)__A, (__mmask64)__B); -} - -__funline __m512i _mm512_maskz_expandloadu_epi8(__mmask64 __A, const void *__B) { - return (__m512i)__builtin_ia32_expandloadqi512_maskz( - (const __v64qi *)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); -} - -__funline __m512i _mm512_mask_expand_epi16(__m512i __A, __mmask32 __B, - __m512i __C) { - return (__m512i)__builtin_ia32_expandhi512_mask((__v32hi)__C, (__v32hi)__A, - (__mmask32)__B); -} - -__funline __m512i _mm512_maskz_expand_epi16(__mmask32 __A, __m512i __B) { - return (__m512i)__builtin_ia32_expandhi512_maskz( - (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); -} - -__funline __m512i _mm512_mask_expandloadu_epi16(__m512i __A, __mmask32 __B, - const void *__C) { - return (__m512i)__builtin_ia32_expandloadhi512_mask( - (const __v32hi *)__C, (__v32hi)__A, (__mmask32)__B); -} - -__funline __m512i _mm512_maskz_expandloadu_epi16(__mmask32 __A, const void *__B) { - return (__m512i)__builtin_ia32_expandloadhi512_maskz( - (const __v32hi *)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); -} - -#ifdef __OPTIMIZE__ -__funline __m512i _mm512_mask_shrdi_epi16(__m512i __A, __mmask32 __B, __m512i __C, - __m512i __D, int __E) { - return (__m512i)__builtin_ia32_vpshrd_v32hi_mask( - (__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B); -} - -__funline __m512i _mm512_maskz_shrdi_epi16(__mmask32 __A, __m512i __B, - __m512i __C, int __D) { - return (__m512i)__builtin_ia32_vpshrd_v32hi_mask( - (__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__A); -} - -__funline __m512i _mm512_mask_shldi_epi16(__m512i __A, __mmask32 __B, __m512i __C, - __m512i __D, int __E) { - return (__m512i)__builtin_ia32_vpshld_v32hi_mask( - (__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B); -} - -__funline __m512i _mm512_maskz_shldi_epi16(__mmask32 __A, __m512i __B, - __m512i __C, int __D) { - return (__m512i)__builtin_ia32_vpshld_v32hi_mask( - (__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(), - (__mmask32)__A); -} - -#else -#define _mm512_mask_shrdi_epi16(A, B, C, D, E) \ - ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), \ - (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B)) -#define _mm512_maskz_shrdi_epi16(A, B, C, D) \ - ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), \ - (__v32hi)(__m512i)(C),(int)(D), \ - (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A)) -#define _mm512_mask_shldi_epi16(A, B, C, D, E) \ - ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), \ - (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B)) -#define _mm512_maskz_shldi_epi16(A, B, C, D) \ - ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B), \ - (__v32hi)(__m512i)(C),(int)(D), \ - (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A)) #endif - -__funline __m512i _mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask( - (__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi8 (__m512i __A, __mmask64 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__C, + (__v64qi)__A, (__mmask64)__B); } - -__funline __m512i _mm512_maskz_shrdv_epi16(__mmask32 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz( - (__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi8 (__mmask64 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__B, + (__v64qi)_mm512_setzero_si512 (), (__mmask64)__A); } - -__funline __m512i _mm512_mask_shldv_epi16(__m512i __A, __mmask32 __B, __m512i __C, - __m512i __D) { - return (__m512i)__builtin_ia32_vpshldv_v32hi_mask( - (__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi8 (void * __A, __mmask64 __B, __m512i __C) +{ + __builtin_ia32_compressstoreuqi512_mask ((__v64qi *) __A, (__v64qi) __C, + (__mmask64) __B); } - -__funline __m512i _mm512_maskz_shldv_epi16(__mmask32 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz( - (__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi16 (__m512i __A, __mmask32 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__C, + (__v32hi)__A, (__mmask32)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi16 (__mmask32 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__B, + (__v32hi)_mm512_setzero_si512 (), (__mmask32)__A); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi16 (void * __A, __mmask32 __B, __m512i __C) +{ + __builtin_ia32_compressstoreuhi512_mask ((__v32hi *) __A, (__v32hi) __C, + (__mmask32) __B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi8 (__m512i __A, __mmask64 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __C, + (__v64qi) __A, + (__mmask64) __B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi8 (__mmask64 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_expandqi512_maskz ((__v64qi) __B, + (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi8 (__m512i __A, __mmask64 __B, const void * __C) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *) __C, + (__v64qi) __A, (__mmask64) __B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi8 (__mmask64 __A, const void * __B) +{ + return (__m512i) __builtin_ia32_expandloadqi512_maskz ((const __v64qi *) __B, + (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi16 (__m512i __A, __mmask32 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __C, + (__v32hi) __A, + (__mmask32) __B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi16 (__mmask32 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_expandhi512_maskz ((__v32hi) __B, + (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi16 (__m512i __A, __mmask32 __B, const void * __C) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *) __C, + (__v32hi) __A, (__mmask32) __B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi16 (__mmask32 __A, const void * __B) +{ + return (__m512i) __builtin_ia32_expandloadhi512_maskz ((const __v32hi *) __B, + (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A); +} +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__C, + (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__B, + (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__C, + (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__B, + (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A); +} +#else +#define _mm512_mask_shrdi_epi16(A, B, C, D, E) ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A), (__mmask32)(B))) +#define _mm512_maskz_shrdi_epi16(A, B, C, D) ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), (__v32hi)(__m512i)(C),(int)(D), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A))) +#define _mm512_mask_shldi_epi16(A, B, C, D, E) ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A), (__mmask32)(B))) +#define _mm512_maskz_shldi_epi16(A, B, C, D) ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B), (__v32hi)(__m512i)(C),(int)(D), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A))) +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask ((__v32hi)__A, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz ((__v32hi)__B, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v32hi_mask ((__v32hi)__A, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__B); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz ((__v32hi)__B, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__A); } - #ifdef __DISABLE_AVX512VBMI2BW__ #undef __DISABLE_AVX512VBMI2BW__ - #pragma GCC pop_options -#endif /* __DISABLE_AVX512VBMI2BW__ */ - -#endif /* __AVX512VBMI2INTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vbmi2vlintrin.internal.h b/third_party/intel/avx512vbmi2vlintrin.internal.h index 92bda6c13..88ec57aab 100644 --- a/third_party/intel/avx512vbmi2vlintrin.internal.h +++ b/third_party/intel/avx512vbmi2vlintrin.internal.h @@ -1,716 +1,772 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +#error "Never use directly; include instead." #endif - #ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED #define _AVX512VBMI2VLINTRIN_H_INCLUDED - #if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) #pragma GCC push_options #pragma GCC target("avx512vbmi2,avx512vl") #define __DISABLE_AVX512VBMI2VL__ -#endif /* __AVX512VBMIVL__ */ - -__funline __m128i _mm_mask_compress_epi8(__m128i __A, __mmask16 __B, - __m128i __C) { - return (__m128i)__builtin_ia32_compressqi128_mask((__v16qi)__C, (__v16qi)__A, - (__mmask16)__B); +#endif +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi8 (__m128i __A, __mmask16 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi)__C, + (__v16qi)__A, (__mmask16)__B); } - -__funline __m128i _mm_maskz_compress_epi8(__mmask16 __A, __m128i __B) { - return (__m128i)__builtin_ia32_compressqi128_mask( - (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi8 (__mmask16 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __B, + (__v16qi) _mm_setzero_si128 (), (__mmask16) __A); } - -__funline void _mm256_mask_compressstoreu_epi16(void *__A, __mmask16 __B, - __m256i __C) { - __builtin_ia32_compressstoreuhi256_mask((__v16hi *)__A, (__v16hi)__C, - (__mmask16)__B); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi16 (void * __A, __mmask16 __B, __m256i __C) +{ + __builtin_ia32_compressstoreuhi256_mask ((__v16hi *) __A, (__v16hi) __C, + (__mmask16) __B); } - -__funline __m128i _mm_mask_compress_epi16(__m128i __A, __mmask8 __B, - __m128i __C) { - return (__m128i)__builtin_ia32_compresshi128_mask((__v8hi)__C, (__v8hi)__A, - (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi16 (__m128i __A, __mmask8 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi)__C, (__v8hi)__A, + (__mmask8)__B); } - -__funline __m128i _mm_maskz_compress_epi16(__mmask8 __A, __m128i __B) { - return (__m128i)__builtin_ia32_compresshi128_mask( - (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi16 (__mmask8 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __B, + (__v8hi) _mm_setzero_si128 (), (__mmask8) __A); } - -__funline __m256i _mm256_mask_compress_epi16(__m256i __A, __mmask16 __B, - __m256i __C) { - return (__m256i)__builtin_ia32_compresshi256_mask((__v16hi)__C, (__v16hi)__A, - (__mmask16)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi16 (__m256i __A, __mmask16 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi)__C, + (__v16hi)__A, (__mmask16)__B); } - -__funline __m256i _mm256_maskz_compress_epi16(__mmask16 __A, __m256i __B) { - return (__m256i)__builtin_ia32_compresshi256_mask( - (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi16 (__mmask16 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __B, + (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A); } - -__funline void _mm_mask_compressstoreu_epi8(void *__A, __mmask16 __B, - __m128i __C) { - __builtin_ia32_compressstoreuqi128_mask((__v16qi *)__A, (__v16qi)__C, - (__mmask16)__B); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi8 (void * __A, __mmask16 __B, __m128i __C) +{ + __builtin_ia32_compressstoreuqi128_mask ((__v16qi *) __A, (__v16qi) __C, + (__mmask16) __B); } - -__funline void _mm_mask_compressstoreu_epi16(void *__A, __mmask8 __B, - __m128i __C) { - __builtin_ia32_compressstoreuhi128_mask((__v8hi *)__A, (__v8hi)__C, - (__mmask8)__B); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi16 (void * __A, __mmask8 __B, __m128i __C) +{ + __builtin_ia32_compressstoreuhi128_mask ((__v8hi *) __A, (__v8hi) __C, + (__mmask8) __B); } - -__funline __m128i _mm_mask_expand_epi8(__m128i __A, __mmask16 __B, __m128i __C) { - return (__m128i)__builtin_ia32_expandqi128_mask((__v16qi)__C, (__v16qi)__A, - (__mmask16)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi8 (__m128i __A, __mmask16 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __C, + (__v16qi) __A, + (__mmask16) __B); } - -__funline __m128i _mm_maskz_expand_epi8(__mmask16 __A, __m128i __B) { - return (__m128i)__builtin_ia32_expandqi128_maskz( - (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi8 (__mmask16 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_expandqi128_maskz ((__v16qi) __B, + (__v16qi) _mm_setzero_si128 (), (__mmask16) __A); } - -__funline __m128i _mm_mask_expandloadu_epi8(__m128i __A, __mmask16 __B, - const void *__C) { - return (__m128i)__builtin_ia32_expandloadqi128_mask( - (const __v16qi *)__C, (__v16qi)__A, (__mmask16)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi8 (__m128i __A, __mmask16 __B, const void * __C) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *) __C, + (__v16qi) __A, (__mmask16) __B); } - -__funline __m128i _mm_maskz_expandloadu_epi8(__mmask16 __A, const void *__B) { - return (__m128i)__builtin_ia32_expandloadqi128_maskz( - (const __v16qi *)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi8 (__mmask16 __A, const void * __B) +{ + return (__m128i) __builtin_ia32_expandloadqi128_maskz ((const __v16qi *) __B, + (__v16qi) _mm_setzero_si128 (), (__mmask16) __A); } - -__funline __m128i _mm_mask_expand_epi16(__m128i __A, __mmask8 __B, __m128i __C) { - return (__m128i)__builtin_ia32_expandhi128_mask((__v8hi)__C, (__v8hi)__A, - (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi16 (__m128i __A, __mmask8 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __C, + (__v8hi) __A, + (__mmask8) __B); } - -__funline __m128i _mm_maskz_expand_epi16(__mmask8 __A, __m128i __B) { - return (__m128i)__builtin_ia32_expandhi128_maskz( - (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi16 (__mmask8 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_expandhi128_maskz ((__v8hi) __B, + (__v8hi) _mm_setzero_si128 (), (__mmask8) __A); } - -__funline __m128i _mm_mask_expandloadu_epi16(__m128i __A, __mmask8 __B, - const void *__C) { - return (__m128i)__builtin_ia32_expandloadhi128_mask( - (const __v8hi *)__C, (__v8hi)__A, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi16 (__m128i __A, __mmask8 __B, const void * __C) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *) __C, + (__v8hi) __A, (__mmask8) __B); } - -__funline __m128i _mm_maskz_expandloadu_epi16(__mmask8 __A, const void *__B) { - return (__m128i)__builtin_ia32_expandloadhi128_maskz( - (const __v8hi *)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi16 (__mmask8 __A, const void * __B) +{ + return (__m128i) __builtin_ia32_expandloadhi128_maskz ((const __v8hi *) __B, + (__v8hi) _mm_setzero_si128 (), (__mmask8) __A); } -__funline __m256i _mm256_mask_expand_epi16(__m256i __A, __mmask16 __B, - __m256i __C) { - return (__m256i)__builtin_ia32_expandhi256_mask((__v16hi)__C, (__v16hi)__A, - (__mmask16)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi16 (__m256i __A, __mmask16 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __C, + (__v16hi) __A, + (__mmask16) __B); } - -__funline __m256i _mm256_maskz_expand_epi16(__mmask16 __A, __m256i __B) { - return (__m256i)__builtin_ia32_expandhi256_maskz( - (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi16 (__mmask16 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_expandhi256_maskz ((__v16hi) __B, + (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A); } - -__funline __m256i _mm256_mask_expandloadu_epi16(__m256i __A, __mmask16 __B, - const void *__C) { - return (__m256i)__builtin_ia32_expandloadhi256_mask( - (const __v16hi *)__C, (__v16hi)__A, (__mmask16)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi16 (__m256i __A, __mmask16 __B, const void * __C) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *) __C, + (__v16hi) __A, (__mmask16) __B); } - -__funline __m256i _mm256_maskz_expandloadu_epi16(__mmask16 __A, const void *__B) { - return (__m256i)__builtin_ia32_expandloadhi256_maskz( - (const __v16hi *)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi16 (__mmask16 __A, const void * __B) +{ + return (__m256i) __builtin_ia32_expandloadhi256_maskz ((const __v16hi *) __B, + (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_shrdi_epi16(__m256i __A, __m256i __B, int __C) { - return (__m256i)__builtin_ia32_vpshrd_v16hi((__v16hi)__A, (__v16hi)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi16 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)__A, (__v16hi) __B, + __C); } - -__funline __m256i _mm256_mask_shrdi_epi16(__m256i __A, __mmask16 __B, __m256i __C, - __m256i __D, int __E) { - return (__m256i)__builtin_ia32_vpshrd_v16hi_mask( - (__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__C, + (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B); } - -__funline __m256i _mm256_maskz_shrdi_epi16(__mmask16 __A, __m256i __B, - __m256i __C, int __D) { - return (__m256i)__builtin_ia32_vpshrd_v16hi_mask( - (__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__B, + (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A); } - -__funline __m256i _mm256_mask_shrdi_epi32(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D, int __E) { - return (__m256i)__builtin_ia32_vpshrd_v8si_mask((__v8si)__C, (__v8si)__D, __E, - (__v8si)__A, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__C, (__v8si) __D, + __E, (__v8si) __A, (__mmask8)__B); } - -__funline __m256i _mm256_maskz_shrdi_epi32(__mmask8 __A, __m256i __B, __m256i __C, - int __D) { - return (__m256i)__builtin_ia32_vpshrd_v8si_mask( - (__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(), - (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__B, (__v8si) __C, + __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A); } - -__funline __m256i _mm256_shrdi_epi32(__m256i __A, __m256i __B, int __C) { - return (__m256i)__builtin_ia32_vpshrd_v8si((__v8si)__A, (__v8si)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi32 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)__A, (__v8si) __B, __C); } - -__funline __m256i _mm256_mask_shrdi_epi64(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D, int __E) { - return (__m256i)__builtin_ia32_vpshrd_v4di_mask((__v4di)__C, (__v4di)__D, __E, - (__v4di)__A, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__C, (__v4di) __D, + __E, (__v4di) __A, (__mmask8)__B); } - -__funline __m256i _mm256_maskz_shrdi_epi64(__mmask8 __A, __m256i __B, __m256i __C, - int __D) { - return (__m256i)__builtin_ia32_vpshrd_v4di_mask( - (__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(), - (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__B, (__v4di) __C, + __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A); } - -__funline __m256i _mm256_shrdi_epi64(__m256i __A, __m256i __B, int __C) { - return (__m256i)__builtin_ia32_vpshrd_v4di((__v4di)__A, (__v4di)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi64 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)__A, (__v4di) __B, __C); } - -__funline __m128i _mm_mask_shrdi_epi16(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D, int __E) { - return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E, - (__v8hi)__A, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__C, (__v8hi) __D, + __E, (__v8hi) __A, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shrdi_epi16(__mmask8 __A, __m128i __B, __m128i __C, - int __D) { - return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D, - (__v8hi)_mm_setzero_si128(), - (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__B, (__v8hi) __C, + __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A); } - -__funline __m128i _mm_shrdi_epi16(__m128i __A, __m128i __B, int __C) { - return (__m128i)__builtin_ia32_vpshrd_v8hi((__v8hi)__A, (__v8hi)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi16 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)__A, (__v8hi) __B, __C); } - -__funline __m128i _mm_mask_shrdi_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D, int __E) { - return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__C, (__v4si)__D, __E, - (__v4si)__A, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__C, (__v4si) __D, + __E, (__v4si) __A, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shrdi_epi32(__mmask8 __A, __m128i __B, __m128i __C, - int __D) { - return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__B, (__v4si)__C, __D, - (__v4si)_mm_setzero_si128(), - (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__B, (__v4si) __C, + __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A); } - -__funline __m128i _mm_shrdi_epi32(__m128i __A, __m128i __B, int __C) { - return (__m128i)__builtin_ia32_vpshrd_v4si((__v4si)__A, (__v4si)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi32 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)__A, (__v4si) __B, __C); } - -__funline __m128i _mm_mask_shrdi_epi64(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D, int __E) { - return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__C, (__v2di)__D, __E, - (__v2di)__A, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__C, (__v2di) __D, + __E, (__v2di) __A, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shrdi_epi64(__mmask8 __A, __m128i __B, __m128i __C, - int __D) { - return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__B, (__v2di)__C, __D, - (__v2di)_mm_setzero_si128(), - (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__B, (__v2di) __C, + __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A); } - -__funline __m128i _mm_shrdi_epi64(__m128i __A, __m128i __B, int __C) { - return (__m128i)__builtin_ia32_vpshrd_v2di((__v2di)__A, (__v2di)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi64 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)__A, (__v2di) __B, __C); } - -__funline __m256i _mm256_shldi_epi16(__m256i __A, __m256i __B, int __C) { - return (__m256i)__builtin_ia32_vpshld_v16hi((__v16hi)__A, (__v16hi)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi16 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)__A, (__v16hi) __B, + __C); } - -__funline __m256i _mm256_mask_shldi_epi16(__m256i __A, __mmask16 __B, __m256i __C, - __m256i __D, int __E) { - return (__m256i)__builtin_ia32_vpshld_v16hi_mask( - (__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__C, + (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B); } - -__funline __m256i _mm256_maskz_shldi_epi16(__mmask16 __A, __m256i __B, - __m256i __C, int __D) { - return (__m256i)__builtin_ia32_vpshld_v16hi_mask( - (__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__B, + (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A); } - -__funline __m256i _mm256_mask_shldi_epi32(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D, int __E) { - return (__m256i)__builtin_ia32_vpshld_v8si_mask((__v8si)__C, (__v8si)__D, __E, - (__v8si)__A, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__C, (__v8si) __D, + __E, (__v8si) __A, (__mmask8)__B); } - -__funline __m256i _mm256_maskz_shldi_epi32(__mmask8 __A, __m256i __B, __m256i __C, - int __D) { - return (__m256i)__builtin_ia32_vpshld_v8si_mask( - (__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(), - (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__B, (__v8si) __C, + __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A); } - -__funline __m256i _mm256_shldi_epi32(__m256i __A, __m256i __B, int __C) { - return (__m256i)__builtin_ia32_vpshld_v8si((__v8si)__A, (__v8si)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi32 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v8si ((__v8si)__A, (__v8si) __B, __C); } - -__funline __m256i _mm256_mask_shldi_epi64(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D, int __E) { - return (__m256i)__builtin_ia32_vpshld_v4di_mask((__v4di)__C, (__v4di)__D, __E, - (__v4di)__A, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__C, (__v4di) __D, + __E, (__v4di) __A, (__mmask8)__B); } - -__funline __m256i _mm256_maskz_shldi_epi64(__mmask8 __A, __m256i __B, __m256i __C, - int __D) { - return (__m256i)__builtin_ia32_vpshld_v4di_mask( - (__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(), - (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__B, (__v4di) __C, + __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A); } - -__funline __m256i _mm256_shldi_epi64(__m256i __A, __m256i __B, int __C) { - return (__m256i)__builtin_ia32_vpshld_v4di((__v4di)__A, (__v4di)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi64 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v4di ((__v4di)__A, (__v4di) __B, __C); } - -__funline __m128i _mm_mask_shldi_epi16(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D, int __E) { - return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E, - (__v8hi)__A, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__C, (__v8hi) __D, + __E, (__v8hi) __A, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shldi_epi16(__mmask8 __A, __m128i __B, __m128i __C, - int __D) { - return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D, - (__v8hi)_mm_setzero_si128(), - (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__B, (__v8hi) __C, + __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A); } - -__funline __m128i _mm_shldi_epi16(__m128i __A, __m128i __B, int __C) { - return (__m128i)__builtin_ia32_vpshld_v8hi((__v8hi)__A, (__v8hi)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi16 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)__A, (__v8hi) __B, __C); } - -__funline __m128i _mm_mask_shldi_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D, int __E) { - return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__C, (__v4si)__D, __E, - (__v4si)__A, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__C, (__v4si) __D, + __E, (__v4si) __A, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shldi_epi32(__mmask8 __A, __m128i __B, __m128i __C, - int __D) { - return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__B, (__v4si)__C, __D, - (__v4si)_mm_setzero_si128(), - (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__B, (__v4si) __C, + __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A); } - -__funline __m128i _mm_shldi_epi32(__m128i __A, __m128i __B, int __C) { - return (__m128i)__builtin_ia32_vpshld_v4si((__v4si)__A, (__v4si)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi32 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v4si ((__v4si)__A, (__v4si) __B, __C); } - -__funline __m128i _mm_mask_shldi_epi64(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D, int __E) { - return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__C, (__v2di)__D, __E, - (__v2di)__A, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__C, (__v2di) __D, + __E, (__v2di) __A, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shldi_epi64(__mmask8 __A, __m128i __B, __m128i __C, - int __D) { - return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__B, (__v2di)__C, __D, - (__v2di)_mm_setzero_si128(), - (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__B, (__v2di) __C, + __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A); } - -__funline __m128i _mm_shldi_epi64(__m128i __A, __m128i __B, int __C) { - return (__m128i)__builtin_ia32_vpshld_v2di((__v2di)__A, (__v2di)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi64 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v2di ((__v2di)__A, (__v2di) __B, __C); } #else -#define _mm256_shrdi_epi16(A, B, C) \ - ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), \ - (__v16hi)(__m256i)(B),(int)(C)) -#define _mm256_mask_shrdi_epi16(A, B, C, D, E) \ - ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), \ - (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B)) -#define _mm256_maskz_shrdi_epi16(A, B, C, D) \ - ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), \ - (__v16hi)(__m256i)(C),(int)(D), \ - (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)) -#define _mm256_shrdi_epi32(A, B, C) \ - ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B),(int)(C)) -#define _mm256_mask_shrdi_epi32(A, B, C, D, E) \ - ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), \ - (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shrdi_epi32(A, B, C, D) \ - ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(C),(int)(D), \ - (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm256_shrdi_epi64(A, B, C) \ - ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B),(int)(C)) -#define _mm256_mask_shrdi_epi64(A, B, C, D, E) \ - ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), \ - (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shrdi_epi64(A, B, C, D) \ - ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(C),(int)(D), \ - (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm_shrdi_epi16(A, B, C) \ - ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B),(int)(C)) -#define _mm_mask_shrdi_epi16(A, B, C, D, E) \ - ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), \ - (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shrdi_epi16(A, B, C, D) \ - ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), \ - (__v8hi)(__m128i)(C),(int)(D), \ - (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shrdi_epi32(A, B, C) \ - ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B),(int)(C)) -#define _mm_mask_shrdi_epi32(A, B, C, D, E) \ - ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), \ - (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shrdi_epi32(A, B, C, D) \ - ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(C),(int)(D), \ - (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shrdi_epi64(A, B, C) \ - ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B),(int)(C)) -#define _mm_mask_shrdi_epi64(A, B, C, D, E) \ - ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), \ - (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shrdi_epi64(A, B, C, D) \ - ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(C),(int)(D), \ - (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm256_shldi_epi16(A, B, C) \ - ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), \ - (__v16hi)(__m256i)(B),(int)(C)) -#define _mm256_mask_shldi_epi16(A, B, C, D, E) \ - ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), \ - (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B)) -#define _mm256_maskz_shldi_epi16(A, B, C, D) \ - ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), \ - (__v16hi)(__m256i)(C),(int)(D), \ - (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)) -#define _mm256_shldi_epi32(A, B, C) \ - ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B),(int)(C)) -#define _mm256_mask_shldi_epi32(A, B, C, D, E) \ - ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), \ - (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shldi_epi32(A, B, C, D) \ - ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(C),(int)(D), \ - (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm256_shldi_epi64(A, B, C) \ - ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B),(int)(C)) -#define _mm256_mask_shldi_epi64(A, B, C, D, E) \ - ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), \ - (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B)) -#define _mm256_maskz_shldi_epi64(A, B, C, D) \ - ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(C),(int)(D), \ - (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) -#define _mm_shldi_epi16(A, B, C) \ - ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B),(int)(C)) -#define _mm_mask_shldi_epi16(A, B, C, D, E) \ - ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), \ - (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shldi_epi16(A, B, C, D) \ - ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), \ - (__v8hi)(__m128i)(C),(int)(D), \ - (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shldi_epi32(A, B, C) \ - ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B),(int)(C)) -#define _mm_mask_shldi_epi32(A, B, C, D, E) \ - ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \ - (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shldi_epi32(A, B, C, D) \ - ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(C),(int)(D), \ - (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) -#define _mm_shldi_epi64(A, B, C) \ - ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B),(int)(C)) -#define _mm_mask_shldi_epi64(A, B, C, D, E) \ - ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), \ - (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B)) -#define _mm_maskz_shldi_epi64(A, B, C, D) \ - ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(C),(int)(D), \ - (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +#define _mm256_shrdi_epi16(A, B, C) ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), (__v16hi)(__m256i)(B),(int)(C))) +#define _mm256_mask_shrdi_epi16(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A), (__mmask16)(B))) +#define _mm256_maskz_shrdi_epi16(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A))) +#define _mm256_shrdi_epi32(A, B, C) ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B),(int)(C))) +#define _mm256_mask_shrdi_epi32(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A), (__mmask8)(B))) +#define _mm256_maskz_shrdi_epi32(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))) +#define _mm256_shrdi_epi64(A, B, C) ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B),(int)(C))) +#define _mm256_mask_shrdi_epi64(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A), (__mmask8)(B))) +#define _mm256_maskz_shrdi_epi64(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))) +#define _mm_shrdi_epi16(A, B, C) ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), (__v8hi)(__m128i)(B),(int)(C))) +#define _mm_mask_shrdi_epi16(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A), (__mmask8)(B))) +#define _mm_maskz_shrdi_epi16(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))) +#define _mm_shrdi_epi32(A, B, C) ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B),(int)(C))) +#define _mm_mask_shrdi_epi32(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A), (__mmask8)(B))) +#define _mm_maskz_shrdi_epi32(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))) +#define _mm_shrdi_epi64(A, B, C) ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B),(int)(C))) +#define _mm_mask_shrdi_epi64(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A), (__mmask8)(B))) +#define _mm_maskz_shrdi_epi64(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))) +#define _mm256_shldi_epi16(A, B, C) ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), (__v16hi)(__m256i)(B),(int)(C))) +#define _mm256_mask_shldi_epi16(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A), (__mmask16)(B))) +#define _mm256_maskz_shldi_epi16(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A))) +#define _mm256_shldi_epi32(A, B, C) ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B),(int)(C))) +#define _mm256_mask_shldi_epi32(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A), (__mmask8)(B))) +#define _mm256_maskz_shldi_epi32(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))) +#define _mm256_shldi_epi64(A, B, C) ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B),(int)(C))) +#define _mm256_mask_shldi_epi64(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A), (__mmask8)(B))) +#define _mm256_maskz_shldi_epi64(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))) +#define _mm_shldi_epi16(A, B, C) ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), (__v8hi)(__m128i)(B),(int)(C))) +#define _mm_mask_shldi_epi16(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A), (__mmask8)(B))) +#define _mm_maskz_shldi_epi16(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))) +#define _mm_shldi_epi32(A, B, C) ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B),(int)(C))) +#define _mm_mask_shldi_epi32(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A), (__mmask8)(B))) +#define _mm_maskz_shldi_epi32(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))) +#define _mm_shldi_epi64(A, B, C) ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B),(int)(C))) +#define _mm_mask_shldi_epi64(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A), (__mmask8)(B))) +#define _mm_maskz_shldi_epi64(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))) #endif - -__funline __m256i _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshrdv_v16hi((__v16hi)__A, (__v16hi)__B, - (__v16hi)__C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdv_epi16 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshrdv_v16hi ((__v16hi)__A, (__v16hi) __B, + (__v16hi) __C); } - -__funline __m256i _mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask( - (__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask ((__v16hi)__A, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__B); } - -__funline __m256i _mm256_maskz_shrdv_epi16(__mmask16 __A, __m256i __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz( - (__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz ((__v16hi)__B, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__A); } - -__funline __m256i _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshrdv_v8si((__v8si)__A, (__v8si)__B, - (__v8si)__C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdv_epi32 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshrdv_v8si ((__v8si)__A, (__v8si) __B, + (__v8si) __C); } - -__funline __m256i _mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshrdv_v8si_mask((__v8si)__A, (__v8si)__C, - (__v8si)__D, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B); } - -__funline __m256i _mm256_maskz_shrdv_epi32(__mmask8 __A, __m256i __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz((__v8si)__B, (__v8si)__C, - (__v8si)__D, (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz ((__v8si)__B, (__v8si) __C, + (__v8si) __D, (__mmask8)__A); } - -__funline __m256i _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshrdv_v4di((__v4di)__A, (__v4di)__B, - (__v4di)__C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdv_epi64 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshrdv_v4di ((__v4di)__A, (__v4di) __B, + (__v4di) __C); } - -__funline __m256i _mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshrdv_v4di_mask((__v4di)__A, (__v4di)__C, - (__v4di)__D, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v4di_mask ((__v4di)__A, (__v4di) __C, + (__v4di) __D, (__mmask8)__B); } - -__funline __m256i _mm256_maskz_shrdv_epi64(__mmask8 __A, __m256i __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz((__v4di)__B, (__v4di)__C, - (__v4di)__D, (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz ((__v4di)__B, (__v4di) __C, + (__v4di) __D, (__mmask8)__A); } - -__funline __m128i _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshrdv_v8hi((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdv_epi16 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshrdv_v8hi ((__v8hi)__A, (__v8hi) __B, + (__v8hi) __C); } - -__funline __m128i _mm_mask_shrdv_epi16(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask((__v8hi)__A, (__v8hi)__C, - (__v8hi)__D, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask ((__v8hi)__A, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shrdv_epi16(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz((__v8hi)__B, (__v8hi)__C, - (__v8hi)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__A); } - -__funline __m128i _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshrdv_v4si((__v4si)__A, (__v4si)__B, - (__v4si)__C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdv_epi32 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshrdv_v4si ((__v4si)__A, (__v4si) __B, + (__v4si) __C); } - -__funline __m128i _mm_mask_shrdv_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshrdv_v4si_mask((__v4si)__A, (__v4si)__C, - (__v4si)__D, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shrdv_epi32(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz((__v4si)__B, (__v4si)__C, - (__v4si)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz ((__v4si)__B, (__v4si) __C, + (__v4si) __D, (__mmask8)__A); } - -__funline __m128i _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshrdv_v2di((__v2di)__A, (__v2di)__B, - (__v2di)__C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdv_epi64 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshrdv_v2di ((__v2di)__A, (__v2di) __B, + (__v2di) __C); } - -__funline __m128i _mm_mask_shrdv_epi64(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshrdv_v2di_mask((__v2di)__A, (__v2di)__C, - (__v2di)__D, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v2di_mask ((__v2di)__A, (__v2di) __C, + (__v2di) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shrdv_epi64(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz((__v2di)__B, (__v2di)__C, - (__v2di)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz ((__v2di)__B, (__v2di) __C, + (__v2di) __D, (__mmask8)__A); } - -__funline __m256i _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshldv_v16hi((__v16hi)__A, (__v16hi)__B, - (__v16hi)__C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldv_epi16 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshldv_v16hi ((__v16hi)__A, (__v16hi) __B, + (__v16hi) __C); } - -__funline __m256i _mm256_mask_shldv_epi16(__m256i __A, __mmask16 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshldv_v16hi_mask( - (__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v16hi_mask ((__v16hi)__A, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__B); } - -__funline __m256i _mm256_maskz_shldv_epi16(__mmask16 __A, __m256i __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz( - (__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz ((__v16hi)__B, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__A); } - -__funline __m256i _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshldv_v8si((__v8si)__A, (__v8si)__B, - (__v8si)__C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldv_epi32 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshldv_v8si ((__v8si)__A, (__v8si) __B, + (__v8si) __C); } - -__funline __m256i _mm256_mask_shldv_epi32(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshldv_v8si_mask((__v8si)__A, (__v8si)__C, - (__v8si)__D, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B) ; } - -__funline __m256i _mm256_maskz_shldv_epi32(__mmask8 __A, __m256i __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshldv_v8si_maskz((__v8si)__B, (__v8si)__C, - (__v8si)__D, (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v8si_maskz ((__v8si)__B, (__v8si) __C, + (__v8si) __D, (__mmask8)__A); } - -__funline __m256i _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpshldv_v4di((__v4di)__A, (__v4di)__B, - (__v4di)__C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldv_epi64 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshldv_v4di ((__v4di)__A, (__v4di) __B, + (__v4di) __C); } - -__funline __m256i _mm256_mask_shldv_epi64(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshldv_v4di_mask((__v4di)__A, (__v4di)__C, - (__v4di)__D, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v4di_mask ((__v4di)__A, (__v4di) __C, + (__v4di) __D, (__mmask8)__B); } - -__funline __m256i _mm256_maskz_shldv_epi64(__mmask8 __A, __m256i __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpshldv_v4di_maskz((__v4di)__B, (__v4di)__C, - (__v4di)__D, (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v4di_maskz ((__v4di)__B, (__v4di) __C, + (__v4di) __D, (__mmask8)__A); } - -__funline __m128i _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshldv_v8hi((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldv_epi16 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshldv_v8hi ((__v8hi)__A, (__v8hi) __B, + (__v8hi) __C); } - -__funline __m128i _mm_mask_shldv_epi16(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshldv_v8hi_mask((__v8hi)__A, (__v8hi)__C, - (__v8hi)__D, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v8hi_mask ((__v8hi)__A, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shldv_epi16(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz((__v8hi)__B, (__v8hi)__C, - (__v8hi)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__A); } - -__funline __m128i _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshldv_v4si((__v4si)__A, (__v4si)__B, - (__v4si)__C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldv_epi32 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshldv_v4si ((__v4si)__A, (__v4si) __B, + (__v4si) __C); } - -__funline __m128i _mm_mask_shldv_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshldv_v4si_mask((__v4si)__A, (__v4si)__C, - (__v4si)__D, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shldv_epi32(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshldv_v4si_maskz((__v4si)__B, (__v4si)__C, - (__v4si)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v4si_maskz ((__v4si)__B, (__v4si) __C, + (__v4si) __D, (__mmask8)__A); } - -__funline __m128i _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpshldv_v2di((__v2di)__A, (__v2di)__B, - (__v2di)__C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldv_epi64 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshldv_v2di ((__v2di)__A, (__v2di) __B, + (__v2di) __C); } - -__funline __m128i _mm_mask_shldv_epi64(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshldv_v2di_mask((__v2di)__A, (__v2di)__C, - (__v2di)__D, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v2di_mask ((__v2di)__A, (__v2di) __C, + (__v2di) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_shldv_epi64(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpshldv_v2di_maskz((__v2di)__B, (__v2di)__C, - (__v2di)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v2di_maskz ((__v2di)__B, (__v2di) __C, + (__v2di) __D, (__mmask8)__A); } - #ifdef __DISABLE_AVX512VBMI2VL__ #undef __DISABLE_AVX512VBMI2VL__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VBMIVL__ */ - -#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \ - !defined(__AVX512BW__) +#endif +#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || !defined(__AVX512BW__) #pragma GCC push_options #pragma GCC target("avx512vbmi2,avx512vl,avx512bw") #define __DISABLE_AVX512VBMI2VLBW__ -#endif /* __AVX512VBMIVLBW__ */ - -__funline __m256i _mm256_mask_compress_epi8(__m256i __A, __mmask32 __B, - __m256i __C) { - return (__m256i)__builtin_ia32_compressqi256_mask((__v32qi)__C, (__v32qi)__A, - (__mmask32)__B); +#endif +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi8 (__m256i __A, __mmask32 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi)__C, + (__v32qi)__A, (__mmask32)__B); } - -__funline __m256i _mm256_maskz_compress_epi8(__mmask32 __A, __m256i __B) { - return (__m256i)__builtin_ia32_compressqi256_mask( - (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi8 (__mmask32 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __B, + (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A); } - -__funline void _mm256_mask_compressstoreu_epi8(void *__A, __mmask32 __B, - __m256i __C) { - __builtin_ia32_compressstoreuqi256_mask((__v32qi *)__A, (__v32qi)__C, - (__mmask32)__B); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi8 (void * __A, __mmask32 __B, __m256i __C) +{ + __builtin_ia32_compressstoreuqi256_mask ((__v32qi *) __A, (__v32qi) __C, + (__mmask32) __B); } - -__funline __m256i _mm256_mask_expand_epi8(__m256i __A, __mmask32 __B, - __m256i __C) { - return (__m256i)__builtin_ia32_expandqi256_mask((__v32qi)__C, (__v32qi)__A, - (__mmask32)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi8 (__m256i __A, __mmask32 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __C, + (__v32qi) __A, + (__mmask32) __B); } - -__funline __m256i _mm256_maskz_expand_epi8(__mmask32 __A, __m256i __B) { - return (__m256i)__builtin_ia32_expandqi256_maskz( - (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi8 (__mmask32 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_expandqi256_maskz ((__v32qi) __B, + (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A); } - -__funline __m256i _mm256_mask_expandloadu_epi8(__m256i __A, __mmask32 __B, - const void *__C) { - return (__m256i)__builtin_ia32_expandloadqi256_mask( - (const __v32qi *)__C, (__v32qi)__A, (__mmask32)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi8 (__m256i __A, __mmask32 __B, const void * __C) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *) __C, + (__v32qi) __A, (__mmask32) __B); } - -__funline __m256i _mm256_maskz_expandloadu_epi8(__mmask32 __A, const void *__B) { - return (__m256i)__builtin_ia32_expandloadqi256_maskz( - (const __v32qi *)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi8 (__mmask32 __A, const void * __B) +{ + return (__m256i) __builtin_ia32_expandloadqi256_maskz ((const __v32qi *) __B, + (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A); } - #ifdef __DISABLE_AVX512VBMI2VLBW__ #undef __DISABLE_AVX512VBMI2VLBW__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VBMIVLBW__ */ - -#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vbmiintrin.internal.h b/third_party/intel/avx512vbmiintrin.internal.h index dad021826..cf5756939 100644 --- a/third_party/intel/avx512vbmiintrin.internal.h +++ b/third_party/intel/avx512vbmiintrin.internal.h @@ -1,90 +1,124 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512VBMIINTRIN_H_INCLUDED #define _AVX512VBMIINTRIN_H_INCLUDED - #ifndef __AVX512VBMI__ #pragma GCC push_options #pragma GCC target("avx512vbmi") #define __DISABLE_AVX512VBMI__ -#endif /* __AVX512VBMI__ */ - -__funline __m512i _mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, - __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( - (__v64qi)__X, (__v64qi)__Y, (__v64qi)__W, (__mmask64)__M); +#endif +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v64qi) __W, + (__mmask64) __M); } - -__funline __m512i _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( - (__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_setzero_si512(), - (__mmask64)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); } - -__funline __m512i _mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( - (__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_undefined_epi32(), - (__mmask64)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v64qi) + _mm512_undefined_epi32 (), + (__mmask64) -1); } - -__funline __m512i _mm512_permutexvar_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_permvarqi512_mask( - (__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_undefined_epi32(), - (__mmask64)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, + (__v64qi) __A, + (__v64qi) + _mm512_undefined_epi32 (), + (__mmask64) -1); } - -__funline __m512i _mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, - __m512i __B) { - return (__m512i)__builtin_ia32_permvarqi512_mask( - (__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), - (__mmask64)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, + (__v64qi) __A, + (__v64qi) + _mm512_setzero_si512(), + (__mmask64) __M); } - -__funline __m512i _mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, - __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_permvarqi512_mask( - (__v64qi)__B, (__v64qi)__A, (__v64qi)__W, (__mmask64)__M); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, + (__v64qi) __A, + (__v64qi) __W, + (__mmask64) __M); } - -__funline __m512i _mm512_permutex2var_epi8(__m512i __A, __m512i __I, - __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varqi512_mask( - (__v64qi)__I - /* idx */, - (__v64qi)__A, (__v64qi)__B, (__mmask64)-1); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I + , + (__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); } - -__funline __m512i _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varqi512_mask( - (__v64qi)__I - /* idx */, - (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I + , + (__v64qi) __A, + (__v64qi) __B, + (__mmask64) + __U); } - -__funline __m512i _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, - __mmask64 __U, __m512i __B) { - return (__m512i)__builtin_ia32_vpermi2varqi512_mask((__v64qi)__A, - (__v64qi)__I - /* idx */, - (__v64qi)__B, - (__mmask64)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I, + __mmask64 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A, + (__v64qi) __I + , + (__v64qi) __B, + (__mmask64) + __U); } - -__funline __m512i _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, - __m512i __I, __m512i __B) { - return (__m512i)__builtin_ia32_vpermt2varqi512_maskz( - (__v64qi)__I - /* idx */, - (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I + , + (__v64qi) __A, + (__v64qi) __B, + (__mmask64) + __U); } - #ifdef __DISABLE_AVX512VBMI__ #undef __DISABLE_AVX512VBMI__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VBMI__ */ - -#endif /* _AVX512VBMIINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vbmivlintrin.internal.h b/third_party/intel/avx512vbmivlintrin.internal.h index a7ff671e6..6ba410426 100644 --- a/third_party/intel/avx512vbmivlintrin.internal.h +++ b/third_party/intel/avx512vbmivlintrin.internal.h @@ -1,159 +1,229 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +#error "Never use directly; include instead." #endif - #ifndef _AVX512VBMIVLINTRIN_H_INCLUDED #define _AVX512VBMIVLINTRIN_H_INCLUDED - #if !defined(__AVX512VL__) || !defined(__AVX512VBMI__) #pragma GCC push_options #pragma GCC target("avx512vbmi,avx512vl") #define __DISABLE_AVX512VBMIVL__ -#endif /* __AVX512VBMIVL__ */ - -__funline __m256i _mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, - __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( - (__v32qi)__X, (__v32qi)__Y, (__v32qi)__W, (__mmask32)__M); +#endif +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v32qi) __W, + (__mmask32) __M); } - -__funline __m256i _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( - (__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_setzero_si256(), - (__mmask32)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); } - -__funline __m256i _mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( - (__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_undefined_si256(), - (__mmask32)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v32qi) + _mm256_undefined_si256 (), + (__mmask32) -1); } - -__funline __m128i _mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, - __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( - (__v16qi)__X, (__v16qi)__Y, (__v16qi)__W, (__mmask16)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v16qi) __W, + (__mmask16) __M); } - -__funline __m128i _mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( - (__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); } - -__funline __m128i _mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( - (__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_undefined_si128(), - (__mmask16)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); } - -__funline __m256i _mm256_permutexvar_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_permvarqi256_mask( - (__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_undefined_si256(), - (__mmask32)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, + (__v32qi) __A, + (__v32qi) + _mm256_undefined_si256 (), + (__mmask32) -1); } - -__funline __m256i _mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_permvarqi256_mask( - (__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), - (__mmask32)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, + (__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); } - -__funline __m256i _mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_permvarqi256_mask( - (__v32qi)__B, (__v32qi)__A, (__v32qi)__W, (__mmask32)__M); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, + (__v32qi) __A, + (__v32qi) __W, + (__mmask32) __M); } - -__funline __m128i _mm_permutexvar_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_permvarqi128_mask( - (__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_undefined_si128(), - (__mmask16)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutexvar_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, + (__v16qi) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); } - -__funline __m128i _mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_permvarqi128_mask( - (__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, + (__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); } - -__funline __m128i _mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, - __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_permvarqi128_mask( - (__v16qi)__B, (__v16qi)__A, (__v16qi)__W, (__mmask16)__M); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, + (__v16qi) __A, + (__v16qi) __W, + (__mmask16) __M); } - -__funline __m256i _mm256_permutex2var_epi8(__m256i __A, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varqi256_mask( - (__v32qi)__I - /* idx */, - (__v32qi)__A, (__v32qi)__B, (__mmask32)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I + , + (__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); } - -__funline __m256i _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varqi256_mask( - (__v32qi)__I - /* idx */, - (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I + , + (__v32qi) __A, + (__v32qi) __B, + (__mmask32) + __U); } - -__funline __m256i _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, - __mmask32 __U, __m256i __B) { - return (__m256i)__builtin_ia32_vpermi2varqi256_mask((__v32qi)__A, - (__v32qi)__I - /* idx */, - (__v32qi)__B, - (__mmask32)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I, + __mmask32 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A, + (__v32qi) __I + , + (__v32qi) __B, + (__mmask32) + __U); } - -__funline __m256i _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varqi256_maskz( - (__v32qi)__I - /* idx */, - (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I + , + (__v32qi) __A, + (__v32qi) __B, + (__mmask32) + __U); } - -__funline __m128i _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varqi128_mask( - (__v16qi)__I - /* idx */, - (__v16qi)__A, (__v16qi)__B, (__mmask16)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I + , + (__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); } - -__funline __m128i _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varqi128_mask( - (__v16qi)__I - /* idx */, - (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I + , + (__v16qi) __A, + (__v16qi) __B, + (__mmask16) + __U); } - -__funline __m128i _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, - __mmask16 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpermi2varqi128_mask((__v16qi)__A, - (__v16qi)__I - /* idx */, - (__v16qi)__B, - (__mmask16)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A, + (__v16qi) __I + , + (__v16qi) __B, + (__mmask16) + __U); } - -__funline __m128i _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varqi128_maskz( - (__v16qi)__I - /* idx */, - (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I + , + (__v16qi) __A, + (__v16qi) __B, + (__mmask16) + __U); } - #ifdef __DISABLE_AVX512VBMIVL__ #undef __DISABLE_AVX512VBMIVL__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VBMIVL__ */ - -#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vlbwintrin.internal.h b/third_party/intel/avx512vlbwintrin.internal.h index 474b39b49..808abdc8d 100644 --- a/third_party/intel/avx512vlbwintrin.internal.h +++ b/third_party/intel/avx512vlbwintrin.internal.h @@ -1,2740 +1,4075 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512VLBWINTRIN_H_INCLUDED #define _AVX512VLBWINTRIN_H_INCLUDED - #if !defined(__AVX512VL__) || !defined(__AVX512BW__) #pragma GCC push_options #pragma GCC target("avx512vl,avx512bw") #define __DISABLE_AVX512VLBW__ -#endif /* __AVX512VLBW__ */ - -__funline __m256i _mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdquqi256_mask((__v32qi)__A, (__v32qi)__W, - (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdquqi256_mask( - (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); -} - -__funline __m128i _mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdquqi128_mask((__v16qi)__A, (__v16qi)__W, - (__mmask16)__U); -} - -__funline __m128i _mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdquqi128_mask( - (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline void _mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) { - __builtin_ia32_storedquqi256_mask((char *)__P, (__v32qi)__A, (__mmask32)__U); -} - -__funline void _mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) { - __builtin_ia32_storedquqi128_mask((char *)__P, (__v16qi)__A, (__mmask16)__U); -} - -__funline __m256i _mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, - void const *__P) { - return (__m256i)__builtin_ia32_loaddquhi256_mask( - (const short *)__P, (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) { - return (__m256i)__builtin_ia32_loaddquhi256_mask( - (const short *)__P, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, - void const *__P) { - return (__m128i)__builtin_ia32_loaddquhi128_mask((const short *)__P, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) { - return (__m128i)__builtin_ia32_loaddquhi128_mask( - (const short *)__P, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdquhi256_mask((__v16hi)__A, (__v16hi)__W, - (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdquhi256_mask( - (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdquhi128_mask((__v8hi)__A, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdquhi128_mask( - (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, - void const *__P) { - return (__m256i)__builtin_ia32_loaddquqi256_mask( - (const char *)__P, (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) { - return (__m256i)__builtin_ia32_loaddquqi256_mask( - (const char *)__P, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); -} - -__funline __m128i _mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, - void const *__P) { - return (__m128i)__builtin_ia32_loaddquqi128_mask( - (const char *)__P, (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) { - return (__m128i)__builtin_ia32_loaddquqi128_mask( - (const char *)__P, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m128i _mm256_cvtepi16_epi8(__m256i __A) { - - return (__m128i)__builtin_ia32_pmovwb256_mask( - (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); -} - -__funline void _mm256_mask_cvtepi16_storeu_epi8(void *__P, __mmask16 __M, - __m256i __A) { - __builtin_ia32_pmovwb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); -} - -__funline __m128i _mm256_mask_cvtepi16_epi8(__m128i __O, __mmask16 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovwb256_mask((__v16hi)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtepi16_epi8(__mmask16 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovwb256_mask( - (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtsepi16_epi8(__m128i __A) { - - return (__m128i)__builtin_ia32_pmovswb128_mask( - (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtsepi16_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovswb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); -} - -__funline __m128i _mm_mask_cvtsepi16_epi8(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovswb128_mask((__v8hi)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtsepi16_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovswb128_mask( - (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtsepi16_epi8(__m256i __A) { - - return (__m128i)__builtin_ia32_pmovswb256_mask( - (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); -} - -__funline void _mm256_mask_cvtsepi16_storeu_epi8(void *__P, __mmask16 __M, - __m256i __A) { - __builtin_ia32_pmovswb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); -} - -__funline __m128i _mm256_mask_cvtsepi16_epi8(__m128i __O, __mmask16 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovswb256_mask((__v16hi)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtsepi16_epi8(__mmask16 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovswb256_mask( - (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtusepi16_epi8(__m128i __A) { - - return (__m128i)__builtin_ia32_pmovuswb128_mask( - (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtusepi16_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovuswb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); -} - -__funline __m128i _mm_mask_cvtusepi16_epi8(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovuswb128_mask((__v8hi)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtusepi16_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovuswb128_mask( - (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtusepi16_epi8(__m256i __A) { - - return (__m128i)__builtin_ia32_pmovuswb256_mask( - (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); -} - -__funline void _mm256_mask_cvtusepi16_storeu_epi8(void *__P, __mmask16 __M, - __m256i __A) { - __builtin_ia32_pmovuswb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); -} - -__funline __m128i _mm256_mask_cvtusepi16_epi8(__m128i __O, __mmask16 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovuswb256_mask((__v16hi)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtusepi16_epi8(__mmask16 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovuswb256_mask( - (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, - __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastb256_mask((__v16qi)__A, (__v32qi)__O, - __M); -} - -__funline __m256i _mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastb256_mask( - (__v16qi)__A, (__v32qi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { - return (__m256i)__builtin_ia32_pbroadcastb256_gpr_mask(__A, (__v32qi)__O, - __M); -} - -__funline __m256i _mm256_maskz_set1_epi8(__mmask32 __M, char __A) { - return (__m256i)__builtin_ia32_pbroadcastb256_gpr_mask( - __A, (__v32qi)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastb128_mask((__v16qi)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastb128_mask( - (__v16qi)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_set1_epi8(__m128i __O, __mmask16 __M, char __A) { - return (__m128i)__builtin_ia32_pbroadcastb128_gpr_mask(__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_set1_epi8(__mmask16 __M, char __A) { - return (__m128i)__builtin_ia32_pbroadcastb128_gpr_mask( - __A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, - __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastw256_mask((__v8hi)__A, (__v16hi)__O, - __M); -} - -__funline __m256i _mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastw256_mask( - (__v8hi)__A, (__v16hi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { - return (__m256i)__builtin_ia32_pbroadcastw256_gpr_mask(__A, (__v16hi)__O, - __M); -} - -__funline __m256i _mm256_maskz_set1_epi16(__mmask16 __M, short __A) { - return (__m256i)__builtin_ia32_pbroadcastw256_gpr_mask( - __A, (__v16hi)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastw128_mask((__v8hi)__A, (__v8hi)__O, - __M); -} - -__funline __m128i _mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastw128_mask( - (__v8hi)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { - return (__m128i)__builtin_ia32_pbroadcastw128_gpr_mask(__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm_maskz_set1_epi16(__mmask8 __M, short __A) { - return (__m128i)__builtin_ia32_pbroadcastw128_gpr_mask( - __A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_permutexvar_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_permvarhi256_mask( - (__v16hi)__B, (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), - (__mmask16)-1); -} - -__funline __m256i _mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_permvarhi256_mask( - (__v16hi)__B, (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__M); -} - -__funline __m256i _mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_permvarhi256_mask( - (__v16hi)__B, (__v16hi)__A, (__v16hi)__W, (__mmask16)__M); -} - -__funline __m128i _mm_permutexvar_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_permvarhi128_mask( - (__v8hi)__B, (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_permvarhi128_mask( - (__v8hi)__B, (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); -} - -__funline __m128i _mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, - __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_permvarhi128_mask((__v8hi)__B, (__v8hi)__A, - (__v8hi)__W, (__mmask8)__M); -} - -__funline __m256i _mm256_permutex2var_epi16(__m256i __A, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varhi256_mask( - (__v16hi)__I - /* idx */, - (__v16hi)__A, (__v16hi)__B, (__mmask16)-1); -} - -__funline __m256i _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varhi256_mask( - (__v16hi)__I - /* idx */, - (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); -} - -__funline __m256i _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, - __mmask16 __U, __m256i __B) { - return (__m256i)__builtin_ia32_vpermi2varhi256_mask((__v16hi)__A, - (__v16hi)__I - /* idx */, - (__v16hi)__B, - (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varhi256_maskz( - (__v16hi)__I - /* idx */, - (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); -} - -__funline __m128i _mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varhi128_mask((__v8hi)__I - /* idx */, - (__v8hi)__A, (__v8hi)__B, - (__mmask8)-1); -} - -__funline __m128i _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varhi128_mask((__v8hi)__I - /* idx */, - (__v8hi)__A, (__v8hi)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, - __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpermi2varhi128_mask((__v8hi)__A, - (__v8hi)__I - /* idx */, - (__v8hi)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varhi128_maskz((__v8hi)__I - /* idx */, - (__v8hi)__A, (__v8hi)__B, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, - __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pmaddubsw256_mask( - (__v32qi)__X, (__v32qi)__Y, (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_pmaddubsw256_mask( - (__v32qi)__X, (__v32qi)__Y, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_pmaddubsw128_mask((__v16qi)__X, (__v16qi)__Y, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_pmaddubsw128_mask( - (__v16qi)__X, (__v16qi)__Y, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaddwd256_mask((__v16hi)__A, (__v16hi)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaddwd256_mask((__v16hi)__A, (__v16hi)__B, - (__v8si)_mm256_setzero_si256(), - (__mmask8)__U); -} - -__funline __m128i _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaddwd128_mask((__v8hi)__A, (__v8hi)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaddwd128_mask( - (__v8hi)__A, (__v8hi)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __mmask16 _mm_movepi8_mask(__m128i __A) { - return (__mmask16)__builtin_ia32_cvtb2mask128((__v16qi)__A); -} - -__funline __mmask32 _mm256_movepi8_mask(__m256i __A) { - return (__mmask32)__builtin_ia32_cvtb2mask256((__v32qi)__A); -} - -__funline __mmask8 _mm_movepi16_mask(__m128i __A) { - return (__mmask8)__builtin_ia32_cvtw2mask128((__v8hi)__A); -} - -__funline __mmask16 _mm256_movepi16_mask(__m256i __A) { - return (__mmask16)__builtin_ia32_cvtw2mask256((__v16hi)__A); -} - -__funline __m128i _mm_movm_epi8(__mmask16 __A) { - return (__m128i)__builtin_ia32_cvtmask2b128(__A); -} - -__funline __m256i _mm256_movm_epi8(__mmask32 __A) { - return (__m256i)__builtin_ia32_cvtmask2b256(__A); -} - -__funline __m128i _mm_movm_epi16(__mmask8 __A) { - return (__m128i)__builtin_ia32_cvtmask2w128(__A); -} - -__funline __m256i _mm256_movm_epi16(__mmask16 __A) { - return (__m256i)__builtin_ia32_cvtmask2w256(__A); -} - -__funline __mmask16 _mm_test_epi8_mask(__m128i __A, __m128i __B) { - return (__mmask16)__builtin_ia32_ptestmb128((__v16qi)__A, (__v16qi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__mmask16)__builtin_ia32_ptestmb128((__v16qi)__A, (__v16qi)__B, __U); -} - -__funline __mmask32 _mm256_test_epi8_mask(__m256i __A, __m256i __B) { - return (__mmask32)__builtin_ia32_ptestmb256((__v32qi)__A, (__v32qi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__mmask32)__builtin_ia32_ptestmb256((__v32qi)__A, (__v32qi)__B, __U); -} - -__funline __mmask8 _mm_test_epi16_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ptestmw128((__v8hi)__A, (__v8hi)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ptestmw128((__v8hi)__A, (__v8hi)__B, __U); -} - -__funline __mmask16 _mm256_test_epi16_mask(__m256i __A, __m256i __B) { - return (__mmask16)__builtin_ia32_ptestmw256((__v16hi)__A, (__v16hi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__mmask16)__builtin_ia32_ptestmw256((__v16hi)__A, (__v16hi)__B, __U); -} - -__funline __m256i _mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminuw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__M); -} - -__funline __m256i _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminuw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__M); -} - -__funline __m128i _mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminuw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); -} - -__funline __m128i _mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminuw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__M); -} - -__funline __m256i _mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__M); -} - -__funline __m256i _mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__M); -} - -__funline __m256i _mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxub256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__M); -} - -__funline __m256i _mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxub256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__M); -} - -__funline __m128i _mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxub128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); -} - -__funline __m128i _mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxub128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__M); -} - -__funline __m256i _mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__M); -} - -__funline __m256i _mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__M); -} - -__funline __m128i _mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); -} - -__funline __m128i _mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__M); -} - -__funline __m256i _mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminub256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__M); -} - -__funline __m256i _mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminub256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__M); -} - -__funline __m128i _mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminub128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); -} - -__funline __m128i _mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminub128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__M); -} - -__funline __m256i _mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__M); -} - -__funline __m256i _mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__M); -} - -__funline __m128i _mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminsb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); -} - -__funline __m128i _mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminsb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__M); -} - -__funline __m256i _mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__M); -} - -__funline __m256i _mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__M); -} - -__funline __m128i _mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); -} - -__funline __m128i _mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__M); -} - -__funline __m256i _mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxuw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__M); -} - -__funline __m256i _mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxuw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__M); -} - -__funline __m128i _mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxuw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); -} - -__funline __m128i _mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxuw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__M); -} - -__funline __m128i _mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminsw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); -} - -__funline __m128i _mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminsw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__M); -} - -#ifdef __OPTIMIZE__ -__funline __m256i _mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B, const int __N) { - return (__m256i)__builtin_ia32_palignr256_mask( - (__v4di)__A, (__v4di)__B, __N * 8, (__v4di)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, - __m256i __B, const int __N) { - return (__m256i)__builtin_ia32_palignr256_mask( - (__v4di)__A, (__v4di)__B, __N * 8, (__v4di)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m128i _mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B, const int __N) { - return (__m128i)__builtin_ia32_palignr128_mask( - (__v2di)__A, (__v2di)__B, __N * 8, (__v2di)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B, - const int __N) { - return (__m128i)__builtin_ia32_palignr128_mask( - (__v2di)__A, (__v2di)__B, __N * 8, (__v2di)_mm_setzero_si128(), - (__mmask16)__U); -} - -__funline __m256i _mm256_dbsad_epu8(__m256i __A, __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_dbpsadbw256_mask( - (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)_mm256_setzero_si256(), - (__mmask16)-1); -} - -__funline __m256i _mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_dbpsadbw256_mask( - (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B, - const int __imm) { - return (__m256i)__builtin_ia32_dbpsadbw256_mask( - (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_dbsad_epu8(__m128i __A, __m128i __B, const int __imm) { - return (__m128i)__builtin_ia32_dbpsadbw128_mask( - (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)_mm_setzero_si128(), - (__mmask8)-1); -} - -__funline __m128i _mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B, const int __imm) { - return (__m128i)__builtin_ia32_dbpsadbw128_mask( - (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { - return (__m128i)__builtin_ia32_dbpsadbw128_mask( - (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)_mm_setzero_si128(), - (__mmask8)__U); -} - -__funline __m128i _mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i)__builtin_ia32_blendmw_128_mask((__v8hi)__A, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) { - return (__m128i)__builtin_ia32_blendmb_128_mask((__v16qi)__A, (__v16qi)__W, - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, - __m256i __W) { - return (__m256i)__builtin_ia32_blendmw_256_mask((__v16hi)__A, (__v16hi)__W, - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, - __m256i __W) { - return (__m256i)__builtin_ia32_blendmb_256_mask((__v32qi)__A, (__v32qi)__W, - (__mmask32)__U); -} - -__funline __mmask8 _mm_mask_cmp_epi16_mask(__mmask8 __U, __m128i __X, __m128i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_cmp_epi16_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask16 _mm256_mask_cmp_epi16_mask(__mmask16 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, - (__mmask16)__U); -} - -__funline __mmask16 _mm256_cmp_epi16_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, - (__mmask16)-1); -} - -__funline __mmask16 _mm_mask_cmp_epi8_mask(__mmask16 __U, __m128i __X, - __m128i __Y, const int __P) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, - (__mmask16)__U); -} - -__funline __mmask16 _mm_cmp_epi8_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, - (__mmask16)-1); -} - -__funline __mmask32 _mm256_mask_cmp_epi8_mask(__mmask32 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, - (__mmask32)__U); -} - -__funline __mmask32 _mm256_cmp_epi8_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, - (__mmask32)-1); -} - -__funline __mmask8 _mm_mask_cmp_epu16_mask(__mmask8 __U, __m128i __X, __m128i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_cmp_epu16_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask16 _mm256_mask_cmp_epu16_mask(__mmask16 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, - __P, (__mmask16)__U); -} - -__funline __mmask16 _mm256_cmp_epu16_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, - __P, (__mmask16)-1); -} - -__funline __mmask16 _mm_mask_cmp_epu8_mask(__mmask16 __U, __m128i __X, - __m128i __Y, const int __P) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, - __P, (__mmask16)__U); -} - -__funline __mmask16 _mm_cmp_epu8_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, - __P, (__mmask16)-1); -} - -__funline __mmask32 _mm256_mask_cmp_epu8_mask(__mmask32 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, - __P, (__mmask32)__U); -} - -__funline __mmask32 _mm256_cmp_epu8_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, - __P, (__mmask32)-1); -} - -__funline __m256i _mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)__A, __imm, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrlwi256_mask( - (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)__A, __imm, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srli_epi16(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrlwi128_mask( - (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, - __m256i __A, const int __imm) { - return (__m256i)__builtin_ia32_pshufhw256_mask((__v16hi)__A, __imm, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_pshufhw256_mask( - (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_pshufhw128_mask((__v8hi)__A, __imm, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_pshufhw128_mask( - (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, - __m256i __A, const int __imm) { - return (__m256i)__builtin_ia32_pshuflw256_mask((__v16hi)__A, __imm, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_pshuflw256_mask( - (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_pshuflw128_mask((__v8hi)__A, __imm, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_pshuflw128_mask( - (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)__A, __imm, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrawi256_mask( - (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)__A, __imm, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrawi128_mask( - (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, - int __B) { - return (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)__A, __B, (__v16hi)__W, - (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B) { - return (__m256i)__builtin_ia32_psllwi256_mask( - (__v16hi)__A, __B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, - int __B) { - return (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)__A, __B, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_slli_epi16(__mmask8 __U, __m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllwi128_mask( - (__v8hi)__A, __B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -#else -#define _mm256_mask_alignr_epi8(W, U, X, Y, N) \ - ((__m256i)__builtin_ia32_palignr256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(N * 8), \ - (__v4di)(__m256i)(X), (__mmask32)(U))) - -#define _mm256_mask_srli_epi16(W, U, A, B) \ - ((__m256i)__builtin_ia32_psrlwi256_mask( \ - (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) - -#define _mm256_maskz_srli_epi16(U, A, B) \ - ((__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(B), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U))) - -#define _mm_mask_srli_epi16(W, U, A, B) \ - ((__m128i)__builtin_ia32_psrlwi128_mask( \ - (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_srli_epi16(U, A, B) \ - ((__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(B), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_srai_epi16(W, U, A, B) \ - ((__m256i)__builtin_ia32_psrawi256_mask( \ - (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) - -#define _mm256_maskz_srai_epi16(U, A, B) \ - ((__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(B), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U))) - -#define _mm_mask_srai_epi16(W, U, A, B) \ - ((__m128i)__builtin_ia32_psrawi128_mask( \ - (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_srai_epi16(U, A, B) \ - ((__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(B), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_shufflehi_epi16(W, U, A, B) \ - ((__m256i)__builtin_ia32_pshufhw256_mask( \ - (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) - -#define _mm256_maskz_shufflehi_epi16(U, A, B) \ - ((__m256i)__builtin_ia32_pshufhw256_mask( \ - (__v16hi)(__m256i)(A), (int)(B), \ - (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) - -#define _mm_mask_shufflehi_epi16(W, U, A, B) \ - ((__m128i)__builtin_ia32_pshufhw128_mask( \ - (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_shufflehi_epi16(U, A, B) \ - ((__m128i)__builtin_ia32_pshufhw128_mask( \ - (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_shufflelo_epi16(W, U, A, B) \ - ((__m256i)__builtin_ia32_pshuflw256_mask( \ - (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) - -#define _mm256_maskz_shufflelo_epi16(U, A, B) \ - ((__m256i)__builtin_ia32_pshuflw256_mask( \ - (__v16hi)(__m256i)(A), (int)(B), \ - (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) - -#define _mm_mask_shufflelo_epi16(W, U, A, B) \ - ((__m128i)__builtin_ia32_pshuflw128_mask( \ - (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_shufflelo_epi16(U, A, B) \ - ((__m128i)__builtin_ia32_pshuflw128_mask( \ - (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_maskz_alignr_epi8(U, X, Y, N) \ - ((__m256i)__builtin_ia32_palignr256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(N * 8), \ - (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask32)(U))) - -#define _mm_mask_alignr_epi8(W, U, X, Y, N) \ - ((__m128i)__builtin_ia32_palignr128_mask( \ - (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N * 8), \ - (__v2di)(__m128i)(X), (__mmask16)(U))) - -#define _mm_maskz_alignr_epi8(U, X, Y, N) \ - ((__m128i)__builtin_ia32_palignr128_mask( \ - (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N * 8), \ - (__v2di)(__m128i)_mm_setzero_si128(), (__mmask16)(U))) - -#define _mm_mask_slli_epi16(W, U, X, C) \ - ((__m128i)__builtin_ia32_psllwi128_mask( \ - (__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_slli_epi16(U, X, C) \ - ((__m128i)__builtin_ia32_psllwi128_mask( \ - (__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_dbsad_epu8(X, Y, C) \ - ((__m256i)__builtin_ia32_dbpsadbw256_mask( \ - (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(C), \ - (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)-1)) - -#define _mm256_mask_slli_epi16(W, U, X, C) \ - ((__m256i)__builtin_ia32_psllwi256_mask( \ - (__v16hi)(__m256i)(X), (int)(C), (__v16hi)(__m256i)(W), (__mmask16)(U))) - -#define _mm256_maskz_slli_epi16(U, X, C) \ - ((__m256i)__builtin_ia32_psllwi256_mask( \ - (__v16hi)(__m256i)(X), (int)(C), \ - (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) - -#define _mm256_mask_dbsad_epu8(W, U, X, Y, C) \ - ((__m256i)__builtin_ia32_dbpsadbw256_mask( \ - (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(C), \ - (__v16hi)(__m256i)(W), (__mmask16)(U))) - -#define _mm256_maskz_dbsad_epu8(U, X, Y, C) \ - ((__m256i)__builtin_ia32_dbpsadbw256_mask( \ - (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(C), \ - (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) - -#define _mm_dbsad_epu8(X, Y, C) \ - ((__m128i)__builtin_ia32_dbpsadbw128_mask( \ - (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(C), \ - (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)-1)) - -#define _mm_mask_dbsad_epu8(W, U, X, Y, C) \ - ((__m128i)__builtin_ia32_dbpsadbw128_mask( \ - (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(C), \ - (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_dbsad_epu8(U, X, Y, C) \ - ((__m128i)__builtin_ia32_dbpsadbw128_mask( \ - (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(C), \ - (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) - -#define _mm_mask_blend_epi16(__U, __A, __W) \ - ((__m128i)__builtin_ia32_blendmw_128_mask((__v8hi)(__A), (__v8hi)(__W), \ - (__mmask8)(__U))) - -#define _mm_mask_blend_epi8(__U, __A, __W) \ - ((__m128i)__builtin_ia32_blendmb_128_mask((__v16qi)(__A), (__v16qi)(__W), \ - (__mmask16)(__U))) - -#define _mm256_mask_blend_epi16(__U, __A, __W) \ - ((__m256i)__builtin_ia32_blendmw_256_mask((__v16hi)(__A), (__v16hi)(__W), \ - (__mmask16)(__U))) - -#define _mm256_mask_blend_epi8(__U, __A, __W) \ - ((__m256i)__builtin_ia32_blendmb_256_mask((__v32qi)(__A), (__v32qi)(__W), \ - (__mmask32)(__U))) - -#define _mm_cmp_epi16_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpw128_mask( \ - (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) - -#define _mm_cmp_epi8_mask(X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(P), \ - (__mmask16)(-1))) - -#define _mm256_cmp_epi16_mask(X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(X), \ - (__v16hi)(__m256i)(Y), (int)(P), \ - (__mmask16)(-1))) - -#define _mm256_cmp_epi8_mask(X, Y, P) \ - ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(X), \ - (__v32qi)(__m256i)(Y), (int)(P), \ - (__mmask32)(-1))) - -#define _mm_cmp_epu16_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpw128_mask( \ - (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) - -#define _mm_cmp_epu8_mask(X, Y, P) \ - ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(P), \ - (__mmask16)(-1))) - -#define _mm256_cmp_epu16_mask(X, Y, P) \ - ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(X), \ - (__v16hi)(__m256i)(Y), (int)(P), \ - (__mmask16)(-1))) - -#define _mm256_cmp_epu8_mask(X, Y, P) \ - ((__mmask32)__builtin_ia32_ucmpb256_mask( \ - (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)-1)) - -#define _mm_mask_cmp_epi16_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpw128_mask( \ - (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_mask_cmp_epi8_mask(M, X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpb128_mask( \ - (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) - -#define _mm256_mask_cmp_epi16_mask(M, X, Y, P) \ - ((__mmask16)__builtin_ia32_cmpw256_mask( \ - (__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) - -#define _mm256_mask_cmp_epi8_mask(M, X, Y, P) \ - ((__mmask32)__builtin_ia32_cmpb256_mask( \ - (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)(M))) - -#define _mm_mask_cmp_epu16_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpw128_mask( \ - (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_mask_cmp_epu8_mask(M, X, Y, P) \ - ((__mmask16)__builtin_ia32_ucmpb128_mask( \ - (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) - -#define _mm256_mask_cmp_epu16_mask(M, X, Y, P) \ - ((__mmask16)__builtin_ia32_ucmpw256_mask( \ - (__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) - -#define _mm256_mask_cmp_epu8_mask(M, X, Y, P) \ - ((__mmask32)__builtin_ia32_ucmpb256_mask( \ - (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)M)) #endif - -__funline __mmask32 _mm256_cmpneq_epi8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmplt_epi8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmpge_epi8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmple_epi8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, - (__mmask32)-1); -} - -__funline __mmask16 _mm256_cmpneq_epi16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmplt_epi16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmpge_epi16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmple_epi16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmpneq_epu8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmplt_epu8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmpge_epu8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmple_epu8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, - (__mmask16)-1); -} - -__funline __mmask8 _mm_cmpneq_epu16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmplt_epu16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpge_epu16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmple_epu16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask16 _mm_cmpneq_epi8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmplt_epi8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmpge_epi8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmple_epi8_mask(__m128i __X, __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, - (__mmask16)-1); -} - -__funline __mmask8 _mm_cmpneq_epi16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmplt_epi16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpge_epi16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmple_epi16_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, - __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pmulhrsw256_mask((__v16hi)__X, (__v16hi)__Y, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_pmulhrsw256_mask( - (__v16hi)__X, (__v16hi)__Y, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmulhuw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmulhuw256_mask( - (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmulhw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmulhw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmulhw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmulhw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmulhuw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmulhuw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_pmulhrsw128_mask((__v8hi)__X, (__v8hi)__Y, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmulhrsw128_mask( - (__v8hi)__X, (__v8hi)__Y, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmullw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmullw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmullw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmullw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxbw256_mask((__v16qi)__A, (__v16hi)__W, - (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxbw256_mask( - (__v16qi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxbw128_mask((__v16qi)__A, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxbw128_mask( - (__v16qi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxbw256_mask((__v16qi)__A, (__v16hi)__W, - (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxbw256_mask( - (__v16qi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxbw128_mask((__v16qi)__A, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxbw128_mask( - (__v16qi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pavgb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pavgb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m128i _mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pavgb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pavgb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m256i _mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pavgw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pavgw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pavgw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pavgw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_paddb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m256i _mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m256i _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddusb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddusb256_mask( - (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m256i _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddusw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddusw256_mask( - (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psubb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m256i _mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubsb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m256i _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubsw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubusb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubusb256_mask( - (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m256i _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubusw256_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubusw256_mask( - (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhbw256_mask( - (__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpckhbw256_mask( - (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m128i _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhbw128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhbw128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m256i _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhwd256_mask( - (__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpckhwd256_mask( - (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhwd128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhwd128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpcklbw256_mask( - (__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpcklbw256_mask( - (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m128i _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpcklbw128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpcklbw128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m256i _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpcklwd256_mask( - (__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpcklwd256_mask( - (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpcklwd128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpcklwd128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __mmask16 _mm_cmpeq_epi8_mask(__m128i __A, __m128i __B) { - return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__A, (__v16qi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmpeq_epu8_mask(__m128i __A, __m128i __B) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 0, - (__mmask16)-1); -} - -__funline __mmask16 _mm_mask_cmpeq_epu8_mask(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 0, - __U); -} - -__funline __mmask16 _mm_mask_cmpeq_epi8_mask(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__A, (__v16qi)__B, - __U); -} - -__funline __mmask32 _mm256_cmpeq_epu8_mask(__m256i __A, __m256i __B) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 0, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmpeq_epi8_mask(__m256i __A, __m256i __B) { - return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__A, (__v32qi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_mask_cmpeq_epu8_mask(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 0, - __U); -} - -__funline __mmask32 _mm256_mask_cmpeq_epi8_mask(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__A, (__v32qi)__B, - __U); -} - -__funline __mmask8 _mm_cmpeq_epu16_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 0, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpeq_epi16_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__A, (__v8hi)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpeq_epu16_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 0, - __U); -} - -__funline __mmask8 _mm_mask_cmpeq_epi16_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__A, (__v8hi)__B, - __U); -} - -__funline __mmask16 _mm256_cmpeq_epu16_mask(__m256i __A, __m256i __B) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 0, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmpeq_epi16_mask(__m256i __A, __m256i __B) { - return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__A, (__v16hi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_mask_cmpeq_epu16_mask(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 0, - __U); -} - -__funline __mmask16 _mm256_mask_cmpeq_epi16_mask(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__A, (__v16hi)__B, - __U); -} - -__funline __mmask16 _mm_cmpgt_epu8_mask(__m128i __A, __m128i __B) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 6, - (__mmask16)-1); -} - -__funline __mmask16 _mm_cmpgt_epi8_mask(__m128i __A, __m128i __B) { - return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__A, (__v16qi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm_mask_cmpgt_epu8_mask(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 6, - __U); -} - -__funline __mmask16 _mm_mask_cmpgt_epi8_mask(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__A, (__v16qi)__B, - __U); -} - -__funline __mmask32 _mm256_cmpgt_epu8_mask(__m256i __A, __m256i __B) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 6, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmpgt_epi8_mask(__m256i __A, __m256i __B) { - return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__A, (__v32qi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_mask_cmpgt_epu8_mask(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 6, - __U); -} - -__funline __mmask32 _mm256_mask_cmpgt_epi8_mask(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__A, (__v32qi)__B, - __U); -} - -__funline __mmask8 _mm_cmpgt_epu16_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 6, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpgt_epi16_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__A, (__v8hi)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpgt_epu16_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 6, - __U); -} - -__funline __mmask8 _mm_mask_cmpgt_epi16_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__A, (__v8hi)__B, - __U); -} - -__funline __mmask16 _mm256_cmpgt_epu16_mask(__m256i __A, __m256i __B) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 6, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmpgt_epi16_mask(__m256i __A, __m256i __B) { - return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__A, (__v16hi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_mask_cmpgt_epu16_mask(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 6, - __U); -} - -__funline __mmask16 _mm256_mask_cmpgt_epi16_mask(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__A, (__v16hi)__B, - __U); -} - -__funline __mmask16 _mm_testn_epi8_mask(__m128i __A, __m128i __B) { - return (__mmask16)__builtin_ia32_ptestnmb128((__v16qi)__A, (__v16qi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__mmask16)__builtin_ia32_ptestnmb128((__v16qi)__A, (__v16qi)__B, __U); -} - -__funline __mmask32 _mm256_testn_epi8_mask(__m256i __A, __m256i __B) { - return (__mmask32)__builtin_ia32_ptestnmb256((__v32qi)__A, (__v32qi)__B, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__mmask32)__builtin_ia32_ptestnmb256((__v32qi)__A, (__v32qi)__B, __U); -} - -__funline __mmask8 _mm_testn_epi16_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ptestnmw128((__v8hi)__A, (__v8hi)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ptestnmw128((__v8hi)__A, (__v8hi)__B, __U); -} - -__funline __mmask16 _mm256_testn_epi16_mask(__m256i __A, __m256i __B) { - return (__mmask16)__builtin_ia32_ptestnmw256((__v16hi)__A, (__v16hi)__B, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__mmask16)__builtin_ia32_ptestnmw256((__v16hi)__A, (__v16hi)__B, __U); -} - -__funline __m256i _mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pshufb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)__W, (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pshufb256_mask((__v32qi)__A, (__v32qi)__B, - (__v32qi)_mm256_setzero_si256(), - (__mmask32)__U); -} - -__funline __m128i _mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pshufb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pshufb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_packsswb256_mask( - (__v16hi)__A, (__v16hi)__B, (__v32qi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_packsswb256_mask((__v16hi)__A, (__v16hi)__B, - (__v32qi)__W, __M); -} - -__funline __m128i _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_packsswb128_mask( - (__v8hi)__A, (__v8hi)__B, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_packsswb128_mask((__v8hi)__A, (__v8hi)__B, - (__v16qi)__W, __M); -} - -__funline __m256i _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_packuswb256_mask( - (__v16hi)__A, (__v16hi)__B, (__v32qi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_packuswb256_mask((__v16hi)__A, (__v16hi)__B, - (__v32qi)__W, __M); -} - -__funline __m128i _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_packuswb128_mask( - (__v8hi)__A, (__v8hi)__B, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_packuswb128_mask((__v8hi)__A, (__v8hi)__B, - (__v16qi)__W, __M); -} - -__funline __m256i _mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsb256_mask((__v32qi)__A, (__v32qi)__W, - (__mmask32)__U); -} - -__funline __m256i _mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsb256_mask( - (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); -} - -__funline __m128i _mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsb128_mask((__v16qi)__A, (__v16qi)__W, - (__mmask16)__U); -} - -__funline __m128i _mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsb128_mask( - (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m256i _mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsw256_mask((__v16hi)__A, (__v16hi)__W, - (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsw256_mask( - (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsw128_mask((__v8hi)__A, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsw128_mask( - (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __mmask32 _mm256_cmpneq_epu8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmplt_epu8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmpge_epu8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, - (__mmask32)-1); -} - -__funline __mmask32 _mm256_cmple_epu8_mask(__m256i __X, __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, - (__mmask32)-1); -} - -__funline __mmask16 _mm256_cmpneq_epu16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmplt_epu16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmpge_epu16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, - (__mmask16)-1); -} - -__funline __mmask16 _mm256_cmple_epu16_mask(__m256i __X, __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, - (__mmask16)-1); -} - -__funline void _mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) { - __builtin_ia32_storedquhi256_mask((short *)__P, (__v16hi)__A, (__mmask16)__U); -} - -__funline void _mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) { - __builtin_ia32_storedquhi128_mask((short *)__P, (__v8hi)__A, (__mmask8)__U); -} - -__funline __m128i _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddsw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubsb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubsb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubsw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubsw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubusb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubusb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubusw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubusw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psrlw256_mask((__v16hi)__A, (__v8hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psrlw256_mask((__v16hi)__A, (__v8hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psrlw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srl_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrlw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psraw256_mask((__v16hi)__A, (__v8hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psraw256_mask((__v16hi)__A, (__v8hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psraw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psraw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddsw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddusb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddusb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddusw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddusw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m128i _mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddsb128_mask((__v16qi)__A, (__v16qi)__B, - (__v16qi)__W, (__mmask16)__U); -} - -__funline __m128i _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddsb128_mask( - (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); -} - -__funline __m128i _mm_cvtepi16_epi8(__m128i __A) { - - return (__m128i)__builtin_ia32_pmovwb128_mask( - (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtepi16_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovwb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); -} - -__funline __m128i _mm_mask_cvtepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovwb128_mask((__v8hi)__A, (__v16qi)__O, __M); -} - -__funline __m128i _mm_maskz_cvtepi16_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovwb128_mask( - (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_srav_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)-1); -} - -__funline __m256i _mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_srav_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrav8hi_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psrav8hi_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrav8hi_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_srlv_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)-1); -} - -__funline __m256i _mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_srlv_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrlv8hi_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psrlv8hi_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrlv8hi_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_sllv_epi16(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)-1); -} - -__funline __m256i _mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m128i _mm_sllv_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psllv8hi_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psllv8hi_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psllv8hi_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psllw128_mask((__v8hi)__A, (__v8hi)__B, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psllw128_mask( - (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psllw256_mask((__v16hi)__A, (__v8hi)__B, - (__v16hi)__W, (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psllw256_mask((__v16hi)__A, (__v8hi)__B, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -__funline __m256i _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_packusdw256_mask( - (__v8si)__A, (__v8si)__B, (__v16hi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_packusdw256_mask((__v8si)__A, (__v8si)__B, - (__v16hi)__W, __M); -} - -__funline __m128i _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_packusdw128_mask( - (__v4si)__A, (__v4si)__B, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_packusdw128_mask((__v4si)__A, (__v4si)__B, - (__v8hi)__W, __M); -} - -__funline __m256i _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_packssdw256_mask( - (__v8si)__A, (__v8si)__B, (__v16hi)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_packssdw256_mask((__v8si)__A, (__v8si)__B, - (__v16hi)__W, __M); -} - -__funline __m128i _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_packssdw128_mask( - (__v4si)__A, (__v4si)__B, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_packssdw128_mask((__v4si)__A, (__v4si)__B, - (__v8hi)__W, __M); -} - -__funline __mmask16 _mm_mask_cmpneq_epu8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, - (__mmask16)__M); -} - -__funline __mmask16 _mm_mask_cmplt_epu8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, - (__mmask16)__M); -} - -__funline __mmask16 _mm_mask_cmpge_epu8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, - (__mmask16)__M); -} - -__funline __mmask16 _mm_mask_cmple_epu8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, - (__mmask16)__M); -} - -__funline __mmask8 _mm_mask_cmpneq_epu16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm_mask_cmplt_epu16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm_mask_cmpge_epu16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm_mask_cmple_epu16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask16 _mm_mask_cmpneq_epi8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, - (__mmask16)__M); -} - -__funline __mmask16 _mm_mask_cmplt_epi8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, - (__mmask16)__M); -} - -__funline __mmask16 _mm_mask_cmpge_epi8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, - (__mmask16)__M); -} - -__funline __mmask16 _mm_mask_cmple_epi8_mask(__mmask16 __M, __m128i __X, - __m128i __Y) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, - (__mmask16)__M); -} - -__funline __mmask8 _mm_mask_cmpneq_epi16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm_mask_cmplt_epi16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm_mask_cmpge_epi16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm_mask_cmple_epi16_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask32 _mm256_mask_cmpneq_epu8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, - (__mmask32)__M); -} - -__funline __mmask32 _mm256_mask_cmplt_epu8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, - (__mmask32)__M); -} - -__funline __mmask32 _mm256_mask_cmpge_epu8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, - (__mmask32)__M); -} - -__funline __mmask32 _mm256_mask_cmple_epu8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, - (__mmask32)__M); -} - -__funline __mmask16 _mm256_mask_cmpneq_epu16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, - (__mmask16)__M); -} - -__funline __mmask16 _mm256_mask_cmplt_epu16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, - (__mmask16)__M); -} - -__funline __mmask16 _mm256_mask_cmpge_epu16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, - (__mmask16)__M); -} - -__funline __mmask16 _mm256_mask_cmple_epu16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, - (__mmask16)__M); -} - -__funline __mmask32 _mm256_mask_cmpneq_epi8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, - (__mmask32)__M); -} - -__funline __mmask32 _mm256_mask_cmplt_epi8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, - (__mmask32)__M); -} - -__funline __mmask32 _mm256_mask_cmpge_epi8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, - (__mmask32)__M); -} - -__funline __mmask32 _mm256_mask_cmple_epi8_mask(__mmask32 __M, __m256i __X, - __m256i __Y) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, - (__mmask32)__M); -} - -__funline __mmask16 _mm256_mask_cmpneq_epi16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, - (__mmask16)__M); -} - -__funline __mmask16 _mm256_mask_cmplt_epi16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, - (__mmask16)__M); -} - -__funline __mmask16 _mm256_mask_cmpge_epi16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, - (__mmask16)__M); -} - -__funline __mmask16 _mm256_mask_cmple_epi16_mask(__mmask16 __M, __m256i __X, - __m256i __Y) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, - (__mmask16)__M); -} - +typedef short __v16hi_u __attribute__ ((__vector_size__ (32), __may_alias__, __aligned__ (1))); +typedef short __v8hi_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef char __v32qi_u __attribute__ ((__vector_size__ (32), __may_alias__, __aligned__ (1))); +typedef char __v16qi_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi8 (void *__P, __m256i __A) +{ + *(__v32qi_u *) __P = (__v32qi_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) +{ + __builtin_ia32_storedquqi256_mask ((char *) __P, + (__v32qi) __A, + (__mmask32) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi8 (void *__P, __m128i __A) +{ + *(__v16qi_u *) __P = (__v16qi_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) +{ + __builtin_ia32_storedquqi128_mask ((char *) __P, + (__v16qi) __A, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi16 (void const *__P) +{ + return (__m256i) (*(__v16hi_u *) __P); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi16 (void const *__P) +{ + return (__m128i) (*(__v8hi_u *) __P); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi8 (void const *__P) +{ + return (__m256i) (*(__v32qi_u *) __P); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi8 (void const *__P) +{ + return (__m128i) (*(__v16qi_u *) __P); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A) +{ + __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi16_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A) +{ + __builtin_ia32_pmovswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi16_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A) +{ + __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi16_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A) +{ + __builtin_ia32_pmovuswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi16_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A) +{ + __builtin_ia32_pmovuswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A, + (__v32qi) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, + (__v32qi) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi8 (__mmask32 __M, char __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi8 (__mmask16 __M, char __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A, + (__v16hi) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, + (__v16hi) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi16 (__mmask16 __M, short __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A, + (__v8hi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, + (__v8hi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi16 (__mmask8 __M, short __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) __W, + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutexvar_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) __W, + (__mmask8) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I + , + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I + , + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I, + __mmask16 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A, + (__v16hi) __I + , + (__v16hi) __B, + (__mmask16) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I + , + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I + , + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I + , + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A, + (__v8hi) __I + , + (__v8hi) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I + , + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi8_mask (__m128i __A) +{ + return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi8_mask (__m256i __A) +{ + return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi16_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi16_mask (__m256i __A) +{ + return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi8 (__mmask16 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2b128 (__A); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi8 (__mmask32 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2b256 (__A); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi16 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2w128 (__A); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi16 (__mmask16 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2w256 (__A); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, + (__v16qi) __B, __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, + (__v32qi) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, + (__v8hi) __B, __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, + (__v16hi) __B, __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A, + (__v4di) __B, + __N * 8, + (__v4di) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi8 (__mmask32 __U, __m256i __A, __m256i __B, + const int __N) +{ + return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A, + (__v4di) __B, + __N * 8, + (__v4di) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B, const int __N) +{ + return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A, + (__v2di) __B, + __N * 8, + (__v2di) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi8 (__mmask16 __U, __m128i __A, __m128i __B, + const int __N) +{ + return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A, + (__v2di) __B, + __N * 8, + (__v2di) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dbsad_epu8 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dbsad_epu8 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dbsad_epu8 (__mmask16 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dbsad_epu8 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dbsad_epu8 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi16_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi16_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi16_mask (__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi16_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi8_mask (__mmask16 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi8_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi8_mask (__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi8_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu16_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu16_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu16_mask (__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu16_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu8_mask (__mmask16 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu8_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu8_mask (__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu8_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi16 (__mmask16 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shufflehi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shufflehi_epi16 (__mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shufflehi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shufflehi_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shufflelo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shufflelo_epi16 (__mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shufflelo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shufflelo_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +#else +#define _mm256_mask_alignr_epi8(W, U, X, Y, N) ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)((N) * 8), (__v4di)(__m256i)(X), (__mmask32)(U))) +#define _mm256_mask_srli_epi16(W, U, A, B) ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) +#define _mm256_maskz_srli_epi16(U, A, B) ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U))) +#define _mm_mask_srli_epi16(W, U, A, B) ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_srli_epi16(U, A, B) ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U))) +#define _mm256_mask_srai_epi16(W, U, A, B) ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) +#define _mm256_maskz_srai_epi16(U, A, B) ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U))) +#define _mm_mask_srai_epi16(W, U, A, B) ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_srai_epi16(U, A, B) ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U))) +#define _mm256_mask_shufflehi_epi16(W, U, A, B) ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) +#define _mm256_maskz_shufflehi_epi16(U, A, B) ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(U))) +#define _mm_mask_shufflehi_epi16(W, U, A, B) ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_shufflehi_epi16(U, A, B) ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_mask_shufflelo_epi16(W, U, A, B) ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) +#define _mm256_maskz_shufflelo_epi16(U, A, B) ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(U))) +#define _mm_mask_shufflelo_epi16(W, U, A, B) ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_shufflelo_epi16(U, A, B) ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_maskz_alignr_epi8(U, X, Y, N) ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)((N) * 8), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask32)(U))) +#define _mm_mask_alignr_epi8(W, U, X, Y, N) ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)((N) * 8), (__v2di)(__m128i)(X), (__mmask16)(U))) +#define _mm_maskz_alignr_epi8(U, X, Y, N) ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)((N) * 8), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask16)(U))) +#define _mm_mask_slli_epi16(W, U, X, C) ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_slli_epi16(U, X, C) ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_dbsad_epu8(X, Y, C) ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), (__v32qi)(__m256i) (Y), (int) (C), (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)-1)) +#define _mm256_mask_slli_epi16(W, U, X, C) ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C), (__v16hi)(__m256i)(W), (__mmask16)(U))) +#define _mm256_maskz_slli_epi16(U, X, C) ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(U))) +#define _mm256_mask_dbsad_epu8(W, U, X, Y, C) ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), (__v32qi)(__m256i) (Y), (int) (C), (__v16hi)(__m256i)(W), (__mmask16)(U))) +#define _mm256_maskz_dbsad_epu8(U, X, Y, C) ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), (__v32qi)(__m256i) (Y), (int) (C), (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) +#define _mm_dbsad_epu8(X, Y, C) ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), (__v16qi)(__m128i) (Y), (int) (C), (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)-1)) +#define _mm_mask_dbsad_epu8(W, U, X, Y, C) ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), (__v16qi)(__m128i) (Y), (int) (C), (__v8hi)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_dbsad_epu8(U, X, Y, C) ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), (__v16qi)(__m128i) (Y), (int) (C), (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) +#define _mm_mask_blend_epi16(__U, __A, __W) ((__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) (__A), (__v8hi) (__W), (__mmask8) (__U))) +#define _mm_mask_blend_epi8(__U, __A, __W) ((__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) (__A), (__v16qi) (__W), (__mmask16) (__U))) +#define _mm256_mask_blend_epi16(__U, __A, __W) ((__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) (__A), (__v16hi) (__W), (__mmask16) (__U))) +#define _mm256_mask_blend_epi8(__U, __A, __W) ((__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) (__A), (__v32qi) (__W), (__mmask32) (__U))) +#define _mm_cmp_epi16_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) +#define _mm_cmp_epi8_mask(X, Y, P) ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(-1))) +#define _mm256_cmp_epi16_mask(X, Y, P) ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(-1))) +#define _mm256_cmp_epi8_mask(X, Y, P) ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)(-1))) +#define _mm_cmp_epu16_mask(X, Y, P) ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) +#define _mm_cmp_epu8_mask(X, Y, P) ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(-1))) +#define _mm256_cmp_epu16_mask(X, Y, P) ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(-1))) +#define _mm256_cmp_epu8_mask(X, Y, P) ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)-1)) +#define _mm_mask_cmp_epi16_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(M))) +#define _mm_mask_cmp_epi8_mask(M, X, Y, P) ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) +#define _mm256_mask_cmp_epi16_mask(M, X, Y, P) ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) +#define _mm256_mask_cmp_epi8_mask(M, X, Y, P) ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)(M))) +#define _mm_mask_cmp_epu16_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(M))) +#define _mm_mask_cmp_epu8_mask(M, X, Y, P) ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) +#define _mm256_mask_cmp_epu16_mask(M, X, Y, P) ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) +#define _mm256_mask_cmp_epu8_mask(M, X, Y, P) ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)(M))) +#endif +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) -1); +} +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) -1); +} +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) -1); +} +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X, + (__v16hi) __Y, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X, + (__v16hi) __Y, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X, + (__v8hi) __Y, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X, + (__v8hi) __Y, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 0, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 0, + __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A, + (__v16qi) __B, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 0, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 0, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A, + (__v32qi) __B, + __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A, + (__v8hi) __B, __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 0, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 0, + __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A, + (__v16hi) __B, + __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 6, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 6, + __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A, + (__v16qi) __B, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 6, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 6, + __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A, + (__v32qi) __B, + __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A, + (__v8hi) __B, __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 6, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 6, + __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A, + (__v16hi) __B, + __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, + (__v16qi) __B, __U); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, + (__v32qi) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, + (__v8hi) __B, __U); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, + (__v16hi) __B, __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) __W, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) __W, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) __W, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) __W, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) -1); +} +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) -1); +} +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) -1); +} +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) -1); +} +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi16 (void *__P, __m256i __A) +{ + *(__v16hi_u *) __P = (__v16hi_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) +{ + __builtin_ia32_storedquhi256_mask ((short *) __P, + (__v16hi) __A, + (__mmask16) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi16 (void *__P, __m128i __A) +{ + *(__v8hi_u *) __P = (__v8hi_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedquhi128_mask ((short *) __P, + (__v8hi) __A, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A) +{ + __builtin_ia32_pmovwb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) __W, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packus_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) __W, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packs_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) __W, __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); +} +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); +} +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); +} #ifdef __DISABLE_AVX512VLBW__ #undef __DISABLE_AVX512VLBW__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VLBW__ */ - -#endif /* _AVX512VLBWINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vldqintrin.internal.h b/third_party/intel/avx512vldqintrin.internal.h index c3ac74c67..2e7d56993 100644 --- a/third_party/intel/avx512vldqintrin.internal.h +++ b/third_party/intel/avx512vldqintrin.internal.h @@ -1,1159 +1,1658 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512VLDQINTRIN_H_INCLUDED #define _AVX512VLDQINTRIN_H_INCLUDED - #if !defined(__AVX512VL__) || !defined(__AVX512DQ__) #pragma GCC push_options #pragma GCC target("avx512vl,avx512dq") #define __DISABLE_AVX512VLDQ__ -#endif /* __AVX512VLDQ__ */ - -__funline __m256i _mm256_cvttpd_epi64(__m256d __A) { - return (__m256i)__builtin_ia32_cvttpd2qq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvttpd_epi64(__m256i __W, __mmask8 __U, - __m256d __A) { - return (__m256i)__builtin_ia32_cvttpd2qq256_mask((__v4df)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvttpd_epi64(__mmask8 __U, __m256d __A) { - return (__m256i)__builtin_ia32_cvttpd2qq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvttpd_epi64(__m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2qq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvttpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2qq128_mask((__v2df)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttpd_epi64(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2qq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvttpd_epu64(__m256d __A) { - return (__m256i)__builtin_ia32_cvttpd2uqq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvttpd_epu64(__m256i __W, __mmask8 __U, - __m256d __A) { - return (__m256i)__builtin_ia32_cvttpd2uqq256_mask((__v4df)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvttpd_epu64(__mmask8 __U, __m256d __A) { - return (__m256i)__builtin_ia32_cvttpd2uqq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvttpd_epu64(__m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2uqq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvttpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2uqq128_mask((__v2df)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttpd_epu64(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2uqq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvtpd_epi64(__m256d __A) { - return (__m256i)__builtin_ia32_cvtpd2qq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvtpd_epi64(__m256i __W, __mmask8 __U, - __m256d __A) { - return (__m256i)__builtin_ia32_cvtpd2qq256_mask((__v4df)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtpd_epi64(__mmask8 __U, __m256d __A) { - return (__m256i)__builtin_ia32_cvtpd2qq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvtpd_epi64(__m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2qq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvtpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2qq128_mask((__v2df)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtpd_epi64(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2qq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvtpd_epu64(__m256d __A) { - return (__m256i)__builtin_ia32_cvtpd2uqq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvtpd_epu64(__m256i __W, __mmask8 __U, - __m256d __A) { - return (__m256i)__builtin_ia32_cvtpd2uqq256_mask((__v4df)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtpd_epu64(__mmask8 __U, __m256d __A) { - return (__m256i)__builtin_ia32_cvtpd2uqq256_mask( - (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvtpd_epu64(__m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2uqq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvtpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2uqq128_mask((__v2df)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtpd_epu64(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2uqq128_mask( - (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvttps_epi64(__m128 __A) { - return (__m256i)__builtin_ia32_cvttps2qq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvttps_epi64(__m256i __W, __mmask8 __U, - __m128 __A) { - return (__m256i)__builtin_ia32_cvttps2qq256_mask((__v4sf)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { - return (__m256i)__builtin_ia32_cvttps2qq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvttps_epi64(__m128 __A) { - return (__m128i)__builtin_ia32_cvttps2qq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvttps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2qq128_mask((__v4sf)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2qq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvttps_epu64(__m128 __A) { - return (__m256i)__builtin_ia32_cvttps2uqq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvttps_epu64(__m256i __W, __mmask8 __U, - __m128 __A) { - return (__m256i)__builtin_ia32_cvttps2uqq256_mask((__v4sf)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { - return (__m256i)__builtin_ia32_cvttps2uqq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvttps_epu64(__m128 __A) { - return (__m128i)__builtin_ia32_cvttps2uqq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvttps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2uqq128_mask((__v4sf)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2uqq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256d _mm256_broadcast_f64x2(__m128d __A) { - return (__m256d)__builtin_ia32_broadcastf64x2_256_mask( - (__v2df)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, - __m128d __A) { - return (__m256d)__builtin_ia32_broadcastf64x2_256_mask((__v2df)__A, - (__v4df)__O, __M); -} - -__funline __m256d _mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { - return (__m256d)__builtin_ia32_broadcastf64x2_256_mask( - (__v2df)__A, (__v4df)_mm256_setzero_ps(), __M); -} - -__funline __m256i _mm256_broadcast_i64x2(__m128i __A) { - return (__m256i)__builtin_ia32_broadcasti64x2_256_mask( - (__v2di)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, - __m128i __A) { - return (__m256i)__builtin_ia32_broadcasti64x2_256_mask((__v2di)__A, - (__v4di)__O, __M); -} - -__funline __m256i _mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_broadcasti64x2_256_mask( - (__v2di)__A, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m256 _mm256_broadcast_f32x2(__m128 __A) { - return (__m256)__builtin_ia32_broadcastf32x2_256_mask( - (__v4sf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, - __m128 __A) { - return (__m256)__builtin_ia32_broadcastf32x2_256_mask((__v4sf)__A, - (__v8sf)__O, __M); -} - -__funline __m256 _mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { - return (__m256)__builtin_ia32_broadcastf32x2_256_mask( - (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); -} - -__funline __m256i _mm256_broadcast_i32x2(__m128i __A) { - return (__m256i)__builtin_ia32_broadcasti32x2_256_mask( - (__v4si)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, - __m128i __A) { - return (__m256i)__builtin_ia32_broadcasti32x2_256_mask((__v4si)__A, - (__v8si)__O, __M); -} - -__funline __m256i _mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_broadcasti32x2_256_mask( - (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm_broadcast_i32x2(__m128i __A) { - return (__m128i)__builtin_ia32_broadcasti32x2_128_mask( - (__v4si)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_broadcasti32x2_128_mask((__v4si)__A, - (__v4si)__O, __M); -} - -__funline __m128i _mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_broadcasti32x2_128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_mullo_epi64(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A * (__v4du)__B); -} - -__funline __m256i _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmullq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmullq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mullo_epi64(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A * (__v2du)__B); -} - -__funline __m128i _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmullq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmullq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_andnpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_andnpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_andnpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_andnpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_andnps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_andnps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_andnps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_andnps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvtps_epi64(__m128 __A) { - return (__m256i)__builtin_ia32_cvtps2qq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvtps_epi64(__m256i __W, __mmask8 __U, __m128 __A) { - return (__m256i)__builtin_ia32_cvtps2qq256_mask((__v4sf)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { - return (__m256i)__builtin_ia32_cvtps2qq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvtps_epi64(__m128 __A) { - return (__m128i)__builtin_ia32_cvtps2qq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvtps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2qq128_mask((__v4sf)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2qq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvtps_epu64(__m128 __A) { - return (__m256i)__builtin_ia32_cvtps2uqq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvtps_epu64(__m256i __W, __mmask8 __U, __m128 __A) { - return (__m256i)__builtin_ia32_cvtps2uqq256_mask((__v4sf)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { - return (__m256i)__builtin_ia32_cvtps2uqq256_mask( - (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvtps_epu64(__m128 __A) { - return (__m128i)__builtin_ia32_cvtps2uqq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvtps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2uqq128_mask((__v4sf)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2uqq128_mask( - (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128 _mm256_cvtepi64_ps(__m256i __A) { - return (__m128)__builtin_ia32_cvtqq2ps256_mask( - (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm256_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_cvtqq2ps256_mask((__v4di)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm256_maskz_cvtepi64_ps(__mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_cvtqq2ps256_mask( - (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_cvtepi64_ps(__m128i __A) { - return (__m128)__builtin_ia32_cvtqq2ps128_mask( - (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtqq2ps128_mask((__v2di)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_cvtepi64_ps(__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtqq2ps128_mask( - (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm256_cvtepu64_ps(__m256i __A) { - return (__m128)__builtin_ia32_cvtuqq2ps256_mask( - (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm256_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_cvtuqq2ps256_mask((__v4di)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm256_maskz_cvtepu64_ps(__mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_cvtuqq2ps256_mask( - (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_cvtepu64_ps(__m128i __A) { - return (__m128)__builtin_ia32_cvtuqq2ps128_mask( - (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtuqq2ps128_mask((__v2di)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_cvtepu64_ps(__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtuqq2ps128_mask( - (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_cvtepi64_pd(__m256i __A) { - return (__m256d)__builtin_ia32_cvtqq2pd256_mask( - (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_cvtepi64_pd(__m256d __W, __mmask8 __U, - __m256i __A) { - return (__m256d)__builtin_ia32_cvtqq2pd256_mask((__v4di)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_cvtepi64_pd(__mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_cvtqq2pd256_mask( - (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_cvtepi64_pd(__m128i __A) { - return (__m128d)__builtin_ia32_cvtqq2pd128_mask( - (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_cvtepi64_pd(__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtqq2pd128_mask((__v2di)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_cvtepi64_pd(__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtqq2pd128_mask( - (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_cvtepu64_pd(__m256i __A) { - return (__m256d)__builtin_ia32_cvtuqq2pd256_mask( - (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_cvtepu64_pd(__m256d __W, __mmask8 __U, - __m256i __A) { - return (__m256d)__builtin_ia32_cvtuqq2pd256_mask((__v4di)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_cvtepu64_pd(__mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_cvtuqq2pd256_mask( - (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_andpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_andpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_andpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_andpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_andps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_andps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_andps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_andps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_cvtepu64_pd(__m128i __A) { - return (__m128d)__builtin_ia32_cvtuqq2pd128_mask( - (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_cvtepu64_pd(__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtuqq2pd128_mask((__v2di)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_cvtepu64_pd(__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtuqq2pd128_mask( - (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_xorpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_xorpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_xorpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_xor_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_xorpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_xorps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_xorps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_xorps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_xorps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_orpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_orpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_orpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_orpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_orps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_orps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_orps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_orps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128i _mm_movm_epi32(__mmask8 __A) { - return (__m128i)__builtin_ia32_cvtmask2d128(__A); -} - -__funline __m256i _mm256_movm_epi32(__mmask8 __A) { - return (__m256i)__builtin_ia32_cvtmask2d256(__A); -} - -__funline __m128i _mm_movm_epi64(__mmask8 __A) { - return (__m128i)__builtin_ia32_cvtmask2q128(__A); -} - -__funline __m256i _mm256_movm_epi64(__mmask8 __A) { - return (__m256i)__builtin_ia32_cvtmask2q256(__A); -} - -__funline __mmask8 _mm_movepi32_mask(__m128i __A) { - return (__mmask8)__builtin_ia32_cvtd2mask128((__v4si)__A); -} - -__funline __mmask8 _mm256_movepi32_mask(__m256i __A) { - return (__mmask8)__builtin_ia32_cvtd2mask256((__v8si)__A); -} - -__funline __mmask8 _mm_movepi64_mask(__m128i __A) { - return (__mmask8)__builtin_ia32_cvtq2mask128((__v2di)__A); -} - -__funline __mmask8 _mm256_movepi64_mask(__m256i __A) { - return (__mmask8)__builtin_ia32_cvtq2mask256((__v4di)__A); -} - -#ifdef __OPTIMIZE__ -__funline __m128d _mm256_extractf64x2_pd(__m256d __A, const int __imm) { - return (__m128d)__builtin_ia32_extractf64x2_256_mask( - (__v4df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, - __m256d __A, const int __imm) { - return (__m128d)__builtin_ia32_extractf64x2_256_mask( - (__v4df)__A, __imm, (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A, - const int __imm) { - return (__m128d)__builtin_ia32_extractf64x2_256_mask( - (__v4df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128i _mm256_extracti64x2_epi64(__m256i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti64x2_256_mask( - (__v4di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, - __m256i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti64x2_256_mask( - (__v4di)__A, __imm, (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A, - const int __imm) { - return (__m128i)__builtin_ia32_extracti64x2_256_mask( - (__v4di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256d _mm256_reduce_pd(__m256d __A, int __B) { - return (__m256d)__builtin_ia32_reducepd256_mask( - (__v4df)__A, __B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_reduce_pd(__m256d __W, __mmask8 __U, __m256d __A, - int __B) { - return (__m256d)__builtin_ia32_reducepd256_mask((__v4df)__A, __B, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_reduce_pd(__mmask8 __U, __m256d __A, int __B) { - return (__m256d)__builtin_ia32_reducepd256_mask( - (__v4df)__A, __B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_reduce_pd(__m128d __A, int __B) { - return (__m128d)__builtin_ia32_reducepd128_mask( - (__v2df)__A, __B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_reduce_pd(__m128d __W, __mmask8 __U, __m128d __A, - int __B) { - return (__m128d)__builtin_ia32_reducepd128_mask((__v2df)__A, __B, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_reduce_pd(__mmask8 __U, __m128d __A, int __B) { - return (__m128d)__builtin_ia32_reducepd128_mask( - (__v2df)__A, __B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_reduce_ps(__m256 __A, int __B) { - return (__m256)__builtin_ia32_reduceps256_mask( - (__v8sf)__A, __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_reduce_ps(__m256 __W, __mmask8 __U, __m256 __A, - int __B) { - return (__m256)__builtin_ia32_reduceps256_mask((__v8sf)__A, __B, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A, int __B) { - return (__m256)__builtin_ia32_reduceps256_mask( - (__v8sf)__A, __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_reduce_ps(__m128 __A, int __B) { - return (__m128)__builtin_ia32_reduceps128_mask( - (__v4sf)__A, __B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_reduce_ps(__m128 __W, __mmask8 __U, __m128 __A, - int __B) { - return (__m128)__builtin_ia32_reduceps128_mask((__v4sf)__A, __B, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_reduce_ps(__mmask8 __U, __m128 __A, int __B) { - return (__m128)__builtin_ia32_reduceps128_mask( - (__v4sf)__A, __B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_range_pd(__m256d __A, __m256d __B, int __C) { - return (__m256d)__builtin_ia32_rangepd256_mask( - (__v4df)__A, (__v4df)__B, __C, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_range_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B, int __C) { - return (__m256d)__builtin_ia32_rangepd256_mask((__v4df)__A, (__v4df)__B, __C, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_range_pd(__mmask8 __U, __m256d __A, __m256d __B, - int __C) { - return (__m256d)__builtin_ia32_rangepd256_mask((__v4df)__A, (__v4df)__B, __C, - (__v4df)_mm256_setzero_pd(), - (__mmask8)__U); -} - -__funline __m128d _mm_range_pd(__m128d __A, __m128d __B, int __C) { - return (__m128d)__builtin_ia32_rangepd128_mask( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_range_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, int __C) { - return (__m128d)__builtin_ia32_rangepd128_mask((__v2df)__A, (__v2df)__B, __C, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_range_pd(__mmask8 __U, __m128d __A, __m128d __B, - int __C) { - return (__m128d)__builtin_ia32_rangepd128_mask( - (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_range_ps(__m256 __A, __m256 __B, int __C) { - return (__m256)__builtin_ia32_rangeps256_mask( - (__v8sf)__A, (__v8sf)__B, __C, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_range_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B, int __C) { - return (__m256)__builtin_ia32_rangeps256_mask((__v8sf)__A, (__v8sf)__B, __C, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_range_ps(__mmask8 __U, __m256 __A, __m256 __B, - int __C) { - return (__m256)__builtin_ia32_rangeps256_mask((__v8sf)__A, (__v8sf)__B, __C, - (__v8sf)_mm256_setzero_ps(), - (__mmask8)__U); -} - -__funline __m128 _mm_range_ps(__m128 __A, __m128 __B, int __C) { - return (__m128)__builtin_ia32_rangeps128_mask( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_range_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, int __C) { - return (__m128)__builtin_ia32_rangeps128_mask((__v4sf)__A, (__v4sf)__B, __C, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_range_ps(__mmask8 __U, __m128 __A, __m128 __B, - int __C) { - return (__m128)__builtin_ia32_rangeps128_mask( - (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __mmask8 _mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A, - const int __imm) { - return (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)__A, __imm, __U); -} - -__funline __mmask8 _mm256_fpclass_pd_mask(__m256d __A, const int __imm) { - return (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)__A, __imm, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A, - const int __imm) { - return (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)__A, __imm, __U); -} - -__funline __mmask8 _mm256_fpclass_ps_mask(__m256 __A, const int __imm) { - return (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)__A, __imm, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A, - const int __imm) { - return (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)__A, __imm, __U); -} - -__funline __mmask8 _mm_fpclass_pd_mask(__m128d __A, const int __imm) { - return (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)__A, __imm, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A, - const int __imm) { - return (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)__A, __imm, __U); -} - -__funline __mmask8 _mm_fpclass_ps_mask(__m128 __A, const int __imm) { - return (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)__A, __imm, - (__mmask8)-1); -} - -__funline __m256i _mm256_inserti64x2(__m256i __A, __m128i __B, const int __imm) { - return (__m256i)__builtin_ia32_inserti64x2_256_mask( - (__v4di)__A, (__v2di)__B, __imm, (__v4di)_mm256_setzero_si256(), - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B, const int __imm) { - return (__m256i)__builtin_ia32_inserti64x2_256_mask( - (__v4di)__A, (__v2di)__B, __imm, (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B, - const int __imm) { - return (__m256i)__builtin_ia32_inserti64x2_256_mask( - (__v4di)__A, (__v2di)__B, __imm, (__v4di)_mm256_setzero_si256(), - (__mmask8)__U); -} - -__funline __m256d _mm256_insertf64x2(__m256d __A, __m128d __B, const int __imm) { - return (__m256d)__builtin_ia32_insertf64x2_256_mask( - (__v4df)__A, (__v2df)__B, __imm, (__v4df)_mm256_setzero_pd(), - (__mmask8)-1); -} - -__funline __m256d _mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, - __m128d __B, const int __imm) { - return (__m256d)__builtin_ia32_insertf64x2_256_mask( - (__v4df)__A, (__v2df)__B, __imm, (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B, - const int __imm) { - return (__m256d)__builtin_ia32_insertf64x2_256_mask( - (__v4df)__A, (__v2df)__B, __imm, (__v4df)_mm256_setzero_pd(), - (__mmask8)__U); -} - -#else -#define _mm256_insertf64x2(X, Y, C) \ - ((__m256d)__builtin_ia32_insertf64x2_256_mask( \ - (__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(C), \ - (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) - -#define _mm256_mask_insertf64x2(W, U, X, Y, C) \ - ((__m256d)__builtin_ia32_insertf64x2_256_mask( \ - (__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(C), \ - (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm256_maskz_insertf64x2(U, X, Y, C) \ - ((__m256d)__builtin_ia32_insertf64x2_256_mask( \ - (__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(C), \ - (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) - -#define _mm256_inserti64x2(X, Y, C) \ - ((__m256i)__builtin_ia32_inserti64x2_256_mask( \ - (__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(C), \ - (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) - -#define _mm256_mask_inserti64x2(W, U, X, Y, C) \ - ((__m256i)__builtin_ia32_inserti64x2_256_mask( \ - (__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(C), \ - (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_inserti64x2(U, X, Y, C) \ - ((__m256i)__builtin_ia32_inserti64x2_256_mask( \ - (__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(C), \ - (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) - -#define _mm256_extractf64x2_pd(X, C) \ - ((__m128d)__builtin_ia32_extractf64x2_256_mask( \ - (__v4df)(__m256d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm256_mask_extractf64x2_pd(W, U, X, C) \ - ((__m128d)__builtin_ia32_extractf64x2_256_mask( \ - (__v4df)(__m256d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm256_maskz_extractf64x2_pd(U, X, C) \ - ((__m128d)__builtin_ia32_extractf64x2_256_mask( \ - (__v4df)(__m256d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_extracti64x2_epi64(X, C) \ - ((__m128i)__builtin_ia32_extracti64x2_256_mask( \ - (__v4di)(__m256i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm256_mask_extracti64x2_epi64(W, U, X, C) \ - ((__m128i)__builtin_ia32_extracti64x2_256_mask( \ - (__v4di)(__m256i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) - -#define _mm256_maskz_extracti64x2_epi64(U, X, C) \ - ((__m128i)__builtin_ia32_extracti64x2_256_mask( \ - (__v4di)(__m256i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_reduce_pd(A, B) \ - ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm256_mask_reduce_pd(W, U, A, B) \ - ((__m256d)__builtin_ia32_reducepd256_mask( \ - (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm256_maskz_reduce_pd(U, A, B) \ - ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm_reduce_pd(A, B) \ - ((__m128d)__builtin_ia32_reducepd128_mask( \ - (__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1)) - -#define _mm_mask_reduce_pd(W, U, A, B) \ - ((__m128d)__builtin_ia32_reducepd128_mask( \ - (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm_maskz_reduce_pd(U, A, B) \ - ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_reduce_ps(A, B) \ - ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_reduce_ps(W, U, A, B) \ - ((__m256)__builtin_ia32_reduceps256_mask( \ - (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) - -#define _mm256_maskz_reduce_ps(U, A, B) \ - ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm_reduce_ps(A, B) \ - ((__m128)__builtin_ia32_reduceps128_mask( \ - (__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) - -#define _mm_mask_reduce_ps(W, U, A, B) \ - ((__m128)__builtin_ia32_reduceps128_mask( \ - (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm_maskz_reduce_ps(U, A, B) \ - ((__m128)__builtin_ia32_reduceps128_mask( \ - (__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) - -#define _mm256_range_pd(A, B, C) \ - ((__m256d)__builtin_ia32_rangepd256_mask( \ - (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) - -#define _mm256_maskz_range_pd(U, A, B, C) \ - ((__m256d)__builtin_ia32_rangepd256_mask( \ - (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) - -#define _mm_range_pd(A, B, C) \ - ((__m128d)__builtin_ia32_rangepd128_mask( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)-1)) - -#define _mm256_range_ps(A, B, C) \ - ((__m256)__builtin_ia32_rangeps256_mask( \ - (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) - -#define _mm256_mask_range_ps(W, U, A, B, C) \ - ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)(__m256)(W), (__mmask8)(U))) - -#define _mm256_maskz_range_ps(U, A, B, C) \ - ((__m256)__builtin_ia32_rangeps256_mask( \ - (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) - -#define _mm_range_ps(A, B, C) \ - ((__m128)__builtin_ia32_rangeps128_mask( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) - -#define _mm_mask_range_ps(W, U, A, B, C) \ - ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm_maskz_range_ps(U, A, B, C) \ - ((__m128)__builtin_ia32_rangeps128_mask( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) - -#define _mm256_mask_range_pd(W, U, A, B, C) \ - ((__m256d)__builtin_ia32_rangepd256_mask( \ - (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm_mask_range_pd(W, U, A, B, C) \ - ((__m128d)__builtin_ia32_rangepd128_mask( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm_maskz_range_pd(U, A, B, C) \ - ((__m128d)__builtin_ia32_rangepd128_mask( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), (__mmask8)(U))) - -#define _mm256_mask_fpclass_pd_mask(u, X, C) \ - ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(X), (int)(C), \ - (__mmask8)(u))) - -#define _mm256_mask_fpclass_ps_mask(u, X, C) \ - ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(X), (int)(C), \ - (__mmask8)(u))) - -#define _mm_mask_fpclass_pd_mask(u, X, C) \ - ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(X), (int)(C), \ - (__mmask8)(u))) - -#define _mm_mask_fpclass_ps_mask(u, X, C) \ - ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(X), (int)(C), \ - (__mmask8)(u))) - -#define _mm256_fpclass_pd_mask(X, C) \ - ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(X), (int)(C), \ - (__mmask8)-1)) - -#define _mm256_fpclass_ps_mask(X, C) \ - ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(X), (int)(C), \ - (__mmask8)-1)) - -#define _mm_fpclass_pd_mask(X, C) \ - ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(X), (int)(C), \ - (__mmask8)-1)) - -#define _mm_fpclass_ps_mask(X, C) \ - ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(X), (int)(C), \ - (__mmask8)-1)) - #endif - +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epi64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epi64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epu64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epu64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epi64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epi64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epu64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epu64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epi64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epi64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epu64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epu64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_f64x2 (__m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) + __A, + (__v4df)_mm256_undefined_pd(), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) + __A, + (__v4df) + __O, __M); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) + __A, + (__v4df) + _mm256_setzero_ps (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_i64x2 (__m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) + __A, + (__v4di)_mm256_undefined_si256(), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) + __A, + (__v4di) + __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) + __A, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_f32x2 (__m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, + (__v8sf)_mm256_undefined_ps(), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_i32x2 (__m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) + __A, + (__v8si)_mm256_undefined_si256(), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) + __A, + (__v8si) + __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) + __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcast_i32x2 (__m128i __A) +{ + return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) + __A, + (__v4si)_mm_undefined_si128(), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) + __A, + (__v4si) + __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) + __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du) __A * (__v4du) __B); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du) __A * (__v2du) __B); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epi64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epi64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epu64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epu64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_ps (__m256i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu64_ps (__m256i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu64_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_pd (__m256i __A) +{ + return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_pd (__m128i __A) +{ + return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu64_pd (__m256i __A) +{ + return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu64_pd (__m128i __A) +{ + return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi32 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2d128 (__A); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi32 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2d256 (__A); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi64 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2q128 (__A); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi64 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2q256 (__A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi32_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi32_mask (__m256i __A) +{ + return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi64_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi64_mask (__m256i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); +} +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf64x2_pd (__m256d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) __W, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extractf64x2_pd (__mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti64x2_epi64 (__m256i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extracti64x2_epi64 (__mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_pd (__m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_pd (__m256d __W, __mmask8 __U, __m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_pd (__mmask8 __U, __m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_pd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_pd (__m128d __W, __mmask8 __U, __m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_pd (__mmask8 __U, __m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_ps (__m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_ps (__m256 __W, __mmask8 __U, __m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_ps (__mmask8 __U, __m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_ps (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_ps (__m128 __W, __mmask8 __U, __m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_ps (__mmask8 __U, __m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_range_pd (__m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_range_pd (__m256d __W, __mmask8 __U, + __m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_range_pd (__mmask8 __U, __m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_pd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_pd (__m128d __W, __mmask8 __U, + __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_pd (__mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_range_ps (__m256 __A, __m256 __B, int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_range_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, + int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_range_ps (__mmask8 __U, __m256 __A, __m256 __B, int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_ps (__m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_ps (__m128 __W, __mmask8 __U, + __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_ps (__mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_pd_mask (__mmask8 __U, __m256d __A, + const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_pd_mask (__m256d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_ps_mask (__mmask8 __U, __m256 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_ps_mask (__m256 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_pd_mask (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_pd_mask (__m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A, + __imm, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_ps_mask (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A, + __imm, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_ps_mask (__m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A, + __imm, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_inserti64x2 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_inserti64x2 (__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_insertf64x2 (__m256d __W, __mmask8 __U, __m256d __A, + __m128d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) __W, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_insertf64x2 (__mmask8 __U, __m256d __A, __m128d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} +#else +#define _mm256_insertf64x2(X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) +#define _mm256_mask_insertf64x2(W, U, X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_insertf64x2(U, X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) +#define _mm256_inserti64x2(X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_inserti64x2(W, U, X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_inserti64x2(U, X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm256_extractf64x2_pd(X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8)-1)) +#define _mm256_mask_extractf64x2_pd(W, U, X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U))) +#define _mm256_maskz_extractf64x2_pd(U, X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8) (U))) +#define _mm256_extracti64x2_epi64(X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1)) +#define _mm256_mask_extracti64x2_epi64(W, U, X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U))) +#define _mm256_maskz_extracti64x2_epi64(U, X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) +#define _mm256_reduce_pd(A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) +#define _mm256_mask_reduce_pd(W, U, A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_reduce_pd(U, A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) +#define _mm_reduce_pd(A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1)) +#define _mm_mask_reduce_pd(W, U, A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) +#define _mm_maskz_reduce_pd(U, A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)(U))) +#define _mm256_reduce_ps(A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) +#define _mm256_mask_reduce_ps(W, U, A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_reduce_ps(U, A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) +#define _mm_reduce_ps(A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) +#define _mm_mask_reduce_ps(W, U, A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm_maskz_reduce_ps(U, A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) +#define _mm256_range_pd(A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) +#define _mm256_maskz_range_pd(U, A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) +#define _mm_range_pd(A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_setzero_pd(), (__mmask8)-1)) +#define _mm256_range_ps(A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) +#define _mm256_mask_range_ps(W, U, A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_range_ps(U, A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) +#define _mm_range_ps(A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) +#define _mm_mask_range_ps(W, U, A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm_maskz_range_ps(U, A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) +#define _mm256_mask_range_pd(W, U, A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm_mask_range_pd(W, U, A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) +#define _mm_maskz_range_pd(U, A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_setzero_pd(), (__mmask8)(U))) +#define _mm256_mask_fpclass_pd_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), (int) (C),(__mmask8)(u))) +#define _mm256_mask_fpclass_ps_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), (int) (C),(__mmask8)(u))) +#define _mm_mask_fpclass_pd_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), (int) (C),(__mmask8)(u))) +#define _mm_mask_fpclass_ps_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), (int) (C),(__mmask8)(u))) +#define _mm256_fpclass_pd_mask(X, C) ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), (int) (C),(__mmask8)-1)) +#define _mm256_fpclass_ps_mask(X, C) ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), (int) (C),(__mmask8)-1)) +#define _mm_fpclass_pd_mask(X, C) ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), (int) (C),(__mmask8)-1)) +#define _mm_fpclass_ps_mask(X, C) ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), (int) (C),(__mmask8)-1)) +#endif #ifdef __DISABLE_AVX512VLDQ__ #undef __DISABLE_AVX512VLDQ__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VLDQ__ */ - -#endif /* _AVX512VLDQINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vlintrin.internal.h b/third_party/intel/avx512vlintrin.internal.h index 534d1fdf8..756e1c767 100644 --- a/third_party/intel/avx512vlintrin.internal.h +++ b/third_party/intel/avx512vlintrin.internal.h @@ -1,8044 +1,11530 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _AVX512VLINTRIN_H_INCLUDED #define _AVX512VLINTRIN_H_INCLUDED - #ifndef __AVX512VL__ #pragma GCC push_options #pragma GCC target("avx512vl") #define __DISABLE_AVX512VL__ -#endif /* __AVX512VL__ */ - +#endif typedef unsigned int __mmask32; - -__funline __m256d _mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_movapd256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_movapd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_movapd128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_mov_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_movapd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_load_pd(__m256d __W, __mmask8 __U, - void const *__P) { - return (__m256d)__builtin_ia32_loadapd256_mask((__v4df *)__P, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_load_pd(__mmask8 __U, void const *__P) { - return (__m256d)__builtin_ia32_loadapd256_mask( - (__v4df *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) { - return (__m128d)__builtin_ia32_loadapd128_mask((__v2df *)__P, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_load_pd(__mmask8 __U, void const *__P) { - return (__m128d)__builtin_ia32_loadapd128_mask( - (__v2df *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline void _mm256_mask_store_pd(void *__P, __mmask8 __U, __m256d __A) { - __builtin_ia32_storeapd256_mask((__v4df *)__P, (__v4df)__A, (__mmask8)__U); -} - -__funline void _mm_mask_store_pd(void *__P, __mmask8 __U, __m128d __A) { - __builtin_ia32_storeapd128_mask((__v2df *)__P, (__v2df)__A, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_movaps256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_movaps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_movaps128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_mov_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_movaps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) { - return (__m256)__builtin_ia32_loadaps256_mask((__v8sf *)__P, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_load_ps(__mmask8 __U, void const *__P) { - return (__m256)__builtin_ia32_loadaps256_mask( - (__v8sf *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) { - return (__m128)__builtin_ia32_loadaps128_mask((__v4sf *)__P, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_load_ps(__mmask8 __U, void const *__P) { - return (__m128)__builtin_ia32_loadaps128_mask( - (__v4sf *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline void _mm256_mask_store_ps(void *__P, __mmask8 __U, __m256 __A) { - __builtin_ia32_storeaps256_mask((__v8sf *)__P, (__v8sf)__A, (__mmask8)__U); -} - -__funline void _mm_mask_store_ps(void *__P, __mmask8 __U, __m128 __A) { - __builtin_ia32_storeaps128_mask((__v4sf *)__P, (__v4sf)__A, (__mmask8)__U); -} - -__funline __m256i _mm256_mask_mov_epi64(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdqa64_256_mask((__v4di)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_mov_epi64(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdqa64_256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_mov_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdqa64_128_mask((__v2di)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mov_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdqa64_128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_load_epi64(__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i)__builtin_ia32_movdqa64load256_mask( - (__v4di *)__P, (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_load_epi64(__mmask8 __U, void const *__P) { - return (__m256i)__builtin_ia32_movdqa64load256_mask( - (__v4di *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_load_epi64(__m128i __W, __mmask8 __U, - void const *__P) { - return (__m128i)__builtin_ia32_movdqa64load128_mask( - (__v2di *)__P, (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_load_epi64(__mmask8 __U, void const *__P) { - return (__m128i)__builtin_ia32_movdqa64load128_mask( - (__v2di *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline void _mm256_mask_store_epi64(void *__P, __mmask8 __U, __m256i __A) { - __builtin_ia32_movdqa64store256_mask((__v4di *)__P, (__v4di)__A, - (__mmask8)__U); -} - -__funline void _mm_mask_store_epi64(void *__P, __mmask8 __U, __m128i __A) { - __builtin_ia32_movdqa64store128_mask((__v2di *)__P, (__v2di)__A, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask_mov_epi32(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdqa32_256_mask((__v8si)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_mov_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_movdqa32_256_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_mov_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdqa32_128_mask((__v4si)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_mov_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_movdqa32_128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_load_epi32(__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i)__builtin_ia32_movdqa32load256_mask( - (__v8si *)__P, (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_load_epi32(__mmask8 __U, void const *__P) { - return (__m256i)__builtin_ia32_movdqa32load256_mask( - (__v8si *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_load_epi32(__m128i __W, __mmask8 __U, - void const *__P) { - return (__m128i)__builtin_ia32_movdqa32load128_mask( - (__v4si *)__P, (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_load_epi32(__mmask8 __U, void const *__P) { - return (__m128i)__builtin_ia32_movdqa32load128_mask( - (__v4si *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline void _mm256_mask_store_epi32(void *__P, __mmask8 __U, __m256i __A) { - __builtin_ia32_movdqa32store256_mask((__v8si *)__P, (__v8si)__A, - (__mmask8)__U); -} - -__funline void _mm_mask_store_epi32(void *__P, __mmask8 __U, __m128i __A) { - __builtin_ia32_movdqa32store128_mask((__v4si *)__P, (__v4si)__A, - (__mmask8)__U); -} - -__funline __m128d _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_addpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_addpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_addpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_addpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_addps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_addps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_addps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_addps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_subpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_subpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_subpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_subpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_subps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_subps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_subps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_subps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline void _mm256_store_epi64(void *__P, __m256i __A) { - *(__m256i *)__P = __A; -} - -__funline void _mm_store_epi64(void *__P, __m128i __A) { - *(__m128i *)__P = __A; -} - -__funline __m256d _mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, - void const *__P) { - return (__m256d)__builtin_ia32_loadupd256_mask((const double *)__P, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) { - return (__m256d)__builtin_ia32_loadupd256_mask( - (const double *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_loadu_pd(__m128d __W, __mmask8 __U, void const *__P) { - return (__m128d)__builtin_ia32_loadupd128_mask((const double *)__P, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_loadu_pd(__mmask8 __U, void const *__P) { - return (__m128d)__builtin_ia32_loadupd128_mask( - (const double *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline void _mm256_mask_storeu_pd(void *__P, __mmask8 __U, __m256d __A) { - __builtin_ia32_storeupd256_mask((double *)__P, (__v4df)__A, (__mmask8)__U); -} - -__funline void _mm_mask_storeu_pd(void *__P, __mmask8 __U, __m128d __A) { - __builtin_ia32_storeupd128_mask((double *)__P, (__v2df)__A, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) { - return (__m256)__builtin_ia32_loadups256_mask((const float *)__P, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) { - return (__m256)__builtin_ia32_loadups256_mask( - (const float *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) { - return (__m128)__builtin_ia32_loadups128_mask((const float *)__P, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_loadu_ps(__mmask8 __U, void const *__P) { - return (__m128)__builtin_ia32_loadups128_mask( - (const float *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline void _mm256_mask_storeu_ps(void *__P, __mmask8 __U, __m256 __A) { - __builtin_ia32_storeups256_mask((float *)__P, (__v8sf)__A, (__mmask8)__U); -} - -__funline void _mm_mask_storeu_ps(void *__P, __mmask8 __U, __m128 __A) { - __builtin_ia32_storeups128_mask((float *)__P, (__v4sf)__A, (__mmask8)__U); -} - -__funline __m256i _mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i)__builtin_ia32_loaddqudi256_mask((const long long *)__P, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) { - return (__m256i)__builtin_ia32_loaddqudi256_mask( - (const long long *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, - void const *__P) { - return (__m128i)__builtin_ia32_loaddqudi128_mask((const long long *)__P, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) { - return (__m128i)__builtin_ia32_loaddqudi128_mask( - (const long long *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline void _mm256_mask_storeu_epi64(void *__P, __mmask8 __U, __m256i __A) { - __builtin_ia32_storedqudi256_mask((long long *)__P, (__v4di)__A, - (__mmask8)__U); -} - -__funline void _mm_mask_storeu_epi64(void *__P, __mmask8 __U, __m128i __A) { - __builtin_ia32_storedqudi128_mask((long long *)__P, (__v2di)__A, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i)__builtin_ia32_loaddqusi256_mask((const int *)__P, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) { - return (__m256i)__builtin_ia32_loaddqusi256_mask( - (const int *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, - void const *__P) { - return (__m128i)__builtin_ia32_loaddqusi128_mask((const int *)__P, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) { - return (__m128i)__builtin_ia32_loaddqusi128_mask( - (const int *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline void _mm256_mask_storeu_epi32(void *__P, __mmask8 __U, __m256i __A) { - __builtin_ia32_storedqusi256_mask((int *)__P, (__v8si)__A, (__mmask8)__U); -} - -__funline void _mm_mask_storeu_epi32(void *__P, __mmask8 __U, __m128i __A) { - __builtin_ia32_storedqusi128_mask((int *)__P, (__v4si)__A, (__mmask8)__U); -} - -__funline __m256i _mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsd256_mask((__v8si)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsd256_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsd128_mask((__v4si)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsd128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_abs_epi64(__m256i __A) { - return (__m256i)__builtin_ia32_pabsq256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_abs_epi64(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsq256_mask((__v4di)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_abs_epi64(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_pabsq256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_abs_epi64(__m128i __A) { - return (__m128i)__builtin_ia32_pabsq128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_abs_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsq128_mask((__v2di)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_abs_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pabsq128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm256_cvtpd_epu32(__m256d __A) { - return (__m128i)__builtin_ia32_cvtpd2udq256_mask( - (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm256_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, - __m256d __A) { - return (__m128i)__builtin_ia32_cvtpd2udq256_mask((__v4df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm256_maskz_cvtpd_epu32(__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_cvtpd2udq256_mask( - (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_cvtpd_epu32(__m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2udq128_mask( - (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2udq128_mask((__v2df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtpd_epu32(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2udq128_mask( - (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvttps_epi32(__m256i __W, __mmask8 __U, - __m256 __A) { - return (__m256i)__builtin_ia32_cvttps2dq256_mask((__v8sf)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvttps_epi32(__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_cvttps2dq256_mask( - (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvttps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2dq128_mask((__v4sf)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttps_epi32(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2dq128_mask( - (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvttps_epu32(__m256 __A) { - return (__m256i)__builtin_ia32_cvttps2udq256_mask( - (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvttps_epu32(__m256i __W, __mmask8 __U, - __m256 __A) { - return (__m256i)__builtin_ia32_cvttps2udq256_mask((__v8sf)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvttps_epu32(__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_cvttps2udq256_mask( - (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvttps_epu32(__m128 __A) { - return (__m128i)__builtin_ia32_cvttps2udq128_mask( - (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvttps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2udq128_mask((__v4sf)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttps_epu32(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvttps2udq128_mask( - (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm256_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, - __m256d __A) { - return (__m128i)__builtin_ia32_cvttpd2dq256_mask((__v4df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm256_maskz_cvttpd_epi32(__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_cvttpd2dq256_mask( - (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2dq128_mask((__v2df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttpd_epi32(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2dq128_mask( - (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm256_cvttpd_epu32(__m256d __A) { - return (__m128i)__builtin_ia32_cvttpd2udq256_mask( - (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm256_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, - __m256d __A) { - return (__m128i)__builtin_ia32_cvttpd2udq256_mask((__v4df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm256_maskz_cvttpd_epu32(__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_cvttpd2udq256_mask( - (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_cvttpd_epu32(__m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2udq128_mask( - (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2udq128_mask((__v2df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvttpd_epu32(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2udq128_mask( - (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm256_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, - __m256d __A) { - return (__m128i)__builtin_ia32_cvtpd2dq256_mask((__v4df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm256_maskz_cvtpd_epi32(__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_cvtpd2dq256_mask( - (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2dq128_mask((__v2df)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtpd_epi32(__mmask8 __U, __m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2dq128_mask( - (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_cvtepi32_pd(__m256d __W, __mmask8 __U, - __m128i __A) { - return (__m256d)__builtin_ia32_cvtdq2pd256_mask((__v4si)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_cvtdq2pd256_mask( - (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_cvtepi32_pd(__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtdq2pd128_mask((__v4si)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtdq2pd128_mask( - (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_cvtepu32_pd(__m128i __A) { - return (__m256d)__builtin_ia32_cvtudq2pd256_mask( - (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_cvtepu32_pd(__m256d __W, __mmask8 __U, - __m128i __A) { - return (__m256d)__builtin_ia32_cvtudq2pd256_mask((__v4si)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_cvtudq2pd256_mask( - (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_cvtepu32_pd(__m128i __A) { - return (__m128d)__builtin_ia32_cvtudq2pd128_mask( - (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_cvtepu32_pd(__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtudq2pd128_mask((__v4si)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_cvtudq2pd128_mask( - (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_cvtepi32_ps(__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_cvtdq2ps256_mask((__v8si)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_cvtepi32_ps(__mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_cvtdq2ps256_mask( - (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_cvtepi32_ps(__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtdq2ps128_mask((__v4si)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_cvtepi32_ps(__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtdq2ps128_mask( - (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_cvtepu32_ps(__m256i __A) { - return (__m256)__builtin_ia32_cvtudq2ps256_mask( - (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_cvtepu32_ps(__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_cvtudq2ps256_mask((__v8si)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_cvtudq2ps256_mask( - (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_cvtepu32_ps(__m128i __A) { - return (__m128)__builtin_ia32_cvtudq2ps128_mask( - (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_cvtepu32_ps(__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtudq2ps128_mask((__v4si)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_cvtepu32_ps(__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_cvtudq2ps128_mask( - (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_cvtps_pd(__m256d __W, __mmask8 __U, __m128 __A) { - return (__m256d)__builtin_ia32_cvtps2pd256_mask((__v4sf)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { - return (__m256d)__builtin_ia32_cvtps2pd256_mask( - (__v4sf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_cvtps_pd(__m128d __W, __mmask8 __U, __m128 __A) { - return (__m128d)__builtin_ia32_cvtps2pd128_mask((__v4sf)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { - return (__m128d)__builtin_ia32_cvtps2pd128_mask( - (__v4sf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128i _mm_cvtepi32_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_pmovdb128_mask( - (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); -} - -__funline __m128i _mm_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovdb128_mask((__v4si)__A, (__v16qi)__O, __M); -} - -__funline __m128i _mm_maskz_cvtepi32_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovdb128_mask( - (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtepi32_epi8(__m256i __A) { - return (__m128i)__builtin_ia32_pmovdb256_mask( - (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline __m128i _mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovdb256_mask((__v8si)__A, (__v16qi)__O, __M); -} - -__funline void _mm256_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); -} - -__funline __m128i _mm256_maskz_cvtepi32_epi8(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovdb256_mask( - (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtsepi32_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_pmovsdb128_mask( - (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovsdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); -} - -__funline __m128i _mm_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovsdb128_mask((__v4si)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtsepi32_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsdb128_mask( - (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtsepi32_epi8(__m256i __A) { - return (__m128i)__builtin_ia32_pmovsdb256_mask( - (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovsdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); -} - -__funline __m128i _mm256_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovsdb256_mask((__v8si)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtsepi32_epi8(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovsdb256_mask( - (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtusepi32_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_pmovusdb128_mask( - (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovusdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); -} - -__funline __m128i _mm_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovusdb128_mask((__v4si)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtusepi32_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovusdb128_mask( - (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtusepi32_epi8(__m256i __A) { - return (__m128i)__builtin_ia32_pmovusdb256_mask( - (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovusdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); -} - -__funline __m128i _mm256_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovusdb256_mask((__v8si)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtusepi32_epi8(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovusdb256_mask( - (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtepi32_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_pmovdw128_mask( - (__v4si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); -} - -__funline __m128i _mm_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovdw128_mask((__v4si)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm_maskz_cvtepi32_epi16(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovdw128_mask( - (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtepi32_epi16(__m256i __A) { - return (__m128i)__builtin_ia32_pmovdw256_mask( - (__v8si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); -} - -__funline __m128i _mm256_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovdw256_mask((__v8si)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm256_maskz_cvtepi32_epi16(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovdw256_mask( - (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtsepi32_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_pmovsdw128_mask( - (__v4si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovsdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); -} - -__funline __m128i _mm_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovsdw128_mask((__v4si)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm_maskz_cvtsepi32_epi16(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsdw128_mask( - (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtsepi32_epi16(__m256i __A) { - return (__m128i)__builtin_ia32_pmovsdw256_mask( - (__v8si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovsdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); -} - -__funline __m128i _mm256_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovsdw256_mask((__v8si)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm256_maskz_cvtsepi32_epi16(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovsdw256_mask( - (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtusepi32_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_pmovusdw128_mask( - (__v4si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovusdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); -} - -__funline __m128i _mm_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovusdw128_mask((__v4si)__A, (__v8hi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtusepi32_epi16(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovusdw128_mask( - (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtusepi32_epi16(__m256i __A) { - return (__m128i)__builtin_ia32_pmovusdw256_mask( - (__v8si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovusdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); -} - -__funline __m128i _mm256_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovusdw256_mask((__v8si)__A, (__v8hi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtusepi32_epi16(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovusdw256_mask( - (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtepi64_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_pmovqb128_mask( - (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovqb128_mask((__v2di)__A, (__v16qi)__O, __M); -} - -__funline __m128i _mm_maskz_cvtepi64_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovqb128_mask( - (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtepi64_epi8(__m256i __A) { - return (__m128i)__builtin_ia32_pmovqb256_mask( - (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovqb256_mask((__v4di)__A, (__v16qi)__O, __M); -} - -__funline __m128i _mm256_maskz_cvtepi64_epi8(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovqb256_mask( - (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtsepi64_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_pmovsqb128_mask( - (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovsqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovsqb128_mask((__v2di)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtsepi64_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsqb128_mask( - (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtsepi64_epi8(__m256i __A) { - return (__m128i)__builtin_ia32_pmovsqb256_mask( - (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovsqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovsqb256_mask((__v4di)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtsepi64_epi8(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovsqb256_mask( - (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtusepi64_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_pmovusqb128_mask( - (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovusqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovusqb128_mask((__v2di)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtusepi64_epi8(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovusqb128_mask( - (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtusepi64_epi8(__m256i __A) { - return (__m128i)__builtin_ia32_pmovusqb256_mask( - (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovusqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovusqb256_mask((__v4di)__A, (__v16qi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtusepi64_epi8(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovusqb256_mask( - (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtepi64_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_pmovqw128_mask( - (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovqw128_mask((__v2di)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm_maskz_cvtepi64_epi16(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovqw128_mask( - (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtepi64_epi16(__m256i __A) { - return (__m128i)__builtin_ia32_pmovqw256_mask( - (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovqw256_mask((__v4di)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm256_maskz_cvtepi64_epi16(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovqw256_mask( - (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtsepi64_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_pmovsqw128_mask( - (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovsqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovsqw128_mask((__v2di)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm_maskz_cvtsepi64_epi16(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsqw128_mask( - (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtsepi64_epi16(__m256i __A) { - return (__m128i)__builtin_ia32_pmovsqw256_mask( - (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovsqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovsqw256_mask((__v4di)__A, (__v8hi)__O, __M); -} - -__funline __m128i _mm256_maskz_cvtsepi64_epi16(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovsqw256_mask( - (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtusepi64_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_pmovusqw128_mask( - (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovusqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovusqw128_mask((__v2di)__A, (__v8hi)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtusepi64_epi16(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovusqw128_mask( - (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtusepi64_epi16(__m256i __A) { - return (__m128i)__builtin_ia32_pmovusqw256_mask( - (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovusqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovusqw256_mask((__v4di)__A, (__v8hi)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtusepi64_epi16(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovusqw256_mask( - (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtepi64_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_pmovqd128_mask( - (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovqd128_mask((__v2di)__A, (__v4si)__O, __M); -} - -__funline __m128i _mm_maskz_cvtepi64_epi32(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovqd128_mask( - (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtepi64_epi32(__m256i __A) { - return (__m128i)__builtin_ia32_pmovqd256_mask( - (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovqd256_mask((__v4di)__A, (__v4si)__O, __M); -} - -__funline __m128i _mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovqd256_mask( - (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtsepi64_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_pmovsqd128_mask( - (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovsqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovsqd128_mask((__v2di)__A, (__v4si)__O, __M); -} - -__funline __m128i _mm_maskz_cvtsepi64_epi32(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsqd128_mask( - (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtsepi64_epi32(__m256i __A) { - return (__m128i)__builtin_ia32_pmovsqd256_mask( - (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovsqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovsqd256_mask((__v4di)__A, (__v4si)__O, __M); -} - -__funline __m128i _mm256_maskz_cvtsepi64_epi32(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovsqd256_mask( - (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_cvtusepi64_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_pmovusqd128_mask( - (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, - __m128i __A) { - __builtin_ia32_pmovusqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); -} - -__funline __m128i _mm_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovusqd128_mask((__v2di)__A, (__v4si)__O, - __M); -} - -__funline __m128i _mm_maskz_cvtusepi64_epi32(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pmovusqd128_mask( - (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm256_cvtusepi64_epi32(__m256i __A) { - return (__m128i)__builtin_ia32_pmovusqd256_mask( - (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -__funline void _mm256_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, - __m256i __A) { - __builtin_ia32_pmovusqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); -} - -__funline __m128i _mm256_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, - __m256i __A) { - return (__m128i)__builtin_ia32_pmovusqd256_mask((__v4di)__A, (__v4si)__O, - __M); -} - -__funline __m128i _mm256_maskz_cvtusepi64_epi32(__mmask8 __M, __m256i __A) { - return (__m128i)__builtin_ia32_pmovusqd256_mask( - (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m256 _mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, - __m128 __A) { - return (__m256)__builtin_ia32_broadcastss256_mask((__v4sf)__A, (__v8sf)__O, - __M); -} - -__funline __m256 _mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { - return (__m256)__builtin_ia32_broadcastss256_mask( - (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); -} - -__funline __m128 _mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) { - return (__m128)__builtin_ia32_broadcastss128_mask((__v4sf)__A, (__v4sf)__O, - __M); -} - -__funline __m128 _mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { - return (__m128)__builtin_ia32_broadcastss128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), __M); -} - -__funline __m256d _mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, - __m128d __A) { - return (__m256d)__builtin_ia32_broadcastsd256_mask((__v2df)__A, (__v4df)__O, - __M); -} - -__funline __m256d _mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { - return (__m256d)__builtin_ia32_broadcastsd256_mask( - (__v2df)__A, (__v4df)_mm256_setzero_pd(), __M); -} - -__funline __m256i _mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, - __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastd256_mask((__v4si)__A, (__v8si)__O, - __M); -} - -__funline __m256i _mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastd256_mask( - (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { - return (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask(__A, (__v8si)__O, __M); -} - -__funline __m256i _mm256_maskz_set1_epi32(__mmask8 __M, int __A) { - return (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask( - __A, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastd128_mask((__v4si)__A, (__v4si)__O, - __M); -} - -__funline __m128i _mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastd128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { - return (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask(__A, (__v4si)__O, __M); -} - -__funline __m128i _mm_maskz_set1_epi32(__mmask8 __M, int __A) { - return (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask( - __A, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, - __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastq256_mask((__v2di)__A, (__v4di)__O, - __M); -} - -__funline __m256i _mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_pbroadcastq256_mask( - (__v2di)__A, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, - long long __A) { - return (__m256i)__builtin_ia32_pbroadcastq256_gpr_mask(__A, (__v4di)__O, __M); -} - -__funline __m256i _mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { - return (__m256i)__builtin_ia32_pbroadcastq256_gpr_mask( - __A, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, - __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastq128_mask((__v2di)__A, (__v2di)__O, - __M); -} - -__funline __m128i _mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { - return (__m128i)__builtin_ia32_pbroadcastq128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { - return (__m128i)__builtin_ia32_pbroadcastq128_gpr_mask(__A, (__v2di)__O, __M); -} - -__funline __m128i _mm_maskz_set1_epi64(__mmask8 __M, long long __A) { - return (__m128i)__builtin_ia32_pbroadcastq128_gpr_mask( - __A, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m256 _mm256_broadcast_f32x4(__m128 __A) { - return (__m256)__builtin_ia32_broadcastf32x4_256_mask( - (__v4sf)__A, (__v8sf)_mm256_undefined_pd(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, - __m128 __A) { - return (__m256)__builtin_ia32_broadcastf32x4_256_mask((__v4sf)__A, - (__v8sf)__O, __M); -} - -__funline __m256 _mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) { - return (__m256)__builtin_ia32_broadcastf32x4_256_mask( - (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); -} - -__funline __m256i _mm256_broadcast_i32x4(__m128i __A) { - return (__m256i)__builtin_ia32_broadcasti32x4_256_mask( - (__v4si)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, - __m128i __A) { - return (__m256i)__builtin_ia32_broadcasti32x4_256_mask((__v4si)__A, - (__v8si)__O, __M); -} - -__funline __m256i _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { - return (__m256i)__builtin_ia32_broadcasti32x4_256_mask( - (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_cvtepi8_epi32(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxbd256_mask((__v16qi)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxbd256_mask( - (__v16qi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxbd128_mask((__v16qi)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxbd128_mask( - (__v16qi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxbq256_mask((__v16qi)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxbq256_mask( - (__v16qi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxbq128_mask((__v16qi)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxbq128_mask( - (__v16qi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxwd256_mask((__v8hi)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxwd256_mask( - (__v8hi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxwd128_mask((__v8hi)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxwd128_mask( - (__v8hi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxwq256_mask((__v8hi)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovsxwq256_mask( - (__v8hi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxwq128_mask((__v8hi)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovsxwq128_mask( - (__v8hi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, - __m128i __X) { - return (__m256i)__builtin_ia32_pmovsxdq256_mask((__v4si)__X, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { - return (__m256i)__builtin_ia32_pmovsxdq256_mask( - (__v4si)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, - __m128i __X) { - return (__m128i)__builtin_ia32_pmovsxdq128_mask((__v4si)__X, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { - return (__m128i)__builtin_ia32_pmovsxdq128_mask( - (__v4si)__X, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxbd256_mask((__v16qi)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxbd256_mask( - (__v16qi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxbd128_mask((__v16qi)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxbd128_mask( - (__v16qi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxbq256_mask((__v16qi)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxbq256_mask( - (__v16qi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxbq128_mask((__v16qi)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxbq128_mask( - (__v16qi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxwd256_mask((__v8hi)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxwd256_mask( - (__v8hi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxwd128_mask((__v8hi)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxwd128_mask( - (__v8hi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, - __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxwq256_mask((__v8hi)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m256i)__builtin_ia32_pmovzxwq256_mask( - (__v8hi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxwq128_mask((__v8hi)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_pmovzxwq128_mask( - (__v8hi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, - __m128i __X) { - return (__m256i)__builtin_ia32_pmovzxdq256_mask((__v4si)__X, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { - return (__m256i)__builtin_ia32_pmovzxdq256_mask( - (__v4si)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, - __m128i __X) { - return (__m128i)__builtin_ia32_pmovzxdq128_mask((__v4si)__X, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { - return (__m128i)__builtin_ia32_pmovzxdq128_mask( - (__v4si)__X, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256d _mm256_rcp14_pd(__m256d __A) { - return (__m256d)__builtin_ia32_rcp14pd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_rcp14_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_rcp14pd256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_rcp14_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_rcp14pd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_rcp14_pd(__m128d __A) { - return (__m128d)__builtin_ia32_rcp14pd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_rcp14_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_rcp14pd128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_rcp14_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_rcp14pd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_rcp14_ps(__m256 __A) { - return (__m256)__builtin_ia32_rcp14ps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_rcp14_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_rcp14ps256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_rcp14ps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_rcp14_ps(__m128 __A) { - return (__m128)__builtin_ia32_rcp14ps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_rcp14_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_rcp14ps128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_rcp14_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_rcp14ps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_rsqrt14_pd(__m256d __A) { - return (__m256d)__builtin_ia32_rsqrt14pd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_rsqrt14_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_rsqrt14pd256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_rsqrt14_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_rsqrt14pd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_rsqrt14_pd(__m128d __A) { - return (__m128d)__builtin_ia32_rsqrt14pd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_rsqrt14_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_rsqrt14pd128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_rsqrt14_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_rsqrt14pd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_rsqrt14_ps(__m256 __A) { - return (__m256)__builtin_ia32_rsqrt14ps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_rsqrt14_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_rsqrt14ps256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_rsqrt14_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_rsqrt14ps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_rsqrt14_ps(__m128 __A) { - return (__m128)__builtin_ia32_rsqrt14ps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_rsqrt14_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_rsqrt14ps128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_rsqrt14_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_rsqrt14ps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_sqrtpd256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_sqrtpd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_sqrtpd128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_sqrtpd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_sqrtps256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_sqrtps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_sqrtps128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_sqrtps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_paddd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_paddq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_paddq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psubd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_psubq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_psubq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_paddq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psubq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256 _mm256_getexp_ps(__m256 __A) { - return (__m256)__builtin_ia32_getexpps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_getexp_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_getexpps256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_getexp_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_getexpps256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_getexp_pd(__m256d __A) { - return (__m256d)__builtin_ia32_getexppd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_getexp_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_getexppd256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_getexp_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_getexppd256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_getexp_ps(__m128 __A) { - return (__m128)__builtin_ia32_getexpps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_getexp_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_getexpps128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_getexp_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_getexpps128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_getexp_pd(__m128d __A) { - return (__m128d)__builtin_ia32_getexppd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_getexp_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_getexppd128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_getexp_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_getexppd128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psrld256_mask((__v8si)__A, (__v4si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psrld256_mask( - (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psrld128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrld128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psrlq256_mask((__v4di)__A, (__v2di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psrlq256_mask( - (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psrlq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrlq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pandd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pandd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256d _mm256_scalef_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_scalefpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_scalef_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_scalefpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_scalef_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_scalefpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_scalef_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_scalefps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_scalef_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_scalefps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_scalef_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_scalefps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_scalef_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_scalefpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_scalef_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_scalefpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_scalef_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_scalefpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_scalef_ps(__m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_scalefps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_scalef_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_scalefps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_scalef_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_scalefps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) { - return (__m256d)__builtin_ia32_vfmaddpd256_mask3((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256_maskz((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmaddpd128_mask3((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd128_maskz((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) { - return (__m256)__builtin_ia32_vfmaddps256_mask3((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256_maskz((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmaddps128_mask3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps128_maskz((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) { - return (__m256d)__builtin_ia32_vfmsubpd256_mask3((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubpd256_maskz((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmsubpd128_mask3((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubpd128_maskz((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmsubps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) { - return (__m256)__builtin_ia32_vfmsubps256_mask3((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmsubps256_maskz((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmsubps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmsubps128_mask3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmsubps128_maskz((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) { - return (__m256d)__builtin_ia32_vfmaddsubpd256_mask3( - (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256_maskz( - (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmaddsubpd128_mask3( - (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd128_maskz( - (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) { - return (__m256)__builtin_ia32_vfmaddsubps256_mask3( - (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256_maskz( - (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmaddsubps128_mask3( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps128_maskz( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256_mask( - (__v4df)__A, (__v4df)__B, -(__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) { - return (__m256d)__builtin_ia32_vfmsubaddpd256_mask3( - (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256_maskz( - (__v4df)__A, (__v4df)__B, -(__v4df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd128_mask( - (__v2df)__A, (__v2df)__B, -(__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfmsubaddpd128_mask3( - (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd128_maskz( - (__v2df)__A, (__v2df)__B, -(__v2df)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256_mask( - (__v8sf)__A, (__v8sf)__B, -(__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) { - return (__m256)__builtin_ia32_vfmsubaddps256_mask3( - (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256_maskz( - (__v8sf)__A, (__v8sf)__B, -(__v8sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps128_mask( - (__v4sf)__A, (__v4sf)__B, -(__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfmsubaddps128_mask3( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps128_maskz( - (__v4sf)__A, (__v4sf)__B, -(__v4sf)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfnmaddpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) { - return (__m256d)__builtin_ia32_vfnmaddpd256_mask3((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfnmaddpd256_maskz((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfnmaddpd128_mask3((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddpd128_maskz((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfnmaddps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) { - return (__m256)__builtin_ia32_vfnmaddps256_mask3((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfnmaddps256_maskz((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfnmaddps128_mask3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddps128_maskz((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfnmsubpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, - __mmask8 __U) { - return (__m256d)__builtin_ia32_vfnmsubpd256_mask3((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256d __C) { - return (__m256d)__builtin_ia32_vfnmsubpd256_maskz((__v4df)__A, (__v4df)__B, - (__v4df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, - __mmask8 __U) { - return (__m128d)__builtin_ia32_vfnmsubpd128_mask3((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, - __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubpd128_maskz((__v2df)__A, (__v2df)__B, - (__v2df)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfnmsubps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, - __mmask8 __U) { - return (__m256)__builtin_ia32_vfnmsubps256_mask3((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, - __m256 __C) { - return (__m256)__builtin_ia32_vfnmsubps256_maskz((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, - __mmask8 __U) { - return (__m128)__builtin_ia32_vfnmsubps128_mask3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, - __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubps128_maskz((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -__funline __m128i _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pandd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pandd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pandnd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pandnd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pandnd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pandnd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_or_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pord256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pord256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_or_epi32(__m256i __A, __m256i __B) { - return (__m256i)((__v8su)__A | (__v8su)__B); -} - -__funline __m128i _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pord128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pord128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_or_epi32(__m128i __A, __m128i __B) { - return (__m128i)((__v4su)__A | (__v4su)__B); -} - -__funline __m256i _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pxord256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pxord256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_xor_epi32(__m256i __A, __m256i __B) { - return (__m256i)((__v8su)__A ^ (__v8su)__B); -} - -__funline __m128i _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pxord128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pxord128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_xor_epi32(__m128i __A, __m128i __B) { - return (__m128i)((__v4su)__A ^ (__v4su)__B); -} - -__funline __m128 _mm_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m128d __A) { - return (__m128)__builtin_ia32_cvtpd2ps_mask((__v2df)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_cvtpd_ps(__mmask8 __U, __m128d __A) { - return (__m128)__builtin_ia32_cvtpd2ps_mask( - (__v2df)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm256_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m256d __A) { - return (__m128)__builtin_ia32_cvtpd2ps256_mask((__v4df)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm256_maskz_cvtpd_ps(__mmask8 __U, __m256d __A) { - return (__m128)__builtin_ia32_cvtpd2ps256_mask( - (__v4df)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_cvtps_epi32(__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_cvtps2dq256_mask((__v8sf)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtps_epi32(__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_cvtps2dq256_mask( - (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2dq128_mask((__v4sf)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtps_epi32(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2dq128_mask( - (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_cvtps_epu32(__m256 __A) { - return (__m256i)__builtin_ia32_cvtps2udq256_mask( - (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_cvtps_epu32(__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_cvtps2udq256_mask((__v8sf)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_cvtps_epu32(__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_cvtps2udq256_mask( - (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_cvtps_epu32(__m128 __A) { - return (__m128i)__builtin_ia32_cvtps2udq128_mask( - (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_cvtps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2udq128_mask((__v4sf)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtps_epu32(__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_cvtps2udq128_mask( - (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_movedup_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_movddup256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_movddup256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_movedup_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_movddup128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_movedup_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_movddup128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_movshdup256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_movshdup256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_movshdup128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_movshdup128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_movsldup256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_movsldup256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_movsldup128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_movsldup128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhdq128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhdq128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhdq256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpckhdq256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhqdq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckhqdq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckhqdq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpckhqdq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckldq128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpckldq128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpckldq256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpckldq256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpcklqdq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_punpcklqdq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, - __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_punpcklqdq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_punpcklqdq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __mmask8 _mm_cmpeq_epu32_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 0, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpeq_epi32_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__A, (__v4si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpeq_epu32_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 0, - __U); -} - -__funline __mmask8 _mm_mask_cmpeq_epi32_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__A, (__v4si)__B, - __U); -} - -__funline __mmask8 _mm256_cmpeq_epu32_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 0, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmpeq_epi32_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__A, (__v8si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpeq_epu32_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 0, - __U); -} - -__funline __mmask8 _mm256_mask_cmpeq_epi32_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__A, (__v8si)__B, - __U); -} - -__funline __mmask8 _mm_cmpeq_epu64_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 0, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpeq_epi64_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__A, (__v2di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpeq_epu64_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 0, - __U); -} - -__funline __mmask8 _mm_mask_cmpeq_epi64_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__A, (__v2di)__B, - __U); -} - -__funline __mmask8 _mm256_cmpeq_epu64_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 0, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmpeq_epi64_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__A, (__v4di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpeq_epu64_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 0, - __U); -} - -__funline __mmask8 _mm256_mask_cmpeq_epi64_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__A, (__v4di)__B, - __U); -} - -__funline __mmask8 _mm_cmpgt_epu32_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 6, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpgt_epi32_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__A, (__v4si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpgt_epu32_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 6, - __U); -} - -__funline __mmask8 _mm_mask_cmpgt_epi32_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__A, (__v4si)__B, - __U); -} - -__funline __mmask8 _mm256_cmpgt_epu32_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 6, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmpgt_epi32_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__A, (__v8si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpgt_epu32_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 6, - __U); -} - -__funline __mmask8 _mm256_mask_cmpgt_epi32_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__A, (__v8si)__B, - __U); -} - -__funline __mmask8 _mm_cmpgt_epu64_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 6, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmpgt_epi64_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__A, (__v2di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpgt_epu64_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 6, - __U); -} - -__funline __mmask8 _mm_mask_cmpgt_epi64_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__A, (__v2di)__B, - __U); -} - -__funline __mmask8 _mm256_cmpgt_epu64_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 6, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmpgt_epi64_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__A, (__v4di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpgt_epu64_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 6, - __U); -} - -__funline __mmask8 _mm256_mask_cmpgt_epi64_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__A, (__v4di)__B, - __U); -} - -__funline __mmask8 _mm_test_epi32_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ptestmd128((__v4si)__A, (__v4si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_test_epi32_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ptestmd128((__v4si)__A, (__v4si)__B, __U); -} - -__funline __mmask8 _mm256_test_epi32_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ptestmd256((__v8si)__A, (__v8si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_test_epi32_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ptestmd256((__v8si)__A, (__v8si)__B, __U); -} - -__funline __mmask8 _mm_test_epi64_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ptestmq128((__v2di)__A, (__v2di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_test_epi64_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ptestmq128((__v2di)__A, (__v2di)__B, __U); -} - -__funline __mmask8 _mm256_test_epi64_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ptestmq256((__v4di)__A, (__v4di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_test_epi64_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ptestmq256((__v4di)__A, (__v4di)__B, __U); -} - -__funline __mmask8 _mm_testn_epi32_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ptestnmd128((__v4si)__A, (__v4si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_testn_epi32_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ptestnmd128((__v4si)__A, (__v4si)__B, __U); -} - -__funline __mmask8 _mm256_testn_epi32_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ptestnmd256((__v8si)__A, (__v8si)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_testn_epi32_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ptestnmd256((__v8si)__A, (__v8si)__B, __U); -} - -__funline __mmask8 _mm_testn_epi64_mask(__m128i __A, __m128i __B) { - return (__mmask8)__builtin_ia32_ptestnmq128((__v2di)__A, (__v2di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_testn_epi64_mask(__mmask8 __U, __m128i __A, - __m128i __B) { - return (__mmask8)__builtin_ia32_ptestnmq128((__v2di)__A, (__v2di)__B, __U); -} - -__funline __mmask8 _mm256_testn_epi64_mask(__m256i __A, __m256i __B) { - return (__mmask8)__builtin_ia32_ptestnmq256((__v4di)__A, (__v4di)__B, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_testn_epi64_mask(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__mmask8)__builtin_ia32_ptestnmq256((__v4di)__A, (__v4di)__B, __U); -} - -__funline __m256d _mm256_mask_compress_pd(__m256d __W, __mmask8 __U, - __m256d __A) { - return (__m256d)__builtin_ia32_compressdf256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_compress_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_compressdf256_mask( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline void _mm256_mask_compressstoreu_pd(void *__P, __mmask8 __U, - __m256d __A) { - __builtin_ia32_compressstoredf256_mask((__v4df *)__P, (__v4df)__A, - (__mmask8)__U); -} - -__funline __m128d _mm_mask_compress_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_compressdf128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_compress_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_compressdf128_mask( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline void _mm_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m128d __A) { - __builtin_ia32_compressstoredf128_mask((__v2df *)__P, (__v2df)__A, - (__mmask8)__U); -} - -__funline __m256 _mm256_mask_compress_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_compresssf256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_compress_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_compresssf256_mask( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline void _mm256_mask_compressstoreu_ps(void *__P, __mmask8 __U, - __m256 __A) { - __builtin_ia32_compressstoresf256_mask((__v8sf *)__P, (__v8sf)__A, - (__mmask8)__U); -} - -__funline __m128 _mm_mask_compress_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_compresssf128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_compress_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_compresssf128_mask( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline void _mm_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m128 __A) { - __builtin_ia32_compressstoresf128_mask((__v4sf *)__P, (__v4sf)__A, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask_compress_epi64(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_compressdi256_mask((__v4di)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_compress_epi64(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_compressdi256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline void _mm256_mask_compressstoreu_epi64(void *__P, __mmask8 __U, - __m256i __A) { - __builtin_ia32_compressstoredi256_mask((__v4di *)__P, (__v4di)__A, - (__mmask8)__U); -} - -__funline __m128i _mm_mask_compress_epi64(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_compressdi128_mask((__v2di)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_compress_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_compressdi128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline void _mm_mask_compressstoreu_epi64(void *__P, __mmask8 __U, - __m128i __A) { - __builtin_ia32_compressstoredi128_mask((__v2di *)__P, (__v2di)__A, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask_compress_epi32(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_compresssi256_mask((__v8si)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_compress_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_compresssi256_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline void _mm256_mask_compressstoreu_epi32(void *__P, __mmask8 __U, - __m256i __A) { - __builtin_ia32_compressstoresi256_mask((__v8si *)__P, (__v8si)__A, - (__mmask8)__U); -} - -__funline __m128i _mm_mask_compress_epi32(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_compresssi128_mask((__v4si)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_compress_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_compresssi128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline void _mm_mask_compressstoreu_epi32(void *__P, __mmask8 __U, - __m128i __A) { - __builtin_ia32_compressstoresi128_mask((__v4si *)__P, (__v4si)__A, - (__mmask8)__U); -} - -__funline __m256d _mm256_mask_expand_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_expanddf256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_expand_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_expanddf256_maskz( - (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_expandloadu_pd(__m256d __W, __mmask8 __U, - void const *__P) { - return (__m256d)__builtin_ia32_expandloaddf256_mask( - (__v4df *)__P, (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { - return (__m256d)__builtin_ia32_expandloaddf256_maskz( - (__v4df *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_expanddf128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_expand_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_expanddf128_maskz( - (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_expandloadu_pd(__m128d __W, __mmask8 __U, - void const *__P) { - return (__m128d)__builtin_ia32_expandloaddf128_mask( - (__v2df *)__P, (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { - return (__m128d)__builtin_ia32_expandloaddf128_maskz( - (__v2df *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_expand_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_expandsf256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_expand_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_expandsf256_maskz( - (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_expandloadu_ps(__m256 __W, __mmask8 __U, - void const *__P) { - return (__m256)__builtin_ia32_expandloadsf256_mask((__v8sf *)__P, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { - return (__m256)__builtin_ia32_expandloadsf256_maskz( - (__v8sf *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_expand_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_expandsf128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_expand_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_expandsf128_maskz( - (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_expandloadu_ps(__m128 __W, __mmask8 __U, - void const *__P) { - return (__m128)__builtin_ia32_expandloadsf128_mask((__v4sf *)__P, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { - return (__m128)__builtin_ia32_expandloadsf128_maskz( - (__v4sf *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_expand_epi64(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_expanddi256_mask((__v4di)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_expand_epi64(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_expanddi256_maskz( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_expandloadu_epi64(__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i)__builtin_ia32_expandloaddi256_mask( - (__v4di *)__P, (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { - return (__m256i)__builtin_ia32_expandloaddi256_maskz( - (__v4di *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_expand_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_expanddi128_mask((__v2di)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_expand_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_expanddi128_maskz( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_expandloadu_epi64(__m128i __W, __mmask8 __U, - void const *__P) { - return (__m128i)__builtin_ia32_expandloaddi128_mask( - (__v2di *)__P, (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { - return (__m128i)__builtin_ia32_expandloaddi128_maskz( - (__v2di *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_expand_epi32(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_expandsi256_mask((__v8si)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_expand_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_expandsi256_maskz( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_expandloadu_epi32(__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i)__builtin_ia32_expandloadsi256_mask( - (__v8si *)__P, (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { - return (__m256i)__builtin_ia32_expandloadsi256_maskz( - (__v8si *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_expand_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_expandsi128_mask((__v4si)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_expand_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_expandsi128_maskz( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_expandloadu_epi32(__m128i __W, __mmask8 __U, - void const *__P) { - return (__m128i)__builtin_ia32_expandloadsi128_mask( - (__v4si *)__P, (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { - return (__m128i)__builtin_ia32_expandloadsi128_maskz( - (__v4si *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256d _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { - return (__m256d)__builtin_ia32_vpermt2varpd256_mask((__v4di)__I - /* idx */, - (__v4df)__A, (__v4df)__B, - (__mmask8)-1); -} - -__funline __m256d _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, - __m256i __I, __m256d __B) { - return (__m256d)__builtin_ia32_vpermt2varpd256_mask((__v4di)__I - /* idx */, - (__v4df)__A, (__v4df)__B, - (__mmask8)__U); -} - -__funline __m256d _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, - __mmask8 __U, __m256d __B) { - return (__m256d)__builtin_ia32_vpermi2varpd256_mask((__v4df)__A, - (__v4di)__I - /* idx */, - (__v4df)__B, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, - __m256i __I, __m256d __B) { - return (__m256d)__builtin_ia32_vpermt2varpd256_maskz((__v4di)__I - /* idx */, - (__v4df)__A, (__v4df)__B, - (__mmask8)__U); -} - -__funline __m256 _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { - return (__m256)__builtin_ia32_vpermt2varps256_mask((__v8si)__I - /* idx */, - (__v8sf)__A, (__v8sf)__B, - (__mmask8)-1); -} - -__funline __m256 _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, - __m256i __I, __m256 __B) { - return (__m256)__builtin_ia32_vpermt2varps256_mask((__v8si)__I - /* idx */, - (__v8sf)__A, (__v8sf)__B, - (__mmask8)__U); -} - -__funline __m256 _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, - __mmask8 __U, __m256 __B) { - return (__m256)__builtin_ia32_vpermi2varps256_mask((__v8sf)__A, - (__v8si)__I - /* idx */, - (__v8sf)__B, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, - __m256i __I, __m256 __B) { - return (__m256)__builtin_ia32_vpermt2varps256_maskz((__v8si)__I - /* idx */, - (__v8sf)__A, (__v8sf)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varq128_mask((__v2di)__I - /* idx */, - (__v2di)__A, (__v2di)__B, - (__mmask8)-1); -} - -__funline __m128i _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varq128_mask((__v2di)__I - /* idx */, - (__v2di)__A, (__v2di)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, - __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpermi2varq128_mask((__v2di)__A, - (__v2di)__I - /* idx */, - (__v2di)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2varq128_maskz((__v2di)__I - /* idx */, - (__v2di)__A, (__v2di)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2vard128_mask((__v4si)__I - /* idx */, - (__v4si)__A, (__v4si)__B, - (__mmask8)-1); -} - -__funline __m128i _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2vard128_mask((__v4si)__I - /* idx */, - (__v4si)__A, (__v4si)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, - __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpermi2vard128_mask((__v4si)__A, - (__v4si)__I - /* idx */, - (__v4si)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, - __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermt2vard128_maskz((__v4si)__I - /* idx */, - (__v4si)__A, (__v4si)__B, - (__mmask8)__U); -} - -__funline __m256i _mm256_permutex2var_epi64(__m256i __A, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varq256_mask((__v4di)__I - /* idx */, - (__v4di)__A, (__v4di)__B, - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varq256_mask((__v4di)__I - /* idx */, - (__v4di)__A, (__v4di)__B, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, - __mmask8 __U, __m256i __B) { - return (__m256i)__builtin_ia32_vpermi2varq256_mask((__v4di)__A, - (__v4di)__I - /* idx */, - (__v4di)__B, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2varq256_maskz((__v4di)__I - /* idx */, - (__v4di)__A, (__v4di)__B, - (__mmask8)__U); -} - -__funline __m256i _mm256_permutex2var_epi32(__m256i __A, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2vard256_mask((__v8si)__I - /* idx */, - (__v8si)__A, (__v8si)__B, - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2vard256_mask((__v8si)__I - /* idx */, - (__v8si)__A, (__v8si)__B, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, - __mmask8 __U, __m256i __B) { - return (__m256i)__builtin_ia32_vpermi2vard256_mask((__v8si)__A, - (__v8si)__I - /* idx */, - (__v8si)__B, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, - __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermt2vard256_maskz((__v8si)__I - /* idx */, - (__v8si)__A, (__v8si)__B, - (__mmask8)__U); -} - -__funline __m128d _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { - return (__m128d)__builtin_ia32_vpermt2varpd128_mask((__v2di)__I - /* idx */, - (__v2df)__A, (__v2df)__B, - (__mmask8)-1); -} - -__funline __m128d _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, - __m128d __B) { - return (__m128d)__builtin_ia32_vpermt2varpd128_mask((__v2di)__I - /* idx */, - (__v2df)__A, (__v2df)__B, - (__mmask8)__U); -} - -__funline __m128d _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, - __mmask8 __U, __m128d __B) { - return (__m128d)__builtin_ia32_vpermi2varpd128_mask((__v2df)__A, - (__v2di)__I - /* idx */, - (__v2df)__B, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, - __m128i __I, __m128d __B) { - return (__m128d)__builtin_ia32_vpermt2varpd128_maskz((__v2di)__I - /* idx */, - (__v2df)__A, (__v2df)__B, - (__mmask8)__U); -} - -__funline __m128 _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { - return (__m128)__builtin_ia32_vpermt2varps128_mask((__v4si)__I - /* idx */, - (__v4sf)__A, (__v4sf)__B, - (__mmask8)-1); -} - -__funline __m128 _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, - __m128 __B) { - return (__m128)__builtin_ia32_vpermt2varps128_mask((__v4si)__I - /* idx */, - (__v4sf)__A, (__v4sf)__B, - (__mmask8)__U); -} - -__funline __m128 _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, - __m128 __B) { - return (__m128)__builtin_ia32_vpermi2varps128_mask((__v4sf)__A, - (__v4si)__I - /* idx */, - (__v4sf)__B, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, - __m128 __B) { - return (__m128)__builtin_ia32_vpermt2varps128_maskz((__v4si)__I - /* idx */, - (__v4sf)__A, (__v4sf)__B, - (__mmask8)__U); -} - -__funline __m128i _mm_srav_epi64(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psravq128_mask( - (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_psravq128_mask((__v2di)__X, (__v2di)__Y, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psravq128_mask( - (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psllv8si_mask((__v8si)__X, (__v8si)__Y, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psllv8si_mask( - (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_psllv4si_mask((__v4si)__X, (__v4si)__Y, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psllv4si_mask( - (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psllv4di_mask((__v4di)__X, (__v4di)__Y, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psllv4di_mask( - (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_psllv2di_mask((__v2di)__X, (__v2di)__Y, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psllv2di_mask( - (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psrav8si_mask((__v8si)__X, (__v8si)__Y, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psrav8si_mask( - (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_psrav4si_mask((__v4si)__X, (__v4si)__Y, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psrav4si_mask( - (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psrlv8si_mask((__v8si)__X, (__v8si)__Y, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psrlv8si_mask( - (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_psrlv4si_mask((__v4si)__X, (__v4si)__Y, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psrlv4si_mask( - (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psrlv4di_mask((__v4di)__X, (__v4di)__Y, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psrlv4di_mask( - (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_psrlv2di_mask((__v2di)__X, (__v2di)__Y, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psrlv2di_mask( - (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_rolv_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prolvd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_rolv_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prolvd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_rolv_epi32(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prolvd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_rolv_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prolvd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_rolv_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_prolvd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_rolv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prolvd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_rorv_epi32(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prorvd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_rorv_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prorvd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_rorv_epi32(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prorvd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_rorv_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prorvd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_rorv_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_prorvd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_rorv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prorvd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_rolv_epi64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prolvq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_rolv_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prolvq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_rolv_epi64(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prolvq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_rolv_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prolvq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_rolv_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_prolvq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_rolv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prolvq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_rorv_epi64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_prorvq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_rorv_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prorvq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_prorvq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_rorv_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prorvq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_rorv_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_prorvq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_rorv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_prorvq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_srav_epi64(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_psravq256_mask( - (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psravq256_mask((__v4di)__X, (__v4di)__Y, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srav_epi64(__mmask8 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_psravq256_mask( - (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pandq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, __U); -} - -__funline __m256i _mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pandq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_pd(), __U); -} - -__funline __m128i _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pandq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, __U); -} - -__funline __m128i _mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pandq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)_mm_setzero_pd(), __U); -} - -__funline __m256i _mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pandnq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, __U); -} - -__funline __m256i _mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pandnq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_pd(), __U); -} - -__funline __m128i _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pandnq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, __U); -} - -__funline __m128i _mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pandnq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)_mm_setzero_pd(), __U); -} - -__funline __m256i _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_porq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_porq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_or_epi64(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A | (__v4du)__B); -} - -__funline __m128i _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_porq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_porq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_or_epi64(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A | (__v2du)__B); -} - -__funline __m256i _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pxorq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pxorq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_xor_epi64(__m256i __A, __m256i __B) { - return (__m256i)((__v4du)__A ^ (__v4du)__B); -} - -__funline __m128i _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pxorq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pxorq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_xor_epi64(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A ^ (__v2du)__B); -} - -__funline __m256d _mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_maxpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_maxpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_maxps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_maxps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_divps_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_divps_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_divpd_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_divpd_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_minpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_divpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_minpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_minps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_divpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_divps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_minps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_divps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_minps_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_mulps_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_minps_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_mulps_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_maxps_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_maxps_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_minpd_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_minpd_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_maxpd_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_maxpd_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_mulpd_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_mulpd_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_mulps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_mulps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_mulpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_mulpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_max_epi64(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_max_epi64(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, __M); -} - -__funline __m256i _mm256_min_epi64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminsq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_min_epi64(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminsq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, __M); -} - -__funline __m256i _mm256_maskz_min_epi64(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminsq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_maskz_max_epu64(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxuq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_max_epi64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_max_epu64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxuq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_max_epu64(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxuq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, __M); -} - -__funline __m256i _mm256_min_epu64(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminuq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_min_epu64(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminuq256_mask((__v4di)__A, (__v4di)__B, - (__v4di)__W, __M); -} - -__funline __m256i _mm256_maskz_min_epu64(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminuq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxsd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, __M); -} - -__funline __m256i _mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminsd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminsd256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, __M); -} - -__funline __m256i _mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pmaxud256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmaxud256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, __M); -} - -__funline __m256i _mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_pminud256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pminud256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, __M); -} - -__funline __m128i _mm_maskz_max_epi64(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_max_epi64(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, __M); -} - -__funline __m128i _mm_min_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminsq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_min_epi64(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminsq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, __M); -} - -__funline __m128i _mm_maskz_min_epi64(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminsq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_maskz_max_epu64(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxuq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_max_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_max_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxuq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_max_epu64(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxuq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, __M); -} - -__funline __m128i _mm_min_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminuq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_min_epu64(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminuq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, __M); -} - -__funline __m128i _mm_maskz_min_epu64(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminuq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, __M); -} - -__funline __m128i _mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminsd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminsd128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, __M); -} - -__funline __m128i _mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxud128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmaxud128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, __M); -} - -__funline __m128i _mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminud128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pminud128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, __M); -} - +typedef int __v4si_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef int __v8si_u __attribute__ ((__vector_size__ (32), __may_alias__, __aligned__ (1))); +typedef long long __v2di_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef long long __v4di_u __attribute__ ((__vector_size__ (32), __may_alias__, __aligned__ (1))); +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_storeapd256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storeapd128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_storeaps256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storeaps128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di *) __P); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di *) __P); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_movdqa64store256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_movdqa64store128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si *) __P); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si *) __P); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_epi32 (void *__P, __m256i __A) +{ + *(__v8si *) __P = (__v8si) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_epi32 (void *__P, __m128i __A) +{ + *(__v4si *) __P = (__v4si) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_epi64 (void *__P, __m256i __A) +{ + *(__m256i *) __P = __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_epi64 (void *__P, __m128i __A) +{ + *(__m128i *) __P = __A; +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_storeupd256_mask ((double *) __P, + (__v4df) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storeupd128_mask ((double *) __P, + (__v2df) __A, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_storeups256_mask ((float *) __P, + (__v8sf) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storeups128_mask ((float *) __P, + (__v4sf) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di_u *) __P); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di_u *) __P); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi64 (void *__P, __m256i __A) +{ + *(__m256i_u *) __P = (__m256i_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_storedqudi256_mask ((long long *) __P, + (__v4di) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi64 (void *__P, __m128i __A) +{ + *(__m128i_u *) __P = (__m128i_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedqudi128_mask ((long long *) __P, + (__v2di) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si_u *) __P); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si_u *) __P); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi32 (void *__P, __m256i __A) +{ + *(__m256i_u *) __P = (__m256i_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_storedqusi256_mask ((int *) __P, + (__v8si) __A, + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi32 (void *__P, __m128i __A) +{ + *(__m128i_u *) __P = (__m128i_u) __A; +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedqusi128_mask ((int *) __P, + (__v4si) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epu32 (__m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epu32 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epu32 (__m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epu32 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epu32 (__m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epu32 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_pd (__m128i __A) +{ + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_pd (__m128i __A) +{ + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_ps (__m256i __A) +{ + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) +{ + return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) +{ + return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) +{ + return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) +{ + return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi)__O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi)__O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqd128mem_mask ((unsigned long long *) __P, + (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si)__O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) __O, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) +{ + return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, + (__v4sf) __O, + __M); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +{ + return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __M); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, + (__v4df) __O, + __M); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, + (__v4df) + _mm256_setzero_pd (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, + (__v8si) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi32 (__m256i __O, __mmask8 __M, int __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, (__v8si) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi32 (__mmask8 __M, int __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, + (__v4si) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi32 (__m128i __O, __mmask8 __M, int __A) +{ + return (__m128i) __builtin_ia32_pbroadcastd128_gpr_mask (__A, (__v4si) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi32 (__mmask8 __M, int __A) +{ + return (__m128i) + __builtin_ia32_pbroadcastd128_gpr_mask (__A, + (__v4si) _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, + (__v4di) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, + (__v2di) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) +{ + return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O, + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m128i) + __builtin_ia32_pbroadcastq128_gpr_mask (__A, + (__v2di) _mm_setzero_si128 (), + __M); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_f32x4 (__m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf)_mm256_undefined_pd (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_i32x4 (__m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si)_mm256_undefined_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si) + __O, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp14_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp14_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt14_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt14_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getexp_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getexp_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_scalef_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_scalef_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_pd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmaddsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmaddsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmaddsub_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmaddsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmaddsub_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmaddsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmaddsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmaddsub_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsubadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsubadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsubadd_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsubadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsubadd_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsubadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsubadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsubadd_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfnmaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmaddpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfnmaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmaddps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfnmaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmaddps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A | (__v8su)__B); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A | (__v4su)__B); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A ^ (__v8su)__B); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A ^ (__v4su)__B); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epu32 (__m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epu32 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 0, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 0, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 6, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 6, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, + (__v4si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, + (__v8si) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, + (__v2di) __B, __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, + (__v4di) __B, __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_compressstoredf256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_compressstoredf128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_compressstoredi256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_compressstoredi128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_compressstoresi256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_compressstoresi128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_expanddf256_maskz ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P, + (__v4df) __W, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_expandloaddf256_maskz ((__v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_expanddf128_maskz ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P, + (__v2df) __W, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_expandloaddf128_maskz ((__v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_expandsf256_maskz ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_expandloadsf256_maskz ((__v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_expandsf128_maskz ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_expandloadsf128_maskz ((__v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expanddi256_maskz ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, + void const *__P) +{ + return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloaddi256_maskz ((__v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expanddi128_maskz ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloaddi128_maskz ((__v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expandsi256_maskz ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, + void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadsi256_maskz ((__v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expandsi128_maskz ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadsi128_maskz ((__v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I + , + (__v4df) __A, + (__v4df) __B, + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I + , + (__v4df) __A, + (__v4df) __B, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A, + (__v4di) __I + , + (__v4df) __B, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I + , + (__v4df) __A, + (__v4df) __B, + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I + , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I + , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A, + (__v8si) __I + , + (__v8sf) __B, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I + , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I + , + (__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I + , + (__v2di) __A, + (__v2di) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A, + (__v2di) __I + , + (__v2di) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I + , + (__v2di) __A, + (__v2di) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I + , + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I + , + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A, + (__v4si) __I + , + (__v4si) __B, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I + , + (__v4si) __A, + (__v4si) __B, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I + , + (__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I + , + (__v4di) __A, + (__v4di) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A, + (__v4di) __I + , + (__v4di) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I + , + (__v4di) __A, + (__v4di) __B, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I + , + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I + , + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A, + (__v8si) __I + , + (__v8si) __B, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I + , + (__v8si) __A, + (__v8si) __B, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I + , + (__v2df) __A, + (__v2df) __B, + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I + , + (__v2df) __A, + (__v2df) __B, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A, + (__v2di) __I + , + (__v2df) __B, + (__mmask8) + __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I + , + (__v2df) __A, + (__v2df) __B, + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I + , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I + , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A, + (__v4si) __I + , + (__v4sf) __B, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I + , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rolv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rolv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rorv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rorv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rolv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rolv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rorv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rorv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A | (__v4du)__B); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A | (__v2du)__B); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A ^ (__v4du)__B); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A ^ (__v2du)__B); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} #ifndef __AVX512CD__ #pragma GCC push_options #pragma GCC target("avx512vl,avx512cd") #define __DISABLE_AVX512VLCD__ #endif - -__funline __m128i _mm_broadcastmb_epi64(__mmask8 __A) { - return (__m128i)__builtin_ia32_broadcastmb128(__A); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_broadcastmb128 (__A); } - -__funline __m256i _mm256_broadcastmb_epi64(__mmask8 __A) { - return (__m256i)__builtin_ia32_broadcastmb256(__A); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_broadcastmb256 (__A); } - -__funline __m128i _mm_broadcastmw_epi32(__mmask16 __A) { - return (__m128i)__builtin_ia32_broadcastmw128(__A); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m128i) __builtin_ia32_broadcastmw128 (__A); } - -__funline __m256i _mm256_broadcastmw_epi32(__mmask16 __A) { - return (__m256i)__builtin_ia32_broadcastmw256(__A); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m256i) __builtin_ia32_broadcastmw256 (__A); } - -__funline __m256i _mm256_lzcnt_epi32(__m256i __A) { - return (__m256i)__builtin_ia32_vplzcntd_256_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lzcnt_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); } - -__funline __m256i _mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_vplzcntd_256_mask((__v8si)__A, (__v8si)__W, - (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); } - -__funline __m256i _mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vplzcntd_256_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); } - -__funline __m256i _mm256_lzcnt_epi64(__m256i __A) { - return (__m256i)__builtin_ia32_vplzcntq_256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lzcnt_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); } - -__funline __m256i _mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_vplzcntq_256_mask((__v4di)__A, (__v4di)__W, - (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); } - -__funline __m256i _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vplzcntq_256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } - -__funline __m256i _mm256_conflict_epi64(__m256i __A) { - return (__m256i)__builtin_ia32_vpconflictdi_256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conflict_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); } - -__funline __m256i _mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_vpconflictdi_256_mask((__v4di)__A, (__v4di)__W, - (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) + __U); } - -__funline __m256i _mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vpconflictdi_256_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); } - -__funline __m256i _mm256_conflict_epi32(__m256i __A) { - return (__m256i)__builtin_ia32_vpconflictsi_256_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conflict_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); } - -__funline __m256i _mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, - __m256i __A) { - return (__m256i)__builtin_ia32_vpconflictsi_256_mask((__v8si)__A, (__v8si)__W, - (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) + __U); } - -__funline __m256i _mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vpconflictsi_256_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); } - -__funline __m128i _mm_lzcnt_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_vplzcntd_128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lzcnt_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); } - -__funline __m128i _mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vplzcntd_128_mask((__v4si)__A, (__v4si)__W, - (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); } - -__funline __m128i _mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vplzcntd_128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); } - -__funline __m128i _mm_lzcnt_epi64(__m128i __A) { - return (__m128i)__builtin_ia32_vplzcntq_128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lzcnt_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); } - -__funline __m128i _mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vplzcntq_128_mask((__v2di)__A, (__v2di)__W, - (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); } - -__funline __m128i _mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vplzcntq_128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } - -__funline __m128i _mm_conflict_epi64(__m128i __A) { - return (__m128i)__builtin_ia32_vpconflictdi_128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conflict_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); } - -__funline __m128i _mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_vpconflictdi_128_mask((__v2di)__A, (__v2di)__W, - (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) + __U); } - -__funline __m128i _mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vpconflictdi_128_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); } - -__funline __m128i _mm_conflict_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_vpconflictsi_128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conflict_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); } - -__funline __m128i _mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, - __m128i __A) { - return (__m128i)__builtin_ia32_vpconflictsi_128_mask((__v4si)__A, (__v4si)__W, - (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) + __U); } - -__funline __m128i _mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vpconflictsi_128_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); } - #ifdef __DISABLE_AVX512VLCD__ #pragma GCC pop_options #endif - -__funline __m256d _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_unpcklpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_unpcklpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_unpcklpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_unpcklpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_unpcklps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_unpckhpd256_mask((__v4df)__A, (__v4df)__B, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d)__builtin_ia32_unpckhpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d)__builtin_ia32_unpckhpd128_mask((__v2df)__A, (__v2df)__B, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_unpckhpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256)__builtin_ia32_unpckhps256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_unpckhps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_unpckhps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_unpckhps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_vcvtph2ps_mask((__v8hi)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_vcvtph2ps_mask( - (__v8hi)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_unpcklps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) { - return (__m256)__builtin_ia32_vcvtph2ps256_mask((__v8hi)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { - return (__m256)__builtin_ia32_vcvtph2ps256_mask( - (__v8hi)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B) { - return (__m128)__builtin_ia32_unpcklps128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_unpcklps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psrad256_mask((__v8si)__A, (__v4si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psrad256_mask( - (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psrad128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrad128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_sra_epi64(__m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psraq256_mask( - (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psraq256_mask((__v4di)__A, (__v2di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psraq256_mask( - (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_sra_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psraq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psraq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psraq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pslld128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pslld128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_psllq128_mask((__v2di)__A, (__v2di)__B, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psllq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_pslld256_mask((__v8si)__A, (__v4si)__B, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_pslld256_mask( - (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B) { - return (__m256i)__builtin_ia32_psllq256_mask((__v4di)__A, (__v2di)__B, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { - return (__m256i)__builtin_ia32_psllq256_mask( - (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, - __m256 __Y) { - return (__m256)__builtin_ia32_permvarsf256_mask((__v8sf)__Y, (__v8si)__X, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, - __m256 __Y) { - return (__m256)__builtin_ia32_permvarsf256_mask( - (__v8sf)__Y, (__v8si)__X, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_permutexvar_pd(__m256i __X, __m256d __Y) { - return (__m256d)__builtin_ia32_permvardf256_mask( - (__v4df)__Y, (__v4di)__X, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, - __m256i __X, __m256d __Y) { - return (__m256d)__builtin_ia32_permvardf256_mask((__v4df)__Y, (__v4di)__X, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, - __m256d __Y) { - return (__m256d)__builtin_ia32_permvardf256_mask( - (__v4df)__Y, (__v4di)__X, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, - __m256d __A, __m256i __C) { - return (__m256d)__builtin_ia32_vpermilvarpd256_mask( - (__v4df)__A, (__v4di)__C, (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, - __m256i __C) { - return (__m256d)__builtin_ia32_vpermilvarpd256_mask( - (__v4df)__A, (__v4di)__C, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256i __C) { - return (__m256)__builtin_ia32_vpermilvarps256_mask( - (__v8sf)__A, (__v8si)__C, (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, - __m256i __C) { - return (__m256)__builtin_ia32_vpermilvarps256_mask( - (__v8sf)__A, (__v8si)__C, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128i __C) { - return (__m128d)__builtin_ia32_vpermilvarpd_mask((__v2df)__A, (__v2di)__C, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, - __m128i __C) { - return (__m128d)__builtin_ia32_vpermilvarpd_mask( - (__v2df)__A, (__v2di)__C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128i __C) { - return (__m128)__builtin_ia32_vpermilvarps_mask((__v4sf)__A, (__v4si)__C, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { - return (__m128)__builtin_ia32_vpermilvarps_mask( - (__v4sf)__A, (__v4si)__C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmulld256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_permvardi256_mask( - (__v4di)__Y, (__v4di)__X, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_pmulld256_mask((__v8si)__A, (__v8si)__B, - (__v8si)__W, __M); -} - -__funline __m128i _mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmulld128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); -} - -__funline __m128i _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_pmulld128_mask((__v4si)__A, (__v4si)__B, - (__v4si)__W, __M); -} - -__funline __m256i _mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_pmuldq256_mask((__v8si)__X, (__v8si)__Y, - (__v4di)__W, __M); -} - -__funline __m256i _mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pmuldq256_mask( - (__v8si)__X, (__v8si)__Y, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_pmuldq128_mask((__v4si)__X, (__v4si)__Y, - (__v2di)__W, __M); -} - -__funline __m128i _mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmuldq128_mask( - (__v4si)__X, (__v4si)__Y, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_permutexvar_epi64(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_permvardi256_mask( - (__v4di)__Y, (__v4di)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, - __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_permvardi256_mask((__v4di)__Y, (__v4di)__X, - (__v4di)__W, __M); -} - -__funline __m256i _mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_pmuludq256_mask((__v8si)__X, (__v8si)__Y, - (__v4di)__W, __M); -} - -__funline __m256i _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_permvarsi256_mask( - (__v8si)__Y, (__v8si)__X, (__v8si)_mm256_setzero_si256(), __M); -} - -__funline __m256i _mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_pmuludq256_mask( - (__v8si)__X, (__v8si)__Y, (__v4di)_mm256_setzero_si256(), __M); -} - -__funline __m128i _mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, - __m128i __Y) { - return (__m128i)__builtin_ia32_pmuludq128_mask((__v4si)__X, (__v4si)__Y, - (__v2di)__W, __M); -} - -__funline __m128i _mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmuludq128_mask( - (__v4si)__X, (__v4si)__Y, (__v2di)_mm_setzero_si128(), __M); -} - -__funline __m256i _mm256_permutexvar_epi32(__m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_permvarsi256_mask( - (__v8si)__Y, (__v8si)__X, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, - __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_permvarsi256_mask((__v8si)__Y, (__v8si)__X, - (__v8si)__W, __M); -} - -__funline __mmask8 _mm256_mask_cmpneq_epu32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpneq_epu32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmplt_epu32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmplt_epu32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpge_epu32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpge_epu32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmple_epu32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmple_epu32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpneq_epu64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpneq_epu64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmplt_epu64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmplt_epu64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpge_epu64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpge_epu64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmple_epu64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmple_epu64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpneq_epi32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpneq_epi32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmplt_epi32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmplt_epi32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpge_epi32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpge_epi32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmple_epi32_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmple_epi32_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpneq_epi64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpneq_epi64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmplt_epi64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmplt_epi64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmpge_epi64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmpge_epi64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmple_epi64_mask(__mmask8 __M, __m256i __X, - __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm256_cmple_epi64_mask(__m256i __X, __m256i __Y) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpneq_epu32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpneq_epu32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmplt_epu32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmplt_epu32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpge_epu32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpge_epu32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmple_epu32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmple_epu32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpneq_epu64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpneq_epu64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmplt_epu64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmplt_epu64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpge_epu64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpge_epu64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmple_epu64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmple_epu64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpneq_epi32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpneq_epi32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmplt_epi32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmplt_epi32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpge_epi32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpge_epi32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmple_epi32_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmple_epi32_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 2, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpneq_epi64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 4, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpneq_epi64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 4, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmplt_epi64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 1, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmplt_epi64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 1, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmpge_epi64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 5, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmpge_epi64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 5, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmple_epi64_mask(__mmask8 __M, __m128i __X, - __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 2, - (__mmask8)__M); -} - -__funline __mmask8 _mm_cmple_epi64_mask(__m128i __X, __m128i __Y) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 2, - (__mmask8)-1); -} - +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X, + __m256 __Y) +{ + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, + (__v8si) __X, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, + (__v8si) __X, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_pd (__m256i __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, + __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, + (__v4di) __C, + (__v4df) __W, + (__mmask8) + __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, + (__v4di) __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, + (__v8si) __C, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, + (__v8si) __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, + (__v2di) __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, + (__v2di) __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A, + __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, + (__v4si) __C, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, + (__v4si) __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) __W, + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) __W, + __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) -1); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) __M); +} +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) -1); +} #ifdef __OPTIMIZE__ -__funline __m256i _mm256_permutex_epi64(__m256i __X, const int __I) { - return (__m256i)__builtin_ia32_permdi256_mask( - (__v4di)__X, __I, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_permutex_epi64(__m256i __W, __mmask8 __M, - __m256i __X, const int __I) { - return (__m256i)__builtin_ia32_permdi256_mask((__v4di)__X, __I, (__v4di)__W, - (__mmask8)__M); -} - -__funline __m256i _mm256_maskz_permutex_epi64(__mmask8 __M, __m256i __X, - const int __I) { - return (__m256i)__builtin_ia32_permdi256_mask( - (__v4di)__X, __I, (__v4di)_mm256_setzero_si256(), (__mmask8)__M); -} - -__funline __m256d _mm256_mask_shuffle_pd(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B, const int __imm) { - return (__m256d)__builtin_ia32_shufpd256_mask((__v4df)__A, (__v4df)__B, __imm, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_shuffle_pd(__mmask8 __U, __m256d __A, __m256d __B, - const int __imm) { - return (__m256d)__builtin_ia32_shufpd256_mask((__v4df)__A, (__v4df)__B, __imm, - (__v4df)_mm256_setzero_pd(), - (__mmask8)__U); -} - -__funline __m128d _mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __imm) { - return (__m128d)__builtin_ia32_shufpd128_mask((__v2df)__A, (__v2df)__B, __imm, - (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_shuffle_pd(__mmask8 __U, __m128d __A, __m128d __B, - const int __imm) { - return (__m128d)__builtin_ia32_shufpd128_mask( - (__v2df)__A, (__v2df)__B, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_shuffle_ps(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B, const int __imm) { - return (__m256)__builtin_ia32_shufps256_mask((__v8sf)__A, (__v8sf)__B, __imm, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B, - const int __imm) { - return (__m256)__builtin_ia32_shufps256_mask((__v8sf)__A, (__v8sf)__B, __imm, - (__v8sf)_mm256_setzero_ps(), - (__mmask8)__U); -} - -__funline __m128 _mm_mask_shuffle_ps(__m128 __W, __mmask8 __U, __m128 __A, - __m128 __B, const int __imm) { - return (__m128)__builtin_ia32_shufps128_mask((__v4sf)__A, (__v4sf)__B, __imm, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_shuffle_ps(__mmask8 __U, __m128 __A, __m128 __B, - const int __imm) { - return (__m128)__builtin_ia32_shufps128_mask( - (__v4sf)__A, (__v4sf)__B, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_inserti32x4(__m256i __A, __m128i __B, const int __imm) { - return (__m256i)__builtin_ia32_inserti32x4_256_mask( - (__v8si)__A, (__v4si)__B, __imm, (__v8si)_mm256_setzero_si256(), - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_inserti32x4(__m256i __W, __mmask8 __U, __m256i __A, - __m128i __B, const int __imm) { - return (__m256i)__builtin_ia32_inserti32x4_256_mask( - (__v8si)__A, (__v4si)__B, __imm, (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_inserti32x4(__mmask8 __U, __m256i __A, __m128i __B, - const int __imm) { - return (__m256i)__builtin_ia32_inserti32x4_256_mask( - (__v8si)__A, (__v4si)__B, __imm, (__v8si)_mm256_setzero_si256(), - (__mmask8)__U); -} - -__funline __m256 _mm256_insertf32x4(__m256 __A, __m128 __B, const int __imm) { - return (__m256)__builtin_ia32_insertf32x4_256_mask( - (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), - (__mmask8)-1); -} - -__funline __m256 _mm256_mask_insertf32x4(__m256 __W, __mmask8 __U, __m256 __A, - __m128 __B, const int __imm) { - return (__m256)__builtin_ia32_insertf32x4_256_mask( - (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_insertf32x4(__mmask8 __U, __m256 __A, __m128 __B, - const int __imm) { - return (__m256)__builtin_ia32_insertf32x4_256_mask( - (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), - (__mmask8)__U); -} - -__funline __m128i _mm256_extracti32x4_epi32(__m256i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti32x4_256_mask( - (__v8si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, - __m256i __A, const int __imm) { - return (__m128i)__builtin_ia32_extracti32x4_256_mask( - (__v8si)__A, __imm, (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A, - const int __imm) { - return (__m128i)__builtin_ia32_extracti32x4_256_mask( - (__v8si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128 _mm256_extractf32x4_ps(__m256 __A, const int __imm) { - return (__m128)__builtin_ia32_extractf32x4_256_mask( - (__v8sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A, - const int __imm) { - return (__m128)__builtin_ia32_extractf32x4_256_mask( - (__v8sf)__A, __imm, (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A, - const int __imm) { - return (__m128)__builtin_ia32_extractf32x4_256_mask( - (__v8sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256i _mm256_shuffle_i64x2(__m256i __A, __m256i __B, - const int __imm) { - return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( - (__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, - __m256i __A, __m256i __B, - const int __imm) { - return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( - (__v4di)__A, (__v4di)__B, __imm, (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( - (__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), - (__mmask8)__U); -} - -__funline __m256i _mm256_shuffle_i32x4(__m256i __A, __m256i __B, - const int __imm) { - return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( - (__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, - __m256i __A, __m256i __B, - const int __imm) { - return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( - (__v8si)__A, (__v8si)__B, __imm, (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( - (__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), - (__mmask8)__U); -} - -__funline __m256d _mm256_shuffle_f64x2(__m256d __A, __m256d __B, - const int __imm) { - return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( - (__v4df)__A, (__v4df)__B, __imm, (__v4df)_mm256_setzero_pd(), - (__mmask8)-1); -} - -__funline __m256d _mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, - __m256d __A, __m256d __B, - const int __imm) { - return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( - (__v4df)__A, (__v4df)__B, __imm, (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, - __m256d __B, const int __imm) { - return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( - (__v4df)__A, (__v4df)__B, __imm, (__v4df)_mm256_setzero_pd(), - (__mmask8)__U); -} - -__funline __m256 _mm256_shuffle_f32x4(__m256 __A, __m256 __B, const int __imm) { - return (__m256)__builtin_ia32_shuf_f32x4_256_mask( - (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), - (__mmask8)-1); -} - -__funline __m256 _mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B, const int __imm) { - return (__m256)__builtin_ia32_shuf_f32x4_256_mask( - (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B, - const int __imm) { - return (__m256)__builtin_ia32_shuf_f32x4_256_mask( - (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), - (__mmask8)__U); -} - -__funline __m256d _mm256_fixupimm_pd(__m256d __A, __m256d __B, __m256i __C, - const int __imm) { - return (__m256d)__builtin_ia32_fixupimmpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)-1); -} - -__funline __m256d _mm256_mask_fixupimm_pd(__m256d __A, __mmask8 __U, __m256d __B, - __m256i __C, const int __imm) { - return (__m256d)__builtin_ia32_fixupimmpd256_mask( - (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_fixupimm_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256i __C, const int __imm) { - return (__m256d)__builtin_ia32_fixupimmpd256_maskz( - (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)__U); -} - -__funline __m256 _mm256_fixupimm_ps(__m256 __A, __m256 __B, __m256i __C, - const int __imm) { - return (__m256)__builtin_ia32_fixupimmps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)-1); -} - -__funline __m256 _mm256_mask_fixupimm_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256i __C, const int __imm) { - return (__m256)__builtin_ia32_fixupimmps256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_fixupimm_ps(__mmask8 __U, __m256 __A, __m256 __B, - __m256i __C, const int __imm) { - return (__m256)__builtin_ia32_fixupimmps256_maskz( - (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)__U); -} - -__funline __m128d _mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C, - const int __imm) { - return (__m128d)__builtin_ia32_fixupimmpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1); -} - -__funline __m128d _mm_mask_fixupimm_pd(__m128d __A, __mmask8 __U, __m128d __B, - __m128i __C, const int __imm) { - return (__m128d)__builtin_ia32_fixupimmpd128_mask( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_fixupimm_pd(__mmask8 __U, __m128d __A, __m128d __B, - __m128i __C, const int __imm) { - return (__m128d)__builtin_ia32_fixupimmpd128_maskz( - (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U); -} - -__funline __m128 _mm_fixupimm_ps(__m128 __A, __m128 __B, __m128i __C, - const int __imm) { - return (__m128)__builtin_ia32_fixupimmps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1); -} - -__funline __m128 _mm_mask_fixupimm_ps(__m128 __A, __mmask8 __U, __m128 __B, - __m128i __C, const int __imm) { - return (__m128)__builtin_ia32_fixupimmps128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_fixupimm_ps(__mmask8 __U, __m128 __A, __m128 __B, - __m128i __C, const int __imm) { - return (__m128)__builtin_ia32_fixupimmps128_maskz( - (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrldi256_mask((__v8si)__A, __imm, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrldi256_mask( - (__v8si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrldi128_mask((__v4si)__A, __imm, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrldi128_mask( - (__v4si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)__A, __imm, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psrlqi256_mask( - (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)__A, __imm, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psrlqi128_mask( - (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C, - const int __imm) { - return (__m256i)__builtin_ia32_pternlogq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)-1); -} - -__funline __m256i _mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, - __m256i __B, __m256i __C, - const int __imm) { - return (__m256i)__builtin_ia32_pternlogq256_mask( - (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, - __m256i __B, __m256i __C, - const int __imm) { - return (__m256i)__builtin_ia32_pternlogq256_maskz( - (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)__U); -} - -__funline __m256i _mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C, - const int __imm) { - return (__m256i)__builtin_ia32_pternlogd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)-1); -} - -__funline __m256i _mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, - __m256i __B, __m256i __C, - const int __imm) { - return (__m256i)__builtin_ia32_pternlogd256_mask( - (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, - __m256i __B, __m256i __C, - const int __imm) { - return (__m256i)__builtin_ia32_pternlogd256_maskz( - (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)__U); -} - -__funline __m128i _mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C, - const int __imm) { - return (__m128i)__builtin_ia32_pternlogq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)-1); -} - -__funline __m128i _mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, - __m128i __B, __m128i __C, - const int __imm) { - return (__m128i)__builtin_ia32_pternlogq128_mask( - (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, - __m128i __B, __m128i __C, - const int __imm) { - return (__m128i)__builtin_ia32_pternlogq128_maskz( - (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)__U); -} - -__funline __m128i _mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C, - const int __imm) { - return (__m128i)__builtin_ia32_pternlogd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)-1); -} - -__funline __m128i _mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, - __m128i __B, __m128i __C, - const int __imm) { - return (__m128i)__builtin_ia32_pternlogd128_mask( - (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, - __m128i __B, __m128i __C, - const int __imm) { - return (__m128i)__builtin_ia32_pternlogd128_maskz( - (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)__U); -} - -__funline __m256 _mm256_roundscale_ps(__m256 __A, const int __imm) { - return (__m256)__builtin_ia32_rndscaleps_256_mask( - (__v8sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_roundscale_ps(__m256 __W, __mmask8 __U, __m256 __A, - const int __imm) { - return (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)__A, __imm, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_roundscale_ps(__mmask8 __U, __m256 __A, - const int __imm) { - return (__m256)__builtin_ia32_rndscaleps_256_mask( - (__v8sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_roundscale_pd(__m256d __A, const int __imm) { - return (__m256d)__builtin_ia32_rndscalepd_256_mask( - (__v4df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_roundscale_pd(__m256d __W, __mmask8 __U, - __m256d __A, const int __imm) { - return (__m256d)__builtin_ia32_rndscalepd_256_mask( - (__v4df)__A, __imm, (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_roundscale_pd(__mmask8 __U, __m256d __A, - const int __imm) { - return (__m256d)__builtin_ia32_rndscalepd_256_mask( - (__v4df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128 _mm_roundscale_ps(__m128 __A, const int __imm) { - return (__m128)__builtin_ia32_rndscaleps_128_mask( - (__v4sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_roundscale_ps(__m128 __W, __mmask8 __U, __m128 __A, - const int __imm) { - return (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)__A, __imm, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_roundscale_ps(__mmask8 __U, __m128 __A, - const int __imm) { - return (__m128)__builtin_ia32_rndscaleps_128_mask( - (__v4sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m128d _mm_roundscale_pd(__m128d __A, const int __imm) { - return (__m128d)__builtin_ia32_rndscalepd_128_mask( - (__v2df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_roundscale_pd(__m128d __W, __mmask8 __U, __m128d __A, - const int __imm) { - return (__m128d)__builtin_ia32_rndscalepd_128_mask( - (__v2df)__A, __imm, (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_roundscale_pd(__mmask8 __U, __m128d __A, - const int __imm) { - return (__m128d)__builtin_ia32_rndscalepd_128_mask( - (__v2df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_getmant_ps(__m256 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256)__builtin_ia32_getmantps256_mask( - (__v8sf)__A, (__C << 2) | __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); -} - -__funline __m256 _mm256_mask_getmant_ps(__m256 __W, __mmask8 __U, __m256 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256)__builtin_ia32_getmantps256_mask((__v8sf)__A, (__C << 2) | __B, - (__v8sf)__W, (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_getmant_ps(__mmask8 __U, __m256 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256)__builtin_ia32_getmantps256_mask((__v8sf)__A, (__C << 2) | __B, - (__v8sf)_mm256_setzero_ps(), - (__mmask8)__U); -} - -__funline __m128 _mm_getmant_ps(__m128 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128)__builtin_ia32_getmantps128_mask( - (__v4sf)__A, (__C << 2) | __B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); -} - -__funline __m128 _mm_mask_getmant_ps(__m128 __W, __mmask8 __U, __m128 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128)__builtin_ia32_getmantps128_mask((__v4sf)__A, (__C << 2) | __B, - (__v4sf)__W, (__mmask8)__U); -} - -__funline __m128 _mm_maskz_getmant_ps(__mmask8 __U, __m128 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128)__builtin_ia32_getmantps128_mask( - (__v4sf)__A, (__C << 2) | __B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_getmant_pd(__m256d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256d)__builtin_ia32_getmantpd256_mask( - (__v4df)__A, (__C << 2) | __B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); -} - -__funline __m256d _mm256_mask_getmant_pd(__m256d __W, __mmask8 __U, __m256d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256d)__builtin_ia32_getmantpd256_mask( - (__v4df)__A, (__C << 2) | __B, (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_getmant_pd(__mmask8 __U, __m256d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m256d)__builtin_ia32_getmantpd256_mask( - (__v4df)__A, (__C << 2) | __B, (__v4df)_mm256_setzero_pd(), - (__mmask8)__U); -} - -__funline __m128d _mm_getmant_pd(__m128d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128d)__builtin_ia32_getmantpd128_mask( - (__v2df)__A, (__C << 2) | __B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); -} - -__funline __m128d _mm_mask_getmant_pd(__m128d __W, __mmask8 __U, __m128d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128d)__builtin_ia32_getmantpd128_mask( - (__v2df)__A, (__C << 2) | __B, (__v2df)__W, (__mmask8)__U); -} - -__funline __m128d _mm_maskz_getmant_pd(__mmask8 __U, __m128d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { - return (__m128d)__builtin_ia32_getmantpd128_mask( - (__v2df)__A, (__C << 2) | __B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mmask_i32gather_ps(__m256 __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { - return (__m256)__builtin_ia32_gather3siv8sf((__v8sf)__v1_old, __addr, - (__v8si)__index, __mask, __scale); -} - -__funline __m128 _mm_mmask_i32gather_ps(__m128 __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128)__builtin_ia32_gather3siv4sf((__v4sf)__v1_old, __addr, - (__v4si)__index, __mask, __scale); -} - -__funline __m256d _mm256_mmask_i32gather_pd(__m256d __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m256d)__builtin_ia32_gather3siv4df( - (__v4df)__v1_old, __addr, (__v4si)__index, __mask, __scale); -} - -__funline __m128d _mm_mmask_i32gather_pd(__m128d __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128d)__builtin_ia32_gather3siv2df( - (__v2df)__v1_old, __addr, (__v4si)__index, __mask, __scale); -} - -__funline __m128 _mm256_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { - return (__m128)__builtin_ia32_gather3div8sf((__v4sf)__v1_old, __addr, - (__v4di)__index, __mask, __scale); -} - -__funline __m128 _mm_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128)__builtin_ia32_gather3div4sf((__v4sf)__v1_old, __addr, - (__v2di)__index, __mask, __scale); -} - -__funline __m256d _mm256_mmask_i64gather_pd(__m256d __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { - return (__m256d)__builtin_ia32_gather3div4df( - (__v4df)__v1_old, __addr, (__v4di)__index, __mask, __scale); -} - -__funline __m128d _mm_mmask_i64gather_pd(__m128d __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128d)__builtin_ia32_gather3div2df( - (__v2df)__v1_old, __addr, (__v2di)__index, __mask, __scale); -} - -__funline __m256i _mm256_mmask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, - __m256i __index, - void const *__addr, int __scale) { - return (__m256i)__builtin_ia32_gather3siv8si( - (__v8si)__v1_old, __addr, (__v8si)__index, __mask, __scale); -} - -__funline __m128i _mm_mmask_i32gather_epi32(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128i)__builtin_ia32_gather3siv4si( - (__v4si)__v1_old, __addr, (__v4si)__index, __mask, __scale); -} - -__funline __m256i _mm256_mmask_i32gather_epi64(__m256i __v1_old, __mmask8 __mask, - __m128i __index, - void const *__addr, int __scale) { - return (__m256i)__builtin_ia32_gather3siv4di( - (__v4di)__v1_old, __addr, (__v4si)__index, __mask, __scale); -} - -__funline __m128i _mm_mmask_i32gather_epi64(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128i)__builtin_ia32_gather3siv2di( - (__v2di)__v1_old, __addr, (__v4si)__index, __mask, __scale); -} - -__funline __m128i _mm256_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, - __m256i __index, - void const *__addr, int __scale) { - return (__m128i)__builtin_ia32_gather3div8si( - (__v4si)__v1_old, __addr, (__v4di)__index, __mask, __scale); -} - -__funline __m128i _mm_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128i)__builtin_ia32_gather3div4si( - (__v4si)__v1_old, __addr, (__v2di)__index, __mask, __scale); -} - -__funline __m256i _mm256_mmask_i64gather_epi64(__m256i __v1_old, __mmask8 __mask, - __m256i __index, - void const *__addr, int __scale) { - return (__m256i)__builtin_ia32_gather3div4di( - (__v4di)__v1_old, __addr, (__v4di)__index, __mask, __scale); -} - -__funline __m128i _mm_mmask_i64gather_epi64(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { - return (__m128i)__builtin_ia32_gather3div2di( - (__v2di)__v1_old, __addr, (__v2di)__index, __mask, __scale); -} - -__funline void _mm256_i32scatter_ps(void *__addr, __m256i __index, __m256 __v1, - const int __scale) { - __builtin_ia32_scattersiv8sf(__addr, (__mmask8)0xFF, (__v8si)__index, - (__v8sf)__v1, __scale); -} - -__funline void _mm256_mask_i32scatter_ps(void *__addr, __mmask8 __mask, - __m256i __index, __m256 __v1, - const int __scale) { - __builtin_ia32_scattersiv8sf(__addr, __mask, (__v8si)__index, (__v8sf)__v1, - __scale); -} - -__funline void _mm_i32scatter_ps(void *__addr, __m128i __index, __m128 __v1, - const int __scale) { - __builtin_ia32_scattersiv4sf(__addr, (__mmask8)0xFF, (__v4si)__index, - (__v4sf)__v1, __scale); -} - -__funline void _mm_mask_i32scatter_ps(void *__addr, __mmask8 __mask, - __m128i __index, __m128 __v1, - const int __scale) { - __builtin_ia32_scattersiv4sf(__addr, __mask, (__v4si)__index, (__v4sf)__v1, - __scale); -} - -__funline void _mm256_i32scatter_pd(void *__addr, __m128i __index, __m256d __v1, - const int __scale) { - __builtin_ia32_scattersiv4df(__addr, (__mmask8)0xFF, (__v4si)__index, - (__v4df)__v1, __scale); -} - -__funline void _mm256_mask_i32scatter_pd(void *__addr, __mmask8 __mask, - __m128i __index, __m256d __v1, - const int __scale) { - __builtin_ia32_scattersiv4df(__addr, __mask, (__v4si)__index, (__v4df)__v1, - __scale); -} - -__funline void _mm_i32scatter_pd(void *__addr, __m128i __index, __m128d __v1, - const int __scale) { - __builtin_ia32_scattersiv2df(__addr, (__mmask8)0xFF, (__v4si)__index, - (__v2df)__v1, __scale); -} - -__funline void _mm_mask_i32scatter_pd(void *__addr, __mmask8 __mask, - __m128i __index, __m128d __v1, - const int __scale) { - __builtin_ia32_scattersiv2df(__addr, __mask, (__v4si)__index, (__v2df)__v1, - __scale); -} - -__funline void _mm256_i64scatter_ps(void *__addr, __m256i __index, __m128 __v1, - const int __scale) { - __builtin_ia32_scatterdiv8sf(__addr, (__mmask8)0xFF, (__v4di)__index, - (__v4sf)__v1, __scale); -} - -__funline void _mm256_mask_i64scatter_ps(void *__addr, __mmask8 __mask, - __m256i __index, __m128 __v1, - const int __scale) { - __builtin_ia32_scatterdiv8sf(__addr, __mask, (__v4di)__index, (__v4sf)__v1, - __scale); -} - -__funline void _mm_i64scatter_ps(void *__addr, __m128i __index, __m128 __v1, - const int __scale) { - __builtin_ia32_scatterdiv4sf(__addr, (__mmask8)0xFF, (__v2di)__index, - (__v4sf)__v1, __scale); -} - -__funline void _mm_mask_i64scatter_ps(void *__addr, __mmask8 __mask, - __m128i __index, __m128 __v1, - const int __scale) { - __builtin_ia32_scatterdiv4sf(__addr, __mask, (__v2di)__index, (__v4sf)__v1, - __scale); -} - -__funline void _mm256_i64scatter_pd(void *__addr, __m256i __index, __m256d __v1, - const int __scale) { - __builtin_ia32_scatterdiv4df(__addr, (__mmask8)0xFF, (__v4di)__index, - (__v4df)__v1, __scale); -} - -__funline void _mm256_mask_i64scatter_pd(void *__addr, __mmask8 __mask, - __m256i __index, __m256d __v1, - const int __scale) { - __builtin_ia32_scatterdiv4df(__addr, __mask, (__v4di)__index, (__v4df)__v1, - __scale); -} - -__funline void _mm_i64scatter_pd(void *__addr, __m128i __index, __m128d __v1, - const int __scale) { - __builtin_ia32_scatterdiv2df(__addr, (__mmask8)0xFF, (__v2di)__index, - (__v2df)__v1, __scale); -} - -__funline void _mm_mask_i64scatter_pd(void *__addr, __mmask8 __mask, - __m128i __index, __m128d __v1, - const int __scale) { - __builtin_ia32_scatterdiv2df(__addr, __mask, (__v2di)__index, (__v2df)__v1, - __scale); -} - -__funline void _mm256_i32scatter_epi32(void *__addr, __m256i __index, - __m256i __v1, const int __scale) { - __builtin_ia32_scattersiv8si(__addr, (__mmask8)0xFF, (__v8si)__index, - (__v8si)__v1, __scale); -} - -__funline void _mm256_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, - __m256i __index, __m256i __v1, - const int __scale) { - __builtin_ia32_scattersiv8si(__addr, __mask, (__v8si)__index, (__v8si)__v1, - __scale); -} - -__funline void _mm_i32scatter_epi32(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scattersiv4si(__addr, (__mmask8)0xFF, (__v4si)__index, - (__v4si)__v1, __scale); -} - -__funline void _mm_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, - __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scattersiv4si(__addr, __mask, (__v4si)__index, (__v4si)__v1, - __scale); -} - -__funline void _mm256_i32scatter_epi64(void *__addr, __m128i __index, - __m256i __v1, const int __scale) { - __builtin_ia32_scattersiv4di(__addr, (__mmask8)0xFF, (__v4si)__index, - (__v4di)__v1, __scale); -} - -__funline void _mm256_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, - __m128i __index, __m256i __v1, - const int __scale) { - __builtin_ia32_scattersiv4di(__addr, __mask, (__v4si)__index, (__v4di)__v1, - __scale); -} - -__funline void _mm_i32scatter_epi64(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scattersiv2di(__addr, (__mmask8)0xFF, (__v4si)__index, - (__v2di)__v1, __scale); -} - -__funline void _mm_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, - __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scattersiv2di(__addr, __mask, (__v4si)__index, (__v2di)__v1, - __scale); -} - -__funline void _mm256_i64scatter_epi32(void *__addr, __m256i __index, - __m128i __v1, const int __scale) { - __builtin_ia32_scatterdiv8si(__addr, (__mmask8)0xFF, (__v4di)__index, - (__v4si)__v1, __scale); -} - -__funline void _mm256_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, - __m256i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scatterdiv8si(__addr, __mask, (__v4di)__index, (__v4si)__v1, - __scale); -} - -__funline void _mm_i64scatter_epi32(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scatterdiv4si(__addr, (__mmask8)0xFF, (__v2di)__index, - (__v4si)__v1, __scale); -} - -__funline void _mm_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, - __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scatterdiv4si(__addr, __mask, (__v2di)__index, (__v4si)__v1, - __scale); -} - -__funline void _mm256_i64scatter_epi64(void *__addr, __m256i __index, - __m256i __v1, const int __scale) { - __builtin_ia32_scatterdiv4di(__addr, (__mmask8)0xFF, (__v4di)__index, - (__v4di)__v1, __scale); -} - -__funline void _mm256_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, - __m256i __index, __m256i __v1, - const int __scale) { - __builtin_ia32_scatterdiv4di(__addr, __mask, (__v4di)__index, (__v4di)__v1, - __scale); -} - -__funline void _mm_i64scatter_epi64(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scatterdiv2di(__addr, (__mmask8)0xFF, (__v2di)__index, - (__v2di)__v1, __scale); -} - -__funline void _mm_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, - __m128i __index, __m128i __v1, - const int __scale) { - __builtin_ia32_scatterdiv2di(__addr, __mask, (__v2di)__index, (__v2di)__v1, - __scale); -} - -__funline __m256i _mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, - __m256i __A, _MM_PERM_ENUM __mask) { - return (__m256i)__builtin_ia32_pshufd256_mask((__v8si)__A, __mask, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A, - _MM_PERM_ENUM __mask) { - return (__m256i)__builtin_ia32_pshufd256_mask( - (__v8si)__A, __mask, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A, - _MM_PERM_ENUM __mask) { - return (__m128i)__builtin_ia32_pshufd128_mask((__v4si)__A, __mask, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A, - _MM_PERM_ENUM __mask) { - return (__m128i)__builtin_ia32_pshufd128_mask( - (__v4si)__A, __mask, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_rol_epi32(__m256i __A, const int __B) { - return (__m256i)__builtin_ia32_prold256_mask( - (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_rol_epi32(__m256i __W, __mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prold256_mask((__v8si)__A, __B, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_rol_epi32(__mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prold256_mask( - (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_rol_epi32(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prold128_mask( - (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_rol_epi32(__m128i __W, __mmask8 __U, __m128i __A, - const int __B) { - return (__m128i)__builtin_ia32_prold128_mask((__v4si)__A, __B, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_rol_epi32(__mmask8 __U, __m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prold128_mask( - (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_ror_epi32(__m256i __A, const int __B) { - return (__m256i)__builtin_ia32_prord256_mask( - (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_ror_epi32(__m256i __W, __mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prord256_mask((__v8si)__A, __B, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_ror_epi32(__mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prord256_mask( - (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_ror_epi32(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prord128_mask( - (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_ror_epi32(__m128i __W, __mmask8 __U, __m128i __A, - const int __B) { - return (__m128i)__builtin_ia32_prord128_mask((__v4si)__A, __B, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_ror_epi32(__mmask8 __U, __m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prord128_mask( - (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_rol_epi64(__m256i __A, const int __B) { - return (__m256i)__builtin_ia32_prolq256_mask( - (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_rol_epi64(__m256i __W, __mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prolq256_mask((__v4di)__A, __B, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_rol_epi64(__mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prolq256_mask( - (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_rol_epi64(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prolq128_mask( - (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_rol_epi64(__m128i __W, __mmask8 __U, __m128i __A, - const int __B) { - return (__m128i)__builtin_ia32_prolq128_mask((__v2di)__A, __B, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_rol_epi64(__mmask8 __U, __m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prolq128_mask( - (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_ror_epi64(__m256i __A, const int __B) { - return (__m256i)__builtin_ia32_prorq256_mask( - (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_ror_epi64(__m256i __W, __mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prorq256_mask((__v4di)__A, __B, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_ror_epi64(__mmask8 __U, __m256i __A, - const int __B) { - return (__m256i)__builtin_ia32_prorq256_mask( - (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_ror_epi64(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prorq128_mask( - (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_ror_epi64(__m128i __W, __mmask8 __U, __m128i __A, - const int __B) { - return (__m128i)__builtin_ia32_prorq128_mask((__v2di)__A, __B, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_ror_epi64(__mmask8 __U, __m128i __A, const int __B) { - return (__m128i)__builtin_ia32_prorq128_mask( - (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_alignr_epi32(__m128i __A, __m128i __B, const int __imm) { - return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, - (__v4si)_mm_setzero_si128(), - (__mmask8)-1); -} - -__funline __m128i _mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B, const int __imm) { - return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, - (__v4si)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { - return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, - (__v4si)_mm_setzero_si128(), - (__mmask8)__U); -} - -__funline __m128i _mm_alignr_epi64(__m128i __A, __m128i __B, const int __imm) { - return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, - (__v2di)_mm_setzero_si128(), - (__mmask8)-1); -} - -__funline __m128i _mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B, const int __imm) { - return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, - (__v2di)__W, (__mmask8)__U); -} - -__funline __m128i _mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { - return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, - (__v2di)_mm_setzero_si128(), - (__mmask8)__U); -} - -__funline __m256i _mm256_alignr_epi32(__m256i __A, __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, - (__v8si)_mm256_setzero_si256(), - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, - (__v8si)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, - (__v8si)_mm256_setzero_si256(), - (__mmask8)__U); -} - -__funline __m256i _mm256_alignr_epi64(__m256i __A, __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, - (__v4di)_mm256_setzero_si256(), - (__mmask8)-1); -} - -__funline __m256i _mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, - (__v4di)__W, (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { - return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, - (__v4di)_mm256_setzero_si256(), - (__mmask8)__U); -} - -__funline __m128i _mm_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m128 __A, - const int __I) { - return (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)__A, __I, (__v8hi)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_cvtps_ph(__mmask8 __U, __m128 __A, const int __I) { - return (__m128i)__builtin_ia32_vcvtps2ph_mask( - (__v4sf)__A, __I, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm256_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m256 __A, - const int __I) { - return (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)__A, __I, - (__v8hi)__W, (__mmask8)__U); -} - -__funline __m128i _mm256_maskz_cvtps_ph(__mmask8 __U, __m256 __A, const int __I) { - return (__m128i)__builtin_ia32_vcvtps2ph256_mask( - (__v8sf)__A, __I, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psradi256_mask((__v8si)__A, __imm, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psradi256_mask( - (__v8si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psradi128_mask((__v4si)__A, __imm, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psradi128_mask( - (__v4si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_srai_epi64(__m256i __A, const int __imm) { - return (__m256i)__builtin_ia32_psraqi256_mask( - (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); -} - -__funline __m256i _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psraqi256_mask((__v4di)__A, __imm, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, - const int __imm) { - return (__m256i)__builtin_ia32_psraqi256_mask( - (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m128i _mm_srai_epi64(__m128i __A, const int __imm) { - return (__m128i)__builtin_ia32_psraqi128_mask( - (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); -} - -__funline __m128i _mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psraqi128_mask((__v2di)__A, __imm, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, - const int __imm) { - return (__m128i)__builtin_ia32_psraqi128_mask( - (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, - int __B) { - return (__m128i)__builtin_ia32_pslldi128_mask((__v4si)__A, __B, (__v4si)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B) { - return (__m128i)__builtin_ia32_pslldi128_mask( - (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m128i _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, - int __B) { - return (__m128i)__builtin_ia32_psllqi128_mask((__v2di)__A, __B, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllqi128_mask( - (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, - int __B) { - return (__m256i)__builtin_ia32_pslldi256_mask((__v8si)__A, __B, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B) { - return (__m256i)__builtin_ia32_pslldi256_mask( - (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256i _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, - int __B) { - return (__m256i)__builtin_ia32_psllqi256_mask((__v4di)__A, __B, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B) { - return (__m256i)__builtin_ia32_psllqi256_mask( - (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_permutex_pd(__m256d __W, __mmask8 __U, __m256d __X, - const int __imm) { - return (__m256d)__builtin_ia32_permdf256_mask((__v4df)__X, __imm, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_permutex_pd(__mmask8 __U, __m256d __X, - const int __imm) { - return (__m256d)__builtin_ia32_permdf256_mask( - (__v4df)__X, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X, - const int __C) { - return (__m256d)__builtin_ia32_vpermilpd256_mask((__v4df)__X, __C, - (__v4df)__W, (__mmask8)__U); -} - -__funline __m256d _mm256_maskz_permute_pd(__mmask8 __U, __m256d __X, - const int __C) { - return (__m256d)__builtin_ia32_vpermilpd256_mask( - (__v4df)__X, __C, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -__funline __m128d _mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X, - const int __C) { - return (__m128d)__builtin_ia32_vpermilpd_mask((__v2df)__X, __C, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_maskz_permute_pd(__mmask8 __U, __m128d __X, const int __C) { - return (__m128d)__builtin_ia32_vpermilpd_mask( - (__v2df)__X, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -__funline __m256 _mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X, - const int __C) { - return (__m256)__builtin_ia32_vpermilps256_mask((__v8sf)__X, __C, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_maskz_permute_ps(__mmask8 __U, __m256 __X, - const int __C) { - return (__m256)__builtin_ia32_vpermilps256_mask( - (__v8sf)__X, __C, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -__funline __m128 _mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X, - const int __C) { - return (__m128)__builtin_ia32_vpermilps_mask((__v4sf)__X, __C, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_maskz_permute_ps(__mmask8 __U, __m128 __X, const int __C) { - return (__m128)__builtin_ia32_vpermilps_mask( - (__v4sf)__X, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -__funline __m256d _mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) { - return (__m256d)__builtin_ia32_blendmpd_256_mask((__v4df)__A, (__v4df)__W, - (__mmask8)__U); -} - -__funline __m256 _mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) { - return (__m256)__builtin_ia32_blendmps_256_mask((__v8sf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, - __m256i __W) { - return (__m256i)__builtin_ia32_blendmq_256_mask((__v4di)__A, (__v4di)__W, - (__mmask8)__U); -} - -__funline __m256i _mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, - __m256i __W) { - return (__m256i)__builtin_ia32_blendmd_256_mask((__v8si)__A, (__v8si)__W, - (__mmask8)__U); -} - -__funline __m128d _mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) { - return (__m128d)__builtin_ia32_blendmpd_128_mask((__v2df)__A, (__v2df)__W, - (__mmask8)__U); -} - -__funline __m128 _mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) { - return (__m128)__builtin_ia32_blendmps_128_mask((__v4sf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i)__builtin_ia32_blendmq_128_mask((__v2di)__A, (__v2di)__W, - (__mmask8)__U); -} - -__funline __m128i _mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i)__builtin_ia32_blendmd_128_mask((__v4si)__A, (__v4si)__W, - (__mmask8)__U); -} - -__funline __mmask8 _mm256_cmp_epi64_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmp_epi32_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmp_epu64_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmp_epu32_mask(__m256i __X, __m256i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmp_pd_mask(__m256d __X, __m256d __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)__X, (__v4df)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_cmp_ps_mask(__m256 __X, __m256 __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)__X, (__v8sf)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm256_mask_cmp_epi64_mask(__mmask8 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm256_mask_cmp_epi32_mask(__mmask8 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm256_mask_cmp_epu64_mask(__mmask8 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm256_mask_cmp_epu32_mask(__mmask8 __U, __m256i __X, - __m256i __Y, const int __P) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm256_mask_cmp_pd_mask(__mmask8 __U, __m256d __X, __m256d __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)__X, (__v4df)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm256_mask_cmp_ps_mask(__mmask8 __U, __m256 __X, __m256 __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)__X, (__v8sf)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_cmp_epi64_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmp_epi32_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmp_epu64_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmp_epu32_mask(__m128i __X, __m128i __Y, const int __P) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmp_pd_mask(__m128d __X, __m128d __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)__X, (__v2df)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm_cmp_ps_mask(__m128 __X, __m128 __Y, const int __P) { - return (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)__X, (__v4sf)__Y, __P, - (__mmask8)-1); -} - -__funline __mmask8 _mm_mask_cmp_epi64_mask(__mmask8 __U, __m128i __X, __m128i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_mask_cmp_epi32_mask(__mmask8 __U, __m128i __X, __m128i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_mask_cmp_epu64_mask(__mmask8 __U, __m128i __X, __m128i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_mask_cmp_epu32_mask(__mmask8 __U, __m128i __X, __m128i __Y, - const int __P) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_mask_cmp_pd_mask(__mmask8 __U, __m128d __X, __m128d __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)__X, (__v2df)__Y, __P, - (__mmask8)__U); -} - -__funline __mmask8 _mm_mask_cmp_ps_mask(__mmask8 __U, __m128 __X, __m128 __Y, - const int __P) { - return (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)__X, (__v4sf)__Y, __P, - (__mmask8)__U); -} - -__funline __m256d _mm256_permutex_pd(__m256d __X, const int __M) { - return (__m256d)__builtin_ia32_permdf256_mask( - (__v4df)__X, __M, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); -} - +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex_epi64 (__m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) + _mm256_setzero_si256(), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex_epi64 (__m256i __W, __mmask8 __M, + __m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) __W, + (__mmask8) __M); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex_epi64 (__mmask8 __M, __m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __M); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A, + (__v4df) __B, __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_pd (__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A, + (__v4df) __B, __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __imm) +{ + return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_pd (__mmask8 __U, __m128d __A, __m128d __B, + const int __imm) +{ + return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A, + (__v8sf) __B, __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_ps (__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A, + (__v8sf) __B, __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __imm) +{ + return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_ps (__mmask8 __U, __m128 __A, __m128 __B, + const int __imm) +{ + return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_inserti32x4 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) __W, + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_inserti32x4 (__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf32x4 (__m256 __A, __m128 __B, const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_insertf32x4 (__m256 __W, __mmask8 __U, __m256 __A, + __m128 __B, const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_insertf32x4 (__mmask8 __U, __m256 __A, __m128 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti32x4_epi32 (__m256i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) __W, + (__mmask8) + __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extracti32x4_epi32 (__mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf32x4_ps (__m256 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) + __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extractf32x4_ps (__mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) + __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_i64x2 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_i64x2 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_i64x2 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_i32x4 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_i32x4 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_i32x4 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_f64x2 (__m256d __A, __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_f64x2 (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_f64x2 (__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_f32x4 (__m256 __A, __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_f32x4 (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_f32x4 (__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fixupimm_pd (__m256d __A, __m256d __B, __m256i __C, + const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fixupimm_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256i __C, const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fixupimm_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256i __C, const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fixupimm_ps (__m256 __A, __m256 __B, __m256i __C, + const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fixupimm_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256i __C, const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fixupimm_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256i __C, const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_pd (__m128d __A, __m128d __B, __m128i __C, + const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_ps (__m128 __A, __m128 __B, __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_ps (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, __imm, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, __imm, + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, __imm, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, __imm, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, __imm, + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, __imm, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, const int __imm) +{ + return (__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_ps (__m256 __A, const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_ps (__m256 __W, __mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_ps (__mmask8 __U, __m256 __A, const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_pd (__m256d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_pd (__m256d __W, __mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_pd (__mmask8 __U, __m256d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_ps (__m128 __A, const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ps (__m128 __W, __mmask8 __U, __m128 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ps (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_pd (__m128d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_pd (__m128d __W, __mmask8 __U, __m128d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_pd (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_ps (__m256 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_ps (__m256 __W, __mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_ps (__mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ps (__m128 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ps (__m128 __W, __mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ps (__mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_pd (__m256d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_pd (__m256d __W, __mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_pd (__mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_pd (__m128d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_pd (__m128d __W, __mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_pd (__mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_ps (__m256 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256) __builtin_ia32_gather3siv8sf ((__v8sf) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_ps (__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3siv4sf ((__v4sf) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_pd (__m256d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m256d) __builtin_ia32_gather3siv4df ((__v4df) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_pd (__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128d) __builtin_ia32_gather3siv2df ((__v2df) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3div8sf ((__v4sf) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3div4sf ((__v4sf) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_pd (__m256d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256d) __builtin_ia32_gather3div4df ((__v4df) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_pd (__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128d) __builtin_ia32_gather3div2df ((__v2df) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_epi32 (__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3siv8si ((__v8si) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3siv4si ((__v4si) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_epi64 (__m256i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3siv4di ((__v4di) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_epi64 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3siv2di ((__v2di) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div8si ((__v4si) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div4si ((__v4si) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_epi64 (__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3div4di ((__v4di) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_epi64 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div2di ((__v2di) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_ps (void *__addr, __m256i __index, + __m256 __v1, const int __scale) +{ + __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_ps (void *__addr, __mmask8 __mask, + __m256i __index, __m256 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index, + (__v8sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_ps (void *__addr, __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_ps (void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index, + (__v4sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_pd (void *__addr, __m128i __index, + __m256d __v1, const int __scale) +{ + __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m256d __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index, + (__v4df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_pd (void *__addr, __m128i __index, + __m128d __v1, const int __scale) +{ + __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v2df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) +{ + __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index, + (__v2df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_ps (void *__addr, __m256i __index, + __m128 __v1, const int __scale) +{ + __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m256i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index, + (__v4sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_ps (void *__addr, __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v4sf) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index, + (__v4sf) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_pd (void *__addr, __m256i __index, + __m256d __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, __m256d __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index, + (__v4df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_pd (void *__addr, __m128i __index, + __m128d __v1, const int __scale) +{ + __builtin_ia32_scatterdiv2df (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v2df) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index, + (__v2df) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_epi32 (void *__addr, __m256i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index, + (__v8si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_epi32 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index, + (__v4si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_epi64 (void *__addr, __m128i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index, + (__v4di) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_epi64 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v2di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index, + (__v2di) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_epi32 (void *__addr, __m256i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m256i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv8si (__addr, __mask, (__v4di) __index, + (__v4si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_epi32 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v4si) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index, + (__v4si) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_epi64 (void *__addr, __m256i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index, + (__v4di) __v1, __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_epi64 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v2di) __v1, + __scale); +} +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index, + (__v2di) __v1, __scale); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) +{ + return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_epi32 (__mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) +{ + return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) +{ + return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_epi32 (__mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) +{ + return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rol_epi32 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rol_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rol_epi32 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rol_epi32 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rol_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rol_epi32 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ror_epi32 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ror_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ror_epi32 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ror_epi32 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ror_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ror_epi32 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rol_epi64 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rol_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rol_epi64 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rol_epi64 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rol_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rol_epi64 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ror_epi64 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ror_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ror_epi64 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ror_epi64 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ror_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ror_epi64 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi32 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi32 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi64 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi64 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi32 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi32 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi64 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi64 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A, + const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A, + const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I, + (__v8hi) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi64 (__m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi64 (__m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex_pd (__m256d __W, __mmask8 __U, __m256d __X, + const int __imm) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex_pd (__mmask8 __U, __m256d __X, const int __imm) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permute_pd (__m256d __W, __mmask8 __U, __m256d __X, + const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permute_pd (__mmask8 __U, __m256d __X, const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permute_pd (__m128d __W, __mmask8 __U, __m128d __X, + const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permute_pd (__mmask8 __U, __m128d __X, const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permute_ps (__m256 __W, __mmask8 __U, __m256 __X, + const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permute_ps (__mmask8 __U, __m256 __X, const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permute_ps (__m128 __W, __mmask8 __U, __m128 __X, + const int __C) +{ + return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C) +{ + return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) +{ + return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) +{ + return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) +{ + return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) +{ + return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi32_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu64_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu32_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pd_mask (__m256d __X, __m256d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X, + (__v4df) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_ps_mask (__m256 __X, __m256 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X, + (__v8sf) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi64_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi32_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu64_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu32_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_pd_mask (__mmask8 __U, __m256d __X, __m256d __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X, + (__v4df) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_ps_mask (__mmask8 __U, __m256 __X, __m256 __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X, + (__v8sf) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi64_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi32_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu64_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu32_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pd_mask (__m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ps_mask (__m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi64_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi32_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu64_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu32_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_pd_mask (__mmask8 __U, __m128d __X, __m128d __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __U); +} +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ps_mask (__mmask8 __U, __m128 __X, __m128 __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __U); +} +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __M, + (__v4df) + _mm256_undefined_pd (), + (__mmask8) -1); +} #else -#define _mm256_permutex_pd(X, M) \ - ((__m256d)__builtin_ia32_permdf256_mask( \ - (__v4df)(__m256d)(X), (int)(M), (__v4df)(__m256d)_mm256_undefined_pd(), \ - (__mmask8)-1)) - -#define _mm256_permutex_epi64(X, I) \ - ((__m256i)__builtin_ia32_permdi256_mask( \ - (__v4di)(__m256i)(X), (int)(I), \ - (__v4di)(__m256i)(_mm256_setzero_si256()), (__mmask8)-1)) - -#define _mm256_maskz_permutex_epi64(M, X, I) \ - ((__m256i)__builtin_ia32_permdi256_mask( \ - (__v4di)(__m256i)(X), (int)(I), \ - (__v4di)(__m256i)(_mm256_setzero_si256()), (__mmask8)(M))) - -#define _mm256_mask_permutex_epi64(W, M, X, I) \ - ((__m256i)__builtin_ia32_permdi256_mask( \ - (__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i)(W), (__mmask8)(M))) - -#define _mm256_insertf32x4(X, Y, C) \ - ((__m256)__builtin_ia32_insertf32x4_256_mask( \ - (__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(C), \ - (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)-1)) - -#define _mm256_mask_insertf32x4(W, U, X, Y, C) \ - ((__m256)__builtin_ia32_insertf32x4_256_mask( \ - (__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(C), (__v8sf)(__m256)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_insertf32x4(U, X, Y, C) \ - ((__m256)__builtin_ia32_insertf32x4_256_mask( \ - (__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(C), \ - (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) - -#define _mm256_inserti32x4(X, Y, C) \ - ((__m256i)__builtin_ia32_inserti32x4_256_mask( \ - (__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) - -#define _mm256_mask_inserti32x4(W, U, X, Y, C) \ - ((__m256i)__builtin_ia32_inserti32x4_256_mask( \ - (__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_inserti32x4(U, X, Y, C) \ - ((__m256i)__builtin_ia32_inserti32x4_256_mask( \ - (__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) - -#define _mm256_extractf32x4_ps(X, C) \ - ((__m128)__builtin_ia32_extractf32x4_256_mask( \ - (__v8sf)(__m256)(X), (int)(C), (__v4sf)(__m128)_mm_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_extractf32x4_ps(W, U, X, C) \ - ((__m128)__builtin_ia32_extractf32x4_256_mask( \ - (__v8sf)(__m256)(X), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm256_maskz_extractf32x4_ps(U, X, C) \ - ((__m128)__builtin_ia32_extractf32x4_256_mask( \ - (__v8sf)(__m256)(X), (int)(C), (__v4sf)(__m128)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_extracti32x4_epi32(X, C) \ - ((__m128i)__builtin_ia32_extracti32x4_256_mask( \ - (__v8si)(__m256i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm256_mask_extracti32x4_epi32(W, U, X, C) \ - ((__m128i)__builtin_ia32_extracti32x4_256_mask( \ - (__v8si)(__m256i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm256_maskz_extracti32x4_epi32(U, X, C) \ - ((__m128i)__builtin_ia32_extracti32x4_256_mask( \ - (__v8si)(__m256i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_shuffle_i64x2(X, Y, C) \ - ((__m256i)__builtin_ia32_shuf_i64x2_256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) - -#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C) \ - ((__m256i)__builtin_ia32_shuf_i64x2_256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_shuffle_i64x2(U, X, Y, C) \ - ((__m256i)__builtin_ia32_shuf_i64x2_256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) - -#define _mm256_shuffle_i32x4(X, Y, C) \ - ((__m256i)__builtin_ia32_shuf_i32x4_256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) - -#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C) \ - ((__m256i)__builtin_ia32_shuf_i32x4_256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_shuffle_i32x4(U, X, Y, C) \ - ((__m256i)__builtin_ia32_shuf_i32x4_256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) - -#define _mm256_shuffle_f64x2(X, Y, C) \ - ((__m256d)__builtin_ia32_shuf_f64x2_256_mask( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), \ - (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) - -#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C) \ - ((__m256d)__builtin_ia32_shuf_f64x2_256_mask( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), \ - (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm256_maskz_shuffle_f64x2(U, X, Y, C) \ - ((__m256d)__builtin_ia32_shuf_f64x2_256_mask( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), \ - (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) - -#define _mm256_shuffle_f32x4(X, Y, C) \ - ((__m256)__builtin_ia32_shuf_f32x4_256_mask( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), \ - (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)-1)) - -#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C) \ - ((__m256)__builtin_ia32_shuf_f32x4_256_mask( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_shuffle_f32x4(U, X, Y, C) \ - ((__m256)__builtin_ia32_shuf_f32x4_256_mask( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), \ - (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) - -#define _mm256_mask_shuffle_pd(W, U, A, B, C) \ - ((__m256d)__builtin_ia32_shufpd256_mask( \ - (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm256_maskz_shuffle_pd(U, A, B, C) \ - ((__m256d)__builtin_ia32_shufpd256_mask( \ - (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) - -#define _mm_mask_shuffle_pd(W, U, A, B, C) \ - ((__m128d)__builtin_ia32_shufpd128_mask( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm_maskz_shuffle_pd(U, A, B, C) \ - ((__m128d)__builtin_ia32_shufpd128_mask( \ - (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U))) - -#define _mm256_mask_shuffle_ps(W, U, A, B, C) \ - ((__m256)__builtin_ia32_shufps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)(__m256)(W), (__mmask8)(U))) - -#define _mm256_maskz_shuffle_ps(U, A, B, C) \ - ((__m256)__builtin_ia32_shufps256_mask( \ - (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) - -#define _mm_mask_shuffle_ps(W, U, A, B, C) \ - ((__m128)__builtin_ia32_shufps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm_maskz_shuffle_ps(U, A, B, C) \ - ((__m128)__builtin_ia32_shufps128_mask( \ - (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U))) - -#define _mm256_fixupimm_pd(X, Y, Z, C) \ - ((__m256d)__builtin_ia32_fixupimmpd256_mask( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), \ - (int)(C), (__mmask8)(-1))) - -#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C) \ - ((__m256d)__builtin_ia32_fixupimmpd256_mask( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C) \ - ((__m256d)__builtin_ia32_fixupimmpd256_maskz( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm256_fixupimm_ps(X, Y, Z, C) \ - ((__m256)__builtin_ia32_fixupimmps256_mask( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), \ - (int)(C), (__mmask8)(-1))) - -#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C) \ - ((__m256)__builtin_ia32_fixupimmps256_mask( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C) \ - ((__m256)__builtin_ia32_fixupimmps256_maskz( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm_fixupimm_pd(X, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmpd128_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(-1))) - -#define _mm_mask_fixupimm_pd(X, U, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmpd128_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C) \ - ((__m128d)__builtin_ia32_fixupimmpd128_maskz( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm_fixupimm_ps(X, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmps128_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(-1))) - -#define _mm_mask_fixupimm_ps(X, U, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmps128_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C) \ - ((__m128)__builtin_ia32_fixupimmps128_maskz( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ - (int)(C), (__mmask8)(U))) - -#define _mm256_mask_srli_epi32(W, U, A, B) \ - ((__m256i)__builtin_ia32_psrldi256_mask( \ - (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_srli_epi32(U, A, B) \ - ((__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_mask_srli_epi32(W, U, A, B) \ - ((__m128i)__builtin_ia32_psrldi128_mask( \ - (__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_srli_epi32(U, A, B) \ - ((__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_srli_epi64(W, U, A, B) \ - ((__m256i)__builtin_ia32_psrlqi256_mask( \ - (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_srli_epi64(U, A, B) \ - ((__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_mask_srli_epi64(W, U, A, B) \ - ((__m128i)__builtin_ia32_psrlqi128_mask( \ - (__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_srli_epi64(U, A, B) \ - ((__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_slli_epi32(W, U, X, C) \ - ((__m256i)__builtin_ia32_pslldi256_mask( \ - (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_slli_epi32(U, X, C) \ - ((__m256i)__builtin_ia32_pslldi256_mask( \ - (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm256_mask_slli_epi64(W, U, X, C) \ - ((__m256i)__builtin_ia32_psllqi256_mask( \ - (__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_slli_epi64(U, X, C) \ - ((__m256i)__builtin_ia32_psllqi256_mask( \ - (__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_mask_slli_epi32(W, U, X, C) \ - ((__m128i)__builtin_ia32_pslldi128_mask( \ - (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_slli_epi32(U, X, C) \ - ((__m128i)__builtin_ia32_pslldi128_mask( \ - (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm_mask_slli_epi64(W, U, X, C) \ - ((__m128i)__builtin_ia32_psllqi128_mask( \ - (__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_slli_epi64(U, X, C) \ - ((__m128i)__builtin_ia32_psllqi128_mask( \ - (__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_ternarylogic_epi64(A, B, C, I) \ - ((__m256i)__builtin_ia32_pternlogq256_mask( \ - (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \ - (int)(I), (__mmask8)-1)) - -#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) \ - ((__m256i)__builtin_ia32_pternlogq256_mask( \ - (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) \ - ((__m256i)__builtin_ia32_pternlogq256_maskz( \ - (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm256_ternarylogic_epi32(A, B, C, I) \ - ((__m256i)__builtin_ia32_pternlogd256_mask( \ - (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \ - (int)(I), (__mmask8)-1)) - -#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) \ - ((__m256i)__builtin_ia32_pternlogd256_mask( \ - (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) \ - ((__m256i)__builtin_ia32_pternlogd256_maskz( \ - (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm_ternarylogic_epi64(A, B, C, I) \ - ((__m128i)__builtin_ia32_pternlogq128_mask( \ - (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \ - (int)(I), (__mmask8)-1)) - -#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) \ - ((__m128i)__builtin_ia32_pternlogq128_mask( \ - (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) \ - ((__m128i)__builtin_ia32_pternlogq128_maskz( \ - (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm_ternarylogic_epi32(A, B, C, I) \ - ((__m128i)__builtin_ia32_pternlogd128_mask( \ - (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \ - (int)(I), (__mmask8)-1)) - -#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) \ - ((__m128i)__builtin_ia32_pternlogd128_mask( \ - (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) \ - ((__m128i)__builtin_ia32_pternlogd128_maskz( \ - (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \ - (int)(I), (__mmask8)(U))) - -#define _mm256_roundscale_ps(A, B) \ - ((__m256)__builtin_ia32_rndscaleps_256_mask( \ - (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_roundscale_ps(W, U, A, B) \ - ((__m256)__builtin_ia32_rndscaleps_256_mask( \ - (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) - -#define _mm256_maskz_roundscale_ps(U, A, B) \ - ((__m256)__builtin_ia32_rndscaleps_256_mask( \ - (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_roundscale_pd(A, B) \ - ((__m256d)__builtin_ia32_rndscalepd_256_mask( \ - (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm256_mask_roundscale_pd(W, U, A, B) \ - ((__m256d)__builtin_ia32_rndscalepd_256_mask( \ - (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm256_maskz_roundscale_pd(U, A, B) \ - ((__m256d)__builtin_ia32_rndscalepd_256_mask( \ - (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm_roundscale_ps(A, B) \ - ((__m128)__builtin_ia32_rndscaleps_128_mask( \ - (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm_mask_roundscale_ps(W, U, A, B) \ - ((__m128)__builtin_ia32_rndscaleps_128_mask( \ - (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm_maskz_roundscale_ps(U, A, B) \ - ((__m128)__builtin_ia32_rndscaleps_128_mask( \ - (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm_roundscale_pd(A, B) \ - ((__m128d)__builtin_ia32_rndscalepd_128_mask( \ - (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm_mask_roundscale_pd(W, U, A, B) \ - ((__m128d)__builtin_ia32_rndscalepd_128_mask( \ - (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm_maskz_roundscale_pd(U, A, B) \ - ((__m128d)__builtin_ia32_rndscalepd_128_mask( \ - (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_getmant_ps(X, B, C) \ - ((__m256)__builtin_ia32_getmantps256_mask( \ - (__v8sf)(__m256)(X), (int)(((C) << 2) | (B)), \ - (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)-1)) - -#define _mm256_mask_getmant_ps(W, U, X, B, C) \ - ((__m256)__builtin_ia32_getmantps256_mask( \ - (__v8sf)(__m256)(X), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_getmant_ps(U, X, B, C) \ - ((__m256)__builtin_ia32_getmantps256_mask( \ - (__v8sf)(__m256)(X), (int)(((C) << 2) | (B)), \ - (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) - -#define _mm_getmant_ps(X, B, C) \ - ((__m128)__builtin_ia32_getmantps128_mask( \ - (__v4sf)(__m128)(X), (int)(((C) << 2) | (B)), \ - (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)-1)) - -#define _mm_mask_getmant_ps(W, U, X, B, C) \ - ((__m128)__builtin_ia32_getmantps128_mask( \ - (__v4sf)(__m128)(X), (int)(((C) << 2) | (B)), (__v4sf)(__m128)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_getmant_ps(U, X, B, C) \ - ((__m128)__builtin_ia32_getmantps128_mask( \ - (__v4sf)(__m128)(X), (int)(((C) << 2) | (B)), \ - (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U))) - -#define _mm256_getmant_pd(X, B, C) \ - ((__m256d)__builtin_ia32_getmantpd256_mask( \ - (__v4df)(__m256d)(X), (int)(((C) << 2) | (B)), \ - (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) - -#define _mm256_mask_getmant_pd(W, U, X, B, C) \ - ((__m256d)__builtin_ia32_getmantpd256_mask( \ - (__v4df)(__m256d)(X), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_getmant_pd(U, X, B, C) \ - ((__m256d)__builtin_ia32_getmantpd256_mask( \ - (__v4df)(__m256d)(X), (int)(((C) << 2) | (B)), \ - (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) - -#define _mm_getmant_pd(X, B, C) \ - ((__m128d)__builtin_ia32_getmantpd128_mask( \ - (__v2df)(__m128d)(X), (int)(((C) << 2) | (B)), \ - (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)-1)) - -#define _mm_mask_getmant_pd(W, U, X, B, C) \ - ((__m128d)__builtin_ia32_getmantpd128_mask( \ - (__v2df)(__m128d)(X), (int)(((C) << 2) | (B)), (__v2df)(__m128d)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_getmant_pd(U, X, B, C) \ - ((__m128d)__builtin_ia32_getmantpd128_mask( \ - (__v2df)(__m128d)(X), (int)(((C) << 2) | (B)), \ - (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U))) - -#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256) __builtin_ia32_gather3siv8sf( \ - (__v8sf)(__m256)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128) __builtin_ia32_gather3siv4sf( \ - (__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256d) __builtin_ia32_gather3siv4df( \ - (__v4df)(__m256d)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128d) __builtin_ia32_gather3siv2df( \ - (__v2df)(__m128d)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128) __builtin_ia32_gather3div8sf( \ - (__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128) __builtin_ia32_gather3div4sf( \ - (__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256d) __builtin_ia32_gather3div4df( \ - (__v4df)(__m256d)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128d) __builtin_ia32_gather3div2df( \ - (__v2df)(__m128d)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256i) __builtin_ia32_gather3siv8si( \ - (__v8si)(__m256i)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128i) __builtin_ia32_gather3siv4si( \ - (__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256i) __builtin_ia32_gather3siv4di( \ - (__v4di)(__m256i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128i) __builtin_ia32_gather3siv2di( \ - (__v2di)(__m128i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128i) __builtin_ia32_gather3div8si( \ - (__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128i) __builtin_ia32_gather3div4si( \ - (__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m256i) __builtin_ia32_gather3div4di( \ - (__v4di)(__m256i)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ - (__m128i) __builtin_ia32_gather3div2di( \ - (__v2di)(__m128i)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ - (__mmask8)MASK, (int)SCALE) - -#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8sf((void *)ADDR, (__mmask8)0xFF, \ - (__v8si)(__m256i)INDEX, (__v8sf)(__m256)V1, \ - (int)SCALE) - -#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8sf((void *)ADDR, (__mmask8)MASK, \ - (__v8si)(__m256i)INDEX, (__v8sf)(__m256)V1, \ - (int)SCALE) - -#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4sf((void *)ADDR, (__mmask8)0xFF, \ - (__v4si)(__m128i)INDEX, (__v4sf)(__m128)V1, \ - (int)SCALE) - -#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4sf((void *)ADDR, (__mmask8)MASK, \ - (__v4si)(__m128i)INDEX, (__v4sf)(__m128)V1, \ - (int)SCALE) - -#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4df((void *)ADDR, (__mmask8)0xFF, \ - (__v4si)(__m128i)INDEX, (__v4df)(__m256d)V1, \ - (int)SCALE) - -#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4df((void *)ADDR, (__mmask8)MASK, \ - (__v4si)(__m128i)INDEX, (__v4df)(__m256d)V1, \ - (int)SCALE) - -#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv2df((void *)ADDR, (__mmask8)0xFF, \ - (__v4si)(__m128i)INDEX, (__v2df)(__m128d)V1, \ - (int)SCALE) - -#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv2df((void *)ADDR, (__mmask8)MASK, \ - (__v4si)(__m128i)INDEX, (__v2df)(__m128d)V1, \ - (int)SCALE) - -#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8sf((void *)ADDR, (__mmask8)0xFF, \ - (__v4di)(__m256i)INDEX, (__v4sf)(__m128)V1, \ - (int)SCALE) - -#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8sf((void *)ADDR, (__mmask8)MASK, \ - (__v4di)(__m256i)INDEX, (__v4sf)(__m128)V1, \ - (int)SCALE) - -#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4sf((void *)ADDR, (__mmask8)0xFF, \ - (__v2di)(__m128i)INDEX, (__v4sf)(__m128)V1, \ - (int)SCALE) - -#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4sf((void *)ADDR, (__mmask8)MASK, \ - (__v2di)(__m128i)INDEX, (__v4sf)(__m128)V1, \ - (int)SCALE) - -#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4df((void *)ADDR, (__mmask8)0xFF, \ - (__v4di)(__m256i)INDEX, (__v4df)(__m256d)V1, \ - (int)SCALE) - -#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4df((void *)ADDR, (__mmask8)MASK, \ - (__v4di)(__m256i)INDEX, (__v4df)(__m256d)V1, \ - (int)SCALE) - -#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv2df((void *)ADDR, (__mmask8)0xFF, \ - (__v2di)(__m128i)INDEX, (__v2df)(__m128d)V1, \ - (int)SCALE) - -#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv2df((void *)ADDR, (__mmask8)MASK, \ - (__v2di)(__m128i)INDEX, (__v2df)(__m128d)V1, \ - (int)SCALE) - -#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8si((void *)ADDR, (__mmask8)0xFF, \ - (__v8si)(__m256i)INDEX, (__v8si)(__m256i)V1, \ - (int)SCALE) - -#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv8si((void *)ADDR, (__mmask8)MASK, \ - (__v8si)(__m256i)INDEX, (__v8si)(__m256i)V1, \ - (int)SCALE) - -#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4si((void *)ADDR, (__mmask8)0xFF, \ - (__v4si)(__m128i)INDEX, (__v4si)(__m128i)V1, \ - (int)SCALE) - -#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4si((void *)ADDR, (__mmask8)MASK, \ - (__v4si)(__m128i)INDEX, (__v4si)(__m128i)V1, \ - (int)SCALE) - -#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4di((void *)ADDR, (__mmask8)0xFF, \ - (__v4si)(__m128i)INDEX, (__v4di)(__m256i)V1, \ - (int)SCALE) - -#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv4di((void *)ADDR, (__mmask8)MASK, \ - (__v4si)(__m128i)INDEX, (__v4di)(__m256i)V1, \ - (int)SCALE) - -#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv2di((void *)ADDR, (__mmask8)0xFF, \ - (__v4si)(__m128i)INDEX, (__v2di)(__m128i)V1, \ - (int)SCALE) - -#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scattersiv2di((void *)ADDR, (__mmask8)MASK, \ - (__v4si)(__m128i)INDEX, (__v2di)(__m128i)V1, \ - (int)SCALE) - -#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8si((void *)ADDR, (__mmask8)0xFF, \ - (__v4di)(__m256i)INDEX, (__v4si)(__m128i)V1, \ - (int)SCALE) - -#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv8si((void *)ADDR, (__mmask8)MASK, \ - (__v4di)(__m256i)INDEX, (__v4si)(__m128i)V1, \ - (int)SCALE) - -#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4si((void *)ADDR, (__mmask8)0xFF, \ - (__v2di)(__m128i)INDEX, (__v4si)(__m128i)V1, \ - (int)SCALE) - -#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4si((void *)ADDR, (__mmask8)MASK, \ - (__v2di)(__m128i)INDEX, (__v4si)(__m128i)V1, \ - (int)SCALE) - -#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4di((void *)ADDR, (__mmask8)0xFF, \ - (__v4di)(__m256i)INDEX, (__v4di)(__m256i)V1, \ - (int)SCALE) - -#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv4di((void *)ADDR, (__mmask8)MASK, \ - (__v4di)(__m256i)INDEX, (__v4di)(__m256i)V1, \ - (int)SCALE) - -#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv2di((void *)ADDR, (__mmask8)0xFF, \ - (__v2di)(__m128i)INDEX, (__v2di)(__m128i)V1, \ - (int)SCALE) - -#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ - __builtin_ia32_scatterdiv2di((void *)ADDR, (__mmask8)MASK, \ - (__v2di)(__m128i)INDEX, (__v2di)(__m128i)V1, \ - (int)SCALE) - -#define _mm256_mask_shuffle_epi32(W, U, X, C) \ - ((__m256i)__builtin_ia32_pshufd256_mask( \ - (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_shuffle_epi32(U, X, C) \ - ((__m256i)__builtin_ia32_pshufd256_mask( \ - (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_mask_shuffle_epi32(W, U, X, C) \ - ((__m128i)__builtin_ia32_pshufd128_mask( \ - (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_shuffle_epi32(U, X, C) \ - ((__m128i)__builtin_ia32_pshufd128_mask( \ - (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_rol_epi64(A, B) \ - ((__m256i)__builtin_ia32_prolq256_mask( \ - (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)-1)) - -#define _mm256_mask_rol_epi64(W, U, A, B) \ - ((__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_rol_epi64(U, A, B) \ - ((__m256i)__builtin_ia32_prolq256_mask( \ - (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_rol_epi64(A, B) \ - ((__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm_mask_rol_epi64(W, U, A, B) \ - ((__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_rol_epi64(U, A, B) \ - ((__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_ror_epi64(A, B) \ - ((__m256i)__builtin_ia32_prorq256_mask( \ - (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)-1)) - -#define _mm256_mask_ror_epi64(W, U, A, B) \ - ((__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_ror_epi64(U, A, B) \ - ((__m256i)__builtin_ia32_prorq256_mask( \ - (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_ror_epi64(A, B) \ - ((__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm_mask_ror_epi64(W, U, A, B) \ - ((__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_ror_epi64(U, A, B) \ - ((__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_rol_epi32(A, B) \ - ((__m256i)__builtin_ia32_prold256_mask( \ - (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)-1)) - -#define _mm256_mask_rol_epi32(W, U, A, B) \ - ((__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_rol_epi32(U, A, B) \ - ((__m256i)__builtin_ia32_prold256_mask( \ - (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_rol_epi32(A, B) \ - ((__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm_mask_rol_epi32(W, U, A, B) \ - ((__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_rol_epi32(U, A, B) \ - ((__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_ror_epi32(A, B) \ - ((__m256i)__builtin_ia32_prord256_mask( \ - (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)-1)) - -#define _mm256_mask_ror_epi32(W, U, A, B) \ - ((__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_ror_epi32(U, A, B) \ - ((__m256i)__builtin_ia32_prord256_mask( \ - (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_ror_epi32(A, B) \ - ((__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm_mask_ror_epi32(W, U, A, B) \ - ((__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_ror_epi32(U, A, B) \ - ((__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_alignr_epi32(X, Y, C) \ - ((__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(X), \ - (__v8si)(__m256i)(Y), (int)(C), \ - (__v8si)(__m256i)(X), (__mmask8)-1)) - -#define _mm256_mask_alignr_epi32(W, U, X, Y, C) \ - ((__m256i)__builtin_ia32_alignd256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_alignr_epi32(U, X, Y, C) \ - ((__m256i)__builtin_ia32_alignd256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ - (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) - -#define _mm256_alignr_epi64(X, Y, C) \ - ((__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(X), \ - (__v4di)(__m256i)(Y), (int)(C), \ - (__v4di)(__m256i)(X), (__mmask8)-1)) - -#define _mm256_mask_alignr_epi64(W, U, X, Y, C) \ - ((__m256i)__builtin_ia32_alignq256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_alignr_epi64(U, X, Y, C) \ - ((__m256i)__builtin_ia32_alignq256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ - (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) - -#define _mm_alignr_epi32(X, Y, C) \ - ((__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(X), \ - (__v4si)(__m128i)(Y), (int)(C), \ - (__v4si)(__m128i)(X), (__mmask8)-1)) - -#define _mm_mask_alignr_epi32(W, U, X, Y, C) \ - ((__m128i)__builtin_ia32_alignd128_mask( \ - (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_alignr_epi32(U, X, Y, C) \ - ((__m128i)__builtin_ia32_alignd128_mask( \ - (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), \ - (__v4si)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) - -#define _mm_alignr_epi64(X, Y, C) \ - ((__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(X), \ - (__v2di)(__m128i)(Y), (int)(C), \ - (__v2di)(__m128i)(X), (__mmask8)-1)) - -#define _mm_mask_alignr_epi64(W, U, X, Y, C) \ - ((__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(X), \ - (__v2di)(__m128i)(Y), (int)(C), \ - (__v2di)(__m128i)(X), (__mmask8)-1)) - -#define _mm_maskz_alignr_epi64(U, X, Y, C) \ - ((__m128i)__builtin_ia32_alignq128_mask( \ - (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), \ - (__v2di)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) - -#define _mm_mask_cvtps_ph(W, U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph_mask( \ - (__v4sf)(__m128)A, (int)(I), (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_cvtps_ph(U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph_mask( \ - (__v4sf)(__m128)A, (int)(I), (__v8hi)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_cvtps_ph(W, U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph256_mask( \ - (__v8sf)(__m256)A, (int)(I), (__v8hi)(__m128i)(W), (__mmask8)(U))) - -#define _mm256_maskz_cvtps_ph(U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph256_mask( \ - (__v8sf)(__m256)A, (int)(I), (__v8hi)(__m128i)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_srai_epi32(W, U, A, B) \ - ((__m256i)__builtin_ia32_psradi256_mask( \ - (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_srai_epi32(U, A, B) \ - ((__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_mask_srai_epi32(W, U, A, B) \ - ((__m128i)__builtin_ia32_psradi128_mask( \ - (__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_srai_epi32(U, A, B) \ - ((__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_srai_epi64(A, B) \ - ((__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1)) - -#define _mm256_mask_srai_epi64(W, U, A, B) \ - ((__m256i)__builtin_ia32_psraqi256_mask( \ - (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) - -#define _mm256_maskz_srai_epi64(U, A, B) \ - ((__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm_srai_epi64(A, B) \ - ((__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)-1)) - -#define _mm_mask_srai_epi64(W, U, A, B) \ - ((__m128i)__builtin_ia32_psraqi128_mask( \ - (__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) - -#define _mm_maskz_srai_epi64(U, A, B) \ - ((__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_permutex_pd(W, U, A, B) \ - ((__m256d)__builtin_ia32_permdf256_mask( \ - (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm256_maskz_permutex_pd(U, A, B) \ - ((__m256d)__builtin_ia32_permdf256_mask( \ - (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_mask_permute_pd(W, U, X, C) \ - ((__m256d)__builtin_ia32_vpermilpd256_mask( \ - (__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) - -#define _mm256_maskz_permute_pd(U, X, C) \ - ((__m256d)__builtin_ia32_vpermilpd256_mask( \ - (__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_mask_permute_ps(W, U, X, C) \ - ((__m256)__builtin_ia32_vpermilps256_mask( \ - (__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) - -#define _mm256_maskz_permute_ps(U, X, C) \ - ((__m256)__builtin_ia32_vpermilps256_mask( \ - (__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm_mask_permute_pd(W, U, X, C) \ - ((__m128d)__builtin_ia32_vpermilpd_mask( \ - (__v2df)(__m128d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) - -#define _mm_maskz_permute_pd(U, X, C) \ - ((__m128d)__builtin_ia32_vpermilpd_mask((__v2df)(__m128d)(X), (int)(C), \ - (__v2df)(__m128d)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm_mask_permute_ps(W, U, X, C) \ - ((__m128)__builtin_ia32_vpermilps_mask((__v4sf)(__m128)(X), (int)(C), \ - (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm_maskz_permute_ps(U, X, C) \ - ((__m128)__builtin_ia32_vpermilps_mask((__v4sf)(__m128)(X), (int)(C), \ - (__v4sf)(__m128)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_mask_blend_pd(__U, __A, __W) \ - ((__m256d)__builtin_ia32_blendmpd_256_mask((__v4df)(__A), (__v4df)(__W), \ - (__mmask8)(__U))) - -#define _mm256_mask_blend_ps(__U, __A, __W) \ - ((__m256)__builtin_ia32_blendmps_256_mask((__v8sf)(__A), (__v8sf)(__W), \ - (__mmask8)(__U))) - -#define _mm256_mask_blend_epi64(__U, __A, __W) \ - ((__m256i)__builtin_ia32_blendmq_256_mask((__v4di)(__A), (__v4di)(__W), \ - (__mmask8)(__U))) - -#define _mm256_mask_blend_epi32(__U, __A, __W) \ - ((__m256i)__builtin_ia32_blendmd_256_mask((__v8si)(__A), (__v8si)(__W), \ - (__mmask8)(__U))) - -#define _mm_mask_blend_pd(__U, __A, __W) \ - ((__m128d)__builtin_ia32_blendmpd_128_mask((__v2df)(__A), (__v2df)(__W), \ - (__mmask8)(__U))) - -#define _mm_mask_blend_ps(__U, __A, __W) \ - ((__m128)__builtin_ia32_blendmps_128_mask((__v4sf)(__A), (__v4sf)(__W), \ - (__mmask8)(__U))) - -#define _mm_mask_blend_epi64(__U, __A, __W) \ - ((__m128i)__builtin_ia32_blendmq_128_mask((__v2di)(__A), (__v2di)(__W), \ - (__mmask8)(__U))) - -#define _mm_mask_blend_epi32(__U, __A, __W) \ - ((__m128i)__builtin_ia32_blendmd_128_mask((__v4si)(__A), (__v4si)(__W), \ - (__mmask8)(__U))) - -#define _mm256_cmp_epu32_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpd256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm256_cmp_epi64_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpq256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm256_cmp_epi32_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpd256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm256_cmp_epu64_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpq256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm256_cmp_pd_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmppd256_mask( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)-1)) - -#define _mm256_cmp_ps_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpps256_mask( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)-1)) - -#define _mm256_mask_cmp_epi64_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpq256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm256_mask_cmp_epi32_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpd256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm256_mask_cmp_epu64_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpq256_mask( \ - (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm256_mask_cmp_epu32_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpd256_mask( \ - (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm256_mask_cmp_pd_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmppd256_mask( \ - (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)(M))) - -#define _mm256_mask_cmp_ps_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpps256_mask( \ - (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_cmp_epi64_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpq128_mask( \ - (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm_cmp_epi32_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpd128_mask( \ - (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm_cmp_epu64_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpq128_mask( \ - (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm_cmp_epu32_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpd128_mask( \ - (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) - -#define _mm_cmp_pd_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmppd128_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1)) - -#define _mm_cmp_ps_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpps128_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1)) - -#define _mm_mask_cmp_epi64_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpq128_mask( \ - (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_mask_cmp_epi32_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpd128_mask( \ - (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_mask_cmp_epu64_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpq128_mask( \ - (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_mask_cmp_epu32_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_ucmpd128_mask( \ - (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_mask_cmp_pd_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmppd128_mask( \ - (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)(M))) - -#define _mm_mask_cmp_ps_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpps128_mask( \ - (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)(M))) - +#define _mm256_permutex_pd(X, M) ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(X), (int)(M), (__v4df)(__m256d) _mm256_undefined_pd (), (__mmask8)-1)) +#define _mm256_permutex_epi64(X, I) ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i) (_mm256_setzero_si256 ()), (__mmask8) -1)) +#define _mm256_maskz_permutex_epi64(M, X, I) ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i) (_mm256_setzero_si256 ()), (__mmask8)(M))) +#define _mm256_mask_permutex_epi64(W, M, X, I) ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i)(W), (__mmask8)(M))) +#define _mm256_insertf32x4(X, Y, C) ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), (__v4sf)(__m128) (Y), (int) (C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) +#define _mm256_mask_insertf32x4(W, U, X, Y, C) ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), (__v4sf)(__m128) (Y), (int) (C), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_insertf32x4(U, X, Y, C) ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), (__v4sf)(__m128) (Y), (int) (C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) +#define _mm256_inserti32x4(X, Y, C) ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_inserti32x4(W, U, X, Y, C) ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_inserti32x4(U, X, Y, C) ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X), (__v4si)(__m128i) (Y), (int) (C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm256_extractf32x4_ps(X, C) ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), (int) (C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1)) +#define _mm256_mask_extractf32x4_ps(W, U, X, C) ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), (int) (C), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm256_maskz_extractf32x4_ps(U, X, C) ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), (int) (C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) +#define _mm256_extracti32x4_epi32(X, C) ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X), (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) +#define _mm256_mask_extracti32x4_epi32(W, U, X, C) ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X), (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm256_maskz_extracti32x4_epi32(U, X, C) ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X), (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_shuffle_i64x2(X, Y, C) ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C) ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_shuffle_i64x2(U, X, Y, C) ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm256_shuffle_i32x4(X, Y, C) ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C) ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_shuffle_i32x4(U, X, Y, C) ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm256_shuffle_f64x2(X, Y, C) ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1)) +#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C) ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_shuffle_f64x2(U, X, Y, C) ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd( ), (__mmask8)(U))) +#define _mm256_shuffle_f32x4(X, Y, C) ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) +#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C) ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_shuffle_f32x4(U, X, Y, C) ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) +#define _mm256_mask_shuffle_pd(W, U, A, B, C) ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_shuffle_pd(U, A, B, C) ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)(__m256d) _mm256_setzero_pd (), (__mmask8)(U))) +#define _mm_mask_shuffle_pd(W, U, A, B, C) ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) +#define _mm_maskz_shuffle_pd(U, A, B, C) ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) +#define _mm256_mask_shuffle_ps(W, U, A, B, C) ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_shuffle_ps(U, A, B, C) ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) +#define _mm_mask_shuffle_ps(W, U, A, B, C) ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm_maskz_shuffle_ps(U, A, B, C) ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) +#define _mm256_fixupimm_pd(X, Y, Z, C) ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), (int)(C), (__mmask8)(-1))) +#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C) ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), (int)(C), (__mmask8)(U))) +#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C) ((__m256d)__builtin_ia32_fixupimmpd256_maskz ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), (int)(C), (__mmask8)(U))) +#define _mm256_fixupimm_ps(X, Y, Z, C) ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), (int)(C), (__mmask8)(-1))) +#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C) ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), (int)(C), (__mmask8)(U))) +#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C) ((__m256)__builtin_ia32_fixupimmps256_maskz ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), (int)(C), (__mmask8)(U))) +#define _mm_fixupimm_pd(X, Y, Z, C) ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(-1))) +#define _mm_mask_fixupimm_pd(X, U, Y, Z, C) ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U))) +#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C) ((__m128d)__builtin_ia32_fixupimmpd128_maskz ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), (__mmask8)(U))) +#define _mm_fixupimm_ps(X, Y, Z, C) ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(-1))) +#define _mm_mask_fixupimm_ps(X, U, Y, Z, C) ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U))) +#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C) ((__m128)__builtin_ia32_fixupimmps128_maskz ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), (__mmask8)(U))) +#define _mm256_mask_srli_epi32(W, U, A, B) ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_srli_epi32(U, A, B) ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_mask_srli_epi32(W, U, A, B) ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_srli_epi32(U, A, B) ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_mask_srli_epi64(W, U, A, B) ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_srli_epi64(U, A, B) ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_mask_srli_epi64(W, U, A, B) ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_srli_epi64(U, A, B) ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_mask_slli_epi32(W, U, X, C) ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_slli_epi32(U, X, C) ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm256_mask_slli_epi64(W, U, X, C) ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_slli_epi64(U, X, C) ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_mask_slli_epi32(W, U, X, C) ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_slli_epi32(U, X, C) ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm_mask_slli_epi64(W, U, X, C) ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_slli_epi64(U, X, C) ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_ternarylogic_epi64(A, B, C, I) ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)-1)) +#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) ((__m256i) __builtin_ia32_pternlogq256_mask ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U))) +#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) ((__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), (int)(I), (__mmask8)(U))) +#define _mm256_ternarylogic_epi32(A, B, C, I) ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)-1)) +#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) ((__m256i) __builtin_ia32_pternlogd256_mask ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U))) +#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) ((__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), (int)(I), (__mmask8)(U))) +#define _mm_ternarylogic_epi64(A, B, C, I) ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)-1)) +#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) ((__m128i) __builtin_ia32_pternlogq128_mask ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U))) +#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) ((__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), (int)(I), (__mmask8)(U))) +#define _mm_ternarylogic_epi32(A, B, C, I) ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)-1)) +#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) ((__m128i) __builtin_ia32_pternlogd128_mask ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U))) +#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) ((__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), (int)(I), (__mmask8)(U))) +#define _mm256_roundscale_ps(A, B) ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) +#define _mm256_mask_roundscale_ps(W, U, A, B) ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_roundscale_ps(U, A, B) ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) +#define _mm256_roundscale_pd(A, B) ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1)) +#define _mm256_mask_roundscale_pd(W, U, A, B) ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_roundscale_pd(U, A, B) ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) +#define _mm_roundscale_ps(A, B) ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1)) +#define _mm_mask_roundscale_ps(W, U, A, B) ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm_maskz_roundscale_ps(U, A, B) ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) +#define _mm_roundscale_pd(A, B) ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1)) +#define _mm_mask_roundscale_pd(W, U, A, B) ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) +#define _mm_maskz_roundscale_pd(U, A, B) ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) +#define _mm256_getmant_ps(X, B, C) ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), (int)(((C)<<2) | (B)), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) +#define _mm256_mask_getmant_ps(W, U, X, B, C) ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), (int)(((C)<<2) | (B)), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_getmant_ps(U, X, B, C) ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), (int)(((C)<<2) | (B)), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) +#define _mm_getmant_ps(X, B, C) ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), (int)(((C)<<2) | (B)), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1)) +#define _mm_mask_getmant_ps(W, U, X, B, C) ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), (int)(((C)<<2) | (B)), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm_maskz_getmant_ps(U, X, B, C) ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), (int)(((C)<<2) | (B)), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) +#define _mm256_getmant_pd(X, B, C) ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), (int)(((C)<<2) | (B)), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1)) +#define _mm256_mask_getmant_pd(W, U, X, B, C) ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), (int)(((C)<<2) | (B)), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_getmant_pd(U, X, B, C) ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), (int)(((C)<<2) | (B)), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) +#define _mm_getmant_pd(X, B, C) ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), (int)(((C)<<2) | (B)), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1)) +#define _mm_mask_getmant_pd(W, U, X, B, C) ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), (int)(((C)<<2) | (B)), (__v2df)(__m128d)(W), (__mmask8)(U))) +#define _mm_maskz_getmant_pd(U, X, B, C) ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), (int)(((C)<<2) | (B)), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) +#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256) __builtin_ia32_gather3siv8sf ((__v8sf)(__m256) (V1OLD), (void const *) (ADDR), (__v8si)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128) __builtin_ia32_gather3siv4sf ((__v4sf)(__m128) (V1OLD), (void const *) (ADDR), (__v4si)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256d) __builtin_ia32_gather3siv4df ((__v4df)(__m256d) (V1OLD), (void const *) (ADDR), (__v4si)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128d) __builtin_ia32_gather3siv2df ((__v2df)(__m128d) (V1OLD), (void const *) (ADDR), (__v4si)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128) __builtin_ia32_gather3div8sf ((__v4sf)(__m128) (V1OLD), (void const *) (ADDR), (__v4di)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128) __builtin_ia32_gather3div4sf ((__v4sf)(__m128) (V1OLD), (void const *) (ADDR), (__v2di)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256d) __builtin_ia32_gather3div4df ((__v4df)(__m256d) (V1OLD), (void const *) (ADDR), (__v4di)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128d) __builtin_ia32_gather3div2df ((__v2df)(__m128d) (V1OLD), (void const *) (ADDR), (__v2di)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256i) __builtin_ia32_gather3siv8si ((__v8si)(__m256i) (V1OLD), (void const *) (ADDR), (__v8si)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128i) __builtin_ia32_gather3siv4si ((__v4si)(__m128i) (V1OLD), (void const *) (ADDR), (__v4si)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256i) __builtin_ia32_gather3siv4di ((__v4di)(__m256i) (V1OLD), (void const *) (ADDR), (__v4si)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128i) __builtin_ia32_gather3siv2di ((__v2di)(__m128i) (V1OLD), (void const *) (ADDR), (__v4si)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128i) __builtin_ia32_gather3div8si ((__v4si)(__m128i) (V1OLD), (void const *) (ADDR), (__v4di)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128i) __builtin_ia32_gather3div4si ((__v4si)(__m128i) (V1OLD), (void const *) (ADDR), (__v2di)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) (__m256i) __builtin_ia32_gather3div4di ((__v4di)(__m256i) (V1OLD), (void const *) (ADDR), (__v4di)(__m256i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) (__m128i) __builtin_ia32_gather3div2di ((__v2di)(__m128i) (V1OLD), (void const *) (ADDR), (__v2di)(__m128i) (INDEX), (__mmask8) (MASK), (int) (SCALE)) +#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8)0xFF, (__v8si)(__m256i) (INDEX), (__v8sf)(__m256) (V1), (int) (SCALE)) +#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8) (MASK), (__v8si)(__m256i) (INDEX), (__v8sf)(__m256) (V1), (int) (SCALE)) +#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8)0xFF, (__v4si)(__m128i) (INDEX), (__v4sf)(__m128) (V1), (int) (SCALE)) +#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8) (MASK), (__v4si)(__m128i) (INDEX), (__v4sf)(__m128) (V1), (int) (SCALE)) +#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8)0xFF, (__v4si)(__m128i) (INDEX), (__v4df)(__m256d) (V1), (int) (SCALE)) +#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8) (MASK), (__v4si)(__m128i) (INDEX), (__v4df)(__m256d) (V1), (int) (SCALE)) +#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8)0xFF, (__v4si)(__m128i) (INDEX), (__v2df)(__m128d) (V1), (int) (SCALE)) +#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8) (MASK), (__v4si)(__m128i) (INDEX), (__v2df)(__m128d) (V1), (int) (SCALE)) +#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8)0xFF, (__v4di)(__m256i) (INDEX), (__v4sf)(__m128) (V1), (int) (SCALE)) +#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8) (MASK), (__v4di)(__m256i) (INDEX), (__v4sf)(__m128) (V1), (int) (SCALE)) +#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8)0xFF, (__v2di)(__m128i) (INDEX), (__v4sf)(__m128) (V1), (int) (SCALE)) +#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8) (MASK), (__v2di)(__m128i) (INDEX), (__v4sf)(__m128) (V1), (int) (SCALE)) +#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8)0xFF, (__v4di)(__m256i) (INDEX), (__v4df)(__m256d) (V1), (int) (SCALE)) +#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8) (MASK), (__v4di)(__m256i) (INDEX), (__v4df)(__m256d) (V1), (int) (SCALE)) +#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8)0xFF, (__v2di)(__m128i) (INDEX), (__v2df)(__m128d) (V1), (int) (SCALE)) +#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8) (MASK), (__v2di)(__m128i) (INDEX), (__v2df)(__m128d) (V1), (int) (SCALE)) +#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8)0xFF, (__v8si)(__m256i) (INDEX), (__v8si)(__m256i) (V1), (int) (SCALE)) +#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8) (MASK), (__v8si)(__m256i) (INDEX), (__v8si)(__m256i) (V1), (int) (SCALE)) +#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8)0xFF, (__v4si)(__m128i) (INDEX), (__v4si)(__m128i) (V1), (int) (SCALE)) +#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8) (MASK), (__v4si)(__m128i) (INDEX), (__v4si)(__m128i) (V1), (int) (SCALE)) +#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8)0xFF, (__v4si)(__m128i) (INDEX), (__v4di)(__m256i) (V1), (int) (SCALE)) +#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8) (MASK), (__v4si)(__m128i) (INDEX), (__v4di)(__m256i) (V1), (int) (SCALE)) +#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE) __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8)0xFF, (__v4si)(__m128i) (INDEX), (__v2di)(__m128i) (V1), (int) (SCALE)) +#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8) (MASK), (__v4si)(__m128i) (INDEX), (__v2di)(__m128i) (V1), (int) (SCALE)) +#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8)0xFF, (__v4di)(__m256i) (INDEX), (__v4si)(__m128i) (V1), (int) (SCALE)) +#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8) (MASK), (__v4di)(__m256i) (INDEX), (__v4si)(__m128i) (V1), (int) (SCALE)) +#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8)0xFF, (__v2di)(__m128i) (INDEX), (__v4si)(__m128i) (V1), (int) (SCALE)) +#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8) (MASK), (__v2di)(__m128i) (INDEX), (__v4si)(__m128i) (V1), (int) (SCALE)) +#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8)0xFF, (__v4di)(__m256i) (INDEX), (__v4di)(__m256i) (V1), (int) (SCALE)) +#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8) (MASK), (__v4di)(__m256i) (INDEX), (__v4di)(__m256i) (V1), (int) (SCALE)) +#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE) __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8)0xFF, (__v2di)(__m128i) (INDEX), (__v2di)(__m128i) (V1), (int) (SCALE)) +#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8) (MASK), (__v2di)(__m128i) (INDEX), (__v2di)(__m128i) (V1), (int) (SCALE)) +#define _mm256_mask_shuffle_epi32(W, U, X, C) ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_shuffle_epi32(U, X, C) ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_mask_shuffle_epi32(W, U, X, C) ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_shuffle_epi32(U, X, C) ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_rol_epi64(A, B) ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_rol_epi64(W, U, A, B) ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_rol_epi64(U, A, B) ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_rol_epi64(A, B) ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) +#define _mm_mask_rol_epi64(W, U, A, B) ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_rol_epi64(U, A, B) ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_ror_epi64(A, B) ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_ror_epi64(W, U, A, B) ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_ror_epi64(U, A, B) ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_ror_epi64(A, B) ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) +#define _mm_mask_ror_epi64(W, U, A, B) ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_ror_epi64(U, A, B) ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_rol_epi32(A, B) ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_rol_epi32(W, U, A, B) ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_rol_epi32(U, A, B) ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_rol_epi32(A, B) ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) +#define _mm_mask_rol_epi32(W, U, A, B) ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_rol_epi32(U, A, B) ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_ror_epi32(A, B) ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_ror_epi32(W, U, A, B) ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_ror_epi32(U, A, B) ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_ror_epi32(A, B) ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) +#define _mm_mask_ror_epi32(W, U, A, B) ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_ror_epi32(U, A, B) ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_alignr_epi32(X, Y, C) ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(X), (__mmask8)-1)) +#define _mm256_mask_alignr_epi32(W, U, X, Y, C) ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_alignr_epi32(U, X, Y, C) ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm256_alignr_epi64(X, Y, C) ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(X), (__mmask8)-1)) +#define _mm256_mask_alignr_epi64(W, U, X, Y, C) ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_alignr_epi64(U, X, Y, C) ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_alignr_epi32(X, Y, C) ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(X), (__mmask8)-1)) +#define _mm_mask_alignr_epi32(W, U, X, Y, C) ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_alignr_epi32(U, X, Y, C) ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm_alignr_epi64(X, Y, C) ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) +#define _mm_mask_alignr_epi64(W, U, X, Y, C) ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) +#define _mm_maskz_alignr_epi64(U, X, Y, C) ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm_mask_cvtps_ph(W, U, A, I) ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), (__v8hi)(__m128i) (W), (__mmask8) (U))) +#define _mm_maskz_cvtps_ph(U, A, I) ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) +#define _mm256_mask_cvtps_ph(W, U, A, I) ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), (__v8hi)(__m128i) (W), (__mmask8) (U))) +#define _mm256_maskz_cvtps_ph(U, A, I) ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) +#define _mm256_mask_srai_epi32(W, U, A, B) ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_srai_epi32(U, A, B) ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_mask_srai_epi32(W, U, A, B) ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_srai_epi32(U, A, B) ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_srai_epi64(A, B) ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1)) +#define _mm256_mask_srai_epi64(W, U, A, B) ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) +#define _mm256_maskz_srai_epi64(U, A, B) ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U))) +#define _mm_srai_epi64(A, B) ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1)) +#define _mm_mask_srai_epi64(W, U, A, B) ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) +#define _mm_maskz_srai_epi64(U, A, B) ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U))) +#define _mm256_mask_permutex_pd(W, U, A, B) ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_permutex_pd(U, A, B) ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) +#define _mm256_mask_permute_pd(W, U, X, C) ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) +#define _mm256_maskz_permute_pd(U, X, C) ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) +#define _mm256_mask_permute_ps(W, U, X, C) ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) +#define _mm256_maskz_permute_ps(U, X, C) ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) +#define _mm_mask_permute_pd(W, U, X, C) ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) +#define _mm_maskz_permute_pd(U, X, C) ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) +#define _mm_mask_permute_ps(W, U, X, C) ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) +#define _mm_maskz_permute_ps(U, X, C) ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) +#define _mm256_mask_blend_pd(__U, __A, __W) ((__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) (__A), (__v4df) (__W), (__mmask8) (__U))) +#define _mm256_mask_blend_ps(__U, __A, __W) ((__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) (__A), (__v8sf) (__W), (__mmask8) (__U))) +#define _mm256_mask_blend_epi64(__U, __A, __W) ((__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) (__A), (__v4di) (__W), (__mmask8) (__U))) +#define _mm256_mask_blend_epi32(__U, __A, __W) ((__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) (__A), (__v8si) (__W), (__mmask8) (__U))) +#define _mm_mask_blend_pd(__U, __A, __W) ((__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) (__A), (__v2df) (__W), (__mmask8) (__U))) +#define _mm_mask_blend_ps(__U, __A, __W) ((__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) (__A), (__v4sf) (__W), (__mmask8) (__U))) +#define _mm_mask_blend_epi64(__U, __A, __W) ((__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) (__A), (__v2di) (__W), (__mmask8) (__U))) +#define _mm_mask_blend_epi32(__U, __A, __W) ((__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) (__A), (__v4si) (__W), (__mmask8) (__U))) +#define _mm256_cmp_epu32_mask(X, Y, P) ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) +#define _mm256_cmp_epi64_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) +#define _mm256_cmp_epi32_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) +#define _mm256_cmp_epu64_mask(X, Y, P) ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) +#define _mm256_cmp_pd_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)-1)) +#define _mm256_cmp_ps_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)-1)) +#define _mm256_mask_cmp_epi64_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) +#define _mm256_mask_cmp_epi32_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) +#define _mm256_mask_cmp_epu64_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) +#define _mm256_mask_cmp_epu32_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) +#define _mm256_mask_cmp_pd_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)(M))) +#define _mm256_mask_cmp_ps_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)(M))) +#define _mm_cmp_epi64_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) +#define _mm_cmp_epi32_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) +#define _mm_cmp_epu64_mask(X, Y, P) ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) +#define _mm_cmp_epu32_mask(X, Y, P) ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) +#define _mm_cmp_pd_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1)) +#define _mm_cmp_ps_mask(X, Y, P) ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1)) +#define _mm_mask_cmp_epi64_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) +#define _mm_mask_cmp_epi32_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) +#define _mm_mask_cmp_epu64_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) +#define _mm_mask_cmp_epu32_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) +#define _mm_mask_cmp_pd_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)(M))) +#define _mm_mask_cmp_ps_mask(M, X, Y, P) ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)(M))) #endif - -#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) - +#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps ((B), (A)) +#define _mm256_mask_cvt_roundps_ph(A, B, C, D) _mm256_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm256_maskz_cvt_roundps_ph(A, B, C) _mm256_maskz_cvtps_ph ((A), (B), (C)) +#define _mm_mask_cvt_roundps_ph(A, B, C, D) _mm_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm_maskz_cvt_roundps_ph(A, B, C) _mm_maskz_cvtps_ph ((A), (B), (C)) #ifdef __DISABLE_AVX512VL__ #undef __DISABLE_AVX512VL__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VL__ */ - -#endif /* _AVX512VLINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vnniintrin.internal.h b/third_party/intel/avx512vnniintrin.internal.h index 3706fda4f..e9e04ea73 100644 --- a/third_party/intel/avx512vnniintrin.internal.h +++ b/third_party/intel/avx512vnniintrin.internal.h @@ -1,87 +1,108 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef __AVX512VNNIINTRIN_H_INCLUDED #define __AVX512VNNIINTRIN_H_INCLUDED - #if !defined(__AVX512VNNI__) #pragma GCC push_options #pragma GCC target("avx512vnni") #define __DISABLE_AVX512VNNI__ -#endif /* __AVX512VNNI__ */ - -__funline __m512i _mm512_dpbusd_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpbusd_v16si((__v16si)__A, (__v16si)__B, - (__v16si)__C); +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbusd_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpbusd_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); } - -__funline __m512i _mm512_mask_dpbusd_epi32(__m512i __A, __mmask16 __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask( - (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbusd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_dpbusd_epi32(__mmask16 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbusd_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); } - -__funline __m512i _mm512_dpbusds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpbusds_v16si((__v16si)__A, (__v16si)__B, - (__v16si)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbusds_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpbusds_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); } - -__funline __m512i _mm512_mask_dpbusds_epi32(__m512i __A, __mmask16 __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask( - (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbusds_epi32 (__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_dpbusds_epi32(__mmask16 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbusds_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); } - -__funline __m512i _mm512_dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwssd_v16si((__v16si)__A, (__v16si)__B, - (__v16si)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpwssd_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); } - -__funline __m512i _mm512_mask_dpwssd_epi32(__m512i __A, __mmask16 __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask( - (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwssd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_dpwssd_epi32(__mmask16 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwssd_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); } - -__funline __m512i _mm512_dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwssds_v16si((__v16si)__A, (__v16si)__B, - (__v16si)__C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpwssds_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); } - -__funline __m512i _mm512_mask_dpwssds_epi32(__m512i __A, __mmask16 __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask( - (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwssds_epi32 (__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); } - -__funline __m512i _mm512_maskz_dpwssds_epi32(__mmask16 __A, __m512i __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz( - (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwssds_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); } - #ifdef __DISABLE_AVX512VNNI__ #undef __DISABLE_AVX512VNNI__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VNNI__ */ - -#endif /* __AVX512VNNIINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vnnivlintrin.internal.h b/third_party/intel/avx512vnnivlintrin.internal.h index 2c2750152..52a339c26 100644 --- a/third_party/intel/avx512vnnivlintrin.internal.h +++ b/third_party/intel/avx512vnnivlintrin.internal.h @@ -1,154 +1,140 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +#error "Never use directly; include instead." #endif - #ifndef _AVX512VNNIVLINTRIN_H_INCLUDED #define _AVX512VNNIVLINTRIN_H_INCLUDED - #if !defined(__AVX512VL__) || !defined(__AVX512VNNI__) #pragma GCC push_options #pragma GCC target("avx512vnni,avx512vl") #define __DISABLE_AVX512VNNIVL__ -#endif /* __AVX512VNNIVL__ */ - -__funline __m256i _mm256_dpbusd_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpdpbusd_v8si((__v8si)__A, (__v8si)__B, - (__v8si)__C); +#endif +#define _mm256_dpbusd_epi32(A, B, C) ((__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) (A), (__v8si) (B), (__v8si) (C))) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbusd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B); } - -__funline __m256i _mm256_mask_dpbusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask((__v8si)__A, (__v8si)__C, - (__v8si)__D, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbusd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); } - -__funline __m256i _mm256_maskz_dpbusd_epi32(__mmask8 __A, __m256i __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz( - (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); +#define _mm_dpbusd_epi32(A, B, C) ((__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) (A), (__v4si) (B), (__v4si) (C))) +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbusd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); } - -__funline __m128i _mm_dpbusd_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpdpbusd_v4si((__v4si)__A, (__v4si)__B, - (__v4si)__C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbusd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); } - -__funline __m128i _mm_mask_dpbusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask((__v4si)__A, (__v4si)__C, - (__v4si)__D, (__mmask8)__B); +#define _mm256_dpbusds_epi32(A, B, C) ((__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) (A), (__v8si) (B), (__v8si) (C))) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbusds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask ((__v8si)__A, + (__v8si) __C, (__v8si) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_dpbusd_epi32(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz( - (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbusds_epi32 (__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); } - -__funline __m256i _mm256_dpbusds_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpdpbusds_v8si((__v8si)__A, (__v8si)__B, - (__v8si)__C); +#define _mm_dpbusds_epi32(A, B, C) ((__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) (A), (__v4si) (B), (__v4si) (C))) +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbusds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask ((__v4si)__A, + (__v4si) __C, (__v4si) __D, (__mmask8)__B); } - -__funline __m256i _mm256_mask_dpbusds_epi32(__m256i __A, __mmask8 __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask( - (__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbusds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); } - -__funline __m256i _mm256_maskz_dpbusds_epi32(__mmask8 __A, __m256i __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz( - (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); +#define _mm256_dpwssd_epi32(A, B, C) ((__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) (A), (__v8si) (B), (__v8si) (C))) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwssd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B); } - -__funline __m128i _mm_dpbusds_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpdpbusds_v4si((__v4si)__A, (__v4si)__B, - (__v4si)__C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwssd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); } - -__funline __m128i _mm_mask_dpbusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask( - (__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); +#define _mm_dpwssd_epi32(A, B, C) ((__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) (A), (__v4si) (B), (__v4si) (C))) +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwssd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); } - -__funline __m128i _mm_maskz_dpbusds_epi32(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz( - (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwssd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); } - -__funline __m256i _mm256_dpwssd_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpdpwssd_v8si((__v8si)__A, (__v8si)__B, - (__v8si)__C); +#define _mm256_dpwssds_epi32(A, B, C) ((__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) (A), (__v8si) (B), (__v8si) (C))) +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwssds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask ((__v8si)__A, + (__v8si) __C, (__v8si) __D, (__mmask8)__B); } - -__funline __m256i _mm256_mask_dpwssd_epi32(__m256i __A, __mmask8 __B, __m256i __C, - __m256i __D) { - return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask((__v8si)__A, (__v8si)__C, - (__v8si)__D, (__mmask8)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwssds_epi32 (__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); } - -__funline __m256i _mm256_maskz_dpwssd_epi32(__mmask8 __A, __m256i __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz( - (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); +#define _mm_dpwssds_epi32(A, B, C) ((__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) (A), (__v4si) (B), (__v4si) (C))) +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwssds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask ((__v4si)__A, + (__v4si) __C, (__v4si) __D, (__mmask8)__B); } - -__funline __m128i _mm_dpwssd_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpdpwssd_v4si((__v4si)__A, (__v4si)__B, - (__v4si)__C); -} - -__funline __m128i _mm_mask_dpwssd_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask((__v4si)__A, (__v4si)__C, - (__v4si)__D, (__mmask8)__B); -} - -__funline __m128i _mm_maskz_dpwssd_epi32(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz( - (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); -} - -__funline __m256i _mm256_dpwssds_epi32(__m256i __A, __m256i __B, __m256i __C) { - return (__m256i)__builtin_ia32_vpdpwssds_v8si((__v8si)__A, (__v8si)__B, - (__v8si)__C); -} - -__funline __m256i _mm256_mask_dpwssds_epi32(__m256i __A, __mmask8 __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask( - (__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); -} - -__funline __m256i _mm256_maskz_dpwssds_epi32(__mmask8 __A, __m256i __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz( - (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); -} - -__funline __m128i _mm_dpwssds_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpdpwssds_v4si((__v4si)__A, (__v4si)__B, - (__v4si)__C); -} - -__funline __m128i _mm_mask_dpwssds_epi32(__m128i __A, __mmask8 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask( - (__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); -} - -__funline __m128i _mm_maskz_dpwssds_epi32(__mmask8 __A, __m128i __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz( - (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwssds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); } #ifdef __DISABLE_AVX512VNNIVL__ #undef __DISABLE_AVX512VNNIVL__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VNNIVL__ */ -#endif /* __DISABLE_AVX512VNNIVL__ */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vp2intersectintrin.internal.h b/third_party/intel/avx512vp2intersectintrin.internal.h new file mode 100644 index 000000000..bf944f3c2 --- /dev/null +++ b/third_party/intel/avx512vp2intersectintrin.internal.h @@ -0,0 +1,32 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AVX512VP2INTERSECTINTRIN_H_INCLUDED +#define _AVX512VP2INTERSECTINTRIN_H_INCLUDED +#if !defined(__AVX512VP2INTERSECT__) +#pragma GCC push_options +#pragma GCC target("avx512vp2intersect") +#define __DISABLE_AVX512VP2INTERSECT__ +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_2intersect_epi32 (__m512i __A, __m512i __B, __mmask16 *__U, + __mmask16 *__M) +{ + __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_2intersect_epi64 (__m512i __A, __m512i __B, __mmask8 *__U, + __mmask8 *__M) +{ + __builtin_ia32_2intersectq512 (__U, __M, (__v8di) __A, (__v8di) __B); +} +#ifdef __DISABLE_AVX512VP2INTERSECT__ +#undef __DISABLE_AVX512VP2INTERSECT__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/avx512vp2intersectvlintrin.internal.h b/third_party/intel/avx512vp2intersectvlintrin.internal.h new file mode 100644 index 000000000..78fabb027 --- /dev/null +++ b/third_party/intel/avx512vp2intersectvlintrin.internal.h @@ -0,0 +1,44 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED +#define _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED +#if !defined(__AVX512VP2INTERSECT__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("avx512vp2intersect,avx512vl") +#define __DISABLE_AVX512VP2INTERSECTVL__ +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_2intersect_epi32 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M) +{ + __builtin_ia32_2intersectd128 (__U, __M, (__v4si) __A, (__v4si) __B); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_2intersect_epi32 (__m256i __A, __m256i __B, __mmask8 *__U, + __mmask8 *__M) +{ + __builtin_ia32_2intersectd256 (__U, __M, (__v8si) __A, (__v8si) __B); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_2intersect_epi64 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M) +{ + __builtin_ia32_2intersectq128 (__U, __M, (__v2di) __A, (__v2di) __B); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_2intersect_epi64 (__m256i __A, __m256i __B, __mmask8 *__U, + __mmask8 *__M) +{ + __builtin_ia32_2intersectq256 (__U, __M, (__v4di) __A, (__v4di) __B); +} +#ifdef __DISABLE_AVX512VP2INTERSECTVL__ +#undef __DISABLE_AVX512VP2INTERSECTVL__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/avx512vpopcntdqintrin.internal.h b/third_party/intel/avx512vpopcntdqintrin.internal.h index f7629f866..e6c4de3a4 100644 --- a/third_party/intel/avx512vpopcntdqintrin.internal.h +++ b/third_party/intel/avx512vpopcntdqintrin.internal.h @@ -1,50 +1,64 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #if !defined _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED #define _AVX512VPOPCNTDQINTRIN_H_INCLUDED - #ifndef __AVX512VPOPCNTDQ__ #pragma GCC push_options #pragma GCC target("avx512vpopcntdq") #define __DISABLE_AVX512VPOPCNTDQ__ -#endif /* __AVX512VPOPCNTDQ__ */ - -__funline __m512i _mm512_popcnt_epi32(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcountd_v16si((__v16si)__A); +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi32 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountd_v16si ((__v16si) __A); } - -__funline __m512i _mm512_mask_popcnt_epi32(__m512i __A, __mmask16 __U, - __m512i __B) { - return (__m512i)__builtin_ia32_vpopcountd_v16si_mask( - (__v16si)__A, (__v16si)__B, (__mmask16)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); } - -__funline __m512i _mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vpopcountd_v16si_mask( - (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); } - -__funline __m512i _mm512_popcnt_epi64(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcountq_v8di((__v8di)__A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi64 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountq_v8di ((__v8di) __A); } - -__funline __m512i _mm512_mask_popcnt_epi64(__m512i __A, __mmask8 __U, - __m512i __B) { - return (__m512i)__builtin_ia32_vpopcountq_v8di_mask((__v8di)__A, (__v8di)__B, - (__mmask8)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); } - -__funline __m512i _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_vpopcountq_v8di_mask( - (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); } - #ifdef __DISABLE_AVX512VPOPCNTDQ__ #undef __DISABLE_AVX512VPOPCNTDQ__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VPOPCNTDQ__ */ - -#endif /* _AVX512VPOPCNTDQINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avx512vpopcntdqvlintrin.internal.h b/third_party/intel/avx512vpopcntdqvlintrin.internal.h index e46c2cf55..af1cd345a 100644 --- a/third_party/intel/avx512vpopcntdqvlintrin.internal.h +++ b/third_party/intel/avx512vpopcntdqvlintrin.internal.h @@ -1,78 +1,110 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #if !defined _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED #define _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED - #if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__) #pragma GCC push_options #pragma GCC target("avx512vpopcntdq,avx512vl") #define __DISABLE_AVX512VPOPCNTDQVL__ -#endif /* __AVX512VPOPCNTDQVL__ */ - -__funline __m128i _mm_popcnt_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcountd_v4si((__v4si)__A); +#endif +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountd_v4si ((__v4si) __A); } - -__funline __m128i _mm_mask_popcnt_epi32(__m128i __A, __mmask16 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpopcountd_v4si_mask((__v4si)__A, (__v4si)__B, - (__mmask16)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi32 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A, + (__v4si) __W, + (__mmask16) __U); } - -__funline __m128i _mm_maskz_popcnt_epi32(__mmask16 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vpopcountd_v4si_mask( - (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask16)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask16) __U); } - -__funline __m256i _mm256_popcnt_epi32(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcountd_v8si((__v8si)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountd_v8si ((__v8si) __A); } - -__funline __m256i _mm256_mask_popcnt_epi32(__m256i __A, __mmask16 __U, - __m256i __B) { - return (__m256i)__builtin_ia32_vpopcountd_v8si_mask((__v8si)__A, (__v8si)__B, - (__mmask16)__U); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi32 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A, + (__v8si) __W, + (__mmask16) __U); } - -__funline __m256i _mm256_maskz_popcnt_epi32(__mmask16 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vpopcountd_v8si_mask( - (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask16)__U); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask16) __U); } - -__funline __m128i _mm_popcnt_epi64(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcountq_v2di((__v2di)__A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountq_v2di ((__v2di) __A); } - -__funline __m128i _mm_mask_popcnt_epi64(__m128i __A, __mmask8 __U, __m128i __B) { - return (__m128i)__builtin_ia32_vpopcountq_v2di_mask((__v2di)__A, (__v2di)__B, - (__mmask8)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); } - -__funline __m128i _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_vpopcountq_v2di_mask( - (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); } - -__funline __m256i _mm256_popcnt_epi64(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcountq_v4di((__v4di)__A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountq_v4di ((__v4di) __A); } - -__funline __m256i _mm256_mask_popcnt_epi64(__m256i __A, __mmask8 __U, - __m256i __B) { - return (__m256i)__builtin_ia32_vpopcountq_v4di_mask((__v4di)__A, (__v4di)__B, - (__mmask8)__U); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); } - -__funline __m256i _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_vpopcountq_v4di_mask( - (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); } - #ifdef __DISABLE_AVX512VPOPCNTDQVL__ #undef __DISABLE_AVX512VPOPCNTDQVL__ #pragma GCC pop_options -#endif /* __DISABLE_AVX512VPOPCNTDQVL__ */ - -#endif /* _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avxintrin.internal.h b/third_party/intel/avxintrin.internal.h index c3e7ca305..3676121d8 100644 --- a/third_party/intel/avxintrin.internal.h +++ b/third_party/intel/avxintrin.internal.h @@ -1,1033 +1,1195 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _AVXINTRIN_H_INCLUDED #define _AVXINTRIN_H_INCLUDED - #ifndef __AVX__ #pragma GCC push_options #pragma GCC target("avx") #define __DISABLE_AVX__ -#endif /* __AVX__ */ - -typedef double __v4df __attribute__((__vector_size__(32))); -typedef float __v8sf __attribute__((__vector_size__(32))); -typedef long long __v4di __attribute__((__vector_size__(32))); -typedef unsigned long long __v4du __attribute__((__vector_size__(32))); -typedef int __v8si __attribute__((__vector_size__(32))); -typedef unsigned int __v8su __attribute__((__vector_size__(32))); -typedef short __v16hi __attribute__((__vector_size__(32))); -typedef unsigned short __v16hu __attribute__((__vector_size__(32))); -typedef char __v32qi __attribute__((__vector_size__(32))); -typedef unsigned char __v32qu __attribute__((__vector_size__(32))); - -typedef float __m256 __attribute__((__vector_size__(32), __may_alias__)); -typedef long long __m256i __attribute__((__vector_size__(32), __may_alias__)); -typedef double __m256d __attribute__((__vector_size__(32), __may_alias__)); - -typedef float __m256_u - __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); -typedef long long __m256i_u - __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); -typedef double __m256d_u - __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); - -#define _CMP_EQ_OQ 0x00 -#define _CMP_LT_OS 0x01 -#define _CMP_LE_OS 0x02 -#define _CMP_UNORD_Q 0x03 -#define _CMP_NEQ_UQ 0x04 -#define _CMP_NLT_US 0x05 -#define _CMP_NLE_US 0x06 -#define _CMP_ORD_Q 0x07 -#define _CMP_EQ_UQ 0x08 -#define _CMP_NGE_US 0x09 -#define _CMP_NGT_US 0x0a +#endif +typedef double __v4df __attribute__ ((__vector_size__ (32))); +typedef float __v8sf __attribute__ ((__vector_size__ (32))); +typedef long long __v4di __attribute__ ((__vector_size__ (32))); +typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); +typedef int __v8si __attribute__ ((__vector_size__ (32))); +typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); +typedef short __v16hi __attribute__ ((__vector_size__ (32))); +typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); +typedef char __v32qi __attribute__ ((__vector_size__ (32))); +typedef signed char __v32qs __attribute__ ((__vector_size__ (32))); +typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); +typedef float __m256 __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef long long __m256i __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef double __m256d __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef float __m256_u __attribute__ ((__vector_size__ (32), + __may_alias__, + __aligned__ (1))); +typedef long long __m256i_u __attribute__ ((__vector_size__ (32), + __may_alias__, + __aligned__ (1))); +typedef double __m256d_u __attribute__ ((__vector_size__ (32), + __may_alias__, + __aligned__ (1))); +#define _CMP_EQ_OQ 0x00 +#define _CMP_LT_OS 0x01 +#define _CMP_LE_OS 0x02 +#define _CMP_UNORD_Q 0x03 +#define _CMP_NEQ_UQ 0x04 +#define _CMP_NLT_US 0x05 +#define _CMP_NLE_US 0x06 +#define _CMP_ORD_Q 0x07 +#define _CMP_EQ_UQ 0x08 +#define _CMP_NGE_US 0x09 +#define _CMP_NGT_US 0x0a #define _CMP_FALSE_OQ 0x0b -#define _CMP_NEQ_OQ 0x0c -#define _CMP_GE_OS 0x0d -#define _CMP_GT_OS 0x0e -#define _CMP_TRUE_UQ 0x0f -#define _CMP_EQ_OS 0x10 -#define _CMP_LT_OQ 0x11 -#define _CMP_LE_OQ 0x12 -#define _CMP_UNORD_S 0x13 -#define _CMP_NEQ_US 0x14 -#define _CMP_NLT_UQ 0x15 -#define _CMP_NLE_UQ 0x16 -#define _CMP_ORD_S 0x17 -#define _CMP_EQ_US 0x18 -#define _CMP_NGE_UQ 0x19 -#define _CMP_NGT_UQ 0x1a +#define _CMP_NEQ_OQ 0x0c +#define _CMP_GE_OS 0x0d +#define _CMP_GT_OS 0x0e +#define _CMP_TRUE_UQ 0x0f +#define _CMP_EQ_OS 0x10 +#define _CMP_LT_OQ 0x11 +#define _CMP_LE_OQ 0x12 +#define _CMP_UNORD_S 0x13 +#define _CMP_NEQ_US 0x14 +#define _CMP_NLT_UQ 0x15 +#define _CMP_NLE_UQ 0x16 +#define _CMP_ORD_S 0x17 +#define _CMP_EQ_US 0x18 +#define _CMP_NGE_UQ 0x19 +#define _CMP_NGT_UQ 0x1a #define _CMP_FALSE_OS 0x1b -#define _CMP_NEQ_OS 0x1c -#define _CMP_GE_OQ 0x1d -#define _CMP_GT_OQ 0x1e -#define _CMP_TRUE_US 0x1f - -__funline __m256d _mm256_add_pd(__m256d __A, __m256d __B) { - return (__m256d)((__v4df)__A + (__v4df)__B); +#define _CMP_NEQ_OS 0x1c +#define _CMP_GE_OQ 0x1d +#define _CMP_GT_OQ 0x1e +#define _CMP_TRUE_US 0x1f +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A + (__v4df)__B); } - -__funline __m256 _mm256_add_ps(__m256 __A, __m256 __B) { - return (__m256)((__v8sf)__A + (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A + (__v8sf)__B); } - -__funline __m256d _mm256_addsub_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_addsubpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_addsub_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_addsub_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_addsubps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_addsub_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); } - -__funline __m256d _mm256_and_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_andpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_and_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_andps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); } - -__funline __m256d _mm256_andnot_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_andnpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_andnot_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_andnps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); } - #ifdef __OPTIMIZE__ -__funline __m256d _mm256_blend_pd(__m256d __X, __m256d __Y, const int __M) { - return (__m256d)__builtin_ia32_blendpd256((__v4df)__X, (__v4df)__Y, __M); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) +{ + return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, + (__v4df)__Y, + __M); } - -__funline __m256 _mm256_blend_ps(__m256 __X, __m256 __Y, const int __M) { - return (__m256)__builtin_ia32_blendps256((__v8sf)__X, (__v8sf)__Y, __M); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); } #else -#define _mm256_blend_pd(X, Y, M) \ - ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(X), \ - (__v4df)(__m256d)(Y), (int)(M))) - -#define _mm256_blend_ps(X, Y, M) \ - ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), \ - (int)(M))) +#define _mm256_blend_pd(X, Y, M) ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(M))) +#define _mm256_blend_ps(X, Y, M) ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(M))) #endif - -__funline __m256d _mm256_blendv_pd(__m256d __X, __m256d __Y, __m256d __M) { - return (__m256d)__builtin_ia32_blendvpd256((__v4df)__X, (__v4df)__Y, - (__v4df)__M); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) +{ + return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, + (__v4df)__Y, + (__v4df)__M); } - -__funline __m256 _mm256_blendv_ps(__m256 __X, __m256 __Y, __m256 __M) { - return (__m256)__builtin_ia32_blendvps256((__v8sf)__X, (__v8sf)__Y, - (__v8sf)__M); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) +{ + return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8sf)__M); } - -__funline __m256d _mm256_div_pd(__m256d __A, __m256d __B) { - return (__m256d)((__v4df)__A / (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A / (__v4df)__B); } - -__funline __m256 _mm256_div_ps(__m256 __A, __m256 __B) { - return (__m256)((__v8sf)__A / (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A / (__v8sf)__B); } - #ifdef __OPTIMIZE__ -__funline __m256 _mm256_dp_ps(__m256 __X, __m256 __Y, const int __M) { - return (__m256)__builtin_ia32_dpps256((__v8sf)__X, (__v8sf)__Y, __M); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); } #else -#define _mm256_dp_ps(X, Y, M) \ - ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), \ - (int)(M))) +#define _mm256_dp_ps(X, Y, M) ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(M))) #endif - -__funline __m256d _mm256_hadd_pd(__m256d __X, __m256d __Y) { - return (__m256d)__builtin_ia32_haddpd256((__v4df)__X, (__v4df)__Y); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_pd (__m256d __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); } - -__funline __m256 _mm256_hadd_ps(__m256 __X, __m256 __Y) { - return (__m256)__builtin_ia32_haddps256((__v8sf)__X, (__v8sf)__Y); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_ps (__m256 __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); } - -__funline __m256d _mm256_hsub_pd(__m256d __X, __m256d __Y) { - return (__m256d)__builtin_ia32_hsubpd256((__v4df)__X, (__v4df)__Y); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_pd (__m256d __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); } - -__funline __m256 _mm256_hsub_ps(__m256 __X, __m256 __Y) { - return (__m256)__builtin_ia32_hsubps256((__v8sf)__X, (__v8sf)__Y); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_ps (__m256 __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); } - -__funline __m256d _mm256_max_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_maxpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_max_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_maxps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); } - -__funline __m256d _mm256_min_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_minpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_min_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_minps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); } - -__funline __m256d _mm256_mul_pd(__m256d __A, __m256d __B) { - return (__m256d)((__v4df)__A * (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A * (__v4df)__B); } - -__funline __m256 _mm256_mul_ps(__m256 __A, __m256 __B) { - return (__m256)((__v8sf)__A * (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A * (__v8sf)__B); } - -__funline __m256d _mm256_or_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_orpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_or_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_orps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); } - #ifdef __OPTIMIZE__ -__funline __m256d _mm256_shuffle_pd(__m256d __A, __m256d __B, const int __mask) { - return (__m256d)__builtin_ia32_shufpd256((__v4df)__A, (__v4df)__B, __mask); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) +{ + return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, + __mask); } - -__funline __m256 _mm256_shuffle_ps(__m256 __A, __m256 __B, const int __mask) { - return (__m256)__builtin_ia32_shufps256((__v8sf)__A, (__v8sf)__B, __mask); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) +{ + return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, + __mask); } #else -#define _mm256_shuffle_pd(A, B, N) \ - ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(N))) - -#define _mm256_shuffle_ps(A, B, N) \ - ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), \ - (int)(N))) +#define _mm256_shuffle_pd(A, B, N) ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(N))) +#define _mm256_shuffle_ps(A, B, N) ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(N))) #endif - -__funline __m256d _mm256_sub_pd(__m256d __A, __m256d __B) { - return (__m256d)((__v4df)__A - (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A - (__v4df)__B); } - -__funline __m256 _mm256_sub_ps(__m256 __A, __m256 __B) { - return (__m256)((__v8sf)__A - (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A - (__v8sf)__B); } - -__funline __m256d _mm256_xor_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_xorpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_xor_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_xorps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); } - #ifdef __OPTIMIZE__ -__funline __m128d _mm_cmp_pd(__m128d __X, __m128d __Y, const int __P) { - return (__m128d)__builtin_ia32_cmppd((__v2df)__X, (__v2df)__Y, __P); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); } - -__funline __m128 _mm_cmp_ps(__m128 __X, __m128 __Y, const int __P) { - return (__m128)__builtin_ia32_cmpps((__v4sf)__X, (__v4sf)__Y, __P); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); } - -__funline __m256d _mm256_cmp_pd(__m256d __X, __m256d __Y, const int __P) { - return (__m256d)__builtin_ia32_cmppd256((__v4df)__X, (__v4df)__Y, __P); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) +{ + return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, + __P); } - -__funline __m256 _mm256_cmp_ps(__m256 __X, __m256 __Y, const int __P) { - return (__m256)__builtin_ia32_cmpps256((__v8sf)__X, (__v8sf)__Y, __P); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) +{ + return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, + __P); } - -__funline __m128d _mm_cmp_sd(__m128d __X, __m128d __Y, const int __P) { - return (__m128d)__builtin_ia32_cmpsd((__v2df)__X, (__v2df)__Y, __P); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); } - -__funline __m128 _mm_cmp_ss(__m128 __X, __m128 __Y, const int __P) { - return (__m128)__builtin_ia32_cmpss((__v4sf)__X, (__v4sf)__Y, __P); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); } #else -#define _mm_cmp_pd(X, Y, P) \ - ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ - (int)(P))) - -#define _mm_cmp_ps(X, Y, P) \ - ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (int)(P))) - -#define _mm256_cmp_pd(X, Y, P) \ - ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(X), \ - (__v4df)(__m256d)(Y), (int)(P))) - -#define _mm256_cmp_ps(X, Y, P) \ - ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), \ - (int)(P))) - -#define _mm_cmp_sd(X, Y, P) \ - ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ - (int)(P))) - -#define _mm_cmp_ss(X, Y, P) \ - ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (int)(P))) +#define _mm_cmp_pd(X, Y, P) ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P))) +#define _mm_cmp_ps(X, Y, P) ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P))) +#define _mm256_cmp_pd(X, Y, P) ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P))) +#define _mm256_cmp_ps(X, Y, P) ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P))) +#define _mm_cmp_sd(X, Y, P) ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P))) +#define _mm_cmp_ss(X, Y, P) ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P))) #endif - -__funline __m256d _mm256_cvtepi32_pd(__m128i __A) { - return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si)__A); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsi256_si32 (__m256i __A) +{ + __v8si __B = (__v8si) __A; + return __B[0]; } - -__funline __m256 _mm256_cvtepi32_ps(__m256i __A) { - return (__m256)__builtin_ia32_cvtdq2ps256((__v8si)__A); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_pd (__m128i __A) +{ + return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); } - -__funline __m128 _mm256_cvtpd_ps(__m256d __A) { - return (__m128)__builtin_ia32_cvtpd2ps256((__v4df)__A); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_ps (__m256i __A) +{ + return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); } - -__funline __m256i _mm256_cvtps_epi32(__m256 __A) { - return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf)__A); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_ps (__m256d __A) +{ + return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); } - -__funline __m256d _mm256_cvtps_pd(__m128 __A) { - return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf)__A); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epi32 (__m256 __A) +{ + return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); } - -__funline __m128i _mm256_cvttpd_epi32(__m256d __A) { - return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df)__A); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_pd (__m128 __A) +{ + return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); } - -__funline __m128i _mm256_cvtpd_epi32(__m256d __A) { - return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df)__A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epi32 (__m256d __A) +{ + return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); } - -__funline __m256i _mm256_cvttps_epi32(__m256 __A) { - return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf)__A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epi32 (__m256d __A) +{ + return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); } - -__funline double _mm256_cvtsd_f64(__m256d __A) { +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epi32 (__m256 __A) +{ + return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); +} +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsd_f64 (__m256d __A) +{ return __A[0]; } - -__funline float _mm256_cvtss_f32(__m256 __A) { +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtss_f32 (__m256 __A) +{ return __A[0]; } - #ifdef __OPTIMIZE__ -__funline __m128d _mm256_extractf128_pd(__m256d __X, const int __N) { - return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__X, __N); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_pd (__m256d __X, const int __N) +{ + return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); } - -__funline __m128 _mm256_extractf128_ps(__m256 __X, const int __N) { - return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__X, __N); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_ps (__m256 __X, const int __N) +{ + return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); } - -__funline __m128i _mm256_extractf128_si256(__m256i __X, const int __N) { - return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__X, __N); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_si256 (__m256i __X, const int __N) +{ + return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); } - -__funline int _mm256_extract_epi32(__m256i __X, int const __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 2); - return _mm_extract_epi32(__Y, __N % 4); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi32 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + return _mm_extract_epi32 (__Y, __N % 4); } - -__funline int _mm256_extract_epi16(__m256i __X, int const __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 3); - return _mm_extract_epi16(__Y, __N % 8); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi16 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + return _mm_extract_epi16 (__Y, __N % 8); } - -__funline int _mm256_extract_epi8(__m256i __X, int const __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 4); - return _mm_extract_epi8(__Y, __N % 16); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi8 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + return _mm_extract_epi8 (__Y, __N % 16); } - #ifdef __x86_64__ -__funline long long _mm256_extract_epi64(__m256i __X, const int __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 1); - return _mm_extract_epi64(__Y, __N % 2); +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi64 (__m256i __X, const int __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + return _mm_extract_epi64 (__Y, __N % 2); } #endif #else -#define _mm256_extractf128_pd(X, N) \ - ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(X), (int)(N))) - -#define _mm256_extractf128_ps(X, N) \ - ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(X), (int)(N))) - -#define _mm256_extractf128_si256(X, N) \ - ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(X), (int)(N))) - -#define _mm256_extract_epi32(X, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 2); \ - _mm_extract_epi32(__Y, (N) % 4); \ - })) - -#define _mm256_extract_epi16(X, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 3); \ - _mm_extract_epi16(__Y, (N) % 8); \ - })) - -#define _mm256_extract_epi8(X, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 4); \ - _mm_extract_epi8(__Y, (N) % 16); \ - })) - +#define _mm256_extractf128_pd(X, N) ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), (int)(N))) +#define _mm256_extractf128_ps(X, N) ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), (int)(N))) +#define _mm256_extractf128_si256(X, N) ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), (int)(N))) +#define _mm256_extract_epi32(X, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); _mm_extract_epi32 (__Y, (N) % 4); })) +#define _mm256_extract_epi16(X, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); _mm_extract_epi16 (__Y, (N) % 8); })) +#define _mm256_extract_epi8(X, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); _mm_extract_epi8 (__Y, (N) % 16); })) #ifdef __x86_64__ -#define _mm256_extract_epi64(X, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 1); \ - _mm_extract_epi64(__Y, (N) % 2); \ - })) +#define _mm256_extract_epi64(X, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); _mm_extract_epi64 (__Y, (N) % 2); })) #endif #endif - -__funline void _mm256_zeroall(void) { - __builtin_ia32_vzeroall(); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zeroall (void) +{ + __builtin_ia32_vzeroall (); } - -__funline void _mm256_zeroupper(void) { - __builtin_ia32_vzeroupper(); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zeroupper (void) +{ + __builtin_ia32_vzeroupper (); } - -__funline __m128d _mm_permutevar_pd(__m128d __A, __m128i __C) { - return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__A, (__v2di)__C); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutevar_pd (__m128d __A, __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, + (__v2di)__C); } - -__funline __m256d _mm256_permutevar_pd(__m256d __A, __m256i __C) { - return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__A, (__v4di)__C); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar_pd (__m256d __A, __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, + (__v4di)__C); } - -__funline __m128 _mm_permutevar_ps(__m128 __A, __m128i __C) { - return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__A, (__v4si)__C); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutevar_ps (__m128 __A, __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, + (__v4si)__C); } - -__funline __m256 _mm256_permutevar_ps(__m256 __A, __m256i __C) { - return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__A, (__v8si)__C); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar_ps (__m256 __A, __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, + (__v8si)__C); } - #ifdef __OPTIMIZE__ -__funline __m128d _mm_permute_pd(__m128d __X, const int __C) { - return (__m128d)__builtin_ia32_vpermilpd((__v2df)__X, __C); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_pd (__m128d __X, const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); } - -__funline __m256d _mm256_permute_pd(__m256d __X, const int __C) { - return (__m256d)__builtin_ia32_vpermilpd256((__v4df)__X, __C); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_pd (__m256d __X, const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); } - -__funline __m128 _mm_permute_ps(__m128 __X, const int __C) { - return (__m128)__builtin_ia32_vpermilps((__v4sf)__X, __C); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_ps (__m128 __X, const int __C) +{ + return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); } - -__funline __m256 _mm256_permute_ps(__m256 __X, const int __C) { - return (__m256)__builtin_ia32_vpermilps256((__v8sf)__X, __C); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_ps (__m256 __X, const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); } #else -#define _mm_permute_pd(X, C) \ - ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(X), (int)(C))) - -#define _mm256_permute_pd(X, C) \ - ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(X), (int)(C))) - -#define _mm_permute_ps(X, C) \ - ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(X), (int)(C))) - -#define _mm256_permute_ps(X, C) \ - ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(X), (int)(C))) +#define _mm_permute_pd(X, C) ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) +#define _mm256_permute_pd(X, C) ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) +#define _mm_permute_ps(X, C) ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) +#define _mm256_permute_ps(X, C) ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) #endif - #ifdef __OPTIMIZE__ -__funline __m256d _mm256_permute2f128_pd(__m256d __X, __m256d __Y, - const int __C) { - return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__X, (__v4df)__Y, - __C); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) +{ + return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, + (__v4df)__Y, + __C); } - -__funline __m256 _mm256_permute2f128_ps(__m256 __X, __m256 __Y, const int __C) { - return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__X, (__v8sf)__Y, __C); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) +{ + return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, + (__v8sf)__Y, + __C); } - -__funline __m256i _mm256_permute2f128_si256(__m256i __X, __m256i __Y, - const int __C) { - return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__X, (__v8si)__Y, - __C); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) +{ + return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, + (__v8si)__Y, + __C); } #else -#define _mm256_permute2f128_pd(X, Y, C) \ - ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(X), \ - (__v4df)(__m256d)(Y), (int)(C))) - -#define _mm256_permute2f128_ps(X, Y, C) \ - ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(X), \ - (__v8sf)(__m256)(Y), (int)(C))) - -#define _mm256_permute2f128_si256(X, Y, C) \ - ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(X), \ - (__v8si)(__m256i)(Y), (int)(C))) +#define _mm256_permute2f128_pd(X, Y, C) ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C))) +#define _mm256_permute2f128_ps(X, Y, C) ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C))) +#define _mm256_permute2f128_si256(X, Y, C) ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C))) #endif - -__funline __m128 _mm_broadcast_ss(float const *__X) { - return (__m128)__builtin_ia32_vbroadcastss(__X); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcast_ss (float const *__X) +{ + return (__m128) __builtin_ia32_vbroadcastss (__X); } - -__funline __m256d _mm256_broadcast_sd(double const *__X) { - return (__m256d)__builtin_ia32_vbroadcastsd256(__X); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_sd (double const *__X) +{ + return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); } - -__funline __m256 _mm256_broadcast_ss(float const *__X) { - return (__m256)__builtin_ia32_vbroadcastss256(__X); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_ss (float const *__X) +{ + return (__m256) __builtin_ia32_vbroadcastss256 (__X); } - -__funline __m256d _mm256_broadcast_pd(__m128d const *__X) { - return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__X); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_pd (__m128d const *__X) +{ + return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); } - -__funline __m256 _mm256_broadcast_ps(__m128 const *__X) { - return (__m256)__builtin_ia32_vbroadcastf128_ps256(__X); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_ps (__m128 const *__X) +{ + return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); } - #ifdef __OPTIMIZE__ -__funline __m256d _mm256_insertf128_pd(__m256d __X, __m128d __Y, const int __O) { - return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__X, (__v2df)__Y, - __O); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) +{ + return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, + (__v2df)__Y, + __O); } - -__funline __m256 _mm256_insertf128_ps(__m256 __X, __m128 __Y, const int __O) { - return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__X, (__v4sf)__Y, - __O); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) +{ + return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, + (__v4sf)__Y, + __O); } - -__funline __m256i _mm256_insertf128_si256(__m256i __X, __m128i __Y, - const int __O) { - return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__X, (__v4si)__Y, - __O); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) +{ + return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, + (__v4si)__Y, + __O); } - -__funline __m256i _mm256_insert_epi32(__m256i __X, int __D, int const __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 2); - __Y = _mm_insert_epi32(__Y, __D, __N % 4); - return _mm256_insertf128_si256(__X, __Y, __N >> 2); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi32 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + __Y = _mm_insert_epi32 (__Y, __D, __N % 4); + return _mm256_insertf128_si256 (__X, __Y, __N >> 2); } - -__funline __m256i _mm256_insert_epi16(__m256i __X, int __D, int const __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 3); - __Y = _mm_insert_epi16(__Y, __D, __N % 8); - return _mm256_insertf128_si256(__X, __Y, __N >> 3); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi16 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + __Y = _mm_insert_epi16 (__Y, __D, __N % 8); + return _mm256_insertf128_si256 (__X, __Y, __N >> 3); } - -__funline __m256i _mm256_insert_epi8(__m256i __X, int __D, int const __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 4); - __Y = _mm_insert_epi8(__Y, __D, __N % 16); - return _mm256_insertf128_si256(__X, __Y, __N >> 4); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi8 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + __Y = _mm_insert_epi8 (__Y, __D, __N % 16); + return _mm256_insertf128_si256 (__X, __Y, __N >> 4); } - #ifdef __x86_64__ -__funline __m256i _mm256_insert_epi64(__m256i __X, long long __D, int const __N) { - __m128i __Y = _mm256_extractf128_si256(__X, __N >> 1); - __Y = _mm_insert_epi64(__Y, __D, __N % 2); - return _mm256_insertf128_si256(__X, __Y, __N >> 1); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi64 (__m256i __X, long long __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + __Y = _mm_insert_epi64 (__Y, __D, __N % 2); + return _mm256_insertf128_si256 (__X, __Y, __N >> 1); } #endif #else -#define _mm256_insertf128_pd(X, Y, O) \ - ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(X), \ - (__v2df)(__m128d)(Y), (int)(O))) - -#define _mm256_insertf128_ps(X, Y, O) \ - ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(X), \ - (__v4sf)(__m128)(Y), (int)(O))) - -#define _mm256_insertf128_si256(X, Y, O) \ - ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(X), \ - (__v4si)(__m128i)(Y), (int)(O))) - -#define _mm256_insert_epi32(X, D, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 2); \ - __Y = _mm_insert_epi32(__Y, (D), (N) % 4); \ - _mm256_insertf128_si256((X), __Y, (N) >> 2); \ - })) - -#define _mm256_insert_epi16(X, D, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 3); \ - __Y = _mm_insert_epi16(__Y, (D), (N) % 8); \ - _mm256_insertf128_si256((X), __Y, (N) >> 3); \ - })) - -#define _mm256_insert_epi8(X, D, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 4); \ - __Y = _mm_insert_epi8(__Y, (D), (N) % 16); \ - _mm256_insertf128_si256((X), __Y, (N) >> 4); \ - })) - +#define _mm256_insertf128_pd(X, Y, O) ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(O))) +#define _mm256_insertf128_ps(X, Y, O) ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(O))) +#define _mm256_insertf128_si256(X, Y, O) ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(O))) +#define _mm256_insert_epi32(X, D, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); _mm256_insertf128_si256 ((X), __Y, (N) >> 2); })) +#define _mm256_insert_epi16(X, D, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); _mm256_insertf128_si256 ((X), __Y, (N) >> 3); })) +#define _mm256_insert_epi8(X, D, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); _mm256_insertf128_si256 ((X), __Y, (N) >> 4); })) #ifdef __x86_64__ -#define _mm256_insert_epi64(X, D, N) \ - (__extension__({ \ - __m128i __Y = _mm256_extractf128_si256((X), (N) >> 1); \ - __Y = _mm_insert_epi64(__Y, (D), (N) % 2); \ - _mm256_insertf128_si256((X), __Y, (N) >> 1); \ - })) +#define _mm256_insert_epi64(X, D, N) (__extension__ ({ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); _mm256_insertf128_si256 ((X), __Y, (N) >> 1); })) #endif #endif - -__funline __m256d _mm256_load_pd(double const *__P) { +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_pd (double const *__P) +{ return *(__m256d *)__P; } - -__funline void _mm256_store_pd(double *__P, __m256d __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_pd (double *__P, __m256d __A) +{ *(__m256d *)__P = __A; } - -__funline __m256 _mm256_load_ps(float const *__P) { +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_ps (float const *__P) +{ return *(__m256 *)__P; } - -__funline void _mm256_store_ps(float *__P, __m256 __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_ps (float *__P, __m256 __A) +{ *(__m256 *)__P = __A; } - -__funline __m256d _mm256_loadu_pd(double const *__P) { +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_pd (double const *__P) +{ return *(__m256d_u *)__P; } - -__funline void _mm256_storeu_pd(double *__P, __m256d __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_pd (double *__P, __m256d __A) +{ *(__m256d_u *)__P = __A; } - -__funline __m256 _mm256_loadu_ps(float const *__P) { +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_ps (float const *__P) +{ return *(__m256_u *)__P; } - -__funline void _mm256_storeu_ps(float *__P, __m256 __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_ps (float *__P, __m256 __A) +{ *(__m256_u *)__P = __A; } - -__funline __m256i _mm256_load_si256(__m256i const *__P) { +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_si256 (__m256i const *__P) +{ return *__P; } - -__funline void _mm256_store_si256(__m256i *__P, __m256i __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_si256 (__m256i *__P, __m256i __A) +{ *__P = __A; } - -__funline __m256i _mm256_loadu_si256(__m256i_u const *__P) { +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_si256 (__m256i_u const *__P) +{ return *__P; } - -__funline void _mm256_storeu_si256(__m256i_u *__P, __m256i __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_si256 (__m256i_u *__P, __m256i __A) +{ *__P = __A; } - -__funline __m128d _mm_maskload_pd(double const *__P, __m128i __M) { - return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__P, (__v2di)__M); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_pd (double const *__P, __m128i __M) +{ + return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, + (__v2di)__M); } - -__funline void _mm_maskstore_pd(double *__P, __m128i __M, __m128d __A) { - __builtin_ia32_maskstorepd((__v2df *)__P, (__v2di)__M, (__v2df)__A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) +{ + __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); } - -__funline __m256d _mm256_maskload_pd(double const *__P, __m256i __M) { - return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__P, - (__v4di)__M); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_pd (double const *__P, __m256i __M) +{ + return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, + (__v4di)__M); } - -__funline void _mm256_maskstore_pd(double *__P, __m256i __M, __m256d __A) { - __builtin_ia32_maskstorepd256((__v4df *)__P, (__v4di)__M, (__v4df)__A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) +{ + __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); } - -__funline __m128 _mm_maskload_ps(float const *__P, __m128i __M) { - return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__P, (__v4si)__M); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_ps (float const *__P, __m128i __M) +{ + return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, + (__v4si)__M); } - -__funline void _mm_maskstore_ps(float *__P, __m128i __M, __m128 __A) { - __builtin_ia32_maskstoreps((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) +{ + __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); } - -__funline __m256 _mm256_maskload_ps(float const *__P, __m256i __M) { - return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__P, (__v8si)__M); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_ps (float const *__P, __m256i __M) +{ + return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, + (__v8si)__M); } - -__funline void _mm256_maskstore_ps(float *__P, __m256i __M, __m256 __A) { - __builtin_ia32_maskstoreps256((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) +{ + __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); } - -__funline __m256 _mm256_movehdup_ps(__m256 __X) { - return (__m256)__builtin_ia32_movshdup256((__v8sf)__X); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movehdup_ps (__m256 __X) +{ + return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); } - -__funline __m256 _mm256_moveldup_ps(__m256 __X) { - return (__m256)__builtin_ia32_movsldup256((__v8sf)__X); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_moveldup_ps (__m256 __X) +{ + return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); } - -__funline __m256d _mm256_movedup_pd(__m256d __X) { - return (__m256d)__builtin_ia32_movddup256((__v4df)__X); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movedup_pd (__m256d __X) +{ + return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); } - -__funline __m256i _mm256_lddqu_si256(__m256i const *__P) { - return (__m256i)__builtin_ia32_lddqu256((char const *)__P); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lddqu_si256 (__m256i const *__P) +{ + return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); } - -__funline void _mm256_stream_si256(__m256i *__A, __m256i __B) { - __builtin_ia32_movntdq256((__v4di *)__A, (__v4di)__B); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_si256 (__m256i *__A, __m256i __B) +{ + __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); } - -__funline void _mm256_stream_pd(double *__A, __m256d __B) { - __builtin_ia32_movntpd256(__A, (__v4df)__B); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_pd (double *__A, __m256d __B) +{ + __builtin_ia32_movntpd256 (__A, (__v4df)__B); } - -__funline void _mm256_stream_ps(float *__P, __m256 __A) { - __builtin_ia32_movntps256(__P, (__v8sf)__A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_ps (float *__P, __m256 __A) +{ + __builtin_ia32_movntps256 (__P, (__v8sf)__A); } - -__funline __m256 _mm256_rcp_ps(__m256 __A) { - return (__m256)__builtin_ia32_rcpps256((__v8sf)__A); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); } - -__funline __m256 _mm256_rsqrt_ps(__m256 __A) { - return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__A); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); } - -__funline __m256d _mm256_sqrt_pd(__m256d __A) { - return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__A); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); } - -__funline __m256 _mm256_sqrt_ps(__m256 __A) { - return (__m256)__builtin_ia32_sqrtps256((__v8sf)__A); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); } - #ifdef __OPTIMIZE__ -__funline __m256d _mm256_round_pd(__m256d __V, const int __M) { - return (__m256d)__builtin_ia32_roundpd256((__v4df)__V, __M); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_pd (__m256d __V, const int __M) +{ + return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); } - -__funline __m256 _mm256_round_ps(__m256 __V, const int __M) { - return (__m256)__builtin_ia32_roundps256((__v8sf)__V, __M); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_ps (__m256 __V, const int __M) +{ + return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); } #else -#define _mm256_round_pd(V, M) \ - ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (int)(M))) - -#define _mm256_round_ps(V, M) \ - ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (int)(M))) +#define _mm256_round_pd(V, M) ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) +#define _mm256_round_ps(V, M) ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) #endif - -#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) -#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) -#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) -#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) - -__funline __m256d _mm256_unpackhi_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_unpckhpd256((__v4df)__A, (__v4df)__B); +#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) +#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) +#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) +#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256d _mm256_unpacklo_pd(__m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_unpcklpd256((__v4df)__A, (__v4df)__B); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); } - -__funline __m256 _mm256_unpackhi_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_unpckhps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); } - -__funline __m256 _mm256_unpacklo_ps(__m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_unpcklps256((__v8sf)__A, (__v8sf)__B); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); } - -__funline int _mm_testz_pd(__m128d __M, __m128d __V) { - return __builtin_ia32_vtestzpd((__v2df)__M, (__v2df)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); } - -__funline int _mm_testc_pd(__m128d __M, __m128d __V) { - return __builtin_ia32_vtestcpd((__v2df)__M, (__v2df)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); } - -__funline int _mm_testnzc_pd(__m128d __M, __m128d __V) { - return __builtin_ia32_vtestnzcpd((__v2df)__M, (__v2df)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); } - -__funline int _mm_testz_ps(__m128 __M, __m128 __V) { - return __builtin_ia32_vtestzps((__v4sf)__M, (__v4sf)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); } - -__funline int _mm_testc_ps(__m128 __M, __m128 __V) { - return __builtin_ia32_vtestcps((__v4sf)__M, (__v4sf)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); } - -__funline int _mm_testnzc_ps(__m128 __M, __m128 __V) { - return __builtin_ia32_vtestnzcps((__v4sf)__M, (__v4sf)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); } - -__funline int _mm256_testz_pd(__m256d __M, __m256d __V) { - return __builtin_ia32_vtestzpd256((__v4df)__M, (__v4df)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); } - -__funline int _mm256_testc_pd(__m256d __M, __m256d __V) { - return __builtin_ia32_vtestcpd256((__v4df)__M, (__v4df)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); } - -__funline int _mm256_testnzc_pd(__m256d __M, __m256d __V) { - return __builtin_ia32_vtestnzcpd256((__v4df)__M, (__v4df)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); } - -__funline int _mm256_testz_ps(__m256 __M, __m256 __V) { - return __builtin_ia32_vtestzps256((__v8sf)__M, (__v8sf)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); } - -__funline int _mm256_testc_ps(__m256 __M, __m256 __V) { - return __builtin_ia32_vtestcps256((__v8sf)__M, (__v8sf)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); } - -__funline int _mm256_testnzc_ps(__m256 __M, __m256 __V) { - return __builtin_ia32_vtestnzcps256((__v8sf)__M, (__v8sf)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); } - -__funline int _mm256_testz_si256(__m256i __M, __m256i __V) { - return __builtin_ia32_ptestz256((__v4di)__M, (__v4di)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); } - -__funline int _mm256_testc_si256(__m256i __M, __m256i __V) { - return __builtin_ia32_ptestc256((__v4di)__M, (__v4di)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); } - -__funline int _mm256_testnzc_si256(__m256i __M, __m256i __V) { - return __builtin_ia32_ptestnzc256((__v4di)__M, (__v4di)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); } - -__funline int _mm256_movemask_pd(__m256d __A) { - return __builtin_ia32_movmskpd256((__v4df)__A); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_pd (__m256d __A) +{ + return __builtin_ia32_movmskpd256 ((__v4df)__A); } - -__funline int _mm256_movemask_ps(__m256 __A) { - return __builtin_ia32_movmskps256((__v8sf)__A); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_ps (__m256 __A) +{ + return __builtin_ia32_movmskps256 ((__v8sf)__A); } - -__funline __m256d _mm256_undefined_pd(void) { +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_pd (void) +{ __m256d __Y = __Y; return __Y; } - -__funline __m256 _mm256_undefined_ps(void) { +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_ps (void) +{ __m256 __Y = __Y; return __Y; } - -__funline __m256i _mm256_undefined_si256(void) { +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_si256 (void) +{ __m256i __Y = __Y; return __Y; } - -__funline __m256d _mm256_setzero_pd(void) { - return __extension__(__m256d){0.0, 0.0, 0.0, 0.0}; +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_pd (void) +{ + return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; } - -__funline __m256 _mm256_setzero_ps(void) { - return __extension__(__m256){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_ps (void) +{ + return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0 }; } - -__funline __m256i _mm256_setzero_si256(void) { - return __extension__(__m256i)(__v4di){0, 0, 0, 0}; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_si256 (void) +{ + return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; } - -__funline __m256d _mm256_set_pd(double __A, double __B, double __C, double __D) { - return __extension__(__m256d){__D, __C, __B, __A}; +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_pd (double __A, double __B, double __C, double __D) +{ + return __extension__ (__m256d){ __D, __C, __B, __A }; } - -__funline __m256 _mm256_set_ps(float __A, float __B, float __C, float __D, - float __E, float __F, float __G, float __H) { - return __extension__(__m256){__H, __G, __F, __E, __D, __C, __B, __A}; +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) +{ + return __extension__ (__m256){ __H, __G, __F, __E, + __D, __C, __B, __A }; } - -__funline __m256i _mm256_set_epi32(int __A, int __B, int __C, int __D, int __E, - int __F, int __G, int __H) { - return __extension__(__m256i)(__v8si){__H, __G, __F, __E, __D, __C, __B, __A}; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H) +{ + return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, + __D, __C, __B, __A }; } - -__funline __m256i _mm256_set_epi16(short __q15, short __q14, short __q13, - short __q12, short __q11, short __q10, - short __q09, short __q08, short __q07, - short __q06, short __q05, short __q04, - short __q03, short __q02, short __q01, - short __q00) { - return __extension__(__m256i)(__v16hi){ - __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, - __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return __extension__ (__m256i)(__v16hi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 + }; } - -__funline __m256i _mm256_set_epi8(char __q31, char __q30, char __q29, char __q28, - char __q27, char __q26, char __q25, char __q24, - char __q23, char __q22, char __q21, char __q20, - char __q19, char __q18, char __q17, char __q16, - char __q15, char __q14, char __q13, char __q12, - char __q11, char __q10, char __q09, char __q08, - char __q07, char __q06, char __q05, char __q04, - char __q03, char __q02, char __q01, - char __q00) { - return __extension__(__m256i)(__v32qi){ - __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, - __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, - __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, - __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31}; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m256i)(__v32qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 + }; } - -__funline __m256i _mm256_set_epi64x(long long __A, long long __B, long long __C, - long long __D) { - return __extension__(__m256i)(__v4di){__D, __C, __B, __A}; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi64x (long long __A, long long __B, long long __C, + long long __D) +{ + return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; } - -__funline __m256d _mm256_set1_pd(double __A) { - return __extension__(__m256d){__A, __A, __A, __A}; +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_pd (double __A) +{ + return __extension__ (__m256d){ __A, __A, __A, __A }; } - -__funline __m256 _mm256_set1_ps(float __A) { - return __extension__(__m256){__A, __A, __A, __A, __A, __A, __A, __A}; +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_ps (float __A) +{ + return __extension__ (__m256){ __A, __A, __A, __A, + __A, __A, __A, __A }; } - -__funline __m256i _mm256_set1_epi32(int __A) { - return __extension__(__m256i)(__v8si){__A, __A, __A, __A, __A, __A, __A, __A}; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi32 (int __A) +{ + return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, + __A, __A, __A, __A }; } - -__funline __m256i _mm256_set1_epi16(short __A) { - return _mm256_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi16 (short __A) +{ + return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); } - -__funline __m256i _mm256_set1_epi8(char __A) { - return _mm256_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A, __A, __A, __A, __A, __A); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi8 (char __A) +{ + return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); } - -__funline __m256i _mm256_set1_epi64x(long long __A) { - return __extension__(__m256i)(__v4di){__A, __A, __A, __A}; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi64x (long long __A) +{ + return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; } - -__funline __m256d _mm256_setr_pd(double __A, double __B, double __C, double __D) { - return _mm256_set_pd(__D, __C, __B, __A); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_pd (double __A, double __B, double __C, double __D) +{ + return _mm256_set_pd (__D, __C, __B, __A); } - -__funline __m256 _mm256_setr_ps(float __A, float __B, float __C, float __D, - float __E, float __F, float __G, float __H) { - return _mm256_set_ps(__H, __G, __F, __E, __D, __C, __B, __A); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) +{ + return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); } - -__funline __m256i _mm256_setr_epi32(int __A, int __B, int __C, int __D, int __E, - int __F, int __G, int __H) { - return _mm256_set_epi32(__H, __G, __F, __E, __D, __C, __B, __A); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H) +{ + return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); } - -__funline __m256i _mm256_setr_epi16(short __q15, short __q14, short __q13, - short __q12, short __q11, short __q10, - short __q09, short __q08, short __q07, - short __q06, short __q05, short __q04, - short __q03, short __q02, short __q01, - short __q00) { - return _mm256_set_epi16(__q00, __q01, __q02, __q03, __q04, __q05, __q06, - __q07, __q08, __q09, __q10, __q11, __q12, __q13, - __q14, __q15); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return _mm256_set_epi16 (__q00, __q01, __q02, __q03, + __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, + __q12, __q13, __q14, __q15); } - -__funline __m256i _mm256_setr_epi8(char __q31, char __q30, char __q29, char __q28, - char __q27, char __q26, char __q25, char __q24, - char __q23, char __q22, char __q21, char __q20, - char __q19, char __q18, char __q17, char __q16, - char __q15, char __q14, char __q13, char __q12, - char __q11, char __q10, char __q09, char __q08, - char __q07, char __q06, char __q05, char __q04, - char __q03, char __q02, char __q01, - char __q00) { - return _mm256_set_epi8(__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, - __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, - __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, - __q24, __q25, __q26, __q27, __q28, __q29, __q30, - __q31); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return _mm256_set_epi8 (__q00, __q01, __q02, __q03, + __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, + __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, + __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, + __q28, __q29, __q30, __q31); } - -__funline __m256i _mm256_setr_epi64x(long long __A, long long __B, long long __C, - long long __D) { - return _mm256_set_epi64x(__D, __C, __B, __A); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi64x (long long __A, long long __B, long long __C, + long long __D) +{ + return _mm256_set_epi64x (__D, __C, __B, __A); } - -__funline __m256 _mm256_castpd_ps(__m256d __A) { - return (__m256)__A; +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_ps (__m256d __A) +{ + return (__m256) __A; } - -__funline __m256i _mm256_castpd_si256(__m256d __A) { - return (__m256i)__A; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_si256 (__m256d __A) +{ + return (__m256i) __A; } - -__funline __m256d _mm256_castps_pd(__m256 __A) { - return (__m256d)__A; +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_pd (__m256 __A) +{ + return (__m256d) __A; } - -__funline __m256i _mm256_castps_si256(__m256 __A) { - return (__m256i)__A; +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_si256(__m256 __A) +{ + return (__m256i) __A; } - -__funline __m256 _mm256_castsi256_ps(__m256i __A) { - return (__m256)__A; +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_ps (__m256i __A) +{ + return (__m256) __A; } - -__funline __m256d _mm256_castsi256_pd(__m256i __A) { - return (__m256d)__A; +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_pd (__m256i __A) +{ + return (__m256d) __A; } - -__funline __m128d _mm256_castpd256_pd128(__m256d __A) { - return (__m128d)__builtin_ia32_pd_pd256((__v4df)__A); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd256_pd128 (__m256d __A) +{ + return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); } - -__funline __m128 _mm256_castps256_ps128(__m256 __A) { - return (__m128)__builtin_ia32_ps_ps256((__v8sf)__A); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps256_ps128 (__m256 __A) +{ + return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); } - -__funline __m128i _mm256_castsi256_si128(__m256i __A) { - return (__m128i)__builtin_ia32_si_si256((__v8si)__A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_si128 (__m256i __A) +{ + return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); } - -__funline __m256d _mm256_castpd128_pd256(__m128d __A) { - return (__m256d)__builtin_ia32_pd256_pd((__v2df)__A); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd128_pd256 (__m128d __A) +{ + return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); } - -__funline __m256 _mm256_castps128_ps256(__m128 __A) { - return (__m256)__builtin_ia32_ps256_ps((__v4sf)__A); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps128_ps256 (__m128 __A) +{ + return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); } - -__funline __m256i _mm256_castsi128_si256(__m128i __A) { - return (__m256i)__builtin_ia32_si256_si((__v4si)__A); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi128_si256 (__m128i __A) +{ + return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); } - -__funline __m256 _mm256_set_m128(__m128 __H, __m128 __L) { - return _mm256_insertf128_ps(_mm256_castps128_ps256(__L), __H, 1); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextpd128_pd256 (__m128d __A) +{ + return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0); } - -__funline __m256d _mm256_set_m128d(__m128d __H, __m128d __L) { - return _mm256_insertf128_pd(_mm256_castpd128_pd256(__L), __H, 1); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextps128_ps256 (__m128 __A) +{ + return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0); } - -__funline __m256i _mm256_set_m128i(__m128i __H, __m128i __L) { - return _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextsi128_si256 (__m128i __A) +{ + return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0); } - -__funline __m256 _mm256_setr_m128(__m128 __L, __m128 __H) { - return _mm256_set_m128(__H, __L); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_m128 ( __m128 __H, __m128 __L) +{ + return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1); } - -__funline __m256d _mm256_setr_m128d(__m128d __L, __m128d __H) { - return _mm256_set_m128d(__H, __L); +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_m128d (__m128d __H, __m128d __L) +{ + return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1); } - -__funline __m256i _mm256_setr_m128i(__m128i __L, __m128i __H) { - return _mm256_set_m128i(__H, __L); +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_m128i (__m128i __H, __m128i __L) +{ + return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_m128 (__m128 __L, __m128 __H) +{ + return _mm256_set_m128 (__H, __L); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_m128d (__m128d __L, __m128d __H) +{ + return _mm256_set_m128d (__H, __L); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_m128i (__m128i __L, __m128i __H) +{ + return _mm256_set_m128i (__H, __L); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu2_m128 (float const *__PH, float const *__PL) +{ + return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)), + _mm_loadu_ps (__PH), 1); +} +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A) +{ + _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A)); + _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1)); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu2_m128d (double const *__PH, double const *__PL) +{ + return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)), + _mm_loadu_pd (__PH), 1); +} +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A) +{ + _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A)); + _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1)); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL) +{ + return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)), + _mm_loadu_si128 (__PH), 1); +} +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A) +{ + _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A)); + _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1)); } - #ifdef __DISABLE_AVX__ #undef __DISABLE_AVX__ #pragma GCC pop_options -#endif /* __DISABLE_AVX__ */ - -#endif /* _AVXINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/avxvnniintrin.internal.h b/third_party/intel/avxvnniintrin.internal.h new file mode 100644 index 000000000..ddb392944 --- /dev/null +++ b/third_party/intel/avxvnniintrin.internal.h @@ -0,0 +1,82 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif +#ifndef _AVXVNNIINTRIN_H_INCLUDED +#define _AVXVNNIINTRIN_H_INCLUDED +#if !defined(__AVXVNNI__) +#pragma GCC push_options +#pragma GCC target("avxvnni") +#define __DISABLE_AVXVNNIVL__ +#endif +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbusd_avx_epi32(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbusd_avx_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbusds_avx_epi32(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbusds_avx_epi32(__m128i __A,__m128i __B,__m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpwssd_avx_epi32(__m256i __A,__m256i __B,__m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpwssd_avx_epi32(__m128i __A,__m128i __B,__m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpwssds_avx_epi32(__m256i __A,__m256i __B,__m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpwssds_avx_epi32(__m128i __A,__m128i __B,__m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} +#ifdef __DISABLE_AVXVNNIVL__ +#undef __DISABLE_AVXVNNIVL__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/bmi2intrin.internal.h b/third_party/intel/bmi2intrin.internal.h index 15ba16ae7..32bbde82d 100644 --- a/third_party/intel/bmi2intrin.internal.h +++ b/third_party/intel/bmi2intrin.internal.h @@ -1,67 +1,74 @@ -#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _BMI2INTRIN_H_INCLUDED #define _BMI2INTRIN_H_INCLUDED - #ifndef __BMI2__ #pragma GCC push_options #pragma GCC target("bmi2") #define __DISABLE_BMI2__ -#endif /* __BMI2__ */ - -__funline unsigned int _bzhi_u32(unsigned int __X, unsigned int __Y) { - return __builtin_ia32_bzhi_si(__X, __Y); +#endif +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bzhi_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_bzhi_si (__X, __Y); } - -__funline unsigned int _pdep_u32(unsigned int __X, unsigned int __Y) { - return __builtin_ia32_pdep_si(__X, __Y); +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pdep_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_pdep_si (__X, __Y); } - -__funline unsigned int _pext_u32(unsigned int __X, unsigned int __Y) { - return __builtin_ia32_pext_si(__X, __Y); +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pext_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_pext_si (__X, __Y); } - #ifdef __x86_64__ - -__funline unsigned long long _bzhi_u64(unsigned long long __X, - unsigned long long __Y) { - return __builtin_ia32_bzhi_di(__X, __Y); +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bzhi_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_bzhi_di (__X, __Y); } - -__funline unsigned long long _pdep_u64(unsigned long long __X, - unsigned long long __Y) { - return __builtin_ia32_pdep_di(__X, __Y); +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pdep_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_pdep_di (__X, __Y); } - -__funline unsigned long long _pext_u64(unsigned long long __X, - unsigned long long __Y) { - return __builtin_ia32_pext_di(__X, __Y); +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pext_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_pext_di (__X, __Y); } - -__funline unsigned long long _mulx_u64(unsigned long long __X, - unsigned long long __Y, - unsigned long long *__P) { - unsigned __int128 __res = (unsigned __int128)__X * __Y; - *__P = (unsigned long long)(__res >> 64); - return (unsigned long long)__res; +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mulx_u64 (unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) +{ + unsigned __int128 __res = (unsigned __int128) __X * __Y; + *__P = (unsigned long long) (__res >> 64); + return (unsigned long long) __res; } - -#else /* !__x86_64__ */ - -__funline unsigned int _mulx_u32(unsigned int __X, unsigned int __Y, - unsigned int *__P) { - unsigned long long __res = (unsigned long long)__X * __Y; - *__P = (unsigned int)(__res >> 32); - return (unsigned int)__res; +#else +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P) +{ + unsigned long long __res = (unsigned long long) __X * __Y; + *__P = (unsigned int) (__res >> 32); + return (unsigned int) __res; } - -#endif /* !__x86_64__ */ - +#endif #ifdef __DISABLE_BMI2__ #undef __DISABLE_BMI2__ #pragma GCC pop_options -#endif /* __DISABLE_BMI2__ */ - -#endif /* _BMI2INTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/bmiintrin.internal.h b/third_party/intel/bmiintrin.internal.h index 9a4bab63b..7425e0db9 100644 --- a/third_party/intel/bmiintrin.internal.h +++ b/third_party/intel/bmiintrin.internal.h @@ -1,160 +1,135 @@ -#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _BMIINTRIN_H_INCLUDED #define _BMIINTRIN_H_INCLUDED - #ifndef __BMI__ #pragma GCC push_options #pragma GCC target("bmi") #define __DISABLE_BMI__ -#endif /* __BMI__ */ - -extern __inline unsigned short - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __tzcnt_u16(unsigned short __X) { - return __builtin_ia32_tzcnt_u16(__X); +#endif +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u16 (unsigned short __X) +{ + return __builtin_ia32_tzcnt_u16 (__X); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __andn_u32(unsigned int __X, unsigned int __Y) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__andn_u32 (unsigned int __X, unsigned int __Y) +{ return ~__X & __Y; } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bextr_u32(unsigned int __X, unsigned int __Y) { - return __builtin_ia32_bextr_u32(__X, __Y); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextr_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_bextr_u32 (__X, __Y); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _bextr_u32(unsigned int __X, unsigned int __Y, unsigned __Z) { - return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bextr_u32 (unsigned int __X, unsigned int __Y, unsigned __Z) +{ + return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsi_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsi_u32 (unsigned int __X) +{ return __X & -__X; } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _blsi_u32(unsigned int __X) { - return __blsi_u32(__X); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsi_u32 (unsigned int __X) +{ + return __blsi_u32 (__X); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsmsk_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsmsk_u32 (unsigned int __X) +{ return __X ^ (__X - 1); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _blsmsk_u32(unsigned int __X) { - return __blsmsk_u32(__X); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsmsk_u32 (unsigned int __X) +{ + return __blsmsk_u32 (__X); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsr_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsr_u32 (unsigned int __X) +{ return __X & (__X - 1); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _blsr_u32(unsigned int __X) { - return __blsr_u32(__X); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsr_u32 (unsigned int __X) +{ + return __blsr_u32 (__X); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __tzcnt_u32(unsigned int __X) { - return __builtin_ia32_tzcnt_u32(__X); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u32 (unsigned int __X) +{ + return __builtin_ia32_tzcnt_u32 (__X); } - -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _tzcnt_u32(unsigned int __X) { - return __builtin_ia32_tzcnt_u32(__X); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tzcnt_u32 (unsigned int __X) +{ + return __builtin_ia32_tzcnt_u32 (__X); } - #ifdef __x86_64__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __andn_u64(unsigned long long __X, unsigned long long __Y) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__andn_u64 (unsigned long long __X, unsigned long long __Y) +{ return ~__X & __Y; } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bextr_u64(unsigned long long __X, unsigned long long __Y) { - return __builtin_ia32_bextr_u64(__X, __Y); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextr_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_bextr_u64 (__X, __Y); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) { - return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bextr_u64 (unsigned long long __X, unsigned int __Y, unsigned int __Z) +{ + return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsi_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsi_u64 (unsigned long long __X) +{ return __X & -__X; } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _blsi_u64(unsigned long long __X) { - return __blsi_u64(__X); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsi_u64 (unsigned long long __X) +{ + return __blsi_u64 (__X); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsmsk_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsmsk_u64 (unsigned long long __X) +{ return __X ^ (__X - 1); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _blsmsk_u64(unsigned long long __X) { - return __blsmsk_u64(__X); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsmsk_u64 (unsigned long long __X) +{ + return __blsmsk_u64 (__X); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsr_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsr_u64 (unsigned long long __X) +{ return __X & (__X - 1); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _blsr_u64(unsigned long long __X) { - return __blsr_u64(__X); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsr_u64 (unsigned long long __X) +{ + return __blsr_u64 (__X); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __tzcnt_u64(unsigned long long __X) { - return __builtin_ia32_tzcnt_u64(__X); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u64 (unsigned long long __X) +{ + return __builtin_ia32_tzcnt_u64 (__X); } - -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _tzcnt_u64(unsigned long long __X) { - return __builtin_ia32_tzcnt_u64(__X); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tzcnt_u64 (unsigned long long __X) +{ + return __builtin_ia32_tzcnt_u64 (__X); } - -#endif /* __x86_64__ */ - +#endif #ifdef __DISABLE_BMI__ #undef __DISABLE_BMI__ #pragma GCC pop_options -#endif /* __DISABLE_BMI__ */ - -#endif /* _BMIINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/cetintrin.internal.h b/third_party/intel/cetintrin.internal.h index 63617f28e..6121ecad0 100644 --- a/third_party/intel/cetintrin.internal.h +++ b/third_party/intel/cetintrin.internal.h @@ -1,73 +1,95 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _CETINTRIN_H_INCLUDED #define _CETINTRIN_H_INCLUDED - #ifndef __SHSTK__ #pragma GCC push_options -#pragma GCC target("shstk") +#pragma GCC target ("shstk") #define __DISABLE_SHSTK__ -#endif /* __SHSTK__ */ - +#endif #ifdef __x86_64__ -__funline unsigned long long _get_ssp(void) { - return __builtin_ia32_rdsspq(); +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_get_ssp (void) +{ + return __builtin_ia32_rdsspq (); } #else -__funline unsigned int _get_ssp(void) { - return __builtin_ia32_rdsspd(); +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_get_ssp (void) +{ + return __builtin_ia32_rdsspd (); } #endif - -__funline void _inc_ssp(unsigned int __B) { +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_inc_ssp (unsigned int __B) +{ #ifdef __x86_64__ - __builtin_ia32_incsspq((unsigned long long)__B); + __builtin_ia32_incsspq ((unsigned long long) __B); #else - __builtin_ia32_incsspd(__B); + __builtin_ia32_incsspd (__B); #endif } - -__funline void _saveprevssp(void) { - __builtin_ia32_saveprevssp(); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_saveprevssp (void) +{ + __builtin_ia32_saveprevssp (); } - -__funline void _rstorssp(void *__B) { - __builtin_ia32_rstorssp(__B); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rstorssp (void *__B) +{ + __builtin_ia32_rstorssp (__B); } - -__funline void _wrssd(unsigned int __B, void *__C) { - __builtin_ia32_wrssd(__B, __C); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrssd (unsigned int __B, void *__C) +{ + __builtin_ia32_wrssd (__B, __C); } - #ifdef __x86_64__ -__funline void _wrssq(unsigned long long __B, void *__C) { - __builtin_ia32_wrssq(__B, __C); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrssq (unsigned long long __B, void *__C) +{ + __builtin_ia32_wrssq (__B, __C); } #endif - -__funline void _wrussd(unsigned int __B, void *__C) { - __builtin_ia32_wrussd(__B, __C); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrussd (unsigned int __B, void *__C) +{ + __builtin_ia32_wrussd (__B, __C); } - #ifdef __x86_64__ -__funline void _wrussq(unsigned long long __B, void *__C) { - __builtin_ia32_wrussq(__B, __C); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrussq (unsigned long long __B, void *__C) +{ + __builtin_ia32_wrussq (__B, __C); } #endif - -__funline void _setssbsy(void) { - __builtin_ia32_setssbsy(); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_setssbsy (void) +{ + __builtin_ia32_setssbsy (); } - -__funline void _clrssbsy(void *__B) { - __builtin_ia32_clrssbsy(__B); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_clrssbsy (void *__B) +{ + __builtin_ia32_clrssbsy (__B); } - #ifdef __DISABLE_SHSTK__ #undef __DISABLE_SHSTK__ #pragma GCC pop_options -#endif /* __DISABLE_SHSTK__ */ - -#endif /* _CETINTRIN_H_INCLUDED. */ +#endif +#endif +#endif diff --git a/third_party/intel/cldemoteintrin.internal.h b/third_party/intel/cldemoteintrin.internal.h index ee9d1eefb..304c8f226 100644 --- a/third_party/intel/cldemoteintrin.internal.h +++ b/third_party/intel/cldemoteintrin.internal.h @@ -1,21 +1,24 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _CLDEMOTE_H_INCLUDED #define _CLDEMOTE_H_INCLUDED - #ifndef __CLDEMOTE__ #pragma GCC push_options #pragma GCC target("cldemote") #define __DISABLE_CLDEMOTE__ -#endif /* __CLDEMOTE__ */ -__funline void _cldemote(void *__A) { - __builtin_ia32_cldemote(__A); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cldemote (void *__A) +{ + __builtin_ia32_cldemote (__A); } #ifdef __DISABLE_CLDEMOTE__ #undef __DISABLE_CLDEMOTE__ #pragma GCC pop_options -#endif /* __DISABLE_CLDEMOTE__ */ - -#endif /* _CLDEMOTE_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/clflushoptintrin.internal.h b/third_party/intel/clflushoptintrin.internal.h index cd974e3b4..4d27e0c3d 100644 --- a/third_party/intel/clflushoptintrin.internal.h +++ b/third_party/intel/clflushoptintrin.internal.h @@ -1,23 +1,24 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _CLFLUSHOPTINTRIN_H_INCLUDED #define _CLFLUSHOPTINTRIN_H_INCLUDED - #ifndef __CLFLUSHOPT__ #pragma GCC push_options #pragma GCC target("clflushopt") #define __DISABLE_CLFLUSHOPT__ -#endif /* __CLFLUSHOPT__ */ - -__funline void _mm_clflushopt(void *__A) { - __builtin_ia32_clflushopt(__A); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clflushopt (void *__A) +{ + __builtin_ia32_clflushopt (__A); } - #ifdef __DISABLE_CLFLUSHOPT__ #undef __DISABLE_CLFLUSHOPT__ #pragma GCC pop_options -#endif /* __DISABLE_CLFLUSHOPT__ */ - -#endif /* _CLFLUSHOPTINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/clwbintrin.internal.h b/third_party/intel/clwbintrin.internal.h index 8f6f9d7ed..270c60aa9 100644 --- a/third_party/intel/clwbintrin.internal.h +++ b/third_party/intel/clwbintrin.internal.h @@ -1,23 +1,24 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _CLWBINTRIN_H_INCLUDED #define _CLWBINTRIN_H_INCLUDED - #ifndef __CLWB__ #pragma GCC push_options #pragma GCC target("clwb") #define __DISABLE_CLWB__ -#endif /* __CLWB__ */ - -__funline void _mm_clwb(void *__A) { - __builtin_ia32_clwb(__A); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clwb (void *__A) +{ + __builtin_ia32_clwb (__A); } - #ifdef __DISABLE_CLWB__ #undef __DISABLE_CLWB__ #pragma GCC pop_options -#endif /* __DISABLE_CLWB__ */ - -#endif /* _CLWBINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/clzerointrin.internal.h b/third_party/intel/clzerointrin.internal.h index c9261ed09..8402f80ef 100644 --- a/third_party/intel/clzerointrin.internal.h +++ b/third_party/intel/clzerointrin.internal.h @@ -1,21 +1,20 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _CLZEROINTRIN_H_INCLUDED #define _CLZEROINTRIN_H_INCLUDED -#ifdef __x86_64__ - #ifndef __CLZERO__ #pragma GCC push_options #pragma GCC target("clzero") #define __DISABLE_CLZERO__ -#endif /* __CLZERO__ */ - -__funline void _mm_clzero(void* __I) { - __builtin_ia32_clzero(__I); +#endif +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clzero (void * __I) +{ + __builtin_ia32_clzero (__I); } - #ifdef __DISABLE_CLZERO__ #undef __DISABLE_CLZERO__ #pragma GCC pop_options -#endif /* __DISABLE_CLZERO__ */ - -#endif /* __x86_64__ */ -#endif /* _CLZEROINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/cpuid.internal.h b/third_party/intel/cpuid.internal.h index 9fca4eac7..1f675485d 100644 --- a/third_party/intel/cpuid.internal.h +++ b/third_party/intel/cpuid.internal.h @@ -1,237 +1,220 @@ -#ifndef COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_ -#define COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_ -#ifdef __x86_64__ -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -#define bit_SSE3 (1 << 0) -#define bit_PCLMUL (1 << 1) -#define bit_LZCNT (1 << 5) -#define bit_SSSE3 (1 << 9) -#define bit_FMA (1 << 12) +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _CPUID_H_INCLUDED +#define _CPUID_H_INCLUDED +#define bit_AVXVNNI (1 << 4) +#define bit_AVX512BF16 (1 << 5) +#define bit_HRESET (1 << 22) +#define bit_SSE3 (1 << 0) +#define bit_PCLMUL (1 << 1) +#define bit_LZCNT (1 << 5) +#define bit_SSSE3 (1 << 9) +#define bit_FMA (1 << 12) #define bit_CMPXCHG16B (1 << 13) -#define bit_SSE4_1 (1 << 19) -#define bit_SSE4_2 (1 << 20) -#define bit_MOVBE (1 << 22) -#define bit_POPCNT (1 << 23) -#define bit_AES (1 << 25) -#define bit_XSAVE (1 << 26) -#define bit_OSXSAVE (1 << 27) -#define bit_AVX (1 << 28) -#define bit_F16C (1 << 29) -#define bit_RDRND (1 << 30) - +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_MOVBE (1 << 22) +#define bit_POPCNT (1 << 23) +#define bit_AES (1 << 25) +#define bit_XSAVE (1 << 26) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) +#define bit_F16C (1 << 29) +#define bit_RDRND (1 << 30) #define bit_CMPXCHG8B (1 << 8) -#define bit_CMOV (1 << 15) -#define bit_MMX (1 << 23) -#define bit_FXSAVE (1 << 24) -#define bit_SSE (1 << 25) -#define bit_SSE2 (1 << 26) - +#define bit_CMOV (1 << 15) +#define bit_MMX (1 << 23) +#define bit_FXSAVE (1 << 24) +#define bit_SSE (1 << 25) +#define bit_SSE2 (1 << 26) #define bit_LAHF_LM (1 << 0) -#define bit_ABM (1 << 5) -#define bit_SSE4a (1 << 6) -#define bit_PRFCHW (1 << 8) -#define bit_XOP (1 << 11) -#define bit_LWP (1 << 15) -#define bit_FMA4 (1 << 16) -#define bit_TBM (1 << 21) -#define bit_MWAITX (1 << 29) - +#define bit_ABM (1 << 5) +#define bit_SSE4a (1 << 6) +#define bit_PRFCHW (1 << 8) +#define bit_XOP (1 << 11) +#define bit_LWP (1 << 15) +#define bit_FMA4 (1 << 16) +#define bit_TBM (1 << 21) +#define bit_MWAITX (1 << 29) #define bit_MMXEXT (1 << 22) -#define bit_LM (1 << 29) +#define bit_LM (1 << 29) #define bit_3DNOWP (1 << 30) -#define bit_3DNOW (1u << 31) - -#define bit_CLZERO (1 << 0) +#define bit_3DNOW (1u << 31) +#define bit_CLZERO (1 << 0) #define bit_WBNOINVD (1 << 9) - -#define bit_FSGSBASE (1 << 0) -#define bit_SGX (1 << 2) -#define bit_BMI (1 << 3) -#define bit_HLE (1 << 4) -#define bit_AVX2 (1 << 5) -#define bit_BMI2 (1 << 8) -#define bit_RTM (1 << 11) -#define bit_MPX (1 << 14) -#define bit_AVX512F (1 << 16) -#define bit_AVX512DQ (1 << 17) -#define bit_RDSEED (1 << 18) -#define bit_ADX (1 << 19) +#define bit_FSGSBASE (1 << 0) +#define bit_SGX (1 << 2) +#define bit_BMI (1 << 3) +#define bit_HLE (1 << 4) +#define bit_AVX2 (1 << 5) +#define bit_BMI2 (1 << 8) +#define bit_RTM (1 << 11) +#define bit_MPX (1 << 14) +#define bit_AVX512F (1 << 16) +#define bit_AVX512DQ (1 << 17) +#define bit_RDSEED (1 << 18) +#define bit_ADX (1 << 19) #define bit_AVX512IFMA (1 << 21) #define bit_CLFLUSHOPT (1 << 23) -#define bit_CLWB (1 << 24) -#define bit_AVX512PF (1 << 26) -#define bit_AVX512ER (1 << 27) -#define bit_AVX512CD (1 << 28) -#define bit_SHA (1 << 29) -#define bit_AVX512BW (1 << 30) -#define bit_AVX512VL (1u << 31) - -#define bit_PREFETCHWT1 (1 << 0) -#define bit_AVX512VBMI (1 << 1) -#define bit_PKU (1 << 3) -#define bit_OSPKE (1 << 4) -#define bit_WAITPKG (1 << 5) -#define bit_AVX512VBMI2 (1 << 6) -#define bit_SHSTK (1 << 7) -#define bit_GFNI (1 << 8) -#define bit_VAES (1 << 9) -#define bit_AVX512VNNI (1 << 11) -#define bit_VPCLMULQDQ (1 << 10) -#define bit_AVX512BITALG (1 << 12) +#define bit_CLWB (1 << 24) +#define bit_AVX512PF (1 << 26) +#define bit_AVX512ER (1 << 27) +#define bit_AVX512CD (1 << 28) +#define bit_SHA (1 << 29) +#define bit_AVX512BW (1 << 30) +#define bit_AVX512VL (1u << 31) +#define bit_PREFETCHWT1 (1 << 0) +#define bit_AVX512VBMI (1 << 1) +#define bit_PKU (1 << 3) +#define bit_OSPKE (1 << 4) +#define bit_WAITPKG (1 << 5) +#define bit_AVX512VBMI2 (1 << 6) +#define bit_SHSTK (1 << 7) +#define bit_GFNI (1 << 8) +#define bit_VAES (1 << 9) +#define bit_AVX512VNNI (1 << 11) +#define bit_VPCLMULQDQ (1 << 10) +#define bit_AVX512BITALG (1 << 12) #define bit_AVX512VPOPCNTDQ (1 << 14) -#define bit_RDPID (1 << 22) -#define bit_MOVDIRI (1 << 27) -#define bit_MOVDIR64B (1 << 28) -#define bit_CLDEMOTE (1 << 25) - +#define bit_RDPID (1 << 22) +#define bit_MOVDIRI (1 << 27) +#define bit_MOVDIR64B (1 << 28) +#define bit_ENQCMD (1 << 29) +#define bit_CLDEMOTE (1 << 25) +#define bit_KL (1 << 23) #define bit_AVX5124VNNIW (1 << 2) #define bit_AVX5124FMAPS (1 << 3) -#define bit_IBT (1 << 20) -#define bit_PCONFIG (1 << 18) - +#define bit_AVX512VP2INTERSECT (1 << 8) +#define bit_IBT (1 << 20) +#define bit_UINTR (1 << 5) +#define bit_PCONFIG (1 << 18) +#define bit_SERIALIZE (1 << 14) +#define bit_TSXLDTRK (1 << 16) +#define bit_AMX_BF16 (1 << 22) +#define bit_AMX_TILE (1 << 24) +#define bit_AMX_INT8 (1 << 25) #define bit_BNDREGS (1 << 3) -#define bit_BNDCSR (1 << 4) - +#define bit_BNDCSR (1 << 4) #define bit_XSAVEOPT (1 << 0) -#define bit_XSAVEC (1 << 1) -#define bit_XSAVES (1 << 3) - +#define bit_XSAVEC (1 << 1) +#define bit_XSAVES (1 << 3) #define bit_PTWRITE (1 << 4) - +#define bit_AESKLE ( 1<<0 ) +#define bit_WIDEKL ( 1<<2 ) #define signature_AMD_ebx 0x68747541 #define signature_AMD_ecx 0x444d4163 #define signature_AMD_edx 0x69746e65 - #define signature_CENTAUR_ebx 0x746e6543 #define signature_CENTAUR_ecx 0x736c7561 #define signature_CENTAUR_edx 0x48727561 - #define signature_CYRIX_ebx 0x69727943 #define signature_CYRIX_ecx 0x64616574 #define signature_CYRIX_edx 0x736e4978 - #define signature_INTEL_ebx 0x756e6547 #define signature_INTEL_ecx 0x6c65746e #define signature_INTEL_edx 0x49656e69 - #define signature_TM1_ebx 0x6e617254 #define signature_TM1_ecx 0x55504361 #define signature_TM1_edx 0x74656d73 - #define signature_TM2_ebx 0x756e6547 #define signature_TM2_ecx 0x3638784d #define signature_TM2_edx 0x54656e69 - #define signature_NSC_ebx 0x646f6547 #define signature_NSC_ecx 0x43534e20 #define signature_NSC_edx 0x79622065 - #define signature_NEXGEN_ebx 0x4778654e #define signature_NEXGEN_ecx 0x6e657669 #define signature_NEXGEN_edx 0x72446e65 - #define signature_RISE_ebx 0x65736952 #define signature_RISE_ecx 0x65736952 #define signature_RISE_edx 0x65736952 - #define signature_SIS_ebx 0x20536953 #define signature_SIS_ecx 0x20536953 #define signature_SIS_edx 0x20536953 - #define signature_UMC_ebx 0x20434d55 #define signature_UMC_ecx 0x20434d55 #define signature_UMC_edx 0x20434d55 - #define signature_VIA_ebx 0x20414956 #define signature_VIA_ecx 0x20414956 #define signature_VIA_edx 0x20414956 - #define signature_VORTEX_ebx 0x74726f56 #define signature_VORTEX_ecx 0x436f5320 #define signature_VORTEX_edx 0x36387865 - #ifndef __x86_64__ - -#define __cpuid(level, a, b, c, d) \ - do { \ - if (__builtin_constant_p(level) && (level) != 1) \ - __asm__("cpuid\n\t" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(level)); \ - else \ - __asm__("cpuid\n\t" \ - : "=a"(a), "=b"(b), "=c"(c), "=d"(d) \ - : "0"(level), "1"(0), "2"(0)); \ - } while (0) +#define __cpuid(level, a, b, c, d) do { if (__builtin_constant_p (level) && (level) != 1) __asm__ __volatile__ ("cpuid\n\t" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (level)); else __asm__ __volatile__ ("cpuid\n\t" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (level), "1" (0), "2" (0)); } while (0) #else -#define __cpuid(level, a, b, c, d) \ - __asm__("cpuid\n\t" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(level)) +#define __cpuid(level, a, b, c, d) __asm__ __volatile__ ("cpuid\n\t" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (level)) #endif - -#define __cpuid_count(level, count, a, b, c, d) \ - __asm__("cpuid\n\t" \ - : "=a"(a), "=b"(b), "=c"(c), "=d"(d) \ - : "0"(level), "2"(count)) - -static __inline unsigned int __get_cpuid_max(unsigned int __ext, - unsigned int *__sig) { +#define __cpuid_count(level, count, a, b, c, d) __asm__ __volatile__ ("cpuid\n\t" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (level), "2" (count)) +static __inline unsigned int +__get_cpuid_max (unsigned int __ext, unsigned int *__sig) +{ unsigned int __eax, __ebx, __ecx, __edx; #ifndef __x86_64__ #if __GNUC__ >= 3 - __asm__("pushf{l|d}\n\t" - "pushf{l|d}\n\t" - "pop{l}\t%0\n\t" - "mov{l}\t{%0, %1|%1, %0}\n\t" - "xor{l}\t{%2, %0|%0, %2}\n\t" - "push{l}\t%0\n\t" - "popf{l|d}\n\t" - "pushf{l|d}\n\t" - "pop{l}\t%0\n\t" - "popf{l|d}\n\t" - : "=&r"(__eax), "=&r"(__ebx) - : "i"(0x00200000)); + __asm__ ("pushf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "mov{l}\t{%0, %1|%1, %0}\n\t" + "xor{l}\t{%2, %0|%0, %2}\n\t" + "push{l}\t%0\n\t" + "popf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "popf{l|d}\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); #else - __asm__("pushfl\n\t" - "pushfl\n\t" - "popl\t%0\n\t" - "movl\t%0, %1\n\t" - "xorl\t%2, %0\n\t" - "pushl\t%0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl\t%0\n\t" - "popfl\n\t" - : "=&r"(__eax), "=&r"(__ebx) - : "i"(0x00200000)); + __asm__ ("pushfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "movl\t%0, %1\n\t" + "xorl\t%2, %0\n\t" + "pushl\t%0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "popfl\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); #endif - if (!((__eax ^ __ebx) & 0x00200000)) return 0; + if (!((__eax ^ __ebx) & 0x00200000)) + return 0; #endif - __cpuid(__ext, __eax, __ebx, __ecx, __edx); - if (__sig) *__sig = __ebx; + __cpuid (__ext, __eax, __ebx, __ecx, __edx); + if (__sig) + *__sig = __ebx; return __eax; } - -static __inline int __get_cpuid(unsigned int __leaf, unsigned int *__eax, - unsigned int *__ebx, unsigned int *__ecx, - unsigned int *__edx) { +static __inline int +__get_cpuid (unsigned int __leaf, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ unsigned int __ext = __leaf & 0x80000000; - unsigned int __maxlevel = __get_cpuid_max(__ext, 0); - if (__maxlevel == 0 || __maxlevel < __leaf) return 0; - __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx); + unsigned int __maxlevel = __get_cpuid_max (__ext, 0); + if (__maxlevel == 0 || __maxlevel < __leaf) + return 0; + __cpuid (__leaf, *__eax, *__ebx, *__ecx, *__edx); return 1; } - -static __inline int __get_cpuid_count(unsigned int __leaf, - unsigned int __subleaf, - unsigned int *__eax, unsigned int *__ebx, - unsigned int *__ecx, - unsigned int *__edx) { +static __inline int +__get_cpuid_count (unsigned int __leaf, unsigned int __subleaf, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ unsigned int __ext = __leaf & 0x80000000; - unsigned int __maxlevel = __get_cpuid_max(__ext, 0); - if (__maxlevel == 0 || __maxlevel < __leaf) return 0; - __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx); + unsigned int __maxlevel = __get_cpuid_max (__ext, 0); + if (__maxlevel == 0 || __maxlevel < __leaf) + return 0; + __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx); return 1; } - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* __x86_64__ */ -#endif /* COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_ */ +static __inline void +__cpuidex (int __cpuid_info[4], int __leaf, int __subleaf) +{ + __cpuid_count (__leaf, __subleaf, __cpuid_info[0], __cpuid_info[1], + __cpuid_info[2], __cpuid_info[3]); +} +#endif +#endif diff --git a/third_party/intel/emmintrin.internal.h b/third_party/intel/emmintrin.internal.h index 712dbfb41..c70096736 100644 --- a/third_party/intel/emmintrin.internal.h +++ b/third_party/intel/emmintrin.internal.h @@ -1,1038 +1,1275 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _EMMINTRIN_H_INCLUDED #define _EMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/xmmintrin.internal.h" - #ifndef __SSE2__ #pragma GCC push_options #pragma GCC target("sse2") #define __DISABLE_SSE2__ -#endif /* __SSE2__ */ - -typedef double __v2df __attribute__((__vector_size__(16))); -typedef long long __v2di __attribute__((__vector_size__(16))); -typedef unsigned long long __v2du __attribute__((__vector_size__(16))); -typedef int __v4si __attribute__((__vector_size__(16))); -typedef unsigned int __v4su __attribute__((__vector_size__(16))); -typedef short __v8hi __attribute__((__vector_size__(16))); -typedef unsigned short __v8hu __attribute__((__vector_size__(16))); -typedef char __v16qi __attribute__((__vector_size__(16))); -typedef signed char __v16qs __attribute__((__vector_size__(16))); -typedef unsigned char __v16qu __attribute__((__vector_size__(16))); - -typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); -typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); - -typedef long long __m128i_u - __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); -typedef double __m128d_u - __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); - -#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0)) - -__funline __m128d _mm_set_sd(double __F) { - return __extension__(__m128d){__F, 0.0}; +#endif +typedef double __v2df __attribute__ ((__vector_size__ (16))); +typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); +typedef int __v4si __attribute__ ((__vector_size__ (16))); +typedef unsigned int __v4su __attribute__ ((__vector_size__ (16))); +typedef short __v8hi __attribute__ ((__vector_size__ (16))); +typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16))); +typedef char __v16qi __attribute__ ((__vector_size__ (16))); +typedef signed char __v16qs __attribute__ ((__vector_size__ (16))); +typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16))); +typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +#define _MM_SHUFFLE2(fp1,fp0) (((fp1) << 1) | (fp0)) +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_sd (double __F) +{ + return __extension__ (__m128d){ __F, 0.0 }; } - -__funline __m128d _mm_set1_pd(double __F) { - return __extension__(__m128d){__F, __F}; +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pd (double __F) +{ + return __extension__ (__m128d){ __F, __F }; } - -__funline __m128d _mm_set_pd1(double __F) { - return _mm_set1_pd(__F); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd1 (double __F) +{ + return _mm_set1_pd (__F); } - -__funline __m128d _mm_set_pd(double __W, double __X) { - return __extension__(__m128d){__X, __W}; +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __X, __W }; } - -__funline __m128d _mm_setr_pd(double __W, double __X) { - return __extension__(__m128d){__W, __X}; +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __W, __X }; } - -__funline __m128d _mm_undefined_pd(void) { +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_pd (void) +{ __m128d __Y = __Y; return __Y; } - -__funline __m128d _mm_setzero_pd(void) { - return __extension__(__m128d){0.0, 0.0}; +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_pd (void) +{ + return __extension__ (__m128d){ 0.0, 0.0 }; } - -__funline __m128d _mm_move_sd(__m128d __A, __m128d __B) { - return __extension__(__m128d) - __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1}); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_sd (__m128d __A, __m128d __B) +{ + return __extension__ (__m128d) __builtin_shuffle ((__v2df)__A, (__v2df)__B, (__v2di){2, 1}); } - -__funline __m128d _mm_load_pd(double const *__P) { +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd (double const *__P) +{ return *(__m128d *)__P; } - -__funline __m128d _mm_loadu_pd(double const *__P) { +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_pd (double const *__P) +{ return *(__m128d_u *)__P; } - -__funline __m128d _mm_load1_pd(double const *__P) { - return _mm_set1_pd(*__P); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load1_pd (double const *__P) +{ + return _mm_set1_pd (*__P); } - -__funline __m128d _mm_load_sd(double const *__P) { - return _mm_set_sd(*__P); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_sd (double const *__P) +{ + return _mm_set_sd (*__P); } - -__funline __m128d _mm_load_pd1(double const *__P) { - return _mm_load1_pd(__P); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd1 (double const *__P) +{ + return _mm_load1_pd (__P); } - -__funline __m128d _mm_loadr_pd(double const *__P) { - __m128d __tmp = _mm_load_pd(__P); - return __builtin_ia32_shufpd(__tmp, __tmp, _MM_SHUFFLE2(0, 1)); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadr_pd (double const *__P) +{ + __m128d __tmp = _mm_load_pd (__P); + return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); } - -__funline void _mm_store_pd(double *__P, __m128d __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd (double *__P, __m128d __A) +{ *(__m128d *)__P = __A; } - -__funline void _mm_storeu_pd(double *__P, __m128d __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_pd (double *__P, __m128d __A) +{ *(__m128d_u *)__P = __A; } - -__funline void _mm_store_sd(double *__P, __m128d __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_sd (double *__P, __m128d __A) +{ *__P = ((__v2df)__A)[0]; } - -__funline double _mm_cvtsd_f64(__m128d __A) { +extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_f64 (__m128d __A) +{ return ((__v2df)__A)[0]; } - -__funline void _mm_storel_pd(double *__P, __m128d __A) { - _mm_store_sd(__P, __A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_pd (double *__P, __m128d __A) +{ + _mm_store_sd (__P, __A); } - -__funline void _mm_storeh_pd(double *__P, __m128d __A) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeh_pd (double *__P, __m128d __A) +{ *__P = ((__v2df)__A)[1]; } - -__funline void _mm_store1_pd(double *__P, __m128d __A) { - _mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 0))); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store1_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); } - -__funline void _mm_store_pd1(double *__P, __m128d __A) { - _mm_store1_pd(__P, __A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd1 (double *__P, __m128d __A) +{ + _mm_store1_pd (__P, __A); } - -__funline void _mm_storer_pd(double *__P, __m128d __A) { - _mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 1))); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storer_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1))); } - -__funline int _mm_cvtsi128_si32(__m128i __A) { - return __builtin_ia32_vec_ext_v4si((__v4si)__A, 0); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si32 (__m128i __A) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); } - #ifdef __x86_64__ - -__funline long long _mm_cvtsi128_si64(__m128i __A) { +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64 (__m128i __A) +{ return ((__v2di)__A)[0]; } - -__funline long long _mm_cvtsi128_si64x(__m128i __A) { +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64x (__m128i __A) +{ return ((__v2di)__A)[0]; } #endif - -__funline __m128d _mm_add_pd(__m128d __A, __m128d __B) { - return (__m128d)((__v2df)__A + (__v2df)__B); -} - -__funline __m128d _mm_add_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_addsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_sub_pd(__m128d __A, __m128d __B) { - return (__m128d)((__v2df)__A - (__v2df)__B); -} - -__funline __m128d _mm_sub_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_subsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_mul_pd(__m128d __A, __m128d __B) { - return (__m128d)((__v2df)__A * (__v2df)__B); -} - -__funline __m128d _mm_mul_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_mulsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_div_pd(__m128d __A, __m128d __B) { - return (__m128d)((__v2df)__A / (__v2df)__B); -} - -__funline __m128d _mm_div_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_divsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_sqrt_pd(__m128d __A) { - return (__m128d)__builtin_ia32_sqrtpd((__v2df)__A); -} - -__funline __m128d _mm_sqrt_sd(__m128d __A, __m128d __B) { - __v2df __tmp = __builtin_ia32_movsd((__v2df)__A, (__v2df)__B); - return (__m128d)__builtin_ia32_sqrtsd((__v2df)__tmp); -} - -__funline __m128d _mm_min_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_minpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_min_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_minsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_max_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_maxpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_max_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_maxsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_and_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_andpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_andnot_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_andnpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_or_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_orpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_xor_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_xorpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpeq_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmplt_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpltpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmple_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmplepd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpgt_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpgtpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpge_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpgepd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpneq_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpnlt_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpnle_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpngt_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpngtpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpnge_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpngepd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpord_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpordpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpunord_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpeq_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmplt_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpltsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmple_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmplesd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpgt_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_movsd( - (__v2df)__A, (__v2df)__builtin_ia32_cmpltsd((__v2df)__B, (__v2df)__A)); -} - -__funline __m128d _mm_cmpge_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_movsd( - (__v2df)__A, (__v2df)__builtin_ia32_cmplesd((__v2df)__B, (__v2df)__A)); -} - -__funline __m128d _mm_cmpneq_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpnlt_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpnle_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpngt_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_movsd( - (__v2df)__A, (__v2df)__builtin_ia32_cmpnltsd((__v2df)__B, (__v2df)__A)); -} - -__funline __m128d _mm_cmpnge_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_movsd( - (__v2df)__A, (__v2df)__builtin_ia32_cmpnlesd((__v2df)__B, (__v2df)__A)); -} - -__funline __m128d _mm_cmpord_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpordsd((__v2df)__A, (__v2df)__B); -} - -__funline __m128d _mm_cmpunord_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_comieq_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_comisdeq((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_comilt_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_comisdlt((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_comile_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_comisdle((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_comigt_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_comisdgt((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_comige_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_comisdge((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_comineq_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_comisdneq((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_ucomieq_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_ucomisdeq((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_ucomilt_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_ucomisdlt((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_ucomile_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_ucomisdle((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_ucomigt_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_ucomisdgt((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_ucomige_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_ucomisdge((__v2df)__A, (__v2df)__B); -} - -__funline int _mm_ucomineq_sd(__m128d __A, __m128d __B) { - return __builtin_ia32_ucomisdneq((__v2df)__A, (__v2df)__B); -} - -__funline __m128i _mm_set_epi64x(long long __q1, long long __q0) { - return __extension__(__m128i)(__v2di){__q0, __q1}; -} - -__funline __m128i _mm_set_epi64(__m64 __q1, __m64 __q0) { - return _mm_set_epi64x((long long)__q1, (long long)__q0); -} - -__funline __m128i _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { - return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; -} - -__funline __m128i _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, - short __q3, short __q2, short __q1, short __q0) { - return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, - __q4, __q5, __q6, __q7}; -} - -__funline __m128i _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, - char __q11, char __q10, char __q09, char __q08, - char __q07, char __q06, char __q05, char __q04, - char __q03, char __q02, char __q01, char __q00) { - return __extension__(__m128i)(__v16qi){ - __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, - __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; -} - -__funline __m128i _mm_set1_epi64x(long long __A) { - return _mm_set_epi64x(__A, __A); -} - -__funline __m128i _mm_set1_epi64(__m64 __A) { - return _mm_set_epi64(__A, __A); -} - -__funline __m128i _mm_set1_epi32(int __A) { - return _mm_set_epi32(__A, __A, __A, __A); -} - -__funline __m128i _mm_set1_epi16(short __A) { - return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); -} - -__funline __m128i _mm_set1_epi8(char __A) { - return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, - __A, __A, __A, __A, __A); -} - -__funline __m128i _mm_setr_epi64(__m64 __q0, __m64 __q1) { - return _mm_set_epi64(__q1, __q0); -} - -__funline __m128i _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { - return _mm_set_epi32(__q3, __q2, __q1, __q0); -} - -__funline __m128i _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, - short __q4, short __q5, short __q6, short __q7) { - return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); -} - -__funline __m128i _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, - char __q04, char __q05, char __q06, char __q07, - char __q08, char __q09, char __q10, char __q11, - char __q12, char __q13, char __q14, char __q15) { - return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, - __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); -} - -__funline __m128i _mm_load_si128(__m128i const *__P) { +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A + (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A - (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A * (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A / (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_pd (__m128d __A) +{ + return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_sd (__m128d __A, __m128d __B) +{ + __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); + return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpltsd ((__v2df) __B, + (__v2df) + __A)); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmplesd ((__v2df) __B, + (__v2df) + __A)); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpnltsd ((__v2df) __B, + (__v2df) + __A)); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpnlesd ((__v2df) __B, + (__v2df) + __A)); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); +} +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64x (long long __q1, long long __q0) +{ + return __extension__ (__m128i)(__v2di){ __q0, __q1 }; +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64 (__m64 __q1, __m64 __q0) +{ + return _mm_set_epi64x ((long long)__q1, (long long)__q0); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) +{ + return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, + short __q3, short __q2, short __q1, short __q0) +{ + return __extension__ (__m128i)(__v8hi){ + __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m128i)(__v16qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 + }; +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64x (long long __A) +{ + return _mm_set_epi64x (__A, __A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64 (__m64 __A) +{ + return _mm_set_epi64 (__A, __A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi32 (int __A) +{ + return _mm_set_epi32 (__A, __A, __A, __A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi16 (short __A) +{ + return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi8 (char __A) +{ + return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi64 (__m64 __q0, __m64 __q1) +{ + return _mm_set_epi64 (__q1, __q0); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) +{ + return _mm_set_epi32 (__q3, __q2, __q1, __q0); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, + short __q4, short __q5, short __q6, short __q7) +{ + return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, + char __q04, char __q05, char __q06, char __q07, + char __q08, char __q09, char __q10, char __q11, + char __q12, char __q13, char __q14, char __q15) +{ + return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, + __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_si128 (__m128i const *__P) +{ return *__P; } - -__funline __m128i _mm_loadu_si128(__m128i_u const *__P) { +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si128 (__m128i_u const *__P) +{ return *__P; } - -__funline __m128i _mm_loadl_epi64(__m128i_u const *__P) { - return _mm_set_epi64((__m64)0LL, *(__m64_u *)__P); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_epi64 (__m128i_u const *__P) +{ + return _mm_set_epi64 ((__m64)0LL, *(__m64_u *)__P); } - -__funline __m128i _mm_loadu_si64(void const *__P) { - return _mm_loadl_epi64((__m128i_u *)__P); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si64 (void const *__P) +{ + return _mm_loadl_epi64 ((__m128i_u *)__P); } - -__funline void _mm_store_si128(__m128i *__P, __m128i __B) { +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si32 (void const *__P) +{ + return _mm_set_epi32 (*(int *)__P, (int)0, (int)0, (int)0); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si16 (void const *__P) +{ + return _mm_set_epi16 (*(short *)__P, (short)0, (short)0, (short)0, + (short)0, (short)0, (short)0, (short)0); +} +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_si128 (__m128i *__P, __m128i __B) +{ *__P = __B; } - -__funline void _mm_storeu_si128(__m128i_u *__P, __m128i __B) { +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si128 (__m128i_u *__P, __m128i __B) +{ *__P = __B; } - -__funline void _mm_storel_epi64(__m128i_u *__P, __m128i __B) { - *(__m64_u *)__P = (__m64)((__v2di)__B)[0]; +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_epi64 (__m128i_u *__P, __m128i __B) +{ + *(__m64_u *)__P = (__m64) ((__v2di)__B)[0]; } - -__funline void _mm_storeu_si64(void *__P, __m128i __B) { - _mm_storel_epi64((__m128i_u *)__P, __B); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si64 (void *__P, __m128i __B) +{ + _mm_storel_epi64 ((__m128i_u *)__P, __B); } - -__funline __m64 _mm_movepi64_pi64(__m128i __B) { - return (__m64)((__v2di)__B)[0]; +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si32 (void *__P, __m128i __B) +{ + *(__m32_u *)__P = (__m32) ((__v4si)__B)[0]; } - -__funline __m128i _mm_movpi64_epi64(__m64 __A) { - return _mm_set_epi64((__m64)0LL, __A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si16 (void *__P, __m128i __B) +{ + *(__m16_u *)__P = (__m16) ((__v8hi)__B)[0]; } - -__funline __m128i _mm_move_epi64(__m128i __A) { - return (__m128i)__builtin_ia32_movq128((__v2di)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi64_pi64 (__m128i __B) +{ + return (__m64) ((__v2di)__B)[0]; } - -__funline __m128i _mm_undefined_si128(void) { +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movpi64_epi64 (__m64 __A) +{ + return _mm_set_epi64 ((__m64)0LL, __A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_epi64 (__m128i __A) +{ + return (__m128i)__builtin_ia32_movq128 ((__v2di) __A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_si128 (void) +{ __m128i __Y = __Y; return __Y; } - -__funline __m128i _mm_setzero_si128(void) { - return __extension__(__m128i)(__v4si){0, 0, 0, 0}; +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si128 (void) +{ + return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; } - -__funline __m128d _mm_cvtepi32_pd(__m128i __A) { - return (__m128d)__builtin_ia32_cvtdq2pd((__v4si)__A); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_pd (__m128i __A) +{ + return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); } - -__funline __m128 _mm_cvtepi32_ps(__m128i __A) { - return (__m128)__builtin_ia32_cvtdq2ps((__v4si)__A); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_ps (__m128i __A) +{ + return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); } - -__funline __m128i _mm_cvtpd_epi32(__m128d __A) { - return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)__A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epi32 (__m128d __A) +{ + return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); } - -__funline __m64 _mm_cvtpd_pi32(__m128d __A) { - return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); } - -__funline __m128 _mm_cvtpd_ps(__m128d __A) { - return (__m128)__builtin_ia32_cvtpd2ps((__v2df)__A); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_ps (__m128d __A) +{ + return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); } - -__funline __m128i _mm_cvttpd_epi32(__m128d __A) { - return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epi32 (__m128d __A) +{ + return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); } - -__funline __m64 _mm_cvttpd_pi32(__m128d __A) { - return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); } - -__funline __m128d _mm_cvtpi32_pd(__m64 __A) { - return (__m128d)__builtin_ia32_cvtpi2pd((__v2si)__A); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32_pd (__m64 __A) +{ + return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); } - -__funline __m128i _mm_cvtps_epi32(__m128 __A) { - return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epi32 (__m128 __A) +{ + return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); } - -__funline __m128i _mm_cvttps_epi32(__m128 __A) { - return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epi32 (__m128 __A) +{ + return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); } - -__funline __m128d _mm_cvtps_pd(__m128 __A) { - return (__m128d)__builtin_ia32_cvtps2pd((__v4sf)__A); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pd (__m128 __A) +{ + return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); } - -__funline int _mm_cvtsd_si32(__m128d __A) { - return __builtin_ia32_cvtsd2si((__v2df)__A); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvtsd2si ((__v2df) __A); } - #ifdef __x86_64__ - -__funline long long _mm_cvtsd_si64(__m128d __A) { - return __builtin_ia32_cvtsd2si64((__v2df)__A); +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64 (__m128d __A) +{ + return __builtin_ia32_cvtsd2si64 ((__v2df) __A); } - -__funline long long _mm_cvtsd_si64x(__m128d __A) { - return __builtin_ia32_cvtsd2si64((__v2df)__A); +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64x (__m128d __A) +{ + return __builtin_ia32_cvtsd2si64 ((__v2df) __A); } #endif - -__funline int _mm_cvttsd_si32(__m128d __A) { - return __builtin_ia32_cvttsd2si((__v2df)__A); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvttsd2si ((__v2df) __A); } - #ifdef __x86_64__ - -__funline long long _mm_cvttsd_si64(__m128d __A) { - return __builtin_ia32_cvttsd2si64((__v2df)__A); +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64 (__m128d __A) +{ + return __builtin_ia32_cvttsd2si64 ((__v2df) __A); } - -__funline long long _mm_cvttsd_si64x(__m128d __A) { - return __builtin_ia32_cvttsd2si64((__v2df)__A); +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64x (__m128d __A) +{ + return __builtin_ia32_cvttsd2si64 ((__v2df) __A); } #endif - -__funline __m128 _mm_cvtsd_ss(__m128 __A, __m128d __B) { - return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__A, (__v2df)__B); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_ss (__m128 __A, __m128d __B) +{ + return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); } - -__funline __m128d _mm_cvtsi32_sd(__m128d __A, int __B) { - return (__m128d)__builtin_ia32_cvtsi2sd((__v2df)__A, __B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_sd (__m128d __A, int __B) +{ + return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); } - #ifdef __x86_64__ - -__funline __m128d _mm_cvtsi64_sd(__m128d __A, long long __B) { - return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_sd (__m128d __A, long long __B) +{ + return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); } - -__funline __m128d _mm_cvtsi64x_sd(__m128d __A, long long __B) { - return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_sd (__m128d __A, long long __B) +{ + return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); } #endif - -__funline __m128d _mm_cvtss_sd(__m128d __A, __m128 __B) { - return (__m128d)__builtin_ia32_cvtss2sd((__v2df)__A, (__v4sf)__B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_sd (__m128d __A, __m128 __B) +{ + return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); } - #ifdef __OPTIMIZE__ -__funline __m128d _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { - return (__m128d)__builtin_ia32_shufpd((__v2df)__A, (__v2df)__B, __mask); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) +{ + return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask); } #else -#define _mm_shuffle_pd(A, B, N) \ - ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ - (int)(N))) +#define _mm_shuffle_pd(A, B, N) ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(N))) #endif - -__funline __m128d _mm_unpackhi_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_unpckhpd((__v2df)__A, (__v2df)__B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); } - -__funline __m128d _mm_unpacklo_pd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_unpcklpd((__v2df)__A, (__v2df)__B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); } - -__funline __m128d _mm_loadh_pd(__m128d __A, double const *__B) { - return (__m128d)__builtin_ia32_loadhpd((__v2df)__A, __B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadh_pd (__m128d __A, double const *__B) +{ + return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); } - -__funline __m128d _mm_loadl_pd(__m128d __A, double const *__B) { - return (__m128d)__builtin_ia32_loadlpd((__v2df)__A, __B); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_pd (__m128d __A, double const *__B) +{ + return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); } - -__funline int _mm_movemask_pd(__m128d __A) { - return __builtin_ia32_movmskpd((__v2df)__A); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pd (__m128d __A) +{ + return __builtin_ia32_movmskpd ((__v2df)__A); } - -__funline __m128i _mm_packs_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_packsswb128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_packs_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_packssdw128((__v4si)__A, (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); } - -__funline __m128i _mm_packus_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_packuswb128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_unpackhi_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_unpackhi_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_unpackhi_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__A, (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); } - -__funline __m128i _mm_unpackhi_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)__A, (__v2di)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); } - -__funline __m128i _mm_unpacklo_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_unpacklo_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_unpacklo_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpckldq128((__v4si)__A, (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); } - -__funline __m128i _mm_unpacklo_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__A, (__v2di)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); } - -__funline __m128i _mm_add_epi8(__m128i __A, __m128i __B) { - return (__m128i)((__v16qu)__A + (__v16qu)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qu)__A + (__v16qu)__B); } - -__funline __m128i _mm_add_epi16(__m128i __A, __m128i __B) { - return (__m128i)((__v8hu)__A + (__v8hu)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A + (__v8hu)__B); } - -__funline __m128i _mm_add_epi32(__m128i __A, __m128i __B) { - return (__m128i)((__v4su)__A + (__v4su)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A + (__v4su)__B); } - -__funline __m128i _mm_add_epi64(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A + (__v2du)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A + (__v2du)__B); } - -__funline __m128i _mm_adds_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddsb128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_adds_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddsw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_adds_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddusb128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_adds_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_paddusw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_sub_epi8(__m128i __A, __m128i __B) { - return (__m128i)((__v16qu)__A - (__v16qu)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qu)__A - (__v16qu)__B); } - -__funline __m128i _mm_sub_epi16(__m128i __A, __m128i __B) { - return (__m128i)((__v8hu)__A - (__v8hu)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A - (__v8hu)__B); } - -__funline __m128i _mm_sub_epi32(__m128i __A, __m128i __B) { - return (__m128i)((__v4su)__A - (__v4su)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A - (__v4su)__B); } - -__funline __m128i _mm_sub_epi64(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A - (__v2du)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A - (__v2du)__B); } - -__funline __m128i _mm_subs_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubsb128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_subs_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubsw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_subs_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubusb128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_subs_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psubusw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_madd_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_mulhi_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_mullo_epi16(__m128i __A, __m128i __B) { - return (__m128i)((__v8hu)__A * (__v8hu)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A * (__v8hu)__B); } - -__funline __m64 _mm_mul_su32(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pmuludq((__v2si)__A, (__v2si)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_su32 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); } - -__funline __m128i _mm_mul_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmuludq128((__v4si)__A, (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); } - -__funline __m128i _mm_slli_epi16(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllwi128((__v8hi)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); } - -__funline __m128i _mm_slli_epi32(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_pslldi128((__v4si)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); } - -__funline __m128i _mm_slli_epi64(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllqi128((__v2di)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); } - -__funline __m128i _mm_srai_epi16(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrawi128((__v8hi)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); } - -__funline __m128i _mm_srai_epi32(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psradi128((__v4si)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_bsrli_si128(__m128i __A, const int __N) { - return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bsrli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); } - -__funline __m128i _mm_bslli_si128(__m128i __A, const int __N) { - return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bslli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); } - -__funline __m128i _mm_srli_si128(__m128i __A, const int __N) { - return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); } - -__funline __m128i _mm_slli_si128(__m128i __A, const int __N) { - return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); } #else -#define _mm_bsrli_si128(A, N) \ - ((__m128i)__builtin_ia32_psrldqi128((__m128i)(A), (int)(N)*8)) -#define _mm_bslli_si128(A, N) \ - ((__m128i)__builtin_ia32_pslldqi128((__m128i)(A), (int)(N)*8)) -#define _mm_srli_si128(A, N) \ - ((__m128i)__builtin_ia32_psrldqi128((__m128i)(A), (int)(N)*8)) -#define _mm_slli_si128(A, N) \ - ((__m128i)__builtin_ia32_pslldqi128((__m128i)(A), (int)(N)*8)) +#define _mm_bsrli_si128(A, N) ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) +#define _mm_bslli_si128(A, N) ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) +#define _mm_srli_si128(A, N) ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) +#define _mm_slli_si128(A, N) ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) #endif - -__funline __m128i _mm_srli_epi16(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); } - -__funline __m128i _mm_srli_epi32(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrldi128((__v4si)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); } - -__funline __m128i _mm_srli_epi64(__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrlqi128((__v2di)__A, __B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); } - -__funline __m128i _mm_sll_epi16(__m128i __A, __m128i __B) { +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi16 (__m128i __A, __m128i __B) +{ return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_sll_epi32(__m128i __A, __m128i __B) { +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi32 (__m128i __A, __m128i __B) +{ return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); } - -__funline __m128i _mm_sll_epi64(__m128i __A, __m128i __B) { +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi64 (__m128i __A, __m128i __B) +{ return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); } - -__funline __m128i _mm_sra_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psraw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_sra_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrad128((__v4si)__A, (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B); } - -__funline __m128i _mm_srl_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrlw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_srl_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrld128((__v4si)__A, (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B); } - -__funline __m128i _mm_srl_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psrlq128((__v2di)__A, (__v2di)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); } - -__funline __m128i _mm_and_si128(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A & (__v2du)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si128 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A & (__v2du)__B); } - -__funline __m128i _mm_andnot_si128(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pandn128((__v2di)__A, (__v2di)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); } - -__funline __m128i _mm_or_si128(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A | (__v2du)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si128 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A | (__v2du)__B); } - -__funline __m128i _mm_xor_si128(__m128i __A, __m128i __B) { - return (__m128i)((__v2du)__A ^ (__v2du)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si128 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A ^ (__v2du)__B); } - -__funline __m128i _mm_cmpeq_epi8(__m128i __A, __m128i __B) { - return (__m128i)((__v16qs)__A == (__v16qs)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qi)__A == (__v16qi)__B); } - -__funline __m128i _mm_cmpeq_epi16(__m128i __A, __m128i __B) { - return (__m128i)((__v8hi)__A == (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hi)__A == (__v8hi)__B); } - -__funline __m128i _mm_cmpeq_epi32(__m128i __A, __m128i __B) { - return (__m128i)((__v4si)__A == (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4si)__A == (__v4si)__B); } - -__funline __m128i _mm_cmplt_epi8(__m128i __A, __m128i __B) { - return (__m128i)((__v16qs)__A < (__v16qs)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qs)__A < (__v16qs)__B); } - -__funline __m128i _mm_cmplt_epi16(__m128i __A, __m128i __B) { - return (__m128i)((__v8hi)__A < (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hi)__A < (__v8hi)__B); } - -__funline __m128i _mm_cmplt_epi32(__m128i __A, __m128i __B) { - return (__m128i)((__v4si)__A < (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4si)__A < (__v4si)__B); } - -__funline __m128i _mm_cmpgt_epi8(__m128i __A, __m128i __B) { - return (__m128i)((__v16qs)__A > (__v16qs)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qs)__A > (__v16qs)__B); } - -__funline __m128i _mm_cmpgt_epi16(__m128i __A, __m128i __B) { - return (__m128i)((__v8hi)__A > (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hi)__A > (__v8hi)__B); } - -__funline __m128i _mm_cmpgt_epi32(__m128i __A, __m128i __B) { - return (__m128i)((__v4si)__A > (__v4si)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4si)__A > (__v4si)__B); } - #ifdef __OPTIMIZE__ -__funline int _mm_extract_epi16(__m128i const __A, int const __N) { - return (unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)__A, __N); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi16 (__m128i const __A, int const __N) +{ + return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); } - -__funline __m128i _mm_insert_epi16(__m128i const __A, int const __D, - int const __N) { - return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)__A, __D, __N); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) +{ + return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N); } #else -#define _mm_extract_epi16(A, N) \ - ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(A), \ - (int)(N))) -#define _mm_insert_epi16(A, D, N) \ - ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(A), (int)(D), \ - (int)(N))) +#define _mm_extract_epi16(A, N) ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_insert_epi16(A, D, N) ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A), (int)(D), (int)(N))) #endif - -__funline __m128i _mm_max_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_max_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_min_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminsw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_min_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pminub128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); } - -__funline int _mm_movemask_epi8(__m128i __A) { - return __builtin_ia32_pmovmskb128((__v16qi)__A); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_epi8 (__m128i __A) +{ + return __builtin_ia32_pmovmskb128 ((__v16qi)__A); } - -__funline __m128i _mm_mulhi_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_shufflehi_epi16(__m128i __A, const int __mask) { - return (__m128i)__builtin_ia32_pshufhw((__v8hi)__A, __mask); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflehi_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask); } - -__funline __m128i _mm_shufflelo_epi16(__m128i __A, const int __mask) { - return (__m128i)__builtin_ia32_pshuflw((__v8hi)__A, __mask); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflelo_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask); } - -__funline __m128i _mm_shuffle_epi32(__m128i __A, const int __mask) { - return (__m128i)__builtin_ia32_pshufd((__v4si)__A, __mask); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi32 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask); } #else -#define _mm_shufflehi_epi16(A, N) \ - ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(A), (int)(N))) -#define _mm_shufflelo_epi16(A, N) \ - ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(A), (int)(N))) -#define _mm_shuffle_epi32(A, N) \ - ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(A), (int)(N))) +#define _mm_shufflehi_epi16(A, N) ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shufflelo_epi16(A, N) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shuffle_epi32(A, N) ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N))) #endif - -__funline void _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { - __builtin_ia32_maskmovdqu((__v16qi)__A, (__v16qi)__B, __C); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) +{ + __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); } - -__funline __m128i _mm_avg_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pavgb128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); } - -__funline __m128i _mm_avg_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_pavgw128((__v8hi)__A, (__v8hi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); } - -__funline __m128i _mm_sad_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_psadbw128((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sad_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); } - -__funline void _mm_stream_si32(int *__A, int __B) { - __builtin_ia32_movnti(__A, __B); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si32 (int *__A, int __B) +{ + __builtin_ia32_movnti (__A, __B); } - #ifdef __x86_64__ -__funline void _mm_stream_si64(long long int *__A, long long int __B) { - __builtin_ia32_movnti64(__A, __B); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si64 (long long int *__A, long long int __B) +{ + __builtin_ia32_movnti64 (__A, __B); } #endif - -__funline void _mm_stream_si128(__m128i *__A, __m128i __B) { - __builtin_ia32_movntdq((__v2di *)__A, (__v2di)__B); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si128 (__m128i *__A, __m128i __B) +{ + __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); } - -__funline void _mm_stream_pd(double *__A, __m128d __B) { - __builtin_ia32_movntpd(__A, (__v2df)__B); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_pd (double *__A, __m128d __B) +{ + __builtin_ia32_movntpd (__A, (__v2df)__B); } - -__funline void _mm_clflush(void const *__A) { - __builtin_ia32_clflush(__A); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clflush (void const *__A) +{ + __builtin_ia32_clflush (__A); } - -__funline void _mm_lfence(void) { - __builtin_ia32_lfence(); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lfence (void) +{ + __builtin_ia32_lfence (); } - -__funline void _mm_mfence(void) { - __builtin_ia32_mfence(); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mfence (void) +{ + __builtin_ia32_mfence (); } - -__funline __m128i _mm_cvtsi32_si128(int __A) { - return _mm_set_epi32(0, 0, 0, __A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_si128 (int __A) +{ + return _mm_set_epi32 (0, 0, 0, __A); } - #ifdef __x86_64__ - -__funline __m128i _mm_cvtsi64_si128(long long __A) { - return _mm_set_epi64x(0, __A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si128 (long long __A) +{ + return _mm_set_epi64x (0, __A); } - -__funline __m128i _mm_cvtsi64x_si128(long long __A) { - return _mm_set_epi64x(0, __A); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_si128 (long long __A) +{ + return _mm_set_epi64x (0, __A); } #endif - -__funline __m128 _mm_castpd_ps(__m128d __A) { - return (__m128)__A; +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_ps(__m128d __A) +{ + return (__m128) __A; } - -__funline __m128i _mm_castpd_si128(__m128d __A) { - return (__m128i)__A; +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_si128(__m128d __A) +{ + return (__m128i) __A; } - -__funline __m128d _mm_castps_pd(__m128 __A) { - return (__m128d)__A; +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_pd(__m128 __A) +{ + return (__m128d) __A; } - -__funline __m128i _mm_castps_si128(__m128 __A) { - return (__m128i)__A; +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_si128(__m128 __A) +{ + return (__m128i) __A; } - -__funline __m128 _mm_castsi128_ps(__m128i __A) { - return (__m128)__A; +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_ps(__m128i __A) +{ + return (__m128) __A; } - -__funline __m128d _mm_castsi128_pd(__m128i __A) { - return (__m128d)__A; +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_pd(__m128i __A) +{ + return (__m128d) __A; } - #ifdef __DISABLE_SSE2__ #undef __DISABLE_SSE2__ #pragma GCC pop_options -#endif /* __DISABLE_SSE2__ */ - -#endif /* __x86_64__ */ -#endif /* _EMMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/enqcmdintrin.internal.h b/third_party/intel/enqcmdintrin.internal.h new file mode 100644 index 000000000..179b35553 --- /dev/null +++ b/third_party/intel/enqcmdintrin.internal.h @@ -0,0 +1,30 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif +#ifndef _ENQCMDINTRIN_H_INCLUDED +#define _ENQCMDINTRIN_H_INCLUDED +#ifndef __ENQCMD__ +#pragma GCC push_options +#pragma GCC target ("enqcmd") +#define __DISABLE_ENQCMD__ +#endif +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enqcmd (void * __P, const void * __Q) +{ + return __builtin_ia32_enqcmd (__P, __Q); +} +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enqcmds (void * __P, const void * __Q) +{ + return __builtin_ia32_enqcmds (__P, __Q); +} +#ifdef __DISABLE_ENQCMD__ +#undef __DISABLE_ENQCMD__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/f16cintrin.internal.h b/third_party/intel/f16cintrin.internal.h index 71f09ec9e..13d390ae4 100644 --- a/third_party/intel/f16cintrin.internal.h +++ b/third_party/intel/f16cintrin.internal.h @@ -1,75 +1,58 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED -#error \ - "Never use directly; include or instead." +# error "Never use directly; include or instead." #endif - #ifndef _F16CINTRIN_H_INCLUDED #define _F16CINTRIN_H_INCLUDED - #ifndef __F16C__ #pragma GCC push_options #pragma GCC target("f16c") #define __DISABLE_F16C__ -#endif /* __F16C__ */ - -__funline float _cvtsh_ss(unsigned short __S) { - __v8hi __H = __extension__(__v8hi){(short)__S, 0, 0, 0, 0, 0, 0, 0}; - __v4sf __A = __builtin_ia32_vcvtph2ps(__H); - return __builtin_ia32_vec_ext_v4sf(__A, 0); +#endif +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cvtsh_ss (unsigned short __S) +{ + __v8hi __H = __extension__ (__v8hi){ (short) __S, 0, 0, 0, 0, 0, 0, 0 }; + __v4sf __A = __builtin_ia32_vcvtph2ps (__H); + return __builtin_ia32_vec_ext_v4sf (__A, 0); } - -/** - * Converts four half-precision (16-bit) floating point values to - * single-precision floating point values. - */ -__funline __m128 _mm_cvtph_ps(__m128i __A) { - return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps ((__v8hi) __A); } - -/** - * Converts eight half-precision (16-bit) floating point values to - * single-precision floating point values. - */ -__funline __m256 _mm256_cvtph_ps(__m128i __A) { - return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A); +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_ps (__m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A); } - #ifdef __OPTIMIZE__ -__funline unsigned short _cvtss_sh(float __F, const int __I) { - __v4sf __A = __extension__(__v4sf){__F, 0, 0, 0}; - __v8hi __H = __builtin_ia32_vcvtps2ph(__A, __I); - return (unsigned short)__builtin_ia32_vec_ext_v8hi(__H, 0); +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cvtss_sh (float __F, const int __I) +{ + __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; + __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); + return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); } - -__funline __m128i _mm_cvtps_ph(__m128 __A, const int __I) { - return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_ph (__m128 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I); } - -/** - * Converts eight single-precision floating point values to - * half-precision (16-bit) floating point values. - */ -__funline __m128i _mm256_cvtps_ph(__m256 __A, const int __I) { - return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_ph (__m256 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I); } #else -#define _cvtss_sh(__F, __I) \ - (__extension__({ \ - __v4sf __A = __extension__(__v4sf){__F, 0, 0, 0}; \ - __v8hi __H = __builtin_ia32_vcvtps2ph(__A, __I); \ - (unsigned short)__builtin_ia32_vec_ext_v8hi(__H, 0); \ - })) - -#define _mm_cvtps_ph(A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)A, (int)(I))) - -#define _mm256_cvtps_ph(A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)A, (int)(I))) -#endif /* __OPTIMIZE */ - +#define _cvtss_sh(__F, __I) (__extension__ ({ __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); })) +#define _mm_cvtps_ph(A, I) ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) (A), (int) (I))) +#define _mm256_cvtps_ph(A, I) ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) (A), (int) (I))) +#endif #ifdef __DISABLE_F16C__ #undef __DISABLE_F16C__ #pragma GCC pop_options -#endif /* __DISABLE_F16C__ */ - -#endif /* _F16CINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/fma4intrin.internal.h b/third_party/intel/fma4intrin.internal.h index d2ed71ab4..f95f38583 100644 --- a/third_party/intel/fma4intrin.internal.h +++ b/third_party/intel/fma4intrin.internal.h @@ -1,184 +1,179 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _X86INTRIN_H_INCLUDED -#error "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _FMA4INTRIN_H_INCLUDED #define _FMA4INTRIN_H_INCLUDED - #include "third_party/intel/ammintrin.internal.h" - #ifndef __FMA4__ #pragma GCC push_options #pragma GCC target("fma4") #define __DISABLE_FMA4__ -#endif /* __FMA4__ */ - -__funline __m128 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -__funline __m128d _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -__funline __m128d _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) - +#endif +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, - -(__v4sf)__C); + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } - -__funline __m128d _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, - -(__v2df)__C); -} - -__funline __m128 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, - -(__v4sf)__C); -} - -__funline __m128d _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, - -(__v2df)__C); -} - -__funline __m128 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m128d _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m128d _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, - -(__v4sf)__C); -} - -__funline __m128d _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, - -(__v2df)__C); -} - -__funline __m128 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, - -(__v4sf)__C); -} - -__funline __m128d _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, - -(__v2df)__C); -} - -__funline __m128 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m128d _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, - -(__v4sf)__C); -} - -__funline __m128d _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, - -(__v2df)__C); -} - -/* 256b Floating point multiply/add type instructions. */ -__funline __m256 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C); -} - -__funline __m256d _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, - (__v4df)__C); -} - -__funline __m256 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) - +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C) { - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, - -(__v8sf)__C); + return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); } - -__funline __m256d _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, - -(__v4df)__C); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } - -__funline __m256 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, - (__v8sf)__C); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C); } - -__funline __m256d _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, - (__v4df)__C); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } - -__funline __m256 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, - -(__v8sf)__C); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); } - -__funline __m256d _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, - -(__v4df)__C); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } - -__funline __m256 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); } - -__funline __m256d _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, - (__v4df)__C); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } - -__funline __m256 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, - -(__v8sf)__C); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C); } - -__funline __m256d _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, - -(__v4df)__C); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C); } - #ifdef __DISABLE_FMA4__ #undef __DISABLE_FMA4__ #pragma GCC pop_options -#endif /* __DISABLE_FMA4__ */ - +#endif +#endif #endif diff --git a/third_party/intel/fmaintrin.internal.h b/third_party/intel/fmaintrin.internal.h index 2b7daad60..77921c49a 100644 --- a/third_party/intel/fmaintrin.internal.h +++ b/third_party/intel/fmaintrin.internal.h @@ -1,177 +1,246 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _FMAINTRIN_H_INCLUDED #define _FMAINTRIN_H_INCLUDED - #ifndef __FMA__ #pragma GCC push_options #pragma GCC target("fma") #define __DISABLE_FMA__ -#endif /* __FMA__ */ - -__funline __m128d _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m256d _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, - (__v4df)__C); -} - -__funline __m128 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -__funline __m256 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C); -} - -__funline __m128d _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, +#endif +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); } - -__funline __m128 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m128d _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m256d _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, - (__v4df)__C); -} - -__funline __m128 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -__funline __m256 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C); -} - -__funline __m128d _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubsd3((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubss3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m128d _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m256d _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); } - -__funline __m128 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } - -__funline __m256 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } - -__funline __m128d _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddsd3((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddss3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m128d _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m256d _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, - (__v4df)__C); -} - -__funline __m128 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m256 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C); -} - -__funline __m128d _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubsd3((__v2df)__A, (__v2df)__B, - (__v2df)__C); -} - -__funline __m128 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubss3((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C); -} - -__funline __m128d _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df)__A, (__v2df)__B, (__v2df)__C); } - -__funline __m256d _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, - (__v4df)__C); -} - -__funline __m128 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } - -__funline __m256 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C); +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); } - -__funline __m128d _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, - -(__v2df)__C); +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B, + (__v4df)__C); } - -__funline __m256d _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, - -(__v4df)__C); +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); } - -__funline __m128 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, - -(__v4sf)__C); +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); } - -__funline __m256 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, - -(__v8sf)__C); +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmsubsd3 ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmsubss3 ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmaddsd3 ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmaddss3 ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmsubsd3 ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmsubss3 ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A, + (__v4df)__B, + (__v4df)__C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A, + (__v8sf)__B, + (__v8sf)__C); +} +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A, + (__v4df)__B, + -(__v4df)__C); +} +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A, + (__v8sf)__B, + -(__v8sf)__C); } - #ifdef __DISABLE_FMA__ #undef __DISABLE_FMA__ #pragma GCC pop_options -#endif /* __DISABLE_FMA__ */ - +#endif +#endif #endif diff --git a/third_party/intel/fxsrintrin.internal.h b/third_party/intel/fxsrintrin.internal.h index 30d15b154..e81b81d80 100644 --- a/third_party/intel/fxsrintrin.internal.h +++ b/third_party/intel/fxsrintrin.internal.h @@ -1,37 +1,44 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _FXSRINTRIN_H_INCLUDED #define _FXSRINTRIN_H_INCLUDED - #ifndef __FXSR__ #pragma GCC push_options #pragma GCC target("fxsr") #define __DISABLE_FXSR__ -#endif /* __FXSR__ */ - -__funline void _fxsave(void *__P) { - __builtin_ia32_fxsave(__P); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxsave (void *__P) +{ + __builtin_ia32_fxsave (__P); } - -__funline void _fxrstor(void *__P) { - __builtin_ia32_fxrstor(__P); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxrstor (void *__P) +{ + __builtin_ia32_fxrstor (__P); } - #ifdef __x86_64__ -__funline void _fxsave64(void *__P) { - __builtin_ia32_fxsave64(__P); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxsave64 (void *__P) +{ + __builtin_ia32_fxsave64 (__P); } - -__funline void _fxrstor64(void *__P) { - __builtin_ia32_fxrstor64(__P); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxrstor64 (void *__P) +{ + __builtin_ia32_fxrstor64 (__P); } #endif - #ifdef __DISABLE_FXSR__ #undef __DISABLE_FXSR__ #pragma GCC pop_options -#endif /* __DISABLE_FXSR__ */ - -#endif /* _FXSRINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/gfniintrin.internal.h b/third_party/intel/gfniintrin.internal.h index 1e345a0e9..987efe2cb 100644 --- a/third_party/intel/gfniintrin.internal.h +++ b/third_party/intel/gfniintrin.internal.h @@ -1,311 +1,310 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _GFNIINTRIN_H_INCLUDED #define _GFNIINTRIN_H_INCLUDED - #if !defined(__GFNI__) || !defined(__SSE2__) #pragma GCC push_options #pragma GCC target("gfni,sse2") #define __DISABLE_GFNI__ -#endif /* __GFNI__ */ - -__funline __m128i _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi((__v16qi)__A, (__v16qi)__B); +#endif +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8mul_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, + (__v16qi) __B); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_gf2p8affineinv_epi64_epi8(__m128i __A, __m128i __B, - const int __C) { - return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)__A, - (__v16qi)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8affineinv_epi64_epi8 (__m128i __A, __m128i __B, const int __C) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi ((__v16qi) __A, + (__v16qi) __B, + __C); } - -__funline __m128i _mm_gf2p8affine_epi64_epi8(__m128i __A, __m128i __B, - const int __C) { - return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)__A, - (__v16qi)__B, __C); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8affine_epi64_epi8 (__m128i __A, __m128i __B, const int __C) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi) __A, + (__v16qi) __B, __C); } #else -#define _mm_gf2p8affineinv_epi64_epi8(A, B, C) \ - ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi( \ - (__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) -#define _mm_gf2p8affine_epi64_epi8(A, B, C) \ - ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi( \ - (__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) +#define _mm_gf2p8affineinv_epi64_epi8(A, B, C) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) +#define _mm_gf2p8affine_epi64_epi8(A, B, C) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) #endif - #ifdef __DISABLE_GFNI__ #undef __DISABLE_GFNI__ #pragma GCC pop_options -#endif /* __DISABLE_GFNI__ */ - +#endif #if !defined(__GFNI__) || !defined(__AVX__) #pragma GCC push_options #pragma GCC target("gfni,avx") #define __DISABLE_GFNIAVX__ -#endif /* __GFNIAVX__ */ - -__funline __m256i _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi((__v32qi)__A, (__v32qi)__B); +#endif +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8mul_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi ((__v32qi) __A, + (__v32qi) __B); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_gf2p8affineinv_epi64_epi8(__m256i __A, __m256i __B, - const int __C) { - return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)__A, - (__v32qi)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8affineinv_epi64_epi8 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi ((__v32qi) __A, + (__v32qi) __B, + __C); } - -__funline __m256i _mm256_gf2p8affine_epi64_epi8(__m256i __A, __m256i __B, - const int __C) { - return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)__A, - (__v32qi)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8affine_epi64_epi8 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi) __A, + (__v32qi) __B, __C); } #else -#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C) \ - ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi( \ - (__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C))) -#define _mm256_gf2p8affine_epi64_epi8(A, B, C) \ - ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi( \ - (__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C))) +#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C))) +#define _mm256_gf2p8affine_epi64_epi8(A, B, C) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi)(__m256i)(A), ( __v32qi)(__m256i)(B), (int)(C))) #endif - #ifdef __DISABLE_GFNIAVX__ #undef __DISABLE_GFNIAVX__ #pragma GCC pop_options -#endif /* __GFNIAVX__ */ - +#endif #if !defined(__GFNI__) || !defined(__AVX512VL__) #pragma GCC push_options #pragma GCC target("gfni,avx512vl") #define __DISABLE_GFNIAVX512VL__ -#endif /* __GFNIAVX512VL__ */ - -__funline __m128i _mm_mask_gf2p8mul_epi8(__m128i __A, __mmask16 __B, __m128i __C, - __m128i __D) { - return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask( - (__v16qi)__C, (__v16qi)__D, (__v16qi)__A, __B); +#endif +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8mul_epi8 (__m128i __A, __mmask16 __B, __m128i __C, __m128i __D) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, + (__v16qi)__A, __B); } - -__funline __m128i _mm_maskz_gf2p8mul_epi8(__mmask16 __A, __m128i __B, - __m128i __C) { - return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask( - (__v16qi)__B, (__v16qi)__C, (__v16qi)_mm_setzero_si128(), __A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8mul_epi8 (__mmask16 __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, (__v16qi) _mm_setzero_si128 (), __A); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_mask_gf2p8affineinv_epi64_epi8(__m128i __A, __mmask16 __B, - __m128i __C, __m128i __D, - const int __E) { - return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( - (__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8affineinv_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, + __E, + (__v16qi)__A, + __B); } - -__funline __m128i _mm_maskz_gf2p8affineinv_epi64_epi8(__mmask16 __A, __m128i __B, - __m128i __C, - const int __D) { - return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( - (__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8affineinv_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C, + const int __D) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, __D, + (__v16qi) _mm_setzero_si128 (), + __A); } - -__funline __m128i _mm_mask_gf2p8affine_epi64_epi8(__m128i __A, __mmask16 __B, - __m128i __C, __m128i __D, - const int __E) { - return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( - (__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8affine_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, __E, (__v16qi)__A, __B); } - -__funline __m128i _mm_maskz_gf2p8affine_epi64_epi8(__mmask16 __A, __m128i __B, - __m128i __C, const int __D) { - return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( - (__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A); +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8affine_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C, + const int __D) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, __D, (__v16qi) _mm_setzero_si128 (), __A); } #else -#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ - ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \ - (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), \ - (__v16qi)(__m128i)(A), (__mmask16)(B))) -#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ - ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \ - (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), \ - (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)(A))) -#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ - ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( \ - (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), \ - (__v16qi)(__m128i)(A), (__mmask16)(B))) -#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ - ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( \ - (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), \ - (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)(A))) +#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B))) +#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A))) +#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B))) +#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A))) #endif - #ifdef __DISABLE_GFNIAVX512VL__ #undef __DISABLE_GFNIAVX512VL__ #pragma GCC pop_options -#endif /* __GFNIAVX512VL__ */ - +#endif #if !defined(__GFNI__) || !defined(__AVX512VL__) || !defined(__AVX512BW__) #pragma GCC push_options #pragma GCC target("gfni,avx512vl,avx512bw") #define __DISABLE_GFNIAVX512VLBW__ -#endif /* __GFNIAVX512VLBW__ */ - -__funline __m256i _mm256_mask_gf2p8mul_epi8(__m256i __A, __mmask32 __B, - __m256i __C, __m256i __D) { - return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask( - (__v32qi)__C, (__v32qi)__D, (__v32qi)__A, __B); +#endif +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8mul_epi8 (__m256i __A, __mmask32 __B, __m256i __C, + __m256i __D) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + (__v32qi)__A, __B); } - -__funline __m256i _mm256_maskz_gf2p8mul_epi8(__mmask32 __A, __m256i __B, - __m256i __C) { - return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask( - (__v32qi)__B, (__v32qi)__C, (__v32qi)_mm256_setzero_si256(), __A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8mul_epi8 (__mmask32 __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, (__v32qi) _mm256_setzero_si256 (), __A); } - #ifdef __OPTIMIZE__ -__funline __m256i _mm256_mask_gf2p8affineinv_epi64_epi8(__m256i __A, - __mmask32 __B, - __m256i __C, __m256i __D, - const int __E) { - return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( - (__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8affineinv_epi64_epi8 (__m256i __A, __mmask32 __B, + __m256i __C, __m256i __D, const int __E) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + __E, + (__v32qi)__A, + __B); } - -__funline __m256i _mm256_maskz_gf2p8affineinv_epi64_epi8(__mmask32 __A, - __m256i __B, __m256i __C, - const int __D) { - return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( - (__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8affineinv_epi64_epi8 (__mmask32 __A, __m256i __B, + __m256i __C, const int __D) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, __D, + (__v32qi) _mm256_setzero_si256 (), __A); } - -__funline __m256i _mm256_mask_gf2p8affine_epi64_epi8(__m256i __A, __mmask32 __B, - __m256i __C, __m256i __D, - const int __E) { - return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( - (__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8affine_epi64_epi8 (__m256i __A, __mmask32 __B, __m256i __C, + __m256i __D, const int __E) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + __E, + (__v32qi)__A, + __B); } - -__funline __m256i _mm256_maskz_gf2p8affine_epi64_epi8(__mmask32 __A, __m256i __B, - __m256i __C, - const int __D) { - return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( - (__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8affine_epi64_epi8 (__mmask32 __A, __m256i __B, + __m256i __C, const int __D) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, __D, (__v32qi)_mm256_setzero_si256 (), __A); } #else -#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ - ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \ - (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \ - (__v32qi)(__m256i)(A), (__mmask32)(B))) -#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ - ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \ - (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \ - (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)(A))) -#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ - ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( \ - (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \ - (__v32qi)(__m256i)(A), (__mmask32)(B))) -#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ - ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( \ - (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \ - (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)(A))) +#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B))) +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A))) +#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B))) +#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A))) #endif - #ifdef __DISABLE_GFNIAVX512VLBW__ #undef __DISABLE_GFNIAVX512VLBW__ #pragma GCC pop_options -#endif /* __GFNIAVX512VLBW__ */ - +#endif #if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__) #pragma GCC push_options #pragma GCC target("gfni,avx512f,avx512bw") #define __DISABLE_GFNIAVX512FBW__ -#endif /* __GFNIAVX512FBW__ */ - -__funline __m512i _mm512_mask_gf2p8mul_epi8(__m512i __A, __mmask64 __B, - __m512i __C, __m512i __D) { - return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask( - (__v64qi)__C, (__v64qi)__D, (__v64qi)__A, __B); +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8mul_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, (__v64qi)__A, __B); } - -__funline __m512i _mm512_maskz_gf2p8mul_epi8(__mmask64 __A, __m512i __B, - __m512i __C) { - return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask( - (__v64qi)__B, (__v64qi)__C, (__v64qi)_mm512_setzero_si512(), __A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8mul_epi8 (__mmask64 __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, (__v64qi) _mm512_setzero_si512 (), __A); } -__funline __m512i _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi((__v64qi)__A, (__v64qi)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A, + (__v64qi) __B); } - #ifdef __OPTIMIZE__ -__funline __m512i _mm512_mask_gf2p8affineinv_epi64_epi8(__m512i __A, - __mmask64 __B, - __m512i __C, __m512i __D, - const int __E) { - return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( - (__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8affineinv_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D, const int __E) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, + __E, + (__v64qi)__A, + __B); } - -__funline __m512i _mm512_maskz_gf2p8affineinv_epi64_epi8(__mmask64 __A, - __m512i __B, __m512i __C, - const int __D) { - return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( - (__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8affineinv_epi64_epi8 (__mmask64 __A, __m512i __B, + __m512i __C, const int __D) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, __D, + (__v64qi) _mm512_setzero_si512 (), __A); } - -__funline __m512i _mm512_gf2p8affineinv_epi64_epi8(__m512i __A, __m512i __B, - const int __C) { - return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)__A, - (__v64qi)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A, + (__v64qi) __B, __C); } - -__funline __m512i _mm512_mask_gf2p8affine_epi64_epi8(__m512i __A, __mmask64 __B, - __m512i __C, __m512i __D, - const int __E) { - return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( - (__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8affine_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D, const int __E) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, __E, (__v64qi)__A, __B); } - -__funline __m512i _mm512_maskz_gf2p8affine_epi64_epi8(__mmask64 __A, __m512i __B, - __m512i __C, - const int __D) { - return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( - (__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8affine_epi64_epi8 (__mmask64 __A, __m512i __B, __m512i __C, + const int __D) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, __D, (__v64qi) _mm512_setzero_si512 (), __A); } -__funline __m512i _mm512_gf2p8affine_epi64_epi8(__m512i __A, __m512i __B, - const int __C) { - return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)__A, - (__v64qi)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A, + (__v64qi) __B, __C); } #else -#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ - ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \ - (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \ - (__v64qi)(__m512i)(A), (__mmask64)(B))) -#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ - ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \ - (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \ - (__v64qi)(__m512i)_mm512_setzero_si512(), (__mmask64)(A))) -#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) \ - ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi( \ - (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) -#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ - ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( \ - (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \ - (__v64qi)(__m512i)(A), (__mmask64)(B))) -#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ - ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( \ - (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \ - (__v64qi)(__m512i)_mm512_setzero_si512(), (__mmask64)(A))) -#define _mm512_gf2p8affine_epi64_epi8(A, B, C) \ - ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi( \ - (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) +#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B))) +#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A))) +#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ( (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) +#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B))) +#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A))) +#define _mm512_gf2p8affine_epi64_epi8(A, B, C) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) #endif - #ifdef __DISABLE_GFNIAVX512FBW__ #undef __DISABLE_GFNIAVX512FBW__ #pragma GCC pop_options -#endif /* __GFNIAVX512FBW__ */ - -#endif /* _GFNIINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/hresetintrin.internal.h b/third_party/intel/hresetintrin.internal.h new file mode 100644 index 000000000..9ff5504d4 --- /dev/null +++ b/third_party/intel/hresetintrin.internal.h @@ -0,0 +1,24 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#if !defined _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif +#ifndef _HRESETINTRIN_H_INCLUDED +#define _HRESETINTRIN_H_INCLUDED +#ifndef __HRESET__ +#pragma GCC push_options +#pragma GCC target ("hreset") +#define __DISABLE_HRESET__ +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_hreset (unsigned int __EAX) +{ + __builtin_ia32_hreset (__EAX); +} +#ifdef __DISABLE_HRESET__ +#undef __DISABLE_HRESET__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/ia32intrin.internal.h b/third_party/intel/ia32intrin.internal.h index f3d0193cb..fb167fcf6 100644 --- a/third_party/intel/ia32intrin.internal.h +++ b/third_party/intel/ia32intrin.internal.h @@ -1,184 +1,217 @@ -#ifndef _X86INTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - -__funline int __bsfd(int __X) { - return __builtin_ctz(__X); +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsfd (int __X) +{ + return __builtin_ctz (__X); } - -__funline int __bsrd(int __X) { - return __builtin_ia32_bsrsi(__X); +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsrd (int __X) +{ + return __builtin_ia32_bsrsi (__X); } - -__funline int __bswapd(int __X) { - return __builtin_bswap32(__X); +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bswapd (int __X) +{ + return __builtin_bswap32 (__X); } - #ifndef __iamcu__ - #ifndef __SSE4_2__ #pragma GCC push_options #pragma GCC target("sse4.2") #define __DISABLE_SSE4_2__ -#endif /* __SSE4_2__ */ - -__funline unsigned int __crc32b(unsigned int __C, unsigned char __V) { - return __builtin_ia32_crc32qi(__C, __V); +#endif +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32b (unsigned int __C, unsigned char __V) +{ + return __builtin_ia32_crc32qi (__C, __V); } - -__funline unsigned int __crc32w(unsigned int __C, unsigned short __V) { - return __builtin_ia32_crc32hi(__C, __V); +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32w (unsigned int __C, unsigned short __V) +{ + return __builtin_ia32_crc32hi (__C, __V); } - -__funline unsigned int __crc32d(unsigned int __C, unsigned int __V) { - return __builtin_ia32_crc32si(__C, __V); +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32d (unsigned int __C, unsigned int __V) +{ + return __builtin_ia32_crc32si (__C, __V); } - #ifdef __DISABLE_SSE4_2__ #undef __DISABLE_SSE4_2__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_2__ */ - -#endif /* __iamcu__ */ - -__funline int __popcntd(unsigned int __X) { - return __builtin_popcount(__X); +#endif +#endif +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__popcntd (unsigned int __X) +{ + return __builtin_popcount (__X); } - #ifndef __iamcu__ - -__funline unsigned long long __rdpmc(int __S) { - return __builtin_ia32_rdpmc(__S); +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdpmc (int __S) +{ + return __builtin_ia32_rdpmc (__S); } - -#endif /* __iamcu__ */ - -__funline unsigned long long __rdtsc(void) { - return __builtin_ia32_rdtsc(); -} - +#endif +#define __rdtsc() __builtin_ia32_rdtsc () #ifndef __iamcu__ - -__funline unsigned long long __rdtscp(unsigned int *__A) { - return __builtin_ia32_rdtscp(__A); +#define __rdtscp(a) __builtin_ia32_rdtscp (a) +#endif +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolb (unsigned char __X, int __C) +{ + return __builtin_ia32_rolqi (__X, __C); } - -#endif /* __iamcu__ */ - -__funline unsigned char __rolb(unsigned char __X, int __C) { - return __builtin_ia32_rolqi(__X, __C); +extern __inline unsigned short +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolw (unsigned short __X, int __C) +{ + return __builtin_ia32_rolhi (__X, __C); } - -__funline unsigned short __rolw(unsigned short __X, int __C) { - return __builtin_ia32_rolhi(__X, __C); -} - -__funline unsigned int __rold(unsigned int __X, int __C) { +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rold (unsigned int __X, int __C) +{ __C &= 31; return (__X << __C) | (__X >> (-__C & 31)); } - -__funline unsigned char __rorb(unsigned char __X, int __C) { - return __builtin_ia32_rorqi(__X, __C); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorb (unsigned char __X, int __C) +{ + return __builtin_ia32_rorqi (__X, __C); } - -__funline unsigned short __rorw(unsigned short __X, int __C) { - return __builtin_ia32_rorhi(__X, __C); +extern __inline unsigned short +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorw (unsigned short __X, int __C) +{ + return __builtin_ia32_rorhi (__X, __C); } - -__funline unsigned int __rord(unsigned int __X, int __C) { +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rord (unsigned int __X, int __C) +{ __C &= 31; return (__X >> __C) | (__X << (-__C & 31)); } - -__funline void __pause(void) { - __builtin_ia32_pause(); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__pause (void) +{ + __builtin_ia32_pause (); } - #ifdef __x86_64__ - -__funline int __bsfq(long long __X) { - return __builtin_ctzll(__X); +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsfq (long long __X) +{ + return __builtin_ctzll (__X); } - -__funline int __bsrq(long long __X) { - return __builtin_ia32_bsrdi(__X); +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsrq (long long __X) +{ + return __builtin_ia32_bsrdi (__X); } - -__funline long long __bswapq(long long __X) { - return __builtin_bswap64(__X); +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bswapq (long long __X) +{ + return __builtin_bswap64 (__X); } - #ifndef __SSE4_2__ #pragma GCC push_options #pragma GCC target("sse4.2") #define __DISABLE_SSE4_2__ -#endif /* __SSE4_2__ */ - -__funline unsigned long long __crc32q(unsigned long long __C, - unsigned long long __V) { - return __builtin_ia32_crc32di(__C, __V); +#endif +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32q (unsigned long long __C, unsigned long long __V) +{ + return __builtin_ia32_crc32di (__C, __V); } - #ifdef __DISABLE_SSE4_2__ #undef __DISABLE_SSE4_2__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_2__ */ - -__funline long long __popcntq(unsigned long long __X) { - return __builtin_popcountll(__X); +#endif +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__popcntq (unsigned long long __X) +{ + return __builtin_popcountll (__X); } - -__funline unsigned long long __rolq(unsigned long long __X, int __C) { +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolq (unsigned long long __X, int __C) +{ __C &= 63; return (__X << __C) | (__X >> (-__C & 63)); } - -__funline unsigned long long __rorq(unsigned long long __X, int __C) { +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorq (unsigned long long __X, int __C) +{ __C &= 63; return (__X >> __C) | (__X << (-__C & 63)); } - -__funline unsigned long long __readeflags(void) { - return __builtin_ia32_readeflags_u64(); +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__readeflags (void) +{ + return __builtin_ia32_readeflags_u64 (); } - -__funline void __writeeflags(unsigned long long __X) { - __builtin_ia32_writeeflags_u64(__X); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__writeeflags (unsigned long long __X) +{ + __builtin_ia32_writeeflags_u64 (__X); } - -#define _bswap64(a) __bswapq(a) +#define _bswap64(a) __bswapq(a) #define _popcnt64(a) __popcntq(a) #else - -__funline unsigned int __readeflags(void) { - return __builtin_ia32_readeflags_u32(); +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__readeflags (void) +{ + return __builtin_ia32_readeflags_u32 (); } - -__funline void __writeeflags(unsigned int __X) { - __builtin_ia32_writeeflags_u32(__X); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__writeeflags (unsigned int __X) +{ + __builtin_ia32_writeeflags_u32 (__X); } - #endif - #ifdef __LP64__ -#define _lrotl(a, b) __rolq((a), (b)) -#define _lrotr(a, b) __rorq((a), (b)) +#define _lrotl(a,b) __rolq((a), (b)) +#define _lrotr(a,b) __rorq((a), (b)) #else -#define _lrotl(a, b) __rold((a), (b)) -#define _lrotr(a, b) __rord((a), (b)) +#define _lrotl(a,b) __rold((a), (b)) +#define _lrotr(a,b) __rord((a), (b)) #endif - #define _bit_scan_forward(a) __bsfd(a) #define _bit_scan_reverse(a) __bsrd(a) -#define _bswap(a) __bswapd(a) -#define _popcnt32(a) __popcntd(a) +#define _bswap(a) __bswapd(a) +#define _popcnt32(a) __popcntd(a) #ifndef __iamcu__ -#define _rdpmc(a) __rdpmc(a) +#define _rdpmc(a) __rdpmc(a) #define _rdtscp(a) __rdtscp(a) -#endif /* __iamcu__ */ -#define _rdtsc() __rdtsc() -#define _rotwl(a, b) __rolw((a), (b)) -#define _rotwr(a, b) __rorw((a), (b)) -#define _rotl(a, b) __rold((a), (b)) -#define _rotr(a, b) __rord((a), (b)) +#endif +#define _rdtsc() __rdtsc() +#define _rotwl(a,b) __rolw((a), (b)) +#define _rotwr(a,b) __rorw((a), (b)) +#define _rotl(a,b) __rold((a), (b)) +#define _rotr(a,b) __rord((a), (b)) +#endif diff --git a/third_party/intel/immintrin.internal.h b/third_party/intel/immintrin.internal.h index ca9f21f81..a375eda47 100644 --- a/third_party/intel/immintrin.internal.h +++ b/third_party/intel/immintrin.internal.h @@ -1,8 +1,8 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #define _IMMINTRIN_H_INCLUDED -#ifdef __x86_64__ - -/* clang-format off */ +#include "third_party/intel/x86gprintrin.internal.h" #include "third_party/intel/mmintrin.internal.h" #include "third_party/intel/xmmintrin.internal.h" #include "third_party/intel/emmintrin.internal.h" @@ -10,12 +10,8 @@ #include "third_party/intel/tmmintrin.internal.h" #include "third_party/intel/smmintrin.internal.h" #include "third_party/intel/wmmintrin.internal.h" -#include "third_party/intel/fxsrintrin.internal.h" -#include "third_party/intel/xsaveintrin.internal.h" -#include "third_party/intel/xsaveoptintrin.internal.h" -#include "third_party/intel/xsavesintrin.internal.h" -#include "third_party/intel/xsavecintrin.internal.h" #include "third_party/intel/avxintrin.internal.h" +#include "third_party/intel/avxvnniintrin.internal.h" #include "third_party/intel/avx2intrin.internal.h" #include "third_party/intel/avx512fintrin.internal.h" #include "third_party/intel/avx512erintrin.internal.h" @@ -39,143 +35,21 @@ #include "third_party/intel/avx512vnnivlintrin.internal.h" #include "third_party/intel/avx512vpopcntdqvlintrin.internal.h" #include "third_party/intel/avx512bitalgintrin.internal.h" +#include "third_party/intel/avx512vp2intersectintrin.internal.h" +#include "third_party/intel/avx512vp2intersectvlintrin.internal.h" #include "third_party/intel/shaintrin.internal.h" -#include "third_party/intel/lzcntintrin.internal.h" -#include "third_party/intel/bmiintrin.internal.h" -#include "third_party/intel/bmi2intrin.internal.h" #include "third_party/intel/fmaintrin.internal.h" #include "third_party/intel/f16cintrin.internal.h" #include "third_party/intel/rtmintrin.internal.h" -#include "third_party/intel/xtestintrin.internal.h" -#include "third_party/intel/cetintrin.internal.h" #include "third_party/intel/gfniintrin.internal.h" #include "third_party/intel/vaesintrin.internal.h" #include "third_party/intel/vpclmulqdqintrin.internal.h" -#include "third_party/intel/movdirintrin.internal.h" -#include "third_party/intel/sgxintrin.internal.h" -#include "third_party/intel/pconfigintrin.internal.h" -#include "third_party/intel/waitpkgintrin.internal.h" -#include "third_party/intel/cldemoteintrin.internal.h" -#include "third_party/intel/rdseedintrin.internal.h" +#include "third_party/intel/avx512bf16vlintrin.internal.h" +#include "third_party/intel/avx512bf16intrin.internal.h" +#include "third_party/intel/amxtileintrin.internal.h" +#include "third_party/intel/amxint8intrin.internal.h" +#include "third_party/intel/amxbf16intrin.internal.h" #include "third_party/intel/prfchwintrin.internal.h" -#include "third_party/intel/adxintrin.internal.h" -#include "third_party/intel/clwbintrin.internal.h" -#include "third_party/intel/clflushoptintrin.internal.h" -#include "third_party/intel/wbnoinvdintrin.internal.h" -#include "third_party/intel/pkuintrin.internal.h" -/* clang-format on */ - -__funline void _wbinvd(void) { - __builtin_ia32_wbinvd(); -} - -#ifndef __RDRND__ -#pragma GCC push_options -#pragma GCC target("rdrnd") -#define __DISABLE_RDRND__ -#endif /* __RDRND__ */ -__funline int _rdrand16_step(unsigned short *__P) { - return __builtin_ia32_rdrand16_step(__P); -} - -__funline int _rdrand32_step(unsigned int *__P) { - return __builtin_ia32_rdrand32_step(__P); -} -#ifdef __DISABLE_RDRND__ -#undef __DISABLE_RDRND__ -#pragma GCC pop_options -#endif /* __DISABLE_RDRND__ */ - -#ifndef __RDPID__ -#pragma GCC push_options -#pragma GCC target("rdpid") -#define __DISABLE_RDPID__ -#endif /* __RDPID__ */ -__funline unsigned int _rdpid_u32(void) { - return __builtin_ia32_rdpid(); -} -#ifdef __DISABLE_RDPID__ -#undef __DISABLE_RDPID__ -#pragma GCC pop_options -#endif /* __DISABLE_RDPID__ */ - -#ifdef __x86_64__ - -#ifndef __FSGSBASE__ -#pragma GCC push_options -#pragma GCC target("fsgsbase") -#define __DISABLE_FSGSBASE__ -#endif /* __FSGSBASE__ */ -__funline unsigned int _readfsbase_u32(void) { - return __builtin_ia32_rdfsbase32(); -} - -__funline unsigned long long _readfsbase_u64(void) { - return __builtin_ia32_rdfsbase64(); -} - -__funline unsigned int _readgsbase_u32(void) { - return __builtin_ia32_rdgsbase32(); -} - -__funline unsigned long long _readgsbase_u64(void) { - return __builtin_ia32_rdgsbase64(); -} - -__funline void _writefsbase_u32(unsigned int __B) { - __builtin_ia32_wrfsbase32(__B); -} - -__funline void _writefsbase_u64(unsigned long long __B) { - __builtin_ia32_wrfsbase64(__B); -} - -__funline void _writegsbase_u32(unsigned int __B) { - __builtin_ia32_wrgsbase32(__B); -} - -__funline void _writegsbase_u64(unsigned long long __B) { - __builtin_ia32_wrgsbase64(__B); -} -#ifdef __DISABLE_FSGSBASE__ -#undef __DISABLE_FSGSBASE__ -#pragma GCC pop_options -#endif /* __DISABLE_FSGSBASE__ */ - -#ifndef __RDRND__ -#pragma GCC push_options -#pragma GCC target("rdrnd") -#define __DISABLE_RDRND__ -#endif /* __RDRND__ */ -__funline int _rdrand64_step(unsigned long long *__P) { - return __builtin_ia32_rdrand64_step(__P); -} -#ifdef __DISABLE_RDRND__ -#undef __DISABLE_RDRND__ -#pragma GCC pop_options -#endif /* __DISABLE_RDRND__ */ - -#endif /* __x86_64__ */ - -#ifndef __PTWRITE__ -#pragma GCC push_options -#pragma GCC target("ptwrite") -#define __DISABLE_PTWRITE__ +#include "third_party/intel/keylockerintrin.internal.h" +#endif #endif - -#ifdef __x86_64__ -__funline void _ptwrite64(unsigned long long __B) { - __builtin_ia32_ptwrite64(__B); -} -#endif /* __x86_64__ */ - -__funline void _ptwrite32(unsigned __B) { - __builtin_ia32_ptwrite32(__B); -} -#ifdef __DISABLE_PTWRITE__ -#undef __DISABLE_PTWRITE__ -#pragma GCC pop_options -#endif /* __DISABLE_PTWRITE__ */ - -#endif /* __x86_64__ */ -#endif /* _IMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/keylockerintrin.internal.h b/third_party/intel/keylockerintrin.internal.h new file mode 100644 index 000000000..e071080f2 --- /dev/null +++ b/third_party/intel/keylockerintrin.internal.h @@ -0,0 +1,93 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif +#ifndef _KEYLOCKERINTRIN_H_INCLUDED +#define _KEYLOCKERINTRIN_H_INCLUDED +#ifndef __KL__ +#pragma GCC push_options +#pragma GCC target("kl") +#define __DISABLE_KL__ +#endif +extern __inline +void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadiwkey (unsigned int __I, __m128i __A, __m128i __B, __m128i __C) +{ + __builtin_ia32_loadiwkey ((__v2di) __B, (__v2di) __C, (__v2di) __A, __I); +} +extern __inline +unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_encodekey128_u32 (unsigned int __I, __m128i __A, void * __P) +{ + return __builtin_ia32_encodekey128_u32 (__I, (__v2di)__A, __P); +} +extern __inline +unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_encodekey256_u32 (unsigned int __I, __m128i __A, __m128i __B, void * __P) +{ + return __builtin_ia32_encodekey256_u32 (__I, (__v2di)__A, (__v2di)__B, __P); +} +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdec128kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesdec128kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdec256kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesdec256kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenc128kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesenc128kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenc256kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesenc256kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} +#ifdef __DISABLE_KL__ +#undef __DISABLE_KL__ +#pragma GCC pop_options +#endif +#ifndef __WIDEKL__ +#pragma GCC push_options +#pragma GCC target("widekl") +#define __DISABLE_WIDEKL__ +#endif +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdecwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesdecwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdecwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesdecwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesencwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesencwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesencwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesencwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} +#ifdef __DISABLE_WIDEKL__ +#undef __DISABLE_WIDEKL__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/lwpintrin.internal.h b/third_party/intel/lwpintrin.internal.h index af776aebf..3a076903e 100644 --- a/third_party/intel/lwpintrin.internal.h +++ b/third_party/intel/lwpintrin.internal.h @@ -1,73 +1,68 @@ -#ifndef _X86INTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _LWPINTRIN_H_INCLUDED #define _LWPINTRIN_H_INCLUDED - #ifndef __LWP__ #pragma GCC push_options #pragma GCC target("lwp") #define __DISABLE_LWP__ -#endif /* __LWP__ */ - -__funline void __llwpcb(void *__pcbAddress) { - __builtin_ia32_llwpcb(__pcbAddress); +#endif +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__llwpcb (void *__pcbAddress) +{ + __builtin_ia32_llwpcb (__pcbAddress); } - -__funline void *__slwpcb(void) { - return __builtin_ia32_slwpcb(); +extern __inline void * __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__slwpcb (void) +{ + return __builtin_ia32_slwpcb (); } - #ifdef __OPTIMIZE__ -__funline void __lwpval32(unsigned int __data2, unsigned int __data1, - unsigned int __flags) { - __builtin_ia32_lwpval32(__data2, __data1, __flags); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval32 (unsigned int __data2, unsigned int __data1, unsigned int __flags) +{ + __builtin_ia32_lwpval32 (__data2, __data1, __flags); } - #ifdef __x86_64__ -__funline void __lwpval64(unsigned long long __data2, unsigned int __data1, - unsigned int __flags) { - __builtin_ia32_lwpval64(__data2, __data1, __flags); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval64 (unsigned long long __data2, unsigned int __data1, + unsigned int __flags) +{ + __builtin_ia32_lwpval64 (__data2, __data1, __flags); } #endif #else -#define __lwpval32(D2, D1, F) \ - (__builtin_ia32_lwpval32((unsigned int)(D2), (unsigned int)(D1), \ - (unsigned int)(F))) +#define __lwpval32(D2, D1, F) (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), (unsigned int) (F))) #ifdef __x86_64__ -#define __lwpval64(D2, D1, F) \ - (__builtin_ia32_lwpval64((unsigned long long)(D2), (unsigned int)(D1), \ - (unsigned int)(F))) +#define __lwpval64(D2, D1, F) (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), (unsigned int) (F))) #endif #endif - #ifdef __OPTIMIZE__ -__funline unsigned char __lwpins32(unsigned int __data2, unsigned int __data1, - unsigned int __flags) { - return __builtin_ia32_lwpins32(__data2, __data1, __flags); +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins32 (unsigned int __data2, unsigned int __data1, unsigned int __flags) +{ + return __builtin_ia32_lwpins32 (__data2, __data1, __flags); } - #ifdef __x86_64__ -__funline unsigned char __lwpins64(unsigned long long __data2, - unsigned int __data1, unsigned int __flags) { - return __builtin_ia32_lwpins64(__data2, __data1, __flags); +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins64 (unsigned long long __data2, unsigned int __data1, + unsigned int __flags) +{ + return __builtin_ia32_lwpins64 (__data2, __data1, __flags); } #endif #else -#define __lwpins32(D2, D1, F) \ - (__builtin_ia32_lwpins32((unsigned int)(D2), (unsigned int)(D1), \ - (unsigned int)(F))) +#define __lwpins32(D2, D1, F) (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), (unsigned int) (F))) #ifdef __x86_64__ -#define __lwpins64(D2, D1, F) \ - (__builtin_ia32_lwpins64((unsigned long long)(D2), (unsigned int)(D1), \ - (unsigned int)(F))) +#define __lwpins64(D2, D1, F) (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), (unsigned int) (F))) #endif #endif - #ifdef __DISABLE_LWP__ #undef __DISABLE_LWP__ #pragma GCC pop_options -#endif /* __DISABLE_LWP__ */ - -#endif /* _LWPINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/lzcntintrin.internal.h b/third_party/intel/lzcntintrin.internal.h index e4a97090f..b72e35777 100644 --- a/third_party/intel/lzcntintrin.internal.h +++ b/third_party/intel/lzcntintrin.internal.h @@ -1,41 +1,45 @@ -#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _LZCNTINTRIN_H_INCLUDED #define _LZCNTINTRIN_H_INCLUDED - #ifndef __LZCNT__ #pragma GCC push_options #pragma GCC target("lzcnt") #define __DISABLE_LZCNT__ -#endif /* __LZCNT__ */ - -__funline unsigned short __lzcnt16(unsigned short __X) { - return __builtin_ia32_lzcnt_u16(__X); +#endif +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt16 (unsigned short __X) +{ + return __builtin_ia32_lzcnt_u16 (__X); } - -__funline unsigned int __lzcnt32(unsigned int __X) { - return __builtin_ia32_lzcnt_u32(__X); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt32 (unsigned int __X) +{ + return __builtin_ia32_lzcnt_u32 (__X); } - -__funline unsigned int _lzcnt_u32(unsigned int __X) { - return __builtin_ia32_lzcnt_u32(__X); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_lzcnt_u32 (unsigned int __X) +{ + return __builtin_ia32_lzcnt_u32 (__X); } - #ifdef __x86_64__ -__funline unsigned long long __lzcnt64(unsigned long long __X) { - return __builtin_ia32_lzcnt_u64(__X); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt64 (unsigned long long __X) +{ + return __builtin_ia32_lzcnt_u64 (__X); } - -__funline unsigned long long _lzcnt_u64(unsigned long long __X) { - return __builtin_ia32_lzcnt_u64(__X); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_lzcnt_u64 (unsigned long long __X) +{ + return __builtin_ia32_lzcnt_u64 (__X); } #endif - #ifdef __DISABLE_LZCNT__ #undef __DISABLE_LZCNT__ #pragma GCC pop_options -#endif /* __DISABLE_LZCNT__ */ - -#endif /* _LZCNTINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/mm3dnow.internal.h b/third_party/intel/mm3dnow.internal.h index 6d278373e..9e43e1086 100644 --- a/third_party/intel/mm3dnow.internal.h +++ b/third_party/intel/mm3dnow.internal.h @@ -1,9 +1,9 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _MM3DNOW_H_INCLUDED #define _MM3DNOW_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/mmintrin.internal.h" #include "third_party/intel/prfchwintrin.internal.h" - #if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW__ #pragma GCC push_options #ifdef __x86_64__ @@ -12,110 +12,128 @@ #pragma GCC target("3dnow") #endif #define __DISABLE_3dNOW__ -#endif /* __3dNOW__ */ - -__funline void _m_femms(void) { +#endif +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_femms (void) +{ __builtin_ia32_femms(); } - -__funline __m64 _m_pavgusb(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pavgusb((__v8qi)__A, (__v8qi)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgusb (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B); } - -__funline __m64 _m_pf2id(__m64 __A) { - return (__m64)__builtin_ia32_pf2id((__v2sf)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pf2id (__m64 __A) +{ + return (__m64)__builtin_ia32_pf2id ((__v2sf)__A); } - -__funline __m64 _m_pfacc(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfacc((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfadd(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfadd((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfadd (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfcmpeq(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpeq (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfcmpge(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfcmpge((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpge (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfcmpgt(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpgt (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfmax(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfmax((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmax (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfmin(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfmin((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmin (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfmul(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfmul((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmul (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfrcp(__m64 __A) { - return (__m64)__builtin_ia32_pfrcp((__v2sf)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcp (__m64 __A) +{ + return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A); } - -__funline __m64 _m_pfrcpit1(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcpit1 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfrcpit2(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcpit2 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfrsqrt(__m64 __A) { - return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrsqrt (__m64 __A) +{ + return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A); } - -__funline __m64 _m_pfrsqit1(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrsqit1 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfsub(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfsub((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfsub (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfsubr(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfsubr((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfsubr (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pi2fd(__m64 __A) { - return (__m64)__builtin_ia32_pi2fd((__v2si)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pi2fd (__m64 __A) +{ + return (__m64)__builtin_ia32_pi2fd ((__v2si)__A); } - -__funline __m64 _m_pmulhrw(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pmulhrw((__v4hi)__A, (__v4hi)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhrw (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B); } - -__funline void _m_prefetch(void *__P) { - __builtin_prefetch(__P, 0, 3 /* _MM_HINT_T0 */); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetch (void *__P) +{ + __builtin_prefetch (__P, 0, 3 ); } - -__funline __m64 _m_from_float(float __A) { - return __extension__(__m64)(__v2sf){__A, 0.0f}; +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_float (float __A) +{ + return __extension__ (__m64)(__v2sf){ __A, 0.0f }; } - -__funline float _m_to_float(__m64 __A) { - union { - __v2sf v; - float a[2]; - } __tmp; +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_float (__m64 __A) +{ + union { __v2sf v; float a[2]; } __tmp; __tmp.v = (__v2sf)__A; return __tmp.a[0]; } - #ifdef __DISABLE_3dNOW__ #undef __DISABLE_3dNOW__ #pragma GCC pop_options -#endif /* __DISABLE_3dNOW__ */ - +#endif #if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW_A__ #pragma GCC push_options #ifdef __x86_64__ @@ -124,32 +142,35 @@ __funline float _m_to_float(__m64 __A) { #pragma GCC target("3dnowa") #endif #define __DISABLE_3dNOW_A__ -#endif /* __3dNOW_A__ */ - -__funline __m64 _m_pf2iw(__m64 __A) { - return (__m64)__builtin_ia32_pf2iw((__v2sf)__A); +#endif +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pf2iw (__m64 __A) +{ + return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A); } - -__funline __m64 _m_pfnacc(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfnacc((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfnacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pfpnacc(__m64 __A, __m64 __B) { - return (__m64)__builtin_ia32_pfpnacc((__v2sf)__A, (__v2sf)__B); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfpnacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B); } - -__funline __m64 _m_pi2fw(__m64 __A) { - return (__m64)__builtin_ia32_pi2fw((__v2si)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pi2fw (__m64 __A) +{ + return (__m64)__builtin_ia32_pi2fw ((__v2si)__A); } - -__funline __m64 _m_pswapd(__m64 __A) { - return (__m64)__builtin_ia32_pswapdsf((__v2sf)__A); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pswapd (__m64 __A) +{ + return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A); } - #ifdef __DISABLE_3dNOW_A__ #undef __DISABLE_3dNOW_A__ #pragma GCC pop_options -#endif /* __DISABLE_3dNOW_A__ */ - -#endif /* __x86_64__ */ -#endif /* _MM3DNOW_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/mm_malloc.internal.h b/third_party/intel/mm_malloc.internal.h index 9c3543077..127c46315 100644 --- a/third_party/intel/mm_malloc.internal.h +++ b/third_party/intel/mm_malloc.internal.h @@ -1,15 +1,14 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _MM_MALLOC_H_INCLUDED #define _MM_MALLOC_H_INCLUDED -#ifdef __x86_64__ #include "libc/mem/mem.h" - #ifndef __cplusplus extern int _mm_posix_memalign(void **, size_t, size_t) #else extern "C" int _mm_posix_memalign(void **, size_t, size_t) throw() #endif __asm__("posix_memalign"); - static __inline void *_mm_malloc(size_t __size, size_t __alignment) { void *__ptr; if (__alignment == 1) return malloc(__size); @@ -20,10 +19,8 @@ static __inline void *_mm_malloc(size_t __size, size_t __alignment) { else return NULL; } - static __inline void _mm_free(void *__ptr) { free(__ptr); } - -#endif /* __x86_64__ */ -#endif /* _MM_MALLOC_H_INCLUDED */ +#endif +#endif diff --git a/third_party/intel/mmintrin.internal.h b/third_party/intel/mmintrin.internal.h index 58f114641..2ec674bbb 100644 --- a/third_party/intel/mmintrin.internal.h +++ b/third_party/intel/mmintrin.internal.h @@ -1,576 +1,710 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _MMINTRIN_H_INCLUDED #define _MMINTRIN_H_INCLUDED -#ifdef __x86_64__ - #if defined __x86_64__ && !defined __SSE__ || !defined __MMX__ #pragma GCC push_options -#ifdef __x86_64__ +#ifdef __MMX_WITH_SSE__ +#pragma GCC target("sse2") +#elif defined __x86_64__ #pragma GCC target("sse,mmx") #else #pragma GCC target("mmx") #endif #define __DISABLE_MMX__ -#endif /* __MMX__ */ - -typedef int __m64 __attribute__((__vector_size__(8), __may_alias__)); - -typedef int __m64_u - __attribute__((__vector_size__(8), __may_alias__, __aligned__(1))); - -typedef int __v2si __attribute__((__vector_size__(8))); -typedef short __v4hi __attribute__((__vector_size__(8))); -typedef char __v8qi __attribute__((__vector_size__(8))); -typedef long long __v1di __attribute__((__vector_size__(8))); -typedef float __v2sf __attribute__((__vector_size__(8))); - -__funline void _mm_empty(void) { - __builtin_ia32_emms(); +#endif +typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); +typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__)); +typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__)); +typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1))); +typedef int __m32_u __attribute__ ((__vector_size__ (4), __may_alias__, __aligned__ (1))); +typedef short __m16_u __attribute__ ((__vector_size__ (2), __may_alias__, __aligned__ (1))); +typedef int __v2si __attribute__ ((__vector_size__ (8))); +typedef short __v4hi __attribute__ ((__vector_size__ (8))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); +typedef long long __v1di __attribute__ ((__vector_size__ (8))); +typedef float __v2sf __attribute__ ((__vector_size__ (8))); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + __builtin_ia32_emms (); } - -__funline void _m_empty(void) { - _mm_empty(); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_empty (void) +{ + _mm_empty (); } - -__funline __m64 _mm_cvtsi32_si64(int __i) { - return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_si64 (int __i) +{ + return (__m64) __builtin_ia32_vec_init_v2si (__i, 0); } - -__funline __m64 _m_from_int(int __i) { - return _mm_cvtsi32_si64(__i); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_int (int __i) +{ + return _mm_cvtsi32_si64 (__i); } - #ifdef __x86_64__ - -__funline __m64 _m_from_int64(long long __i) { - return (__m64)__i; +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_int64 (long long __i) +{ + return (__m64) __i; } - -__funline __m64 _mm_cvtsi64_m64(long long __i) { - return (__m64)__i; +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_m64 (long long __i) +{ + return (__m64) __i; } - -__funline __m64 _mm_cvtsi64x_si64(long long __i) { - return (__m64)__i; +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_si64 (long long __i) +{ + return (__m64) __i; } - -__funline __m64 _mm_set_pi64x(long long __i) { - return (__m64)__i; +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi64x (long long __i) +{ + return (__m64) __i; } #endif - -__funline int _mm_cvtsi64_si32(__m64 __i) { - return __builtin_ia32_vec_ext_v2si((__v2si)__i, 0); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si32 (__m64 __i) +{ + return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0); } - -__funline int _m_to_int(__m64 __i) { - return _mm_cvtsi64_si32(__i); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_int (__m64 __i) +{ + return _mm_cvtsi64_si32 (__i); } - #ifdef __x86_64__ - -__funline long long _m_to_int64(__m64 __i) { +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_int64 (__m64 __i) +{ return (long long)__i; } - -__funline long long _mm_cvtm64_si64(__m64 __i) { +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtm64_si64 (__m64 __i) +{ return (long long)__i; } - -__funline long long _mm_cvtsi64_si64x(__m64 __i) { +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si64x (__m64 __i) +{ return (long long)__i; } #endif - -__funline __m64 _mm_packs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_packsswb(__m64 __m1, __m64 __m2) { - return _mm_packs_pi16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packsswb (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pi16 (__m1, __m2); } - -__funline __m64 _mm_packs_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); } - -__funline __m64 _m_packssdw(__m64 __m1, __m64 __m2) { - return _mm_packs_pi32(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packssdw (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pi32 (__m1, __m2); } - -__funline __m64 _mm_packs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_packuswb(__m64 __m1, __m64 __m2) { - return _mm_packs_pu16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packuswb (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pu16 (__m1, __m2); } - -__funline __m64 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); } - -__funline __m64 _m_punpckhbw(__m64 __m1, __m64 __m2) { - return _mm_unpackhi_pi8(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhbw (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi8 (__m1, __m2); } - -__funline __m64 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_punpckhwd(__m64 __m1, __m64 __m2) { - return _mm_unpackhi_pi16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhwd (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi16 (__m1, __m2); } - -__funline __m64 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); } - -__funline __m64 _m_punpckhdq(__m64 __m1, __m64 __m2) { - return _mm_unpackhi_pi32(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhdq (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi32 (__m1, __m2); } - -__funline __m64 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); } - -__funline __m64 _m_punpcklbw(__m64 __m1, __m64 __m2) { - return _mm_unpacklo_pi8(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpcklbw (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi8 (__m1, __m2); } - -__funline __m64 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_punpcklwd(__m64 __m1, __m64 __m2) { - return _mm_unpacklo_pi16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpcklwd (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi16 (__m1, __m2); } - -__funline __m64 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); } - -__funline __m64 _m_punpckldq(__m64 __m1, __m64 __m2) { - return _mm_unpacklo_pi32(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckldq (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi32 (__m1, __m2); } - -__funline __m64 _mm_add_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); } - -__funline __m64 _m_paddb(__m64 __m1, __m64 __m2) { - return _mm_add_pi8(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddb (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi8 (__m1, __m2); } - -__funline __m64 _mm_add_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_paddw(__m64 __m1, __m64 __m2) { - return _mm_add_pi16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddw (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi16 (__m1, __m2); } - -__funline __m64 _mm_add_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); } - -__funline __m64 _m_paddd(__m64 __m1, __m64 __m2) { - return _mm_add_pi32(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddd (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi32 (__m1, __m2); } - #ifndef __SSE2__ #pragma GCC push_options +#ifdef __MMX_WITH_SSE__ +#pragma GCC target("sse2") +#else #pragma GCC target("sse2,mmx") +#endif #define __DISABLE_SSE2__ -#endif /* __SSE2__ */ - -__funline __m64 _mm_add_si64(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddq((__v1di)__m1, (__v1di)__m2); +#endif +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_si64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2); } #ifdef __DISABLE_SSE2__ #undef __DISABLE_SSE2__ #pragma GCC pop_options -#endif /* __DISABLE_SSE2__ */ - -__funline __m64 _mm_adds_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); +#endif +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); } - -__funline __m64 _m_paddsb(__m64 __m1, __m64 __m2) { - return _mm_adds_pi8(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddsb (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pi8 (__m1, __m2); } - -__funline __m64 _mm_adds_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_paddsw(__m64 __m1, __m64 __m2) { - return _mm_adds_pi16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddsw (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pi16 (__m1, __m2); } - -__funline __m64 _mm_adds_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); } - -__funline __m64 _m_paddusb(__m64 __m1, __m64 __m2) { - return _mm_adds_pu8(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddusb (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pu8 (__m1, __m2); } - -__funline __m64 _mm_adds_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_paddusw(__m64 __m1, __m64 __m2) { - return _mm_adds_pu16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddusw (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pu16 (__m1, __m2); } - -__funline __m64 _mm_sub_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); } - -__funline __m64 _m_psubb(__m64 __m1, __m64 __m2) { - return _mm_sub_pi8(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubb (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi8 (__m1, __m2); } - -__funline __m64 _mm_sub_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); } - -__funline __m64 _m_psubw(__m64 __m1, __m64 __m2) { - return _mm_sub_pi16(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubw (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi16 (__m1, __m2); } - -__funline __m64 _mm_sub_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); } - -__funline __m64 _m_psubd(__m64 __m1, __m64 __m2) { - return _mm_sub_pi32(__m1, __m2); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubd (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi32 (__m1, __m2); } - #ifndef __SSE2__ #pragma GCC push_options +#ifdef __MMX_WITH_SSE__ +#pragma GCC target("sse2") +#else #pragma GCC target("sse2,mmx") +#endif #define __DISABLE_SSE2__ -#endif /* __SSE2__ */ - -__funline __m64 _mm_sub_si64(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubq((__v1di)__m1, (__v1di)__m2); +#endif +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_si64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2); } #ifdef __DISABLE_SSE2__ #undef __DISABLE_SSE2__ #pragma GCC pop_options -#endif /* __DISABLE_SSE2__ */ - -__funline __m64 _mm_subs_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); -} - -__funline __m64 _m_psubsb(__m64 __m1, __m64 __m2) { - return _mm_subs_pi8(__m1, __m2); -} - -__funline __m64 _mm_subs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); -} - -__funline __m64 _m_psubsw(__m64 __m1, __m64 __m2) { - return _mm_subs_pi16(__m1, __m2); -} - -__funline __m64 _mm_subs_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); -} - -__funline __m64 _m_psubusb(__m64 __m1, __m64 __m2) { - return _mm_subs_pu8(__m1, __m2); -} - -__funline __m64 _mm_subs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); -} - -__funline __m64 _m_psubusw(__m64 __m1, __m64 __m2) { - return _mm_subs_pu16(__m1, __m2); -} - -__funline __m64 _mm_madd_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); -} - -__funline __m64 _m_pmaddwd(__m64 __m1, __m64 __m2) { - return _mm_madd_pi16(__m1, __m2); -} - -__funline __m64 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); -} - -__funline __m64 _m_pmulhw(__m64 __m1, __m64 __m2) { - return _mm_mulhi_pi16(__m1, __m2); -} - -__funline __m64 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); -} - -__funline __m64 _m_pmullw(__m64 __m1, __m64 __m2) { - return _mm_mullo_pi16(__m1, __m2); -} - -__funline __m64 _mm_sll_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllw((__v4hi)__m, (__v4hi)__count); -} - -__funline __m64 _m_psllw(__m64 __m, __m64 __count) { - return _mm_sll_pi16(__m, __count); -} - -__funline __m64 _mm_slli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); -} - -__funline __m64 _m_psllwi(__m64 __m, int __count) { - return _mm_slli_pi16(__m, __count); -} - -__funline __m64 _mm_sll_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_pslld((__v2si)__m, (__v2si)__count); -} - -__funline __m64 _m_pslld(__m64 __m, __m64 __count) { - return _mm_sll_pi32(__m, __count); -} - -__funline __m64 _mm_slli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); -} - -__funline __m64 _m_pslldi(__m64 __m, int __count) { - return _mm_slli_pi32(__m, __count); -} - -__funline __m64 _mm_sll_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllq((__v1di)__m, (__v1di)__count); -} - -__funline __m64 _m_psllq(__m64 __m, __m64 __count) { - return _mm_sll_si64(__m, __count); -} - -__funline __m64 _mm_slli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); -} - -__funline __m64 _m_psllqi(__m64 __m, int __count) { - return _mm_slli_si64(__m, __count); -} - -__funline __m64 _mm_sra_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psraw((__v4hi)__m, (__v4hi)__count); -} - -__funline __m64 _m_psraw(__m64 __m, __m64 __count) { - return _mm_sra_pi16(__m, __count); -} - -__funline __m64 _mm_srai_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); -} - -__funline __m64 _m_psrawi(__m64 __m, int __count) { - return _mm_srai_pi16(__m, __count); -} - -__funline __m64 _mm_sra_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrad((__v2si)__m, (__v2si)__count); -} - -__funline __m64 _m_psrad(__m64 __m, __m64 __count) { - return _mm_sra_pi32(__m, __count); -} - -__funline __m64 _mm_srai_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); -} - -__funline __m64 _m_psradi(__m64 __m, int __count) { - return _mm_srai_pi32(__m, __count); -} - -__funline __m64 _mm_srl_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlw((__v4hi)__m, (__v4hi)__count); -} - -__funline __m64 _m_psrlw(__m64 __m, __m64 __count) { - return _mm_srl_pi16(__m, __count); -} - -__funline __m64 _mm_srli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); -} - -__funline __m64 _m_psrlwi(__m64 __m, int __count) { - return _mm_srli_pi16(__m, __count); -} - -__funline __m64 _mm_srl_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrld((__v2si)__m, (__v2si)__count); -} - -__funline __m64 _m_psrld(__m64 __m, __m64 __count) { - return _mm_srl_pi32(__m, __count); -} - -__funline __m64 _mm_srli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); -} - -__funline __m64 _m_psrldi(__m64 __m, int __count) { - return _mm_srli_pi32(__m, __count); -} - -__funline __m64 _mm_srl_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlq((__v1di)__m, (__v1di)__count); -} - -__funline __m64 _m_psrlq(__m64 __m, __m64 __count) { - return _mm_srl_si64(__m, __count); -} - -__funline __m64 _mm_srli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); -} - -__funline __m64 _m_psrlqi(__m64 __m, int __count) { - return _mm_srli_si64(__m, __count); -} - -__funline __m64 _mm_and_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pand(__m1, __m2); -} - -__funline __m64 _m_pand(__m64 __m1, __m64 __m2) { - return _mm_and_si64(__m1, __m2); -} - -__funline __m64 _mm_andnot_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pandn(__m1, __m2); -} - -__funline __m64 _m_pandn(__m64 __m1, __m64 __m2) { - return _mm_andnot_si64(__m1, __m2); -} - -__funline __m64 _mm_or_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_por(__m1, __m2); -} - -__funline __m64 _m_por(__m64 __m1, __m64 __m2) { - return _mm_or_si64(__m1, __m2); -} - -__funline __m64 _mm_xor_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pxor(__m1, __m2); -} - -__funline __m64 _m_pxor(__m64 __m1, __m64 __m2) { - return _mm_xor_si64(__m1, __m2); -} - -__funline __m64 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); -} - -__funline __m64 _m_pcmpeqb(__m64 __m1, __m64 __m2) { - return _mm_cmpeq_pi8(__m1, __m2); -} - -__funline __m64 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); -} - -__funline __m64 _m_pcmpgtb(__m64 __m1, __m64 __m2) { - return _mm_cmpgt_pi8(__m1, __m2); -} - -__funline __m64 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); -} - -__funline __m64 _m_pcmpeqw(__m64 __m1, __m64 __m2) { - return _mm_cmpeq_pi16(__m1, __m2); -} - -__funline __m64 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); -} - -__funline __m64 _m_pcmpgtw(__m64 __m1, __m64 __m2) { - return _mm_cmpgt_pi16(__m1, __m2); -} - -__funline __m64 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); -} - -__funline __m64 _m_pcmpeqd(__m64 __m1, __m64 __m2) { - return _mm_cmpeq_pi32(__m1, __m2); -} - -__funline __m64 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); -} - -__funline __m64 _m_pcmpgtd(__m64 __m1, __m64 __m2) { - return _mm_cmpgt_pi32(__m1, __m2); -} - -__funline __m64 _mm_setzero_si64(void) { +#endif +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubsb (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pi8 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubsw (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pi16 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubusb (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pu8 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubusw (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pu16 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaddwd (__m64 __m1, __m64 __m2) +{ + return _mm_madd_pi16 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhw (__m64 __m1, __m64 __m2) +{ + return _mm_mulhi_pi16 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmullw (__m64 __m1, __m64 __m2) +{ + return _mm_mullo_pi16 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllw (__m64 __m, __m64 __count) +{ + return _mm_sll_pi16 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllwi (__m64 __m, int __count) +{ + return _mm_slli_pi16 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pslld (__m64 __m, __m64 __count) +{ + return _mm_sll_pi32 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pslldi (__m64 __m, int __count) +{ + return _mm_slli_pi32 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllq (__m64 __m, __m64 __count) +{ + return _mm_sll_si64 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllqi (__m64 __m, int __count) +{ + return _mm_slli_si64 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psraw (__m64 __m, __m64 __count) +{ + return _mm_sra_pi16 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrawi (__m64 __m, int __count) +{ + return _mm_srai_pi16 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrad (__m64 __m, __m64 __count) +{ + return _mm_sra_pi32 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psradi (__m64 __m, int __count) +{ + return _mm_srai_pi32 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlw (__m64 __m, __m64 __count) +{ + return _mm_srl_pi16 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlwi (__m64 __m, int __count) +{ + return _mm_srli_pi16 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrld (__m64 __m, __m64 __count) +{ + return _mm_srl_pi32 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrldi (__m64 __m, int __count) +{ + return _mm_srli_pi32 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlq (__m64 __m, __m64 __count) +{ + return _mm_srl_si64 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlqi (__m64 __m, int __count) +{ + return _mm_srli_si64 (__m, __count); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pand (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pand (__m64 __m1, __m64 __m2) +{ + return _mm_and_si64 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pandn (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pandn (__m64 __m1, __m64 __m2) +{ + return _mm_andnot_si64 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_por (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_por (__m64 __m1, __m64 __m2) +{ + return _mm_or_si64 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pxor (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pxor (__m64 __m1, __m64 __m2) +{ + return _mm_xor_si64 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqb (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi8 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtb (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi8 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqw (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi16 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtw (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi16 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqd (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi32 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtd (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi32 (__m1, __m2); +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si64 (void) +{ return (__m64)0LL; } - -__funline __m64 _mm_set_pi32(int __i1, int __i0) { - return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi32 (int __i1, int __i0) +{ + return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1); } - -__funline __m64 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { - return (__m64)__builtin_ia32_vec_init_v4hi(__w0, __w1, __w2, __w3); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) +{ + return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3); } - -__funline __m64 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, - char __b2, char __b1, char __b0) { - return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, __b4, __b5, - __b6, __b7); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, + char __b3, char __b2, char __b1, char __b0) +{ + return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3, + __b4, __b5, __b6, __b7); } - -__funline __m64 _mm_setr_pi32(int __i0, int __i1) { - return _mm_set_pi32(__i1, __i0); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi32 (int __i0, int __i1) +{ + return _mm_set_pi32 (__i1, __i0); } - -__funline __m64 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { - return _mm_set_pi16(__w3, __w2, __w1, __w0); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) +{ + return _mm_set_pi16 (__w3, __w2, __w1, __w0); } - -__funline __m64 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, - char __b4, char __b5, char __b6, char __b7) { - return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, + char __b4, char __b5, char __b6, char __b7) +{ + return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } - -__funline __m64 _mm_set1_pi32(int __i) { - return _mm_set_pi32(__i, __i); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi32 (int __i) +{ + return _mm_set_pi32 (__i, __i); } - -__funline __m64 _mm_set1_pi16(short __w) { - return _mm_set_pi16(__w, __w, __w, __w); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi16 (short __w) +{ + return _mm_set_pi16 (__w, __w, __w, __w); } - -__funline __m64 _mm_set1_pi8(char __b) { - return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi8 (char __b) +{ + return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b); } #ifdef __DISABLE_MMX__ #undef __DISABLE_MMX__ #pragma GCC pop_options -#endif /* __DISABLE_MMX__ */ - -#endif /* __x86_64__ */ -#endif /* _MMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/movdirintrin.internal.h b/third_party/intel/movdirintrin.internal.h index ccf9bc58b..f8cca6ac9 100644 --- a/third_party/intel/movdirintrin.internal.h +++ b/third_party/intel/movdirintrin.internal.h @@ -1,42 +1,47 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _MOVDIRINTRIN_H_INCLUDED #define _MOVDIRINTRIN_H_INCLUDED - #ifndef __MOVDIRI__ #pragma GCC push_options -#pragma GCC target("movdiri") +#pragma GCC target ("movdiri") #define __DISABLE_MOVDIRI__ -#endif /* __MOVDIRI__ */ - -__funline void _directstoreu_u32(void *__P, unsigned int __A) { - __builtin_ia32_directstoreu_u32((unsigned int *)__P, __A); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_directstoreu_u32 (void * __P, unsigned int __A) +{ + __builtin_ia32_directstoreu_u32 ((unsigned int *)__P, __A); } #ifdef __x86_64__ -__funline void _directstoreu_u64(void *__P, unsigned long long __A) { - __builtin_ia32_directstoreu_u64((unsigned long long *)__P, __A); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_directstoreu_u64 (void * __P, unsigned long long __A) +{ + __builtin_ia32_directstoreu_u64 ((unsigned long long *)__P, __A); } #endif - #ifdef __DISABLE_MOVDIRI__ #undef __DISABLE_MOVDIRI__ #pragma GCC pop_options -#endif /* __DISABLE_MOVDIRI__ */ - +#endif #ifndef __MOVDIR64B__ #pragma GCC push_options -#pragma GCC target("movdir64b") +#pragma GCC target ("movdir64b") #define __DISABLE_MOVDIR64B__ -#endif /* __MOVDIR64B__ */ - -__funline void _movdir64b(void *__P, const void *__Q) { - __builtin_ia32_movdir64b(__P, __Q); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_movdir64b (void * __P, const void * __Q) +{ + __builtin_ia32_movdir64b (__P, __Q); } - #ifdef __DISABLE_MOVDIR64B__ #undef __DISABLE_MOVDIR64B__ #pragma GCC pop_options -#endif /* __DISABLE_MOVDIR64B__ */ -#endif /* _MOVDIRINTRIN_H_INCLUDED. */ +#endif +#endif +#endif diff --git a/third_party/intel/mwaitxintrin.internal.h b/third_party/intel/mwaitxintrin.internal.h index 0db3aa2c4..00de10a18 100644 --- a/third_party/intel/mwaitxintrin.internal.h +++ b/third_party/intel/mwaitxintrin.internal.h @@ -1,25 +1,25 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _MWAITXINTRIN_H_INCLUDED #define _MWAITXINTRIN_H_INCLUDED -#ifdef __x86_64__ - #ifndef __MWAITX__ #pragma GCC push_options #pragma GCC target("mwaitx") #define __DISABLE_MWAITX__ -#endif /* __MWAITX__ */ - -__funline void _mm_monitorx(void const* __P, unsigned int __E, unsigned int __H) { - __builtin_ia32_monitorx(__P, __E, __H); +#endif +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_monitorx (void const * __P, unsigned int __E, unsigned int __H) +{ + __builtin_ia32_monitorx (__P, __E, __H); } - -__funline void _mm_mwaitx(unsigned int __E, unsigned int __H, unsigned int __C) { - __builtin_ia32_mwaitx(__E, __H, __C); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mwaitx (unsigned int __E, unsigned int __H, unsigned int __C) +{ + __builtin_ia32_mwaitx (__E, __H, __C); } - #ifdef __DISABLE_MWAITX__ #undef __DISABLE_MWAITX__ #pragma GCC pop_options -#endif /* __DISABLE_MWAITX__ */ - -#endif /* __x86_64__ */ -#endif /* _MWAITXINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/nmmintrin.internal.h b/third_party/intel/nmmintrin.internal.h index 9a826c18b..503e4a4c0 100644 --- a/third_party/intel/nmmintrin.internal.h +++ b/third_party/intel/nmmintrin.internal.h @@ -1,6 +1,7 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _NMMINTRIN_H_INCLUDED #define _NMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/smmintrin.internal.h" -#endif /* __x86_64__ */ -#endif /* _NMMINTRIN_H_INCLUDED */ +#endif +#endif diff --git a/third_party/intel/pconfigintrin.internal.h b/third_party/intel/pconfigintrin.internal.h index c0877e16d..bca106a0e 100644 --- a/third_party/intel/pconfigintrin.internal.h +++ b/third_party/intel/pconfigintrin.internal.h @@ -1,52 +1,41 @@ -#ifndef _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _PCONFIGINTRIN_H_INCLUDED #define _PCONFIGINTRIN_H_INCLUDED - #ifndef __PCONFIG__ #pragma GCC push_options #pragma GCC target("pconfig") #define __DISABLE_PCONFIG__ -#endif /* __PCONFIG__ */ - -#define __pconfig_b(leaf, b, retval) \ - __asm__ __volatile__("pconfig\n\t" \ - : "=a"(retval) \ - : "a"(leaf), "b"(b) \ - : "c" \ - "c") - -#define __pconfig_generic(leaf, b, c, d, retval) \ - __asm__ __volatile__("pconfig\n\t" \ - : "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \ - : "a"(leaf), "b"(b), "c"(c), "d"(d) \ - : "cc") - -__funline unsigned int _pconfig_u32(const unsigned int __L, size_t __D[]) { - enum __pconfig_type { +#endif +#define __pconfig_b(leaf, b, retval) __asm__ __volatile__ ("pconfig\n\t" : "=a" (retval) : "a" (leaf), "b" (b) : "cc") +#define __pconfig_generic(leaf, b, c, d, retval) __asm__ __volatile__ ("pconfig\n\t" : "=a" (retval), "=b" (b), "=c" (c), "=d" (d) : "a" (leaf), "b" (b), "c" (c), "d" (d) : "cc") +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pconfig_u32 (const unsigned int __L, size_t __D[]) +{ + enum __pconfig_type + { __PCONFIG_KEY_PROGRAM = 0x01, }; - unsigned int __R = 0; - - if (!__builtin_constant_p(__L)) - __pconfig_generic(__L, __D[0], __D[1], __D[2], __R); - else - switch (__L) { - case __PCONFIG_KEY_PROGRAM: - __pconfig_b(__L, __D[0], __R); - break; - default: - __pconfig_generic(__L, __D[0], __D[1], __D[2], __R); + if (!__builtin_constant_p (__L)) + __pconfig_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__L) + { + case __PCONFIG_KEY_PROGRAM: + __pconfig_b (__L, __D[0], __R); + break; + default: + __pconfig_generic (__L, __D[0], __D[1], __D[2], __R); } return __R; } - #ifdef __DISABLE_PCONFIG__ #undef __DISABLE_PCONFIG__ #pragma GCC pop_options -#endif /* __DISABLE_PCONFIG__ */ - -#endif /* _PCONFIGINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/pkuintrin.internal.h b/third_party/intel/pkuintrin.internal.h index 789c1335e..17daff8ea 100644 --- a/third_party/intel/pkuintrin.internal.h +++ b/third_party/intel/pkuintrin.internal.h @@ -1,27 +1,30 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _PKUINTRIN_H_INCLUDED #define _PKUINTRIN_H_INCLUDED - #ifndef __PKU__ #pragma GCC push_options #pragma GCC target("pku") #define __DISABLE_PKU__ -#endif /* __PKU__ */ - -__funline unsigned int _rdpkru_u32(void) { - return __builtin_ia32_rdpkru(); +#endif +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdpkru_u32 (void) +{ + return __builtin_ia32_rdpkru (); } - -__funline void _wrpkru(unsigned int __key) { - __builtin_ia32_wrpkru(__key); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrpkru (unsigned int __key) +{ + __builtin_ia32_wrpkru (__key); } - #ifdef __DISABLE_PKU__ #undef __DISABLE_PKU__ #pragma GCC pop_options -#endif /* __DISABLE_PKU__ */ - -#endif /* _PKUINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/pmmintrin.internal.h b/third_party/intel/pmmintrin.internal.h index 27f9c5e41..4a5b6c00e 100644 --- a/third_party/intel/pmmintrin.internal.h +++ b/third_party/intel/pmmintrin.internal.h @@ -1,78 +1,86 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _PMMINTRIN_H_INCLUDED #define _PMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/emmintrin.internal.h" - #ifndef __SSE3__ #pragma GCC push_options #pragma GCC target("sse3") #define __DISABLE_SSE3__ -#endif /* __SSE3__ */ - +#endif #define _MM_DENORMALS_ZERO_MASK 0x0040 -#define _MM_DENORMALS_ZERO_ON 0x0040 -#define _MM_DENORMALS_ZERO_OFF 0x0000 - -#define _MM_SET_DENORMALS_ZERO_MODE(mode) \ - _mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (mode)) +#define _MM_DENORMALS_ZERO_ON 0x0040 +#define _MM_DENORMALS_ZERO_OFF 0x0000 +#define _MM_SET_DENORMALS_ZERO_MODE(mode) _mm_setcsr ((_mm_getcsr () & ~_MM_DENORMALS_ZERO_MASK) | (mode)) #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) - -__funline __m128 _mm_addsub_ps(__m128 __X, __m128 __Y) { - return (__m128)__builtin_ia32_addsubps((__v4sf)__X, (__v4sf)__Y); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y); } - -__funline __m128 _mm_hadd_ps(__m128 __X, __m128 __Y) { - return (__m128)__builtin_ia32_haddps((__v4sf)__X, (__v4sf)__Y); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y); } - -__funline __m128 _mm_hsub_ps(__m128 __X, __m128 __Y) { - return (__m128)__builtin_ia32_hsubps((__v4sf)__X, (__v4sf)__Y); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y); } - -__funline __m128 _mm_movehdup_ps(__m128 __X) { - return (__m128)__builtin_ia32_movshdup((__v4sf)__X); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehdup_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_movshdup ((__v4sf)__X); } - -__funline __m128 _mm_moveldup_ps(__m128 __X) { - return (__m128)__builtin_ia32_movsldup((__v4sf)__X); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_moveldup_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_movsldup ((__v4sf)__X); } - -__funline __m128d _mm_addsub_pd(__m128d __X, __m128d __Y) { - return (__m128d)__builtin_ia32_addsubpd((__v2df)__X, (__v2df)__Y); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y); } - -__funline __m128d _mm_hadd_pd(__m128d __X, __m128d __Y) { - return (__m128d)__builtin_ia32_haddpd((__v2df)__X, (__v2df)__Y); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y); } - -__funline __m128d _mm_hsub_pd(__m128d __X, __m128d __Y) { - return (__m128d)__builtin_ia32_hsubpd((__v2df)__X, (__v2df)__Y); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y); } - -__funline __m128d _mm_loaddup_pd(double const *__P) { - return _mm_load1_pd(__P); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loaddup_pd (double const *__P) +{ + return _mm_load1_pd (__P); } - -__funline __m128d _mm_movedup_pd(__m128d __X) { - return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0)); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movedup_pd (__m128d __X) +{ + return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); } - -__funline __m128i _mm_lddqu_si128(__m128i const *__P) { - return (__m128i)__builtin_ia32_lddqu((char const *)__P); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lddqu_si128 (__m128i const *__P) +{ + return (__m128i) __builtin_ia32_lddqu ((char const *)__P); } - -__funline void _mm_monitor(void const *__P, unsigned int __E, unsigned int __H) { - __builtin_ia32_monitor(__P, __E, __H); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_monitor (void const * __P, unsigned int __E, unsigned int __H) +{ + __builtin_ia32_monitor (__P, __E, __H); } - -__funline void _mm_mwait(unsigned int __E, unsigned int __H) { - __builtin_ia32_mwait(__E, __H); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mwait (unsigned int __E, unsigned int __H) +{ + __builtin_ia32_mwait (__E, __H); } - #ifdef __DISABLE_SSE3__ #undef __DISABLE_SSE3__ #pragma GCC pop_options -#endif /* __DISABLE_SSE3__ */ - -#endif /* __x86_64__ */ -#endif /* _PMMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/popcntintrin.internal.h b/third_party/intel/popcntintrin.internal.h index 8f18eb598..18a9f9da1 100644 --- a/third_party/intel/popcntintrin.internal.h +++ b/third_party/intel/popcntintrin.internal.h @@ -1,27 +1,27 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _POPCNTINTRIN_H_INCLUDED #define _POPCNTINTRIN_H_INCLUDED -#ifdef __x86_64__ - #ifndef __POPCNT__ #pragma GCC push_options #pragma GCC target("popcnt") #define __DISABLE_POPCNT__ -#endif /* __POPCNT__ */ - -__funline int _mm_popcnt_u32(unsigned int __X) { - return __builtin_popcount(__X); +#endif +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_u32 (unsigned int __X) +{ + return __builtin_popcount (__X); } - #ifdef __x86_64__ -__funline long long _mm_popcnt_u64(unsigned long long __X) { - return __builtin_popcountll(__X); +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_u64 (unsigned long long __X) +{ + return __builtin_popcountll (__X); } #endif - #ifdef __DISABLE_POPCNT__ #undef __DISABLE_POPCNT__ #pragma GCC pop_options -#endif /* __DISABLE_POPCNT__ */ - -#endif /* __x86_64__ */ -#endif /* _POPCNTINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/prfchwintrin.internal.h b/third_party/intel/prfchwintrin.internal.h index 66f911314..d98fd07de 100644 --- a/third_party/intel/prfchwintrin.internal.h +++ b/third_party/intel/prfchwintrin.internal.h @@ -1,13 +1,14 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #if !defined _IMMINTRIN_H_INCLUDED && !defined _MM3DNOW_H_INCLUDED -#error \ - "Never use directly; include or instead." +# error "Never use directly; include or instead." #endif - #ifndef _PRFCHWINTRIN_H_INCLUDED #define _PRFCHWINTRIN_H_INCLUDED - -__funline void _m_prefetchw(void *__P) { - __builtin_prefetch(__P, 1, 3 /* _MM_HINT_T0 */); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetchw (void *__P) +{ + __builtin_prefetch (__P, 1, 3 ); } - -#endif /* _PRFCHWINTRIN_H_INCLUDED */ +#endif +#endif diff --git a/third_party/intel/rdseedintrin.internal.h b/third_party/intel/rdseedintrin.internal.h index c5125717e..c8975d18c 100644 --- a/third_party/intel/rdseedintrin.internal.h +++ b/third_party/intel/rdseedintrin.internal.h @@ -1,33 +1,38 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _RDSEEDINTRIN_H_INCLUDED #define _RDSEEDINTRIN_H_INCLUDED - #ifndef __RDSEED__ #pragma GCC push_options #pragma GCC target("rdseed") #define __DISABLE_RDSEED__ -#endif /* __RDSEED__ */ - -__funline int _rdseed16_step(unsigned short *__p) { - return __builtin_ia32_rdseed_hi_step(__p); +#endif +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdseed16_step (unsigned short *__p) +{ + return __builtin_ia32_rdseed_hi_step (__p); } - -__funline int _rdseed32_step(unsigned int *__p) { - return __builtin_ia32_rdseed_si_step(__p); +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdseed32_step (unsigned int *__p) +{ + return __builtin_ia32_rdseed_si_step (__p); } - #ifdef __x86_64__ -__funline int _rdseed64_step(unsigned long long *__p) { - return __builtin_ia32_rdseed_di_step(__p); +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdseed64_step (unsigned long long *__p) +{ + return __builtin_ia32_rdseed_di_step (__p); } #endif - #ifdef __DISABLE_RDSEED__ #undef __DISABLE_RDSEED__ #pragma GCC pop_options -#endif /* __DISABLE_RDSEED__ */ - -#endif /* _RDSEEDINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/rtmintrin.internal.h b/third_party/intel/rtmintrin.internal.h index 370786179..9d950f401 100644 --- a/third_party/intel/rtmintrin.internal.h +++ b/third_party/intel/rtmintrin.internal.h @@ -1,44 +1,48 @@ -#ifndef _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _RTMINTRIN_H_INCLUDED #define _RTMINTRIN_H_INCLUDED - #ifndef __RTM__ #pragma GCC push_options #pragma GCC target("rtm") #define __DISABLE_RTM__ -#endif /* __RTM__ */ - -#define _XBEGIN_STARTED (~0u) +#endif +#define _XBEGIN_STARTED (~0u) #define _XABORT_EXPLICIT (1 << 0) -#define _XABORT_RETRY (1 << 1) +#define _XABORT_RETRY (1 << 1) #define _XABORT_CONFLICT (1 << 2) #define _XABORT_CAPACITY (1 << 3) -#define _XABORT_DEBUG (1 << 4) -#define _XABORT_NESTED (1 << 5) -#define _XABORT_CODE(x) (((x) >> 24) & 0xFF) - -__funline unsigned int _xbegin(void) { - return __builtin_ia32_xbegin(); +#define _XABORT_DEBUG (1 << 4) +#define _XABORT_NESTED (1 << 5) +#define _XABORT_CODE(x) (((x) >> 24) & 0xFF) +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xbegin (void) +{ + return __builtin_ia32_xbegin (); } - -__funline void _xend(void) { - __builtin_ia32_xend(); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xend (void) +{ + __builtin_ia32_xend (); } - #ifdef __OPTIMIZE__ -__funline void _xabort(const unsigned int __imm) { - __builtin_ia32_xabort(__imm); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xabort (const unsigned int __imm) +{ + __builtin_ia32_xabort (__imm); } #else -#define _xabort(N) __builtin_ia32_xabort(N) -#endif /* __OPTIMIZE__ */ - +#define _xabort(N) __builtin_ia32_xabort (N) +#endif #ifdef __DISABLE_RTM__ #undef __DISABLE_RTM__ #pragma GCC pop_options -#endif /* __DISABLE_RTM__ */ - -#endif /* _RTMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/serializeintrin.internal.h b/third_party/intel/serializeintrin.internal.h new file mode 100644 index 000000000..f783a91d7 --- /dev/null +++ b/third_party/intel/serializeintrin.internal.h @@ -0,0 +1,19 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif +#ifndef _SERIALIZE_H_INCLUDED +#define _SERIALIZE_H_INCLUDED +#ifndef __SERIALIZE__ +#pragma GCC push_options +#pragma GCC target("serialize") +#define __DISABLE_SERIALIZE__ +#endif +#define _serialize() __builtin_ia32_serialize () +#ifdef __DISABLE_SERIALIZE__ +#undef __DISABLE_SERIALIZE__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/sgxintrin.internal.h b/third_party/intel/sgxintrin.internal.h index 3e5955943..e6d713811 100644 --- a/third_party/intel/sgxintrin.internal.h +++ b/third_party/intel/sgxintrin.internal.h @@ -1,87 +1,31 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _SGXINTRIN_H_INCLUDED #define _SGXINTRIN_H_INCLUDED -#ifdef __x86_64__ - #ifndef __SGX__ #pragma GCC push_options #pragma GCC target("sgx") #define __DISABLE_SGX__ -#endif /* __SGX__ */ - -#define __encls_bc(leaf, b, c, retval) \ - __asm__ __volatile__("encls\n\t" \ - : "=a"(retval) \ - : "a"(leaf), "b"(b), "c"(c) \ - : "cc") - -#define __encls_bcd(leaf, b, c, d, retval) \ - __asm__ __volatile__("encls\n\t" \ - : "=a"(retval) \ - : "a"(leaf), "b"(b), "c"(c), "d"(d) \ - : "cc") - -#define __encls_c(leaf, c, retval) \ - __asm__ __volatile__("encls\n\t" : "=a"(retval) : "a"(leaf), "c"(c) : "cc") - -#define __encls_edbgrd(leaf, b, c, retval) \ - __asm__ __volatile__("encls\n\t" : "=a"(retval), "=b"(b) : "a"(leaf), "c"(c)) - -#define __encls_generic(leaf, b, c, d, retval) \ - __asm__ __volatile__("encls\n\t" \ - : "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \ - : "a"(leaf), "b"(b), "c"(c), "d"(d) \ - : "cc") - -#define __enclu_bc(leaf, b, c, retval) \ - __asm__ __volatile__("enclu\n\t" \ - : "=a"(retval) \ - : "a"(leaf), "b"(b), "c"(c) \ - : "cc") - -#define __enclu_bcd(leaf, b, c, d, retval) \ - __asm__ __volatile__("enclu\n\t" \ - : "=a"(retval) \ - : "a"(leaf), "b"(b), "c"(c), "d"(d) \ - : "cc") - -#define __enclu_eenter(leaf, b, c, retval) \ - __asm__ __volatile__("enclu\n\t" \ - : "=a"(retval), "=c"(c) \ - : "a"(leaf), "b"(b), "c"(c) \ - : "cc") - -#define __enclu_eexit(leaf, b, c, retval) \ - __asm__ __volatile__("enclu\n\t" \ - : "=a"(retval), "=c"(c) \ - : "a"(leaf), "b"(b) \ - : "cc") - -#define __enclu_generic(leaf, b, c, d, retval) \ - __asm__ __volatile__("enclu\n\t" \ - : "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \ - : "a"(leaf), "b"(b), "c"(c), "d"(d) \ - : "cc") - -#define __enclv_bc(leaf, b, c, retval) \ - __asm__ __volatile__("enclv\n\t" \ - : "=a"(retval) \ - : "a"(leaf), "b"(b), "c"(c) \ - : "cc") - -#define __enclv_cd(leaf, c, d, retval) \ - __asm__ __volatile__("enclv\n\t" \ - : "=a"(retval) \ - : "a"(leaf), "c"(c), "d"(d) \ - : "cc") - -#define __enclv_generic(leaf, b, c, d, retval) \ - __asm__ __volatile__("enclv\n\t" \ - : "=a"(retval), "=b"(b), "=c"(b), "=d"(d) \ - : "a"(leaf), "b"(b), "c"(c), "d"(d) \ - : "cc") - -__funline unsigned int _encls_u32(const unsigned int __L, size_t __D[]) { - enum __encls_type { +#endif +#define __encls_bc(leaf, b, c, retval) __asm__ __volatile__ ("encls\n\t" : "=a" (retval) : "a" (leaf), "b" (b), "c" (c) : "cc") +#define __encls_bcd(leaf, b, c, d, retval) __asm__ __volatile__("encls\n\t" : "=a" (retval) : "a" (leaf), "b" (b), "c" (c), "d" (d) : "cc") +#define __encls_c(leaf, c, retval) __asm__ __volatile__("encls\n\t" : "=a" (retval) : "a" (leaf), "c" (c) : "cc") +#define __encls_edbgrd(leaf, b, c, retval) __asm__ __volatile__("encls\n\t" : "=a" (retval), "=b" (b) : "a" (leaf), "c" (c)) +#define __encls_generic(leaf, b, c, d, retval) __asm__ __volatile__("encls\n\t" : "=a" (retval), "=b" (b), "=c" (c), "=d" (d) : "a" (leaf), "b" (b), "c" (c), "d" (d) : "cc") +#define __enclu_bc(leaf, b, c, retval) __asm__ __volatile__("enclu\n\t" : "=a" (retval) : "a" (leaf), "b" (b), "c" (c) : "cc") +#define __enclu_bcd(leaf, b, c, d, retval) __asm__ __volatile__("enclu\n\t" : "=a" (retval) : "a" (leaf), "b" (b), "c" (c), "d" (d) : "cc") +#define __enclu_eenter(leaf, b, c, retval) __asm__ __volatile__("enclu\n\t" : "=a" (retval), "=c" (c) : "a" (leaf), "b" (b), "c" (c) : "cc") +#define __enclu_eexit(leaf, b, c, retval) __asm__ __volatile__("enclu\n\t" : "=a" (retval), "=c" (c) : "a" (leaf), "b" (b) : "cc") +#define __enclu_generic(leaf, b, c, d, retval) __asm__ __volatile__("enclu\n\t" : "=a" (retval), "=b" (b), "=c" (c), "=d" (d) : "a" (leaf), "b" (b), "c" (c), "d" (d) : "cc") +#define __enclv_bc(leaf, b, c, retval) __asm__ __volatile__("enclv\n\t" : "=a" (retval) : "a" (leaf), "b" (b), "c" (c) : "cc") +#define __enclv_cd(leaf, c, d, retval) __asm__ __volatile__("enclv\n\t" : "=a" (retval) : "a" (leaf), "c" (c), "d" (d) : "cc") +#define __enclv_generic(leaf, b, c, d, retval) __asm__ __volatile__("enclv\n\t" : "=a" (retval), "=b" (b), "=c" (b), "=d" (d) : "a" (leaf), "b" (b), "c" (c), "d" (d) : "cc") +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_encls_u32 (const unsigned int __L, size_t __D[]) +{ + enum __encls_type + { __SGX_ECREATE = 0x00, __SGX_EADD = 0x01, __SGX_EINIT = 0x02, @@ -105,46 +49,49 @@ __funline unsigned int _encls_u32(const unsigned int __L, size_t __D[]) { }; enum __encls_type __T = (enum __encls_type)__L; unsigned int __R = 0; - if (!__builtin_constant_p(__T)) - __encls_generic(__L, __D[0], __D[1], __D[2], __R); - else - switch (__T) { - case __SGX_ECREATE: - case __SGX_EADD: - case __SGX_EDBGWR: - case __SGX_EEXTEND: - case __SGX_EPA: - case __SGX_EMODPR: - case __SGX_EMODT: - case __SGX_EAUG: - case __SGX_ERDINFO: - __encls_bc(__L, __D[0], __D[1], __R); - break; - case __SGX_EINIT: - case __SGX_ELDB: - case __SGX_ELDU: - case __SGX_EWB: - case __SGX_ELDBC: - case __SGX_ELDUC: - __encls_bcd(__L, __D[0], __D[1], __D[2], __R); - break; - case __SGX_EREMOVE: - case __SGX_EBLOCK: - case __SGX_ETRACK: - case __SGX_ETRACKC: - __encls_c(__L, __D[1], __R); - break; - case __SGX_EDBGRD: - __encls_edbgrd(__L, __D[0], __D[1], __R); - break; - default: - __encls_generic(__L, __D[0], __D[1], __D[2], __R); + if (!__builtin_constant_p (__T)) + __encls_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__T) + { + case __SGX_ECREATE: + case __SGX_EADD: + case __SGX_EDBGWR: + case __SGX_EEXTEND: + case __SGX_EPA: + case __SGX_EMODPR: + case __SGX_EMODT: + case __SGX_EAUG: + case __SGX_ERDINFO: + __encls_bc (__L, __D[0], __D[1], __R); + break; + case __SGX_EINIT: + case __SGX_ELDB: + case __SGX_ELDU: + case __SGX_EWB: + case __SGX_ELDBC: + case __SGX_ELDUC: + __encls_bcd (__L, __D[0], __D[1], __D[2], __R); + break; + case __SGX_EREMOVE: + case __SGX_EBLOCK: + case __SGX_ETRACK: + case __SGX_ETRACKC: + __encls_c (__L, __D[1], __R); + break; + case __SGX_EDBGRD: + __encls_edbgrd (__L, __D[0], __D[1], __R); + break; + default: + __encls_generic (__L, __D[0], __D[1], __D[2], __R); } return __R; } - -__funline unsigned int _enclu_u32(const unsigned int __L, size_t __D[]) { - enum __enclu_type { +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enclu_u32 (const unsigned int __L, size_t __D[]) +{ + enum __enclu_type + { __SGX_EREPORT = 0x00, __SGX_EGETKEY = 0x01, __SGX_EENTER = 0x02, @@ -154,62 +101,63 @@ __funline unsigned int _enclu_u32(const unsigned int __L, size_t __D[]) { __SGX_EMODPE = 0x06, __SGX_EACCEPTCOPY = 0x07 }; - enum __enclu_type __T = (enum __enclu_type)__L; + enum __enclu_type __T = (enum __enclu_type) __L; unsigned int __R = 0; - if (!__builtin_constant_p(__T)) - __enclu_generic(__L, __D[0], __D[1], __D[2], __R); - else - switch (__T) { - case __SGX_EREPORT: - case __SGX_EACCEPTCOPY: - __enclu_bcd(__L, __D[0], __D[1], __D[2], __R); - break; - case __SGX_EGETKEY: - case __SGX_ERESUME: - case __SGX_EACCEPT: - case __SGX_EMODPE: - __enclu_bc(__L, __D[0], __D[1], __R); - break; - case __SGX_EENTER: - __enclu_eenter(__L, __D[0], __D[1], __R); - break; - case __SGX_EEXIT: - __enclu_eexit(__L, __D[0], __D[1], __R); - break; - default: - __enclu_generic(__L, __D[0], __D[1], __D[2], __R); + if (!__builtin_constant_p (__T)) + __enclu_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__T) + { + case __SGX_EREPORT: + case __SGX_EACCEPTCOPY: + __enclu_bcd (__L, __D[0], __D[1], __D[2], __R); + break; + case __SGX_EGETKEY: + case __SGX_ERESUME: + case __SGX_EACCEPT: + case __SGX_EMODPE: + __enclu_bc (__L, __D[0], __D[1], __R); + break; + case __SGX_EENTER: + __enclu_eenter (__L, __D[0], __D[1], __R); + break; + case __SGX_EEXIT: + __enclu_eexit (__L, __D[0], __D[1], __R); + break; + default: + __enclu_generic (__L, __D[0], __D[1], __D[2], __R); } return __R; } - -__funline unsigned int _enclv_u32(const unsigned int __L, size_t __D[]) { - enum __enclv_type { +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enclv_u32 (const unsigned int __L, size_t __D[]) +{ + enum __enclv_type + { __SGX_EDECVIRTCHILD = 0x00, __SGX_EINCVIRTCHILD = 0x01, __SGX_ESETCONTEXT = 0x02 }; unsigned int __R = 0; - if (!__builtin_constant_p(__L)) - __enclv_generic(__L, __D[0], __D[1], __D[2], __R); - else - switch (__L) { - case __SGX_EDECVIRTCHILD: - case __SGX_EINCVIRTCHILD: - __enclv_bc(__L, __D[0], __D[1], __R); - break; - case __SGX_ESETCONTEXT: - __enclv_cd(__L, __D[1], __D[2], __R); - break; - default: - __enclv_generic(__L, __D[0], __D[1], __D[2], __R); + if (!__builtin_constant_p (__L)) + __enclv_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__L) + { + case __SGX_EDECVIRTCHILD: + case __SGX_EINCVIRTCHILD: + __enclv_bc (__L, __D[0], __D[1], __R); + break; + case __SGX_ESETCONTEXT: + __enclv_cd (__L, __D[1], __D[2], __R); + break; + default: + __enclv_generic (__L, __D[0], __D[1], __D[2], __R); } return __R; } - #ifdef __DISABLE_SGX__ #undef __DISABLE_SGX__ #pragma GCC pop_options -#endif /* __DISABLE_SGX__ */ - -#endif /* __x86_64__ */ -#endif /* _SGXINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/shaintrin.internal.h b/third_party/intel/shaintrin.internal.h index 1d3a6c139..914db8cc3 100644 --- a/third_party/intel/shaintrin.internal.h +++ b/third_party/intel/shaintrin.internal.h @@ -1,54 +1,65 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _SHAINTRIN_H_INCLUDED #define _SHAINTRIN_H_INCLUDED - #ifndef __SHA__ #pragma GCC push_options #pragma GCC target("sha") #define __DISABLE_SHA__ -#endif /* __SHA__ */ - -__funline __m128i _mm_sha1msg1_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_sha1msg1((__v4si)__A, (__v4si)__B); +#endif +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1msg1_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1msg1 ((__v4si) __A, (__v4si) __B); } - -__funline __m128i _mm_sha1msg2_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_sha1msg2((__v4si)__A, (__v4si)__B); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1msg2_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1msg2 ((__v4si) __A, (__v4si) __B); } - -__funline __m128i _mm_sha1nexte_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_sha1nexte((__v4si)__A, (__v4si)__B); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1nexte_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_sha1rnds4_epu32(__m128i __A, __m128i __B, const int __I) { - return (__m128i)__builtin_ia32_sha1rnds4((__v4si)__A, (__v4si)__B, __I); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I) +{ + return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I); } #else -#define _mm_sha1rnds4_epu32(A, B, I) \ - ((__m128i)__builtin_ia32_sha1rnds4((__v4si)(__m128i)A, (__v4si)(__m128i)B, \ - (int)I)) +#define _mm_sha1rnds4_epu32(A, B, I) ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (int)(I))) #endif - -__funline __m128i _mm_sha256msg1_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_sha256msg1((__v4si)__A, (__v4si)__B); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256msg1_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha256msg1 ((__v4si) __A, (__v4si) __B); } - -__funline __m128i _mm_sha256msg2_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_sha256msg2((__v4si)__A, (__v4si)__B); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256msg2_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha256msg2 ((__v4si) __A, (__v4si) __B); } - -__funline __m128i _mm_sha256rnds2_epu32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__A, (__v4si)__B, - (__v4si)__C); +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_sha256rnds2 ((__v4si) __A, (__v4si) __B, + (__v4si) __C); } - #ifdef __DISABLE_SHA__ #undef __DISABLE_SHA__ #pragma GCC pop_options -#endif /* __DISABLE_SHA__ */ - -#endif /* _SHAINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/smmintrin.internal.h b/third_party/intel/smmintrin.internal.h index 022fdb427..aed50ea41 100644 --- a/third_party/intel/smmintrin.internal.h +++ b/third_party/intel/smmintrin.internal.h @@ -1,577 +1,569 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _SMMINTRIN_H_INCLUDED #define _SMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/tmmintrin.internal.h" - #ifndef __SSE4_1__ #pragma GCC push_options #pragma GCC target("sse4.1") #define __DISABLE_SSE4_1__ -#endif /* __SSE4_1__ */ - +#endif #define _MM_FROUND_TO_NEAREST_INT 0x00 -#define _MM_FROUND_TO_NEG_INF 0x01 -#define _MM_FROUND_TO_POS_INF 0x02 -#define _MM_FROUND_TO_ZERO 0x03 -#define _MM_FROUND_CUR_DIRECTION 0x04 - +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 #define _MM_FROUND_RAISE_EXC 0x00 -#define _MM_FROUND_NO_EXC 0x08 - -#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) - -__funline int _mm_testz_si128(__m128i __M, __m128i __V) { - return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V); } - -__funline int _mm_testc_si128(__m128i __M, __m128i __V) { - return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V); } - -__funline int _mm_testnzc_si128(__m128i __M, __m128i __V) { - return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V); } - -#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) - -#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) - -#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) - +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) +#define _mm_test_all_ones(V) _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) #ifdef __OPTIMIZE__ -__funline __m128d _mm_round_pd(__m128d __V, const int __M) { - return (__m128d)__builtin_ia32_roundpd((__v2df)__V, __M); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_pd (__m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M); } - -__funline __m128d _mm_round_sd(__m128d __D, __m128d __V, const int __M) { - return (__m128d)__builtin_ia32_roundsd((__v2df)__D, (__v2df)__V, __M); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_sd(__m128d __D, __m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundsd ((__v2df)__D, + (__v2df)__V, + __M); } #else -#define _mm_round_pd(V, M) \ - ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(V), (int)(M))) - -#define _mm_round_sd(D, V, M) \ - ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(D), (__v2df)(__m128d)(V), \ - (int)(M))) +#define _mm_round_pd(V, M) ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M))) +#define _mm_round_sd(D, V, M) ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), (__v2df)(__m128d)(V), (int)(M))) #endif - #ifdef __OPTIMIZE__ -__funline __m128 _mm_round_ps(__m128 __V, const int __M) { - return (__m128)__builtin_ia32_roundps((__v4sf)__V, __M); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ps (__m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M); } - -__funline __m128 _mm_round_ss(__m128 __D, __m128 __V, const int __M) { - return (__m128)__builtin_ia32_roundss((__v4sf)__D, (__v4sf)__V, __M); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ss (__m128 __D, __m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundss ((__v4sf)__D, + (__v4sf)__V, + __M); } #else -#define _mm_round_ps(V, M) \ - ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(V), (int)(M))) - -#define _mm_round_ss(D, V, M) \ - ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(D), (__v4sf)(__m128)(V), \ - (int)(M))) +#define _mm_round_ps(V, M) ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M))) +#define _mm_round_ss(D, V, M) ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), (__v4sf)(__m128)(V), (int)(M))) #endif - -#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) -#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) - -#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) -#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) - -#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) -#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) - -#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) -#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) - +#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) +#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) +#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) #ifdef __OPTIMIZE__ -__funline __m128i _mm_blend_epi16(__m128i __X, __m128i __Y, const int __M) { - return (__m128i)__builtin_ia32_pblendw128((__v8hi)__X, (__v8hi)__Y, __M); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X, + (__v8hi)__Y, + __M); } #else -#define _mm_blend_epi16(X, Y, M) \ - ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(X), \ - (__v8hi)(__m128i)(Y), (int)(M))) +#define _mm_blend_epi16(X, Y, M) ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(M))) #endif - -__funline __m128i _mm_blendv_epi8(__m128i __X, __m128i __Y, __m128i __M) { - return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__X, (__v16qi)__Y, - (__v16qi)__M); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) +{ + return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X, + (__v16qi)__Y, + (__v16qi)__M); } - #ifdef __OPTIMIZE__ -__funline __m128 _mm_blend_ps(__m128 __X, __m128 __Y, const int __M) { - return (__m128)__builtin_ia32_blendps((__v4sf)__X, (__v4sf)__Y, __M); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_blendps ((__v4sf)__X, + (__v4sf)__Y, + __M); } #else -#define _mm_blend_ps(X, Y, M) \ - ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (int)(M))) +#define _mm_blend_ps(X, Y, M) ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(M))) #endif - -__funline __m128 _mm_blendv_ps(__m128 __X, __m128 __Y, __m128 __M) { - return (__m128)__builtin_ia32_blendvps((__v4sf)__X, (__v4sf)__Y, (__v4sf)__M); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) +{ + return (__m128) __builtin_ia32_blendvps ((__v4sf)__X, + (__v4sf)__Y, + (__v4sf)__M); } - #ifdef __OPTIMIZE__ -__funline __m128d _mm_blend_pd(__m128d __X, __m128d __Y, const int __M) { - return (__m128d)__builtin_ia32_blendpd((__v2df)__X, (__v2df)__Y, __M); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_blendpd ((__v2df)__X, + (__v2df)__Y, + __M); } #else -#define _mm_blend_pd(X, Y, M) \ - ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ - (int)(M))) +#define _mm_blend_pd(X, Y, M) ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(M))) #endif - -__funline __m128d _mm_blendv_pd(__m128d __X, __m128d __Y, __m128d __M) { - return (__m128d)__builtin_ia32_blendvpd((__v2df)__X, (__v2df)__Y, - (__v2df)__M); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) +{ + return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X, + (__v2df)__Y, + (__v2df)__M); } - #ifdef __OPTIMIZE__ -__funline __m128 _mm_dp_ps(__m128 __X, __m128 __Y, const int __M) { - return (__m128)__builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, __M); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_dpps ((__v4sf)__X, + (__v4sf)__Y, + __M); } - -__funline __m128d _mm_dp_pd(__m128d __X, __m128d __Y, const int __M) { - return (__m128d)__builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, __M); +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_dppd ((__v2df)__X, + (__v2df)__Y, + __M); } #else -#define _mm_dp_ps(X, Y, M) \ - ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (int)(M))) - -#define _mm_dp_pd(X, Y, M) \ - ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ - (int)(M))) +#define _mm_dp_ps(X, Y, M) ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(M))) +#define _mm_dp_pd(X, Y, M) ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(M))) #endif - -__funline __m128i _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { - return (__m128i)((__v2di)__X == (__v2di)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) ((__v2di)__X == (__v2di)__Y); } - -__funline __m128i _mm_min_epi8(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pminsb128((__v16qi)__X, (__v16qi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y); } - -__funline __m128i _mm_max_epi8(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmaxsb128((__v16qi)__X, (__v16qi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y); } - -__funline __m128i _mm_min_epu16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pminuw128((__v8hi)__X, (__v8hi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m128i _mm_max_epu16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmaxuw128((__v8hi)__X, (__v8hi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m128i _mm_min_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pminsd128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y); } - -__funline __m128i _mm_max_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmaxsd128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y); } - -__funline __m128i _mm_min_epu32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pminud128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y); } - -__funline __m128i _mm_max_epu32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmaxud128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y); } - -__funline __m128i _mm_mullo_epi32(__m128i __X, __m128i __Y) { - return (__m128i)((__v4su)__X * (__v4su)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) ((__v4su)__X * (__v4su)__Y); } - -__funline __m128i _mm_mul_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmuldq128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); } - #ifdef __OPTIMIZE__ -__funline __m128 _mm_insert_ps(__m128 __D, __m128 __S, const int __N) { - return (__m128)__builtin_ia32_insertps128((__v4sf)__D, (__v4sf)__S, __N); +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_ps (__m128 __D, __m128 __S, const int __N) +{ + return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D, + (__v4sf)__S, + __N); } #else -#define _mm_insert_ps(D, S, N) \ - ((__m128)__builtin_ia32_insertps128((__v4sf)(__m128)(D), \ - (__v4sf)(__m128)(S), (int)(N))) +#define _mm_insert_ps(D, S, N) ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), (__v4sf)(__m128)(S), (int)(N))) #endif - #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) - #ifdef __OPTIMIZE__ -__funline int _mm_extract_ps(__m128 __X, const int __N) { - union { - int i; - float f; - } __tmp; - __tmp.f = __builtin_ia32_vec_ext_v4sf((__v4sf)__X, __N); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_ps (__m128 __X, const int __N) +{ + union { int i; float f; } __tmp; + __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N); return __tmp.i; } #else -#define _mm_extract_ps(X, N) \ - (__extension__({ \ - union { \ - int i; \ - float f; \ - } __tmp; \ - __tmp.f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ - __tmp.i; \ - })) +#define _mm_extract_ps(X, N) (__extension__ ({ union { int i; float f; } __tmp; __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); __tmp.i; })) #endif - -#define _MM_EXTRACT_FLOAT(D, S, N) \ - { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(S), (N)); } - -#define _MM_PICK_OUT_PS(X, N) \ - _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) - +#define _MM_EXTRACT_FLOAT(D, S, N) { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } +#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps (), (X), _MM_MK_INSERTPS_NDX ((N), 0, 0x0e)) #ifdef __OPTIMIZE__ -__funline __m128i _mm_insert_epi8(__m128i __D, int __S, const int __N) { - return (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)__D, __S, __N); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D, + __S, __N); } - -__funline __m128i _mm_insert_epi32(__m128i __D, int __S, const int __N) { - return (__m128i)__builtin_ia32_vec_set_v4si((__v4si)__D, __S, __N); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D, + __S, __N); } - #ifdef __x86_64__ -__funline __m128i _mm_insert_epi64(__m128i __D, long long __S, const int __N) { - return (__m128i)__builtin_ia32_vec_set_v2di((__v2di)__D, __S, __N); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i __D, long long __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D, + __S, __N); } #endif #else -#define _mm_insert_epi8(D, S, N) \ - ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(D), (int)(S), \ - (int)(N))) - -#define _mm_insert_epi32(D, S, N) \ - ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(D), (int)(S), \ - (int)(N))) - +#define _mm_insert_epi8(D, S, N) ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), (int)(S), (int)(N))) +#define _mm_insert_epi32(D, S, N) ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), (int)(S), (int)(N))) #ifdef __x86_64__ -#define _mm_insert_epi64(D, S, N) \ - ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(D), (long long)(S), \ - (int)(N))) +#define _mm_insert_epi64(D, S, N) ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), (long long)(S), (int)(N))) #endif #endif - #ifdef __OPTIMIZE__ -__funline int _mm_extract_epi8(__m128i __X, const int __N) { - return (unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)__X, __N); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N); } - -__funline int _mm_extract_epi32(__m128i __X, const int __N) { - return __builtin_ia32_vec_ext_v4si((__v4si)__X, __N); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N); } - #ifdef __x86_64__ -__funline long long _mm_extract_epi64(__m128i __X, const int __N) { - return __builtin_ia32_vec_ext_v2di((__v2di)__X, __N); +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N); } #endif #else -#define _mm_extract_epi8(X, N) \ - ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ - (int)(N))) -#define _mm_extract_epi32(X, N) \ - ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) - +#define _mm_extract_epi8(X, N) ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N))) +#define _mm_extract_epi32(X, N) ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N))) #ifdef __x86_64__ -#define _mm_extract_epi64(X, N) \ - ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) +#define _mm_extract_epi64(X, N) ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N))) #endif #endif - -__funline __m128i _mm_minpos_epu16(__m128i __X) { - return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_minpos_epu16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X); } - -__funline __m128i _mm_cvtepi8_epi32(__m128i __X) { - return (__m128i)__builtin_ia32_pmovsxbd128((__v16qi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X); } - -__funline __m128i _mm_cvtepi16_epi32(__m128i __X) { - return (__m128i)__builtin_ia32_pmovsxwd128((__v8hi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X); } - -__funline __m128i _mm_cvtepi8_epi64(__m128i __X) { - return (__m128i)__builtin_ia32_pmovsxbq128((__v16qi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X); } - -__funline __m128i _mm_cvtepi32_epi64(__m128i __X) { - return (__m128i)__builtin_ia32_pmovsxdq128((__v4si)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X); } - -__funline __m128i _mm_cvtepi16_epi64(__m128i __X) { - return (__m128i)__builtin_ia32_pmovsxwq128((__v8hi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X); } - -__funline __m128i _mm_cvtepi8_epi16(__m128i __X) { - return (__m128i)__builtin_ia32_pmovsxbw128((__v16qi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X); } - -__funline __m128i _mm_cvtepu8_epi32(__m128i __X) { - return (__m128i)__builtin_ia32_pmovzxbd128((__v16qi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X); } - -__funline __m128i _mm_cvtepu16_epi32(__m128i __X) { - return (__m128i)__builtin_ia32_pmovzxwd128((__v8hi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X); } - -__funline __m128i _mm_cvtepu8_epi64(__m128i __X) { - return (__m128i)__builtin_ia32_pmovzxbq128((__v16qi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X); } - -__funline __m128i _mm_cvtepu32_epi64(__m128i __X) { - return (__m128i)__builtin_ia32_pmovzxdq128((__v4si)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X); } - -__funline __m128i _mm_cvtepu16_epi64(__m128i __X) { - return (__m128i)__builtin_ia32_pmovzxwq128((__v8hi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X); } - -__funline __m128i _mm_cvtepu8_epi16(__m128i __X) { - return (__m128i)__builtin_ia32_pmovzxbw128((__v16qi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X); } - -__funline __m128i _mm_packus_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_packusdw128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_mpsadbw_epu8(__m128i __X, __m128i __Y, const int __M) { - return (__m128i)__builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X, + (__v16qi)__Y, __M); } #else -#define _mm_mpsadbw_epu8(X, Y, M) \ - ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_mpsadbw_epu8(X, Y, M) ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) #endif - -__funline __m128i _mm_stream_load_si128(__m128i *__X) { - return (__m128i)__builtin_ia32_movntdqa((__v2di *)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_load_si128 (__m128i *__X) +{ + return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X); } - #ifndef __SSE4_2__ #pragma GCC push_options #pragma GCC target("sse4.2") #define __DISABLE_SSE4_2__ #endif - #define _SIDD_UBYTE_OPS 0x00 #define _SIDD_UWORD_OPS 0x01 #define _SIDD_SBYTE_OPS 0x02 #define _SIDD_SWORD_OPS 0x03 - -#define _SIDD_CMP_EQUAL_ANY 0x00 -#define _SIDD_CMP_RANGES 0x04 -#define _SIDD_CMP_EQUAL_EACH 0x08 +#define _SIDD_CMP_EQUAL_ANY 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_CMP_EQUAL_EACH 0x08 #define _SIDD_CMP_EQUAL_ORDERED 0x0c - -#define _SIDD_POSITIVE_POLARITY 0x00 -#define _SIDD_NEGATIVE_POLARITY 0x10 +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_NEGATIVE_POLARITY 0x10 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 - #define _SIDD_LEAST_SIGNIFICANT 0x00 -#define _SIDD_MOST_SIGNIFICANT 0x40 - -#define _SIDD_BIT_MASK 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 +#define _SIDD_BIT_MASK 0x00 #define _SIDD_UNIT_MASK 0x40 - #ifdef __OPTIMIZE__ -__funline __m128i _mm_cmpistrm(__m128i __X, __m128i __Y, const int __M) { - return (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X, + (__v16qi)__Y, + __M); } - -__funline int _mm_cmpistri(__m128i __X, __m128i __Y, const int __M) { - return __builtin_ia32_pcmpistri128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistri (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistri128 ((__v16qi)__X, + (__v16qi)__Y, + __M); } - -__funline __m128i _mm_cmpestrm(__m128i __X, int __LX, __m128i __Y, int __LY, - const int __M) { - return (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)__X, __LX, (__v16qi)__Y, - __LY, __M); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); } - -__funline int _mm_cmpestri(__m128i __X, int __LX, __m128i __Y, int __LY, - const int __M) { - return __builtin_ia32_pcmpestri128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, - __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); } #else -#define _mm_cmpistrm(X, Y, M) \ - ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistri(X, Y, M) \ - ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) - -#define _mm_cmpestrm(X, LX, Y, LY, M) \ - ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(X), (int)(LX), \ - (__v16qi)(__m128i)(Y), (int)(LY), \ - (int)(M))) -#define _mm_cmpestri(X, LX, Y, LY, M) \ - ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(X), (int)(LX), \ - (__v16qi)(__m128i)(Y), (int)(LY), \ - (int)(M))) +#define _mm_cmpistrm(X, Y, M) ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistri(X, Y, M) ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpestrm(X, LX, Y, LY, M) ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) +#define _mm_cmpestri(X, LX, Y, LY, M) ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) #endif - #ifdef __OPTIMIZE__ -__funline int _mm_cmpistra(__m128i __X, __m128i __Y, const int __M) { - return __builtin_ia32_pcmpistria128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistra (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistria128 ((__v16qi)__X, + (__v16qi)__Y, + __M); } - -__funline int _mm_cmpistrc(__m128i __X, __m128i __Y, const int __M) { - return __builtin_ia32_pcmpistric128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistric128 ((__v16qi)__X, + (__v16qi)__Y, + __M); } - -__funline int _mm_cmpistro(__m128i __X, __m128i __Y, const int __M) { - return __builtin_ia32_pcmpistrio128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistro (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistrio128 ((__v16qi)__X, + (__v16qi)__Y, + __M); } - -__funline int _mm_cmpistrs(__m128i __X, __m128i __Y, const int __M) { - return __builtin_ia32_pcmpistris128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistris128 ((__v16qi)__X, + (__v16qi)__Y, + __M); } - -__funline int _mm_cmpistrz(__m128i __X, __m128i __Y, const int __M) { - return __builtin_ia32_pcmpistriz128((__v16qi)__X, (__v16qi)__Y, __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistriz128 ((__v16qi)__X, + (__v16qi)__Y, + __M); } - -__funline int _mm_cmpestra(__m128i __X, int __LX, __m128i __Y, int __LY, - const int __M) { - return __builtin_ia32_pcmpestria128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, - __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); } - -__funline int _mm_cmpestrc(__m128i __X, int __LX, __m128i __Y, int __LY, - const int __M) { - return __builtin_ia32_pcmpestric128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, - __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); } - -__funline int _mm_cmpestro(__m128i __X, int __LX, __m128i __Y, int __LY, - const int __M) { - return __builtin_ia32_pcmpestrio128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, - __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); } - -__funline int _mm_cmpestrs(__m128i __X, int __LX, __m128i __Y, int __LY, - const int __M) { - return __builtin_ia32_pcmpestris128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, - __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); } - -__funline int _mm_cmpestrz(__m128i __X, int __LX, __m128i __Y, int __LY, - const int __M) { - return __builtin_ia32_pcmpestriz128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, - __M); +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); } #else -#define _mm_cmpistra(X, Y, M) \ - ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistrc(X, Y, M) \ - ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistro(X, Y, M) \ - ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistrs(X, Y, M) \ - ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) -#define _mm_cmpistrz(X, Y, M) \ - ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (int)(M))) - -#define _mm_cmpestra(X, LX, Y, LY, M) \ - ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(X), (int)(LX), \ - (__v16qi)(__m128i)(Y), (int)(LY), \ - (int)(M))) -#define _mm_cmpestrc(X, LX, Y, LY, M) \ - ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(X), (int)(LX), \ - (__v16qi)(__m128i)(Y), (int)(LY), \ - (int)(M))) -#define _mm_cmpestro(X, LX, Y, LY, M) \ - ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(X), (int)(LX), \ - (__v16qi)(__m128i)(Y), (int)(LY), \ - (int)(M))) -#define _mm_cmpestrs(X, LX, Y, LY, M) \ - ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(X), (int)(LX), \ - (__v16qi)(__m128i)(Y), (int)(LY), \ - (int)(M))) -#define _mm_cmpestrz(X, LX, Y, LY, M) \ - ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(X), (int)(LX), \ - (__v16qi)(__m128i)(Y), (int)(LY), \ - (int)(M))) +#define _mm_cmpistra(X, Y, M) ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrc(X, Y, M) ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistro(X, Y, M) ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrs(X, Y, M) ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrz(X, Y, M) ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpestra(X, LX, Y, LY, M) ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) +#define _mm_cmpestrc(X, LX, Y, LY, M) ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) +#define _mm_cmpestro(X, LX, Y, LY, M) ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) +#define _mm_cmpestrs(X, LX, Y, LY, M) ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) +#define _mm_cmpestrz(X, LX, Y, LY, M) ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), (__v16qi)(__m128i)(Y), (int)(LY), (int)(M))) #endif - -__funline __m128i _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { - return (__m128i)((__v2di)__X > (__v2di)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) ((__v2di)__X > (__v2di)__Y); } - #ifdef __DISABLE_SSE4_2__ #undef __DISABLE_SSE4_2__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_2__ */ - +#endif #ifdef __DISABLE_SSE4_1__ #undef __DISABLE_SSE4_1__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_1__ */ - +#endif #include "third_party/intel/popcntintrin.internal.h" - #ifndef __SSE4_1__ #pragma GCC push_options #pragma GCC target("sse4.1") #define __DISABLE_SSE4_1__ -#endif /* __SSE4_1__ */ - +#endif #ifndef __SSE4_2__ #pragma GCC push_options #pragma GCC target("sse4.2") #define __DISABLE_SSE4_2__ -#endif /* __SSE4_1__ */ - -/* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ -__funline unsigned int _mm_crc32_u8(unsigned int __C, unsigned char __V) { - return __builtin_ia32_crc32qi(__C, __V); +#endif +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u8 (unsigned int __C, unsigned char __V) +{ + return __builtin_ia32_crc32qi (__C, __V); } - -__funline unsigned int _mm_crc32_u16(unsigned int __C, unsigned short __V) { - return __builtin_ia32_crc32hi(__C, __V); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u16 (unsigned int __C, unsigned short __V) +{ + return __builtin_ia32_crc32hi (__C, __V); } - -__funline unsigned int _mm_crc32_u32(unsigned int __C, unsigned int __V) { - return __builtin_ia32_crc32si(__C, __V); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u32 (unsigned int __C, unsigned int __V) +{ + return __builtin_ia32_crc32si (__C, __V); } - #ifdef __x86_64__ -__funline unsigned long long _mm_crc32_u64(unsigned long long __C, - unsigned long long __V) { - return __builtin_ia32_crc32di(__C, __V); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u64 (unsigned long long __C, unsigned long long __V) +{ + return __builtin_ia32_crc32di (__C, __V); } #endif - #ifdef __DISABLE_SSE4_2__ #undef __DISABLE_SSE4_2__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_2__ */ - +#endif #ifdef __DISABLE_SSE4_1__ #undef __DISABLE_SSE4_1__ #pragma GCC pop_options -#endif /* __DISABLE_SSE4_1__ */ - -#endif /* __x86_64__ */ -#endif /* _SMMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/tbmintrin.internal.h b/third_party/intel/tbmintrin.internal.h index d740e2274..9b63f1c68 100644 --- a/third_party/intel/tbmintrin.internal.h +++ b/third_party/intel/tbmintrin.internal.h @@ -1,115 +1,128 @@ -#ifndef _X86INTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _TBMINTRIN_H_INCLUDED #define _TBMINTRIN_H_INCLUDED - #ifndef __TBM__ #pragma GCC push_options #pragma GCC target("tbm") #define __DISABLE_TBM__ -#endif /* __TBM__ */ - +#endif #ifdef __OPTIMIZE__ -__funline unsigned int __bextri_u32(unsigned int __X, const unsigned int __I) { - return __builtin_ia32_bextri_u32(__X, __I); +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u32 (unsigned int __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u32 (__X, __I); } #else -#define __bextri_u32(X, I) \ - ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(X), \ - (unsigned int)(I))) -#endif /*__OPTIMIZE__ */ - -__funline unsigned int __blcfill_u32(unsigned int __X) { +#define __bextri_u32(X, I) ((unsigned int)__builtin_ia32_bextri_u32 ((unsigned int)(X), (unsigned int)(I))) +#endif +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcfill_u32 (unsigned int __X) +{ return __X & (__X + 1); } - -__funline unsigned int __blci_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blci_u32 (unsigned int __X) +{ return __X | ~(__X + 1); } - -__funline unsigned int __blcic_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcic_u32 (unsigned int __X) +{ return ~__X & (__X + 1); } - -__funline unsigned int __blcmsk_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcmsk_u32 (unsigned int __X) +{ return __X ^ (__X + 1); } - -__funline unsigned int __blcs_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcs_u32 (unsigned int __X) +{ return __X | (__X + 1); } - -__funline unsigned int __blsfill_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsfill_u32 (unsigned int __X) +{ return __X | (__X - 1); } - -__funline unsigned int __blsic_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsic_u32 (unsigned int __X) +{ return ~__X | (__X - 1); } - -__funline unsigned int __t1mskc_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__t1mskc_u32 (unsigned int __X) +{ return ~__X | (__X + 1); } - -__funline unsigned int __tzmsk_u32(unsigned int __X) { +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzmsk_u32 (unsigned int __X) +{ return ~__X & (__X - 1); } - #ifdef __x86_64__ #ifdef __OPTIMIZE__ -__funline unsigned long long __bextri_u64(unsigned long long __X, - const unsigned int __I) { - return __builtin_ia32_bextri_u64(__X, __I); +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u64 (unsigned long long __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u64 (__X, __I); } #else -#define __bextri_u64(X, I) \ - ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(X), \ - (unsigned long long)(I))) -#endif /*__OPTIMIZE__ */ - -__funline unsigned long long __blcfill_u64(unsigned long long __X) { +#define __bextri_u64(X, I) ((unsigned long long)__builtin_ia32_bextri_u64 ((unsigned long long)(X), (unsigned long long)(I))) +#endif +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcfill_u64 (unsigned long long __X) +{ return __X & (__X + 1); } - -__funline unsigned long long __blci_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blci_u64 (unsigned long long __X) +{ return __X | ~(__X + 1); } - -__funline unsigned long long __blcic_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcic_u64 (unsigned long long __X) +{ return ~__X & (__X + 1); } - -__funline unsigned long long __blcmsk_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcmsk_u64 (unsigned long long __X) +{ return __X ^ (__X + 1); } - -__funline unsigned long long __blcs_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcs_u64 (unsigned long long __X) +{ return __X | (__X + 1); } - -__funline unsigned long long __blsfill_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsfill_u64 (unsigned long long __X) +{ return __X | (__X - 1); } - -__funline unsigned long long __blsic_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsic_u64 (unsigned long long __X) +{ return ~__X | (__X - 1); } - -__funline unsigned long long __t1mskc_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__t1mskc_u64 (unsigned long long __X) +{ return ~__X | (__X + 1); } - -__funline unsigned long long __tzmsk_u64(unsigned long long __X) { +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzmsk_u64 (unsigned long long __X) +{ return ~__X & (__X - 1); } - -#endif /* __x86_64__ */ - +#endif #ifdef __DISABLE_TBM__ #undef __DISABLE_TBM__ #pragma GCC pop_options -#endif /* __DISABLE_TBM__ */ - -#endif /* _TBMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/tmmintrin.internal.h b/third_party/intel/tmmintrin.internal.h index 7f56e25d5..8bbd38f6d 100644 --- a/third_party/intel/tmmintrin.internal.h +++ b/third_party/intel/tmmintrin.internal.h @@ -1,155 +1,183 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _TMMINTRIN_H_INCLUDED #define _TMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/pmmintrin.internal.h" - #ifndef __SSSE3__ #pragma GCC push_options #pragma GCC target("ssse3") #define __DISABLE_SSSE3__ -#endif /* __SSSE3__ */ - -__funline __m128i _mm_hadd_epi16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_phaddw128((__v8hi)__X, (__v8hi)__Y); +#endif +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m128i _mm_hadd_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_phaddd128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y); } - -__funline __m128i _mm_hadds_epi16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__X, (__v8hi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m64 _mm_hadd_pi16(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_phaddw((__v4hi)__X, (__v4hi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y); } - -__funline __m64 _mm_hadd_pi32(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_phaddd((__v2si)__X, (__v2si)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y); } - -__funline __m64 _mm_hadds_pi16(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_phaddsw((__v4hi)__X, (__v4hi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y); } - -__funline __m128i _mm_hsub_epi16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_phsubw128((__v8hi)__X, (__v8hi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m128i _mm_hsub_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_phsubd128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y); } - -__funline __m128i _mm_hsubs_epi16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__X, (__v8hi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m64 _mm_hsub_pi16(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_phsubw((__v4hi)__X, (__v4hi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y); } - -__funline __m64 _mm_hsub_pi32(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_phsubd((__v2si)__X, (__v2si)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y); } - -__funline __m64 _mm_hsubs_pi16(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_phsubsw((__v4hi)__X, (__v4hi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y); } - -__funline __m128i _mm_maddubs_epi16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__X, (__v16qi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y); } - -__funline __m64 _mm_maddubs_pi16(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__X, (__v8qi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y); } - -__funline __m128i _mm_mulhrs_epi16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__X, (__v8hi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m64 _mm_mulhrs_pi16(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__X, (__v4hi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y); } - -__funline __m128i _mm_shuffle_epi8(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_pshufb128((__v16qi)__X, (__v16qi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y); } - -__funline __m64 _mm_shuffle_pi8(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_pshufb((__v8qi)__X, (__v8qi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi8 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y); } - -__funline __m128i _mm_sign_epi8(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psignb128((__v16qi)__X, (__v16qi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y); } - -__funline __m128i _mm_sign_epi16(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psignw128((__v8hi)__X, (__v8hi)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y); } - -__funline __m128i _mm_sign_epi32(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_psignd128((__v4si)__X, (__v4si)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y); } - -__funline __m64 _mm_sign_pi8(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_psignb((__v8qi)__X, (__v8qi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi8 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y); } - -__funline __m64 _mm_sign_pi16(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_psignw((__v4hi)__X, (__v4hi)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y); } - -__funline __m64 _mm_sign_pi32(__m64 __X, __m64 __Y) { - return (__m64)__builtin_ia32_psignd((__v2si)__X, (__v2si)__Y); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) { - return (__m128i)__builtin_ia32_palignr128((__v2di)__X, (__v2di)__Y, __N * 8); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) +{ + return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X, + (__v2di)__Y, __N * 8); } - -__funline __m64 _mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) { - return (__m64)__builtin_ia32_palignr((__v1di)__X, (__v1di)__Y, __N * 8); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) +{ + return (__m64) __builtin_ia32_palignr ((__v1di)__X, + (__v1di)__Y, __N * 8); } #else -#define _mm_alignr_epi8(X, Y, N) \ - ((__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(X), \ - (__v2di)(__m128i)(Y), (int)(N)*8)) -#define _mm_alignr_pi8(X, Y, N) \ - ((__m64)__builtin_ia32_palignr((__v1di)(__m64)(X), (__v1di)(__m64)(Y), \ - (int)(N)*8)) +#define _mm_alignr_epi8(X, Y, N) ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N) * 8)) +#define _mm_alignr_pi8(X, Y, N) ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X), (__v1di)(__m64)(Y), (int)(N) * 8)) #endif - -__funline __m128i _mm_abs_epi8(__m128i __X) { - return (__m128i)__builtin_ia32_pabsb128((__v16qi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi8 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X); } - -__funline __m128i _mm_abs_epi16(__m128i __X) { - return (__m128i)__builtin_ia32_pabsw128((__v8hi)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X); } - -__funline __m128i _mm_abs_epi32(__m128i __X) { - return (__m128i)__builtin_ia32_pabsd128((__v4si)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X); } - -__funline __m64 _mm_abs_pi8(__m64 __X) { - return (__m64)__builtin_ia32_pabsb((__v8qi)__X); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi8 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsb ((__v8qi)__X); } - -__funline __m64 _mm_abs_pi16(__m64 __X) { - return (__m64)__builtin_ia32_pabsw((__v4hi)__X); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi16 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsw ((__v4hi)__X); } - -__funline __m64 _mm_abs_pi32(__m64 __X) { - return (__m64)__builtin_ia32_pabsd((__v2si)__X); +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi32 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsd ((__v2si)__X); } - #ifdef __DISABLE_SSSE3__ #undef __DISABLE_SSSE3__ #pragma GCC pop_options -#endif /* __DISABLE_SSSE3__ */ - -#endif /* __x86_64__ */ -#endif /* _TMMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/tsxldtrkintrin.internal.h b/third_party/intel/tsxldtrkintrin.internal.h new file mode 100644 index 000000000..5a1509378 --- /dev/null +++ b/third_party/intel/tsxldtrkintrin.internal.h @@ -0,0 +1,30 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif +#ifndef _TSXLDTRKINTRIN_H_INCLUDED +#define _TSXLDTRKINTRIN_H_INCLUDED +#if !defined(__TSXLDTRK__) +#pragma GCC push_options +#pragma GCC target("tsxldtrk") +#define __DISABLE_TSXLDTRK__ +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsusldtrk (void) +{ + __builtin_ia32_xsusldtrk (); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xresldtrk (void) +{ + __builtin_ia32_xresldtrk (); +} +#ifdef __DISABLE_TSXLDTRK__ +#undef __DISABLE_TSXLDTRK__ +#pragma GCC pop_options +#endif +#endif +#endif diff --git a/third_party/intel/uintrintrin.internal.h b/third_party/intel/uintrintrin.internal.h new file mode 100644 index 000000000..b5c4fc84c --- /dev/null +++ b/third_party/intel/uintrintrin.internal.h @@ -0,0 +1,50 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif +#ifndef _UINTRNTRIN_H_INCLUDED +#define _UINTRNTRIN_H_INCLUDED +#ifdef __x86_64__ +#ifndef __UINTR__ +#pragma GCC push_options +#pragma GCC target ("uintr") +#define __DISABLE_UINTR__ +#endif +struct __uintr_frame +{ + unsigned long long rip; + unsigned long long rflags; + unsigned long long rsp; +}; +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_clui (void) +{ + __builtin_ia32_clui (); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_stui (void) +{ + __builtin_ia32_stui (); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_senduipi (unsigned long long __R) +{ + __builtin_ia32_senduipi (__R); +} +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_testui (void) +{ + return __builtin_ia32_testui (); +} +#ifdef __DISABLE_UINTR__ +#undef __DISABLE_UINTR__ +#pragma GCC pop_options +#endif +#endif +#endif +#endif diff --git a/third_party/intel/upgrade.sh b/third_party/intel/upgrade.sh new file mode 100755 index 000000000..64d870d15 --- /dev/null +++ b/third_party/intel/upgrade.sh @@ -0,0 +1,135 @@ +#!/bin/sh + +s=/opt/cross11portcosmo/lib/gcc/x86_64-linux-musl/11.2.0/include +d=third_party/intel + +FILES=' +amxbf16intrin +amxint8intrin +amxtileintrin +avx512bf16intrin +avx512bf16vlintrin +avx512vp2intersectintrin +avx512vp2intersectvlintrin +avxvnniintrin +enqcmdintrin +hresetintrin +keylockerintrin +serializeintrin +tsxldtrkintrin +uintrintrin +x86gprintrin +avx5124fmapsintrin +avx5124vnniwintrin +avx512bitalgintrin +avx512bwintrin +avx512cdintrin +avx512dqintrin +avx512erintrin +avx512fintrin +avx512ifmaintrin +avx512ifmavlintrin +avx512pfintrin +avx512vbmi2intrin +avx512vbmi2vlintrin +avx512vbmiintrin +avx512vbmivlintrin +avx512vlbwintrin +avx512vldqintrin +avx512vlintrin +avx512vnniintrin +avx512vnnivlintrin +avx512vpopcntdqintrin +avx512vpopcntdqvlintrin +adxintrin +ammintrin +avx2intrin +avxintrin +bmi2intrin +bmiintrin +cetintrin +cldemoteintrin +clflushoptintrin +clwbintrin +clzerointrin +cpuid +emmintrin +f16cintrin +fma4intrin +fmaintrin +fxsrintrin +gfniintrin +ia32intrin +immintrin +lwpintrin +lzcntintrin +mm3dnow +mm_malloc +mmintrin +movdirintrin +mwaitxintrin +nmmintrin +pconfigintrin +pkuintrin +pmmintrin +popcntintrin +prfchwintrin +rdseedintrin +rtmintrin +sgxintrin +shaintrin +smmintrin +tbmintrin +tmmintrin +vaesintrin +vpclmulqdqintrin +waitpkgintrin +wbnoinvdintrin +wmmintrin +x86intrin +xmmintrin +xopintrin +xsavecintrin +xsaveintrin +xsaveoptintrin +xsavesintrin +xtestintrin +' + +strip_c_comments() { + # https://stackoverflow.com/a/13062682/1653720 + [ $# -eq 2 ] && arg="$1" || arg="" + eval file="\$$#" + sed 's/a/aA/g; s/__/aB/g; s/#/aC/g' "$file" | + gcc -P -E $arg - | + sed 's/aC/#/g; s/aB/__/g; s/aA/a/g' +} + +rm -f third_party/intel/*.h + +for f in $FILES; do + echo cp $s/$f.h $d/$f.internal.h + cp $s/$f.h $d/$f.internal.h || exit +done + +sed -i \ + -e 's/# *include/#include/' \ + -e '/#include .std/d' \ + -e 's!#include [<"]!#include "third_party/intel/!' \ + -e 's!\.h[>"]$!.internal.h"!' \ + third_party/intel/*.h + +for f in third_party/intel/*.h; do + strip_c_comments $f >$f.tmp || exit + mv $f.tmp $f +done + +for f in third_party/intel/*.h; do + ( + printf %s\\n '/* clang-format off */' + printf %s\\n '#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)' + cat $f + printf %s\\n '#endif' + ) >$f.tmp + mv $f.tmp $f +done diff --git a/third_party/intel/vaesintrin.internal.h b/third_party/intel/vaesintrin.internal.h index a71e548d7..4d278d07f 100644 --- a/third_party/intel/vaesintrin.internal.h +++ b/third_party/intel/vaesintrin.internal.h @@ -1,61 +1,76 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef __VAESINTRIN_H_INCLUDED #define __VAESINTRIN_H_INCLUDED -#ifdef __x86_64__ -#include "third_party/intel/x86intrin.internal.h" - #if !defined(__VAES__) || !defined(__AVX__) #pragma GCC push_options #pragma GCC target("vaes,avx") #define __DISABLE_VAES__ -#endif /* __VAES__ */ - -__funline __m256i _mm256_aesdec_epi128(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vaesdec_v32qi((__v32qi)__A, (__v32qi)__B); +#endif +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesdec_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesdec_v32qi ((__v32qi) __A, (__v32qi) __B); } - -__funline __m256i _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vaesdeclast_v32qi((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesdeclast_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesdeclast_v32qi ((__v32qi) __A, + (__v32qi) __B); } - -__funline __m256i _mm256_aesenc_epi128(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vaesenc_v32qi((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesenc_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesenc_v32qi ((__v32qi) __A, (__v32qi) __B); } - -__funline __m256i _mm256_aesenclast_epi128(__m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vaesenclast_v32qi((__v32qi)__A, (__v32qi)__B); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesenclast_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesenclast_v32qi ((__v32qi) __A, + (__v32qi) __B); } - #ifdef __DISABLE_VAES__ #undef __DISABLE_VAES__ #pragma GCC pop_options -#endif /* __DISABLE_VAES__ */ - +#endif #if !defined(__VAES__) || !defined(__AVX512F__) #pragma GCC push_options #pragma GCC target("vaes,avx512f") #define __DISABLE_VAESF__ -#endif /* __VAES__ */ - -__funline __m512i _mm512_aesdec_epi128(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vaesdec_v64qi((__v64qi)__A, (__v64qi)__B); +#endif +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesdec_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesdec_v64qi ((__v64qi) __A, (__v64qi) __B); } - -__funline __m512i _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vaesdeclast_v64qi((__v64qi)__A, (__v64qi)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesdeclast_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesdeclast_v64qi ((__v64qi) __A, + (__v64qi) __B); } - -__funline __m512i _mm512_aesenc_epi128(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vaesenc_v64qi((__v64qi)__A, (__v64qi)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesenc_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesenc_v64qi ((__v64qi) __A, (__v64qi) __B); } - -__funline __m512i _mm512_aesenclast_epi128(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vaesenclast_v64qi((__v64qi)__A, (__v64qi)__B); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesenclast_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesenclast_v64qi ((__v64qi) __A, + (__v64qi) __B); } - #ifdef __DISABLE_VAESF__ #undef __DISABLE_VAESF__ #pragma GCC pop_options -#endif /* __DISABLE_VAES__ */ - -#endif /* __x86_64__ */ -#endif /* __VAESINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/vpclmulqdqintrin.internal.h b/third_party/intel/vpclmulqdqintrin.internal.h index 49454f499..bbb18799d 100644 --- a/third_party/intel/vpclmulqdqintrin.internal.h +++ b/third_party/intel/vpclmulqdqintrin.internal.h @@ -1,52 +1,49 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _IMMINTRIN_H_INCLUDED #error "Never use directly; include instead." #endif - #ifndef _VPCLMULQDQINTRIN_H_INCLUDED #define _VPCLMULQDQINTRIN_H_INCLUDED - #if !defined(__VPCLMULQDQ__) || !defined(__AVX512F__) #pragma GCC push_options #pragma GCC target("vpclmulqdq,avx512f") #define __DISABLE_VPCLMULQDQF__ -#endif /* __VPCLMULQDQF__ */ - +#endif #ifdef __OPTIMIZE__ -__funline __m512i _mm512_clmulepi64_epi128(__m512i __A, __m512i __B, - const int __C) { - return (__m512i)__builtin_ia32_vpclmulqdq_v8di((__v8di)__A, (__v8di)__B, __C); +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_clmulepi64_epi128 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)__A, + (__v8di) __B, __C); } #else -#define _mm512_clmulepi64_epi128(A, B, C) \ - ((__m512i)__builtin_ia32_vpclmulqdq_v8di((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(C))) +#define _mm512_clmulepi64_epi128(A, B, C) ((__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (int)(C))) #endif - #ifdef __DISABLE_VPCLMULQDQF__ #undef __DISABLE_VPCLMULQDQF__ #pragma GCC pop_options -#endif /* __DISABLE_VPCLMULQDQF__ */ - +#endif #if !defined(__VPCLMULQDQ__) || !defined(__AVX__) #pragma GCC push_options #pragma GCC target("vpclmulqdq,avx") #define __DISABLE_VPCLMULQDQ__ -#endif /* __VPCLMULQDQ__ */ - +#endif #ifdef __OPTIMIZE__ -__funline __m256i _mm256_clmulepi64_epi128(__m256i __A, __m256i __B, - const int __C) { - return (__m256i)__builtin_ia32_vpclmulqdq_v4di((__v4di)__A, (__v4di)__B, __C); +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_clmulepi64_epi128 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)__A, + (__v4di) __B, __C); } #else -#define _mm256_clmulepi64_epi128(A, B, C) \ - ((__m256i)__builtin_ia32_vpclmulqdq_v4di((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(C))) +#define _mm256_clmulepi64_epi128(A, B, C) ((__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (int)(C))) #endif - #ifdef __DISABLE_VPCLMULQDQ__ #undef __DISABLE_VPCLMULQDQ__ #pragma GCC pop_options -#endif /* __DISABLE_VPCLMULQDQ__ */ - -#endif /* _VPCLMULQDQINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/waitpkgintrin.internal.h b/third_party/intel/waitpkgintrin.internal.h index 3f4f19254..64e8c73fe 100644 --- a/third_party/intel/waitpkgintrin.internal.h +++ b/third_party/intel/waitpkgintrin.internal.h @@ -1,31 +1,36 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _WAITPKG_H_INCLUDED #define _WAITPKG_H_INCLUDED - #ifndef __WAITPKG__ #pragma GCC push_options #pragma GCC target("waitpkg") #define __DISABLE_WAITPKG__ -#endif /* __WAITPKG__ */ - -__funline void _umonitor(void *__A) { - __builtin_ia32_umonitor(__A); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_umonitor (void *__A) +{ + __builtin_ia32_umonitor (__A); } - -__funline unsigned char _umwait(unsigned int __A, unsigned long long __B) { - return __builtin_ia32_umwait(__A, __B); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_umwait (unsigned int __A, unsigned long long __B) +{ + return __builtin_ia32_umwait (__A, __B); } - -__funline unsigned char _tpause(unsigned int __A, unsigned long long __B) { - return __builtin_ia32_tpause(__A, __B); +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tpause (unsigned int __A, unsigned long long __B) +{ + return __builtin_ia32_tpause (__A, __B); } - #ifdef __DISABLE_WAITPKG__ #undef __DISABLE_WAITPKG__ #pragma GCC pop_options -#endif /* __DISABLE_WAITPKG__ */ - -#endif /* _WAITPKG_H_INCLUDED. */ +#endif +#endif +#endif diff --git a/third_party/intel/wbnoinvdintrin.internal.h b/third_party/intel/wbnoinvdintrin.internal.h index 72b06d30a..5d8d9456b 100644 --- a/third_party/intel/wbnoinvdintrin.internal.h +++ b/third_party/intel/wbnoinvdintrin.internal.h @@ -1,23 +1,24 @@ -#ifndef _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _WBNOINVDINTRIN_H_INCLUDED #define _WBNOINVDINTRIN_H_INCLUDED - #ifndef __WBNOINVD__ #pragma GCC push_options #pragma GCC target("wbnoinvd") #define __DISABLE_WBNOINVD__ -#endif /* __WBNOINVD__ */ - -__funline void _wbnoinvd(void) { - __builtin_ia32_wbnoinvd(); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wbnoinvd (void) +{ + __builtin_ia32_wbnoinvd (); } - #ifdef __DISABLE_WBNOINVD__ #undef __DISABLE_WBNOINVD__ #pragma GCC pop_options -#endif /* __DISABLE_WBNOINVD__ */ - -#endif /* _WBNOINVDINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/wmmintrin.internal.h b/third_party/intel/wmmintrin.internal.h index 2a5819959..60df795ff 100644 --- a/third_party/intel/wmmintrin.internal.h +++ b/third_party/intel/wmmintrin.internal.h @@ -1,68 +1,70 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _WMMINTRIN_H_INCLUDED #define _WMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/emmintrin.internal.h" - #if !defined(__AES__) || !defined(__SSE2__) #pragma GCC push_options #pragma GCC target("aes,sse2") #define __DISABLE_AES__ -#endif /* __AES__ */ - -__funline __m128i _mm_aesdec_si128(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_aesdec128((__v2di)__X, (__v2di)__Y); +#endif +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdec_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y); } - -__funline __m128i _mm_aesdeclast_si128(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__X, (__v2di)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdeclast_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X, + (__v2di)__Y); } - -__funline __m128i _mm_aesenc_si128(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_aesenc128((__v2di)__X, (__v2di)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenc_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y); } - -__funline __m128i _mm_aesenclast_si128(__m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_aesenclast128((__v2di)__X, (__v2di)__Y); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenclast_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y); } - -__funline __m128i _mm_aesimc_si128(__m128i __X) { - return (__m128i)__builtin_ia32_aesimc128((__v2di)__X); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesimc_si128 (__m128i __X) +{ + return (__m128i) __builtin_ia32_aesimc128 ((__v2di)__X); } - #ifdef __OPTIMIZE__ -__funline __m128i _mm_aeskeygenassist_si128(__m128i __X, const int __C) { - return (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)__X, __C); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aeskeygenassist_si128 (__m128i __X, const int __C) +{ + return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C); } #else -#define _mm_aeskeygenassist_si128(X, C) \ - ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(X), (int)(C))) +#define _mm_aeskeygenassist_si128(X, C) ((__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)(__m128i)(X), (int)(C))) #endif - #ifdef __DISABLE_AES__ #undef __DISABLE_AES__ #pragma GCC pop_options -#endif /* __DISABLE_AES__ */ - +#endif #if !defined(__PCLMUL__) || !defined(__SSE2__) #pragma GCC push_options #pragma GCC target("pclmul,sse2") #define __DISABLE_PCLMUL__ -#endif /* __PCLMUL__ */ - +#endif #ifdef __OPTIMIZE__ -__funline __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I) { - return (__m128i)__builtin_ia32_pclmulqdq128((__v2di)__X, (__v2di)__Y, __I); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clmulepi64_si128 (__m128i __X, __m128i __Y, const int __I) +{ + return (__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)__X, + (__v2di)__Y, __I); } #else -#define _mm_clmulepi64_si128(X, Y, I) \ - ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \ - (__v2di)(__m128i)(Y), (int)(I))) +#define _mm_clmulepi64_si128(X, Y, I) ((__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(I))) #endif - #ifdef __DISABLE_PCLMUL__ #undef __DISABLE_PCLMUL__ #pragma GCC pop_options -#endif /* __DISABLE_PCLMUL__ */ - -#endif /* __x86_64__ */ -#endif /* _WMMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/x86gprintrin.internal.h b/third_party/intel/x86gprintrin.internal.h new file mode 100644 index 000000000..400a9a4f2 --- /dev/null +++ b/third_party/intel/x86gprintrin.internal.h @@ -0,0 +1,180 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +#define _X86GPRINTRIN_H_INCLUDED +#include "third_party/intel/ia32intrin.internal.h" +#ifndef __iamcu__ +#include "third_party/intel/adxintrin.internal.h" +#include "third_party/intel/bmiintrin.internal.h" +#include "third_party/intel/bmi2intrin.internal.h" +#include "third_party/intel/cetintrin.internal.h" +#include "third_party/intel/cldemoteintrin.internal.h" +#include "third_party/intel/clflushoptintrin.internal.h" +#include "third_party/intel/clwbintrin.internal.h" +#include "third_party/intel/clzerointrin.internal.h" +#include "third_party/intel/enqcmdintrin.internal.h" +#include "third_party/intel/fxsrintrin.internal.h" +#include "third_party/intel/lzcntintrin.internal.h" +#include "third_party/intel/lwpintrin.internal.h" +#include "third_party/intel/movdirintrin.internal.h" +#include "third_party/intel/mwaitxintrin.internal.h" +#include "third_party/intel/pconfigintrin.internal.h" +#include "third_party/intel/popcntintrin.internal.h" +#include "third_party/intel/pkuintrin.internal.h" +#include "third_party/intel/rdseedintrin.internal.h" +#include "third_party/intel/rtmintrin.internal.h" +#include "third_party/intel/serializeintrin.internal.h" +#include "third_party/intel/sgxintrin.internal.h" +#include "third_party/intel/tbmintrin.internal.h" +#include "third_party/intel/tsxldtrkintrin.internal.h" +#include "third_party/intel/uintrintrin.internal.h" +#include "third_party/intel/waitpkgintrin.internal.h" +#include "third_party/intel/wbnoinvdintrin.internal.h" +#include "third_party/intel/xsaveintrin.internal.h" +#include "third_party/intel/xsavecintrin.internal.h" +#include "third_party/intel/xsaveoptintrin.internal.h" +#include "third_party/intel/xsavesintrin.internal.h" +#include "third_party/intel/xtestintrin.internal.h" +#include "third_party/intel/hresetintrin.internal.h" +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wbinvd (void) +{ + __builtin_ia32_wbinvd (); +} +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand16_step (unsigned short *__P) +{ + return __builtin_ia32_rdrand16_step (__P); +} +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand32_step (unsigned int *__P) +{ + return __builtin_ia32_rdrand32_step (__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif +#ifndef __RDPID__ +#pragma GCC push_options +#pragma GCC target("rdpid") +#define __DISABLE_RDPID__ +#endif +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdpid_u32 (void) +{ + return __builtin_ia32_rdpid (); +} +#ifdef __DISABLE_RDPID__ +#undef __DISABLE_RDPID__ +#pragma GCC pop_options +#endif +#ifdef __x86_64__ +#ifndef __FSGSBASE__ +#pragma GCC push_options +#pragma GCC target("fsgsbase") +#define __DISABLE_FSGSBASE__ +#endif +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u32 (void) +{ + return __builtin_ia32_rdfsbase32 (); +} +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u64 (void) +{ + return __builtin_ia32_rdfsbase64 (); +} +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u32 (void) +{ + return __builtin_ia32_rdgsbase32 (); +} +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u64 (void) +{ + return __builtin_ia32_rdgsbase64 (); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrfsbase32 (__B); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrfsbase64 (__B); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrgsbase32 (__B); +} +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrgsbase64 (__B); +} +#ifdef __DISABLE_FSGSBASE__ +#undef __DISABLE_FSGSBASE__ +#pragma GCC pop_options +#endif +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand64_step (unsigned long long *__P) +{ + return __builtin_ia32_rdrand64_step (__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif +#endif +#ifndef __PTWRITE__ +#pragma GCC push_options +#pragma GCC target("ptwrite") +#define __DISABLE_PTWRITE__ +#endif +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_ptwrite64 (unsigned long long __B) +{ + __builtin_ia32_ptwrite64 (__B); +} +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_ptwrite32 (unsigned __B) +{ + __builtin_ia32_ptwrite32 (__B); +} +#ifdef __DISABLE_PTWRITE__ +#undef __DISABLE_PTWRITE__ +#pragma GCC pop_options +#endif +#endif +#endif +#endif diff --git a/third_party/intel/x86intrin.internal.h b/third_party/intel/x86intrin.internal.h index 1a78b7005..14b142ffb 100644 --- a/third_party/intel/x86intrin.internal.h +++ b/third_party/intel/x86intrin.internal.h @@ -1,21 +1,13 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _X86INTRIN_H_INCLUDED #define _X86INTRIN_H_INCLUDED -#ifdef __x86_64__ -#include "third_party/intel/ia32intrin.internal.h" - +#include "third_party/intel/x86gprintrin.internal.h" #ifndef __iamcu__ -/* clang-format off */ #include "third_party/intel/immintrin.internal.h" #include "third_party/intel/mm3dnow.internal.h" #include "third_party/intel/fma4intrin.internal.h" #include "third_party/intel/xopintrin.internal.h" -#include "third_party/intel/lwpintrin.internal.h" -#include "third_party/intel/tbmintrin.internal.h" -#include "third_party/intel/popcntintrin.internal.h" -#include "third_party/intel/mwaitxintrin.internal.h" -#include "third_party/intel/clzerointrin.internal.h" -/* clang-format on */ -#endif /* __iamcu__ */ - -#endif /* __x86_64__ */ -#endif /* _X86INTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/xmmintrin.internal.h b/third_party/intel/xmmintrin.internal.h index 909a1e3a2..dd8367730 100644 --- a/third_party/intel/xmmintrin.internal.h +++ b/third_party/intel/xmmintrin.internal.h @@ -1,9 +1,9 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _XMMINTRIN_H_INCLUDED #define _XMMINTRIN_H_INCLUDED -#ifdef __x86_64__ #include "third_party/intel/mm_malloc.internal.h" #include "third_party/intel/mmintrin.internal.h" - enum _mm_hint { _MM_HINT_ET0 = 7, _MM_HINT_ET1 = 6, @@ -12,384 +12,452 @@ enum _mm_hint { _MM_HINT_T2 = 1, _MM_HINT_NTA = 0 }; - #ifdef __OPTIMIZE__ -__funline void _mm_prefetch(const void *__P, enum _mm_hint __I) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_prefetch(const void *__P, enum _mm_hint __I) { __builtin_prefetch(__P, (__I & 0x4) >> 2, __I & 0x3); } #else #define _mm_prefetch(P, I) __builtin_prefetch((P), ((I & 0x4) >> 2), (I & 0x3)) #endif - #ifndef __SSE__ #pragma GCC push_options #pragma GCC target("sse") #define __DISABLE_SSE__ -#endif /* __SSE__ */ - +#endif typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); - typedef float __m128_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); - typedef float __v4sf __attribute__((__vector_size__(16))); - #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) - -#define _MM_EXCEPT_MASK 0x003f -#define _MM_EXCEPT_INVALID 0x0001 -#define _MM_EXCEPT_DENORM 0x0002 -#define _MM_EXCEPT_DIV_ZERO 0x0004 -#define _MM_EXCEPT_OVERFLOW 0x0008 -#define _MM_EXCEPT_UNDERFLOW 0x0010 -#define _MM_EXCEPT_INEXACT 0x0020 - -#define _MM_MASK_MASK 0x1f80 -#define _MM_MASK_INVALID 0x0080 -#define _MM_MASK_DENORM 0x0100 -#define _MM_MASK_DIV_ZERO 0x0200 -#define _MM_MASK_OVERFLOW 0x0400 -#define _MM_MASK_UNDERFLOW 0x0800 -#define _MM_MASK_INEXACT 0x1000 - +#define _MM_EXCEPT_MASK 0x003f +#define _MM_EXCEPT_INVALID 0x0001 +#define _MM_EXCEPT_DENORM 0x0002 +#define _MM_EXCEPT_DIV_ZERO 0x0004 +#define _MM_EXCEPT_OVERFLOW 0x0008 +#define _MM_EXCEPT_UNDERFLOW 0x0010 +#define _MM_EXCEPT_INEXACT 0x0020 +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_INVALID 0x0080 +#define _MM_MASK_DENORM 0x0100 +#define _MM_MASK_DIV_ZERO 0x0200 +#define _MM_MASK_OVERFLOW 0x0400 +#define _MM_MASK_UNDERFLOW 0x0800 +#define _MM_MASK_INEXACT 0x1000 #define _MM_ROUND_MASK 0x6000 #define _MM_ROUND_NEAREST 0x0000 #define _MM_ROUND_DOWN 0x2000 #define _MM_ROUND_UP 0x4000 #define _MM_ROUND_TOWARD_ZERO 0x6000 - -#define _MM_FLUSH_ZERO_MASK 0x8000 -#define _MM_FLUSH_ZERO_ON 0x8000 -#define _MM_FLUSH_ZERO_OFF 0x0000 - -__funline __m128 _mm_undefined_ps(void) { +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_undefined_ps(void) { __m128 __Y = __Y; return __Y; } - -__funline __m128 _mm_setzero_ps(void) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_ps(void) { return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f}; } - -__funline __m128 _mm_add_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_addss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_sub_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_subss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_mul_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_mulss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_div_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_divss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_sqrt_ss(__m128 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_ss(__m128 __A) { return (__m128)__builtin_ia32_sqrtss((__v4sf)__A); } - -__funline __m128 _mm_rcp_ss(__m128 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp_ss(__m128 __A) { return (__m128)__builtin_ia32_rcpss((__v4sf)__A); } - -__funline __m128 _mm_rsqrt_ss(__m128 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt_ss(__m128 __A) { return (__m128)__builtin_ia32_rsqrtss((__v4sf)__A); } - -__funline __m128 _mm_min_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_minss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_max_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_maxss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_add_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A + (__v4sf)__B); } - -__funline __m128 _mm_sub_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A - (__v4sf)__B); } - -__funline __m128 _mm_mul_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A * (__v4sf)__B); } - -__funline __m128 _mm_div_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A / (__v4sf)__B); } - -__funline __m128 _mm_sqrt_ps(__m128 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_ps(__m128 __A) { return (__m128)__builtin_ia32_sqrtps((__v4sf)__A); } - -__funline __m128 _mm_rcp_ps(__m128 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp_ps(__m128 __A) { return (__m128)__builtin_ia32_rcpps((__v4sf)__A); } - -__funline __m128 _mm_rsqrt_ps(__m128 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt_ps(__m128 __A) { return (__m128)__builtin_ia32_rsqrtps((__v4sf)__A); } - -__funline __m128 _mm_min_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_minps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_max_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_maxps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_and_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_ps(__m128 __A, __m128 __B) { return __builtin_ia32_andps(__A, __B); } - -__funline __m128 _mm_andnot_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_ps(__m128 __A, __m128 __B) { return __builtin_ia32_andnps(__A, __B); } - -__funline __m128 _mm_or_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_ps(__m128 __A, __m128 __B) { return __builtin_ia32_orps(__A, __B); } - -__funline __m128 _mm_xor_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_ps(__m128 __A, __m128 __B) { return __builtin_ia32_xorps(__A, __B); } - -__funline __m128 _mm_cmpeq_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpeqss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmplt_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpltss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmple_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpless((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpgt_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpltss((__v4sf)__B, (__v4sf)__A)); } - -__funline __m128 _mm_cmpge_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpless((__v4sf)__B, (__v4sf)__A)); } - -__funline __m128 _mm_cmpneq_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpneqss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpnlt_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnltss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpnle_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnless((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpngt_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__B, (__v4sf)__A)); } - -__funline __m128 _mm_cmpnge_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpnless((__v4sf)__B, (__v4sf)__A)); } - -__funline __m128 _mm_cmpord_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpordss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpunord_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpunordss((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpeq_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpeqps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmplt_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpltps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmple_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpleps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpgt_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpgtps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpge_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpgeps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpneq_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpneqps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpnlt_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnltps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpnle_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnleps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpngt_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpngtps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpnge_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpngeps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpord_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpordps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_cmpunord_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpunordps((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_comieq_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comieq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comieq((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_comilt_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comilt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comilt((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_comile_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comile_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comile((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_comigt_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comigt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comigt((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_comige_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comige_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comige((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_comineq_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comineq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comineq((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_ucomieq_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomieq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomieq((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_ucomilt_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomilt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomilt((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_ucomile_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomile_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomile((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_ucomigt_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomigt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomigt((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_ucomige_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomige_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomige((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_ucomineq_ss(__m128 __A, __m128 __B) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomineq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomineq((__v4sf)__A, (__v4sf)__B); } - -__funline int _mm_cvtss_si32(__m128 __A) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si32(__m128 __A) { return __builtin_ia32_cvtss2si((__v4sf)__A); } - -__funline int _mm_cvt_ss2si(__m128 __A) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_ss2si(__m128 __A) { return _mm_cvtss_si32(__A); } - #ifdef __x86_64__ - -__funline long long _mm_cvtss_si64(__m128 __A) { +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si64(__m128 __A) { return __builtin_ia32_cvtss2si64((__v4sf)__A); } - -__funline long long _mm_cvtss_si64x(__m128 __A) { +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si64x(__m128 __A) { return __builtin_ia32_cvtss2si64((__v4sf)__A); } #endif - -__funline __m64 _mm_cvtps_pi32(__m128 __A) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi32(__m128 __A) { return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__A); } - -__funline __m64 _mm_cvt_ps2pi(__m128 __A) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_ps2pi(__m128 __A) { return _mm_cvtps_pi32(__A); } - -__funline int _mm_cvttss_si32(__m128 __A) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si32(__m128 __A) { return __builtin_ia32_cvttss2si((__v4sf)__A); } - -__funline int _mm_cvtt_ss2si(__m128 __A) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_ss2si(__m128 __A) { return _mm_cvttss_si32(__A); } - #ifdef __x86_64__ - -__funline long long _mm_cvttss_si64(__m128 __A) { +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si64(__m128 __A) { return __builtin_ia32_cvttss2si64((__v4sf)__A); } - -__funline long long _mm_cvttss_si64x(__m128 __A) { +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si64x(__m128 __A) { return __builtin_ia32_cvttss2si64((__v4sf)__A); } #endif - -__funline __m64 _mm_cvttps_pi32(__m128 __A) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_pi32(__m128 __A) { return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__A); } - -__funline __m64 _mm_cvtt_ps2pi(__m128 __A) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_ps2pi(__m128 __A) { return _mm_cvttps_pi32(__A); } - -__funline __m128 _mm_cvtsi32_ss(__m128 __A, int __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_ss(__m128 __A, int __B) { return (__m128)__builtin_ia32_cvtsi2ss((__v4sf)__A, __B); } - -__funline __m128 _mm_cvt_si2ss(__m128 __A, int __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_si2ss(__m128 __A, int __B) { return _mm_cvtsi32_ss(__A, __B); } - #ifdef __x86_64__ - -__funline __m128 _mm_cvtsi64_ss(__m128 __A, long long __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_ss(__m128 __A, long long __B) { return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B); } - -__funline __m128 _mm_cvtsi64x_ss(__m128 __A, long long __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_ss(__m128 __A, long long __B) { return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B); } #endif - -__funline __m128 _mm_cvtpi32_ps(__m128 __A, __m64 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32_ps(__m128 __A, __m64 __B) { return (__m128)__builtin_ia32_cvtpi2ps((__v4sf)__A, (__v2si)__B); } - -__funline __m128 _mm_cvt_pi2ps(__m128 __A, __m64 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_pi2ps(__m128 __A, __m64 __B) { return _mm_cvtpi32_ps(__A, __B); } - -__funline __m128 _mm_cvtpi16_ps(__m64 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi16_ps(__m64 __A) { __v4hi __sign; __v2si __hisi, __losi; __v4sf __zero, __ra, __rb; @@ -401,8 +469,9 @@ __funline __m128 _mm_cvtpi16_ps(__m64 __A) { __rb = __builtin_ia32_cvtpi2ps(__ra, __hisi); return (__m128)__builtin_ia32_movlhps(__ra, __rb); } - -__funline __m128 _mm_cvtpu16_ps(__m64 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpu16_ps(__m64 __A) { __v2si __hisi, __losi; __v4sf __zero, __ra, __rb; __losi = (__v2si)__builtin_ia32_punpcklwd((__v4hi)__A, (__v4hi)0LL); @@ -412,44 +481,47 @@ __funline __m128 _mm_cvtpu16_ps(__m64 __A) { __rb = __builtin_ia32_cvtpi2ps(__ra, __hisi); return (__m128)__builtin_ia32_movlhps(__ra, __rb); } - -__funline __m128 _mm_cvtpi8_ps(__m64 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi8_ps(__m64 __A) { __v8qi __sign; - __sign = __builtin_ia32_pcmpgtb((__v8qi)0LL, (__v8qi)__A); - __A = (__m64)__builtin_ia32_punpcklbw((__v8qi)__A, __sign); - return _mm_cvtpi16_ps(__A); } - -__funline __m128 _mm_cvtpu8_ps(__m64 __A) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpu8_ps(__m64 __A) { __A = (__m64)__builtin_ia32_punpcklbw((__v8qi)__A, (__v8qi)0LL); return _mm_cvtpu16_ps(__A); } - -__funline __m128 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { __v4sf __zero = (__v4sf)_mm_setzero_ps(); __v4sf __sfa = __builtin_ia32_cvtpi2ps(__zero, (__v2si)__A); __v4sf __sfb = __builtin_ia32_cvtpi2ps(__sfa, (__v2si)__B); return (__m128)__builtin_ia32_movlhps(__sfa, __sfb); } - -__funline __m64 _mm_cvtps_pi16(__m128 __A) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi16(__m128 __A) { __v4sf __hisf = (__v4sf)__A; __v4sf __losf = __builtin_ia32_movhlps(__hisf, __hisf); __v2si __hisi = __builtin_ia32_cvtps2pi(__hisf); __v2si __losi = __builtin_ia32_cvtps2pi(__losf); return (__m64)__builtin_ia32_packssdw(__hisi, __losi); } - -__funline __m64 _mm_cvtps_pi8(__m128 __A) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi8(__m128 __A) { __v4hi __tmp = (__v4hi)_mm_cvtps_pi16(__A); return (__m64)__builtin_ia32_packsswb(__tmp, (__v4hi)0LL); } - #ifdef __OPTIMIZE__ -__funline __m128 _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { return (__m128)__builtin_ia32_shufps((__v4sf)__A, (__v4sf)__B, __mask); } #else @@ -457,304 +529,416 @@ __funline __m128 _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ (int)(MASK))) #endif - -__funline __m128 _mm_unpackhi_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_unpckhps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_unpacklo_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_unpcklps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_loadh_pi(__m128 __A, __m64 const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadh_pi(__m128 __A, __m64 const *__P) { return (__m128)__builtin_ia32_loadhps((__v4sf)__A, (const __v2sf *)__P); } - -__funline void _mm_storeh_pi(__m64 *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeh_pi(__m64 *__P, __m128 __A) { __builtin_ia32_storehps((__v2sf *)__P, (__v4sf)__A); } - -__funline __m128 _mm_movehl_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movehl_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movhlps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_movelh_ps(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movelh_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movlhps((__v4sf)__A, (__v4sf)__B); } - -__funline __m128 _mm_loadl_pi(__m128 __A, __m64 const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadl_pi(__m128 __A, __m64 const *__P) { return (__m128)__builtin_ia32_loadlps((__v4sf)__A, (const __v2sf *)__P); } - -__funline void _mm_storel_pi(__m64 *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storel_pi(__m64 *__P, __m128 __A) { __builtin_ia32_storelps((__v2sf *)__P, (__v4sf)__A); } - -__funline int _mm_movemask_ps(__m128 __A) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_ps(__m128 __A) { return __builtin_ia32_movmskps((__v4sf)__A); } - -__funline unsigned int _mm_getcsr(void) { +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getcsr(void) { return __builtin_ia32_stmxcsr(); } - -__funline unsigned int _MM_GET_EXCEPTION_STATE(void) { +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_EXCEPTION_STATE(void) { return _mm_getcsr() & _MM_EXCEPT_MASK; } - -__funline unsigned int _MM_GET_EXCEPTION_MASK(void) { +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_EXCEPTION_MASK(void) { return _mm_getcsr() & _MM_MASK_MASK; } - -__funline unsigned int _MM_GET_ROUNDING_MODE(void) { +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_ROUNDING_MODE(void) { return _mm_getcsr() & _MM_ROUND_MASK; } - -__funline unsigned int _MM_GET_FLUSH_ZERO_MODE(void) { +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_FLUSH_ZERO_MODE(void) { return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; } - -__funline void _mm_setcsr(unsigned int __I) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setcsr(unsigned int __I) { __builtin_ia32_ldmxcsr(__I); } - -__funline void _MM_SET_EXCEPTION_STATE(unsigned int __mask) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_EXCEPTION_STATE(unsigned int __mask) { _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); } - -__funline void _MM_SET_EXCEPTION_MASK(unsigned int __mask) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_EXCEPTION_MASK(unsigned int __mask) { _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); } - -__funline void _MM_SET_ROUNDING_MODE(unsigned int __mode) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_ROUNDING_MODE(unsigned int __mode) { _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); } - -__funline void _MM_SET_FLUSH_ZERO_MODE(unsigned int __mode) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_FLUSH_ZERO_MODE(unsigned int __mode) { _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); } - -__funline __m128 _mm_set_ss(float __F) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_ss(float __F) { return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f}; } - -__funline __m128 _mm_set1_ps(float __F) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_ps(float __F) { return __extension__(__m128)(__v4sf){__F, __F, __F, __F}; } - -__funline __m128 _mm_set_ps1(float __F) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_ps1(float __F) { return _mm_set1_ps(__F); } - -__funline __m128 _mm_load_ss(float const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ss(float const *__P) { return _mm_set_ss(*__P); } - -__funline __m128 _mm_load1_ps(float const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load1_ps(float const *__P) { return _mm_set1_ps(*__P); } - -__funline __m128 _mm_load_ps1(float const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ps1(float const *__P) { return _mm_load1_ps(__P); } - -__funline __m128 _mm_load_ps(float const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ps(float const *__P) { return *(__m128 *)__P; } - -__funline __m128 _mm_loadu_ps(float const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_ps(float const *__P) { return *(__m128_u *)__P; } - -__funline __m128 _mm_loadr_ps(float const *__P) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadr_ps(float const *__P) { __v4sf __tmp = *(__v4sf *)__P; return (__m128)__builtin_ia32_shufps(__tmp, __tmp, _MM_SHUFFLE(0, 1, 2, 3)); } - -__funline __m128 _mm_set_ps(const float __Z, const float __Y, const float __X, - const float __W) { +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) { return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z}; } - -__funline __m128 _mm_setr_ps(float __Z, float __Y, float __X, float __W) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_ps(float __Z, float __Y, float __X, float __W) { return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W}; } - -__funline void _mm_store_ss(float *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ss(float *__P, __m128 __A) { *__P = ((__v4sf)__A)[0]; } - -__funline float _mm_cvtss_f32(__m128 __A) { +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_f32(__m128 __A) { return ((__v4sf)__A)[0]; } - -__funline void _mm_store_ps(float *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ps(float *__P, __m128 __A) { *(__m128 *)__P = __A; } - -__funline void _mm_storeu_ps(float *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_ps(float *__P, __m128 __A) { *(__m128_u *)__P = __A; } - -__funline void _mm_store1_ps(float *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store1_ps(float *__P, __m128 __A) { __v4sf __va = (__v4sf)__A; __v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 0, 0, 0)); _mm_storeu_ps(__P, __tmp); } - -__funline void _mm_store_ps1(float *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ps1(float *__P, __m128 __A) { _mm_store1_ps(__P, __A); } - -__funline void _mm_storer_ps(float *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storer_ps(float *__P, __m128 __A) { __v4sf __va = (__v4sf)__A; __v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 1, 2, 3)); _mm_store_ps(__P, __tmp); } - -__funline __m128 _mm_move_ss(__m128 __A, __m128 __B) { +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_move_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_shuffle( (__v4sf)__A, (__v4sf)__B, __extension__(__attribute__((__vector_size__(16))) int){4, 1, 2, 3}); } - #ifdef __OPTIMIZE__ -__funline int _mm_extract_pi16(__m64 const __A, int const __N) { - return __builtin_ia32_vec_ext_v4hi((__v4hi)__A, __N); +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_pi16(__m64 const __A, int const __N) { + return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__A, __N); } - -__funline int _m_pextrw(__m64 const __A, int const __N) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pextrw(__m64 const __A, int const __N) { return _mm_extract_pi16(__A, __N); } #else -#define _mm_extract_pi16(A, N) \ - ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)(__m64)(A), (int)(N))) - +#define _mm_extract_pi16(A, N) \ + ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)(__m64)(A), \ + (int)(N))) #define _m_pextrw(A, N) _mm_extract_pi16(A, N) #endif - #ifdef __OPTIMIZE__ -__funline __m64 _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { return (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)__A, __D, __N); } - -__funline __m64 _m_pinsrw(__m64 const __A, int const __D, int const __N) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pinsrw(__m64 const __A, int const __D, int const __N) { return _mm_insert_pi16(__A, __D, __N); } #else #define _mm_insert_pi16(A, D, N) \ ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)(__m64)(A), (int)(D), (int)(N))) - #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N) #endif - -__funline __m64 _mm_max_pi16(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_pi16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmaxsw((__v4hi)__A, (__v4hi)__B); } - -__funline __m64 _m_pmaxsw(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaxsw(__m64 __A, __m64 __B) { return _mm_max_pi16(__A, __B); } - -__funline __m64 _mm_max_pu8(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmaxub((__v8qi)__A, (__v8qi)__B); } - -__funline __m64 _m_pmaxub(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaxub(__m64 __A, __m64 __B) { return _mm_max_pu8(__A, __B); } - -__funline __m64 _mm_min_pi16(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pi16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pminsw((__v4hi)__A, (__v4hi)__B); } - -__funline __m64 _m_pminsw(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pminsw(__m64 __A, __m64 __B) { return _mm_min_pi16(__A, __B); } - -__funline __m64 _mm_min_pu8(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pminub((__v8qi)__A, (__v8qi)__B); } - -__funline __m64 _m_pminub(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pminub(__m64 __A, __m64 __B) { return _mm_min_pu8(__A, __B); } - -__funline int _mm_movemask_pi8(__m64 __A) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_pi8(__m64 __A) { return __builtin_ia32_pmovmskb((__v8qi)__A); } - -__funline int _m_pmovmskb(__m64 __A) { +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmovmskb(__m64 __A) { return _mm_movemask_pi8(__A); } - -__funline __m64 _mm_mulhi_pu16(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_pu16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmulhuw((__v4hi)__A, (__v4hi)__B); } - -__funline __m64 _m_pmulhuw(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmulhuw(__m64 __A, __m64 __B) { return _mm_mulhi_pu16(__A, __B); } - #ifdef __OPTIMIZE__ -__funline __m64 _mm_shuffle_pi16(__m64 __A, int const __N) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_pi16(__m64 __A, int const __N) { return (__m64)__builtin_ia32_pshufw((__v4hi)__A, __N); } - -__funline __m64 _m_pshufw(__m64 __A, int const __N) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pshufw(__m64 __A, int const __N) { return _mm_shuffle_pi16(__A, __N); } #else #define _mm_shuffle_pi16(A, N) \ ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(A), (int)(N))) - #define _m_pshufw(A, N) _mm_shuffle_pi16(A, N) #endif - -__funline void _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { +#ifdef __MMX_WITH_SSE__ + typedef long long __v2di __attribute__((__vector_size__(16))); + typedef char __v16qi __attribute__((__vector_size__(16))); + __v2di __A128 = __extension__(__v2di){((__v1di)__A)[0], 0}; + __v2di __N128 = __extension__(__v2di){((__v1di)__N)[0], 0}; + __SIZE_TYPE__ offset = ((__SIZE_TYPE__)__P) & 0xf; + if (offset) { + if (offset > 8) offset = 8; + __P = (char *)(((__SIZE_TYPE__)__P) - offset); + switch (offset) { + case 1: + __A128 = __builtin_ia32_pslldqi128(__A128, 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 8); + break; + case 2: + __A128 = __builtin_ia32_pslldqi128(__A128, 2 * 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 2 * 8); + break; + case 3: + __A128 = __builtin_ia32_pslldqi128(__A128, 3 * 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 3 * 8); + break; + case 4: + __A128 = __builtin_ia32_pslldqi128(__A128, 4 * 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 4 * 8); + break; + case 5: + __A128 = __builtin_ia32_pslldqi128(__A128, 5 * 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 5 * 8); + break; + case 6: + __A128 = __builtin_ia32_pslldqi128(__A128, 6 * 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 6 * 8); + break; + case 7: + __A128 = __builtin_ia32_pslldqi128(__A128, 7 * 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 7 * 8); + break; + case 8: + __A128 = __builtin_ia32_pslldqi128(__A128, 8 * 8); + __N128 = __builtin_ia32_pslldqi128(__N128, 8 * 8); + break; + default: + break; + } + } + __builtin_ia32_maskmovdqu((__v16qi)__A128, (__v16qi)__N128, __P); +#else __builtin_ia32_maskmovq((__v8qi)__A, (__v8qi)__N, __P); +#endif } - -__funline void _m_maskmovq(__m64 __A, __m64 __N, char *__P) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_maskmovq(__m64 __A, __m64 __N, char *__P) { _mm_maskmove_si64(__A, __N, __P); } - -__funline __m64 _mm_avg_pu8(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pavgb((__v8qi)__A, (__v8qi)__B); } - -__funline __m64 _m_pavgb(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pavgb(__m64 __A, __m64 __B) { return _mm_avg_pu8(__A, __B); } - -__funline __m64 _mm_avg_pu16(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_pu16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pavgw((__v4hi)__A, (__v4hi)__B); } - -__funline __m64 _m_pavgw(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pavgw(__m64 __A, __m64 __B) { return _mm_avg_pu16(__A, __B); } - -__funline __m64 _mm_sad_pu8(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sad_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_psadbw((__v8qi)__A, (__v8qi)__B); } - -__funline __m64 _m_psadbw(__m64 __A, __m64 __B) { +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psadbw(__m64 __A, __m64 __B) { return _mm_sad_pu8(__A, __B); } - -__funline void _mm_stream_pi(__m64 *__P, __m64 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_pi(__m64 *__P, __m64 __A) { __builtin_ia32_movntq((unsigned long long *)__P, (unsigned long long)__A); } - -__funline void _mm_stream_ps(float *__P, __m128 __A) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_ps(float *__P, __m128 __A) { __builtin_ia32_movntps(__P, (__v4sf)__A); } - -__funline void _mm_sfence(void) { +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sfence(void) { __builtin_ia32_sfence(); } - #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ @@ -767,17 +951,15 @@ __funline void _mm_sfence(void) { (row2) = __builtin_ia32_movlhps(__t2, __t3); \ (row3) = __builtin_ia32_movhlps(__t3, __t2); \ } while (0) - #include "third_party/intel/emmintrin.internal.h" - #ifdef __DISABLE_SSE__ #undef __DISABLE_SSE__ #pragma GCC pop_options -#endif /* __DISABLE_SSE__ */ - -__funline void _mm_pause(void) { +#endif +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_pause(void) { __builtin_ia32_pause(); } - -#endif /* __x86_64__ */ -#endif /* _XMMINTRIN_H_INCLUDED */ +#endif +#endif diff --git a/third_party/intel/xopintrin.internal.h b/third_party/intel/xopintrin.internal.h index 25feaabda..36f0d8d96 100644 --- a/third_party/intel/xopintrin.internal.h +++ b/third_party/intel/xopintrin.internal.h @@ -1,558 +1,649 @@ +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) #ifndef _X86INTRIN_H_INCLUDED -#error "Never use directly; include instead." +# error "Never use directly; include instead." #endif - #ifndef _XOPMMINTRIN_H_INCLUDED #define _XOPMMINTRIN_H_INCLUDED - #include "third_party/intel/fma4intrin.internal.h" - #ifndef __XOP__ #pragma GCC push_options #pragma GCC target("xop") #define __DISABLE_XOP__ -#endif /* __XOP__ */ - -__funline __m128i _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); -} - -__funline __m128i _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); -} - -__funline __m128i _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, - (__v4si)__C); -} - -__funline __m128i _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, - (__v4si)__C); -} - -__funline __m128i _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, - (__v4si)__C); -} - -__funline __m128i _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, - (__v4si)__C); -} - -__funline __m128i _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, - (__v2di)__C); -} - -__funline __m128i _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, - (__v2di)__C); -} - -__funline __m128i _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, - (__v2di)__C); -} - -__funline __m128i _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, - (__v2di)__C); -} - -__funline __m128i _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, - (__v4si)__C); -} - -__funline __m128i _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, - (__v4si)__C); -} - -/* Packed Integer Horizontal Add and Subtract */ -__funline __m128i _mm_haddw_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A); -} - -__funline __m128i _mm_haddd_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A); -} - -__funline __m128i _mm_haddq_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A); -} - -__funline __m128i _mm_haddd_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A); -} - -__funline __m128i _mm_haddq_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A); -} - -__funline __m128i _mm_haddq_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_vphadddq((__v4si)__A); -} - -__funline __m128i _mm_haddw_epu8(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A); -} - -__funline __m128i _mm_haddd_epu8(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A); -} - -__funline __m128i _mm_haddq_epu8(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A); -} - -__funline __m128i _mm_haddd_epu16(__m128i __A) { - return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A); -} - -__funline __m128i _mm_haddq_epu16(__m128i __A) { - return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A); -} - -__funline __m128i _mm_haddq_epu32(__m128i __A) { - return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A); -} - -__funline __m128i _mm_hsubw_epi8(__m128i __A) { - return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A); -} - -__funline __m128i _mm_hsubd_epi16(__m128i __A) { - return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A); -} - -__funline __m128i _mm_hsubq_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); -} - -/* Vector conditional move and permute */ - -__funline __m128i _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C); -} - -__funline __m128i _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) { - return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, - (__v16qi)__C); -} - -/* Packed Integer Rotates and Shifts - Rotates - Non-Immediate form */ - -__funline __m128i _mm_rot_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_rot_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_rot_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_rot_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); -} - -#ifdef __OPTIMIZE__ -__funline __m128i _mm_roti_epi8(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, __B); -} - -__funline __m128i _mm_roti_epi16(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, __B); -} - -__funline __m128i _mm_roti_epi32(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_vprotdi((__v4si)__A, __B); -} - -__funline __m128i _mm_roti_epi64(__m128i __A, const int __B) { - return (__m128i)__builtin_ia32_vprotqi((__v2di)__A, __B); -} -#else -#define _mm_roti_epi8(A, N) \ - ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (int)(N))) -#define _mm_roti_epi16(A, N) \ - ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (int)(N))) -#define _mm_roti_epi32(A, N) \ - ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (int)(N))) -#define _mm_roti_epi64(A, N) \ - ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (int)(N))) #endif - -__funline __m128i _mm_shl_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddw_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadddq ((__v4si)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddw_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epu16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubw_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubd_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubq_epi32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpcmov (__A, __B, __C); +} +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpcmov256 (__A, __B, __C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B); } - -__funline __m128i _mm_shl_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_shl_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_shl_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_sha_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_sha_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_sha_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_sha_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comlt_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltub((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comle_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomleub((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comgt_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtub((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comge_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgeub((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comeq_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomequb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comneq_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomnequb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comfalse_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalseub((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comtrue_epu8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtrueub((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comlt_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltuw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comle_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomleuw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comgt_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtuw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comge_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgeuw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comeq_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomequw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comneq_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomnequw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comfalse_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalseuw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comtrue_epu16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtrueuw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comlt_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltud((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comle_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomleud((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comgt_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtud((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comge_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgeud((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comeq_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomequd((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comneq_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomnequd((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comfalse_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalseud((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comtrue_epu32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtrueud((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comlt_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltuq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comle_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomleuq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comgt_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtuq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comge_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgeuq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comeq_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomequq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comneq_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomnequq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comfalse_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalseuq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comtrue_epu64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtrueuq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comlt_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comle_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomleb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comgt_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comge_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgeb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comeq_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomeqb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comneq_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomneqb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comfalse_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalseb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comtrue_epi8(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtrueb((__v16qi)__A, (__v16qi)__B); -} - -__funline __m128i _mm_comlt_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comle_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomlew((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comgt_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comge_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgew((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comeq_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomeqw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comneq_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomneqw((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comfalse_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalsew((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comtrue_epi16(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtruew((__v8hi)__A, (__v8hi)__B); -} - -__funline __m128i _mm_comlt_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltd((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comle_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomled((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comgt_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtd((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comge_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomged((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comeq_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomeqd((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comneq_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomneqd((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comfalse_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalsed((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comtrue_epi32(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtrued((__v4si)__A, (__v4si)__B); -} - -__funline __m128i _mm_comlt_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomltq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comle_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomleq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comgt_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgtq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comge_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomgeq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comeq_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomeqq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comneq_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomneqq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comfalse_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomfalseq((__v2di)__A, (__v2di)__B); -} - -__funline __m128i _mm_comtrue_epi64(__m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpcomtrueq((__v2di)__A, (__v2di)__B); -} - -__funline __m128 _mm_frcz_ps(__m128 __A) { - return (__m128)__builtin_ia32_vfrczps((__v4sf)__A); -} - -__funline __m128d _mm_frcz_pd(__m128d __A) { - return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); -} - -__funline __m128 _mm_frcz_ss(__m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_movss( - (__v4sf)__A, (__v4sf)__builtin_ia32_vfrczss((__v4sf)__B)); -} - -__funline __m128d _mm_frcz_sd(__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_movsd( - (__v2df)__A, (__v2df)__builtin_ia32_vfrczsd((__v2df)__B)); -} - -__funline __m256 _mm256_frcz_ps(__m256 __A) { - return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); -} - -__funline __m256d _mm256_frcz_pd(__m256d __A) { - return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); -} - #ifdef __OPTIMIZE__ -__funline __m128d _mm_permute2_pd(__m128d __X, __m128d __Y, __m128i __C, - const int __I) { - return (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, - (__v2di)__C, __I); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi8(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B); } - -__funline __m256d _mm256_permute2_pd(__m256d __X, __m256d __Y, __m256i __C, - const int __I) { - return (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, - (__v4di)__C, __I); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi16(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B); } - -__funline __m128 _mm_permute2_ps(__m128 __X, __m128 __Y, __m128i __C, - const int __I) { - return (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, - (__v4si)__C, __I); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi32(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B); } - -__funline __m256 _mm256_peeeeeeermute2_ps(__m256 __X, __m256 __Y, __m256i __C, - const int __I) { - return (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, - (__v8si)__C, __I); +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi64(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B); } #else -#define _mm_permute2_pd(X, Y, C, I) \ - ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (__v2di)(__m128d)(C), (int)(I))) - -#define _mm256_permute2_pd(X, Y, C, I) \ - ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ - (__v4df)(__m256d)(Y), \ - (__v4di)(__m256d)(C), (int)(I))) - -#define _mm_permute2_ps(X, Y, C, I) \ - ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (__v4si)(__m128)(C), (int)(I))) - -#define _mm256_permute2_ps(X, Y, C, I) \ - ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ - (__v8sf)(__m256)(Y), \ - (__v8si)(__m256)(C), (int)(I))) -#endif /* __OPTIMIZE__ */ - +#define _mm_roti_epi8(A, N) ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi16(A, N) ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi32(A, N) ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N))) +#define _mm_roti_epi64(A, N) ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N))) +#endif +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf)__A, + (__v4sf) + __builtin_ia32_vfrczss ((__v4sf)__B)); +} +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df)__A, + (__v2df) + __builtin_ia32_vfrczsd ((__v2df)__B)); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_frcz_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_frcz_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A); +} +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I) +{ + return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X, + (__v2df)__Y, + (__v2di)__C, + __I); +} +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I) +{ + return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X, + (__v4df)__Y, + (__v4di)__C, + __I); +} +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I) +{ + return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X, + (__v4sf)__Y, + (__v4si)__C, + __I); +} +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I) +{ + return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8si)__C, + __I); +} +#else +#define _mm_permute2_pd(X, Y, C, I) ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(C), (int)(I))) +#define _mm256_permute2_pd(X, Y, C, I) ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(C), (int)(I))) +#define _mm_permute2_ps(X, Y, C, I) ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(C), (int)(I))) +#define _mm256_permute2_ps(X, Y, C, I) ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(C), (int)(I))) +#endif #ifdef __DISABLE_XOP__ #undef __DISABLE_XOP__ #pragma GCC pop_options -#endif /* __DISABLE_XOP__ */ - -#endif /* _XOPMMINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/xsavecintrin.internal.h b/third_party/intel/xsavecintrin.internal.h index 6daebde60..649beb1bf 100644 --- a/third_party/intel/xsavecintrin.internal.h +++ b/third_party/intel/xsavecintrin.internal.h @@ -1,29 +1,32 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _XSAVECINTRIN_H_INCLUDED #define _XSAVECINTRIN_H_INCLUDED - #ifndef __XSAVEC__ #pragma GCC push_options #pragma GCC target("xsavec") #define __DISABLE_XSAVEC__ -#endif /* __XSAVEC__ */ - -__funline void _xsavec(void *__P, long long __M) { - __builtin_ia32_xsavec(__P, __M); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsavec (void *__P, long long __M) +{ + __builtin_ia32_xsavec (__P, __M); } - #ifdef __x86_64__ -__funline void _xsavec64(void *__P, long long __M) { - __builtin_ia32_xsavec64(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsavec64 (void *__P, long long __M) +{ + __builtin_ia32_xsavec64 (__P, __M); } #endif - #ifdef __DISABLE_XSAVEC__ #undef __DISABLE_XSAVEC__ #pragma GCC pop_options -#endif /* __DISABLE_XSAVEC__ */ - -#endif /* _XSAVECINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/xsaveintrin.internal.h b/third_party/intel/xsaveintrin.internal.h index 76070f620..fa72c42b2 100644 --- a/third_party/intel/xsaveintrin.internal.h +++ b/third_party/intel/xsaveintrin.internal.h @@ -1,45 +1,56 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _XSAVEINTRIN_H_INCLUDED #define _XSAVEINTRIN_H_INCLUDED - #ifndef __XSAVE__ #pragma GCC push_options #pragma GCC target("xsave") #define __DISABLE_XSAVE__ -#endif /* __XSAVE__ */ - -__funline void _xsave(void *__P, long long __M) { - __builtin_ia32_xsave(__P, __M); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsave (void *__P, long long __M) +{ + __builtin_ia32_xsave (__P, __M); } - -__funline void _xrstor(void *__P, long long __M) { - __builtin_ia32_xrstor(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstor (void *__P, long long __M) +{ + __builtin_ia32_xrstor (__P, __M); } - -__funline void _xsetbv(unsigned int __A, long long __V) { - __builtin_ia32_xsetbv(__A, __V); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsetbv (unsigned int __A, long long __V) +{ + __builtin_ia32_xsetbv (__A, __V); } - -__funline long long _xgetbv(unsigned int __A) { - return __builtin_ia32_xgetbv(__A); +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xgetbv (unsigned int __A) +{ + return __builtin_ia32_xgetbv (__A); } - #ifdef __x86_64__ -__funline void _xsave64(void *__P, long long __M) { - __builtin_ia32_xsave64(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsave64 (void *__P, long long __M) +{ + __builtin_ia32_xsave64 (__P, __M); } - -__funline void _xrstor64(void *__P, long long __M) { - __builtin_ia32_xrstor64(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstor64 (void *__P, long long __M) +{ + __builtin_ia32_xrstor64 (__P, __M); } #endif - #ifdef __DISABLE_XSAVE__ #undef __DISABLE_XSAVE__ #pragma GCC pop_options -#endif /* __DISABLE_XSAVE__ */ - -#endif /* _XSAVEINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/xsaveoptintrin.internal.h b/third_party/intel/xsaveoptintrin.internal.h index 45d39ddb4..a50f9ef3f 100644 --- a/third_party/intel/xsaveoptintrin.internal.h +++ b/third_party/intel/xsaveoptintrin.internal.h @@ -1,29 +1,32 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _XSAVEOPTINTRIN_H_INCLUDED #define _XSAVEOPTINTRIN_H_INCLUDED - #ifndef __XSAVEOPT__ #pragma GCC push_options #pragma GCC target("xsaveopt") #define __DISABLE_XSAVEOPT__ -#endif /* __XSAVEOPT__ */ - -__funline void _xsaveopt(void *__P, long long __M) { - __builtin_ia32_xsaveopt(__P, __M); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaveopt (void *__P, long long __M) +{ + __builtin_ia32_xsaveopt (__P, __M); } - #ifdef __x86_64__ -__funline void _xsaveopt64(void *__P, long long __M) { - __builtin_ia32_xsaveopt64(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaveopt64 (void *__P, long long __M) +{ + __builtin_ia32_xsaveopt64 (__P, __M); } #endif - #ifdef __DISABLE_XSAVEOPT__ #undef __DISABLE_XSAVEOPT__ #pragma GCC pop_options -#endif /* __DISABLE_XSAVEOPT__ */ - -#endif /* _XSAVEOPTINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/xsavesintrin.internal.h b/third_party/intel/xsavesintrin.internal.h index 7a1b1e769..4e2631ecb 100644 --- a/third_party/intel/xsavesintrin.internal.h +++ b/third_party/intel/xsavesintrin.internal.h @@ -1,37 +1,44 @@ -#if !defined _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _XSAVESINTRIN_H_INCLUDED #define _XSAVESINTRIN_H_INCLUDED - #ifndef __XSAVES__ #pragma GCC push_options #pragma GCC target("xsaves") #define __DISABLE_XSAVES__ -#endif /* __XSAVES__ */ - -__funline void _xsaves(void *__P, long long __M) { - __builtin_ia32_xsaves(__P, __M); +#endif +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaves (void *__P, long long __M) +{ + __builtin_ia32_xsaves (__P, __M); } - -__funline void _xrstors(void *__P, long long __M) { - __builtin_ia32_xrstors(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstors (void *__P, long long __M) +{ + __builtin_ia32_xrstors (__P, __M); } - #ifdef __x86_64__ -__funline void _xrstors64(void *__P, long long __M) { - __builtin_ia32_xrstors64(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstors64 (void *__P, long long __M) +{ + __builtin_ia32_xrstors64 (__P, __M); } - -__funline void _xsaves64(void *__P, long long __M) { - __builtin_ia32_xsaves64(__P, __M); +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaves64 (void *__P, long long __M) +{ + __builtin_ia32_xsaves64 (__P, __M); } #endif - #ifdef __DISABLE_XSAVES__ #undef __DISABLE_XSAVES__ #pragma GCC pop_options -#endif /* __DISABLE_XSAVES__ */ - -#endif /* _XSAVESINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/intel/xtestintrin.internal.h b/third_party/intel/xtestintrin.internal.h index bc58a51d4..4101998c7 100644 --- a/third_party/intel/xtestintrin.internal.h +++ b/third_party/intel/xtestintrin.internal.h @@ -1,23 +1,24 @@ -#ifndef _IMMINTRIN_H_INCLUDED -#error "Never use directly; include instead." +/* clang-format off */ +#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0) +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." #endif - #ifndef _XTESTINTRIN_H_INCLUDED #define _XTESTINTRIN_H_INCLUDED - #ifndef __RTM__ #pragma GCC push_options #pragma GCC target("rtm") #define __DISABLE_RTM__ -#endif /* __RTM__ */ - -__funline int _xtest(void) { - return __builtin_ia32_xtest(); +#endif +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xtest (void) +{ + return __builtin_ia32_xtest (); } - #ifdef __DISABLE_RTM__ #undef __DISABLE_RTM__ #pragma GCC pop_options -#endif /* __DISABLE_RTM__ */ - -#endif /* _XTESTINTRIN_H_INCLUDED */ +#endif +#endif +#endif diff --git a/third_party/nsync/compat.S b/third_party/nsync/compat.S index 35f8d07a3..795e39631 100644 --- a/third_party/nsync/compat.S +++ b/third_party/nsync/compat.S @@ -16,8 +16,13 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/calls/struct/timespec.h" #include "libc/macros.internal.h" +#ifdef __aarch64__ +#define jmp b +#endif + nsync_time_now: jmp timespec_real .endfn nsync_time_now,globl diff --git a/third_party/nsync/nsync.mk b/third_party/nsync/nsync.mk index de04e9d85..c36d6e773 100644 --- a/third_party/nsync/nsync.mk +++ b/third_party/nsync/nsync.mk @@ -54,6 +54,10 @@ $(THIRD_PARTY_NSYNC_A_OBJS): private \ -ffunction-sections \ -fdata-sections +# these assembly files are safe to build on aarch64 +o/$(MODE)/third_party/nsync/compat.o: third_party/nsync/compat.S + @$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $< + THIRD_PARTY_NSYNC_LIBS = $(foreach x,$(THIRD_PARTY_NSYNC_ARTIFACTS),$($(x))) THIRD_PARTY_NSYNC_SRCS = $(foreach x,$(THIRD_PARTY_NSYNC_ARTIFACTS),$($(x)_SRCS)) THIRD_PARTY_NSYNC_CHECKS = $(foreach x,$(THIRD_PARTY_NSYNC_ARTIFACTS),$($(x)_CHECKS)) diff --git a/tool/scripts/cosmoc++ b/tool/scripts/cosmoc++ index 095155d1f..20f8e5ebf 100755 --- a/tool/scripts/cosmoc++ +++ b/tool/scripts/cosmoc++ @@ -21,7 +21,7 @@ if [ "$1" = "--version" ]; then cat <<'EOF' -x86_64-unknown-cosmo-g++ (GCC) 9.2.0 +x86_64-unknown-cosmo-g++ (GCC) 11.2.0 Copyright (C) 2019 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. @@ -30,7 +30,7 @@ exit 0 fi CXX="/opt/cosmo/o/third_party/gcc/bin/x86_64-linux-musl-g++" -CCFLAGS="-O2 -fdata-sections -ffunction-sections -fno-pie -pg -mnop-mcount -mno-tls-direct-seg-refs" +CCFLAGS="-g -O2 -fdata-sections -ffunction-sections -fno-pie -pg -mnop-mcount -mno-tls-direct-seg-refs -fportcosmo -include /opt/cosmo/build/portcosmo.h" CXXFLAGS="-fno-exceptions -fuse-cxa-atexit -fno-threadsafe-statics" CPPFLAGS="-DNDEBUG -nostdinc -iquote /opt/cosmo -isystem /opt/cosmos/include -isystem /opt/cosmo/libc/isystem -include libc/integral/normalize.inc" LDFLAGS="-static -no-pie -nostdlib -fuse-ld=bfd -Wl,-melf_x86_64 -Wl,--gc-sections -L/opt/cosmos/lib -Wl,-T,/opt/cosmo/o/ape/public/ape.lds /opt/cosmo/o/ape/ape-no-modify-self.o /opt/cosmo/o/libc/crt/crt.o" @@ -86,11 +86,11 @@ for x; do done if [ "$HAS_E" = "1" ]; then - set -- $CPPFLAGS "$@" + set -- $CCFLAGS $CPPFLAGS "$@" elif [ "$HAS_C" = "1" ]; then set -- $CCFLAGS $CXXFLAGS $CPPFLAGS "$@" -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer else - set -- $LDFLAGS $CXXFLAGS $CPPFLAGS "$@" $LDLIBS -Wl,-z,common-page-size=4096 -Wl,-z,max-page-size=4096 + set -- $LDFLAGS $CXXFLAGS $CPPFLAGS "$@" $LDLIBS -Wl,-z,common-page-size=65536 -Wl,-z,max-page-size=65536 fi set -- "$CXX" "$@" diff --git a/tool/scripts/cosmocc b/tool/scripts/cosmocc index 42a7d3f21..dc1809ee7 100755 --- a/tool/scripts/cosmocc +++ b/tool/scripts/cosmocc @@ -24,7 +24,7 @@ COSMOS=/opt/cosmos if [ "$1" = "--version" ]; then cat <<'EOF' -x86_64-unknown-cosmo-gcc (GCC) 9.2.0 +x86_64-unknown-cosmo-gcc (GCC) 11.2.0 Copyright (C) 2019 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. @@ -33,7 +33,7 @@ exit 0 fi CC="/opt/cosmo/o/third_party/gcc/bin/x86_64-linux-musl-gcc" -CFLAGS="-g -O2 -fdata-sections -ffunction-sections -fno-pie -pg -mnop-mcount -mno-tls-direct-seg-refs" +CFLAGS="-g -O2 -fdata-sections -ffunction-sections -fno-pie -pg -mnop-mcount -mno-tls-direct-seg-refs -fportcosmo -include /opt/cosmo/build/portcosmo.h" CPPFLAGS="-DNDEBUG -nostdinc -iquote /opt/cosmo -isystem $COSMOS/include -isystem /opt/cosmo/libc/isystem -include libc/integral/normalize.inc" LDFLAGS="-static -no-pie -nostdlib -fuse-ld=bfd -Wl,-melf_x86_64 -Wl,--gc-sections -Wl,-z,max-page-size=0x1000 -L$COSMOS/lib -Wl,-T,/opt/cosmo/o/ape/public/ape.lds /opt/cosmo/o/ape/ape-no-modify-self.o /opt/cosmo/o/libc/crt/crt.o" LDLIBS="/opt/cosmo/o/cosmopolitan.a" @@ -92,7 +92,7 @@ if [ "$HAS_E" = "1" ]; then elif [ "$HAS_C" = "1" ]; then set -- $CFLAGS $CPPFLAGS "$@" -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer else - set -- $LDFLAGS $CFLAGS $CPPFLAGS "$@" $LDLIBS -Wl,-z,common-page-size=4096 -Wl,-z,max-page-size=4096 + set -- $LDFLAGS $CFLAGS $CPPFLAGS "$@" $LDLIBS -Wl,-z,common-page-size=65536 -Wl,-z,max-page-size=65536 fi set -- "$CC" "$@"