cosmopolitan/tool/build/lib/sse.c
2022-08-18 17:46:34 -07:00

1575 lines
44 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/macros.internal.h"
#include "libc/str/str.h"
#include "tool/build/lib/endian.h"
#include "tool/build/lib/machine.h"
#include "tool/build/lib/modrm.h"
#include "tool/build/lib/throw.h"
static void MmxPor(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] |= y[i];
}
}
static void MmxPxor(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] ^= y[i];
}
}
static void MmxPsubb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] -= y[i];
}
}
static void MmxPaddb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] += y[i];
}
}
static void MmxPand(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] &= y[i];
}
}
static void MmxPandn(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = ~x[i] & y[i];
}
}
static void MmxPavgb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = (x[i] + y[i] + 1) >> 1;
}
}
static void MmxPabsb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = ABS((int8_t)y[i]);
}
}
static void MmxPminub(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = MIN(x[i], y[i]);
}
}
static void MmxPmaxub(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = MAX(x[i], y[i]);
}
}
static void MmxPaddusb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = MIN(255, x[i] + y[i]);
}
}
static void MmxPsubusb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = MIN(255, MAX(0, x[i] - y[i]));
}
}
static void MmxPcmpeqb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = -(x[i] == y[i]);
}
}
static void MmxPcmpgtb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = -((int8_t)x[i] > (int8_t)y[i]);
}
}
static void MmxPsubsb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = MAX(-128, MIN(127, (int8_t)x[i] - (int8_t)y[i]));
}
}
static void MmxPaddsb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 8; ++i) {
x[i] = MAX(-128, MIN(127, (int8_t)x[i] + (int8_t)y[i]));
}
}
static void MmxPmulhrsw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
int16_t a, b;
for (i = 0; i < 4; ++i) {
a = Read16(x + i * 2);
b = Read16(y + i * 2);
Write16(x + i * 2, (((a * b) >> 14) + 1) >> 1);
}
}
static void MmxPmaddubsw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2,
MAX(-32768, MIN(32767, (x[i * 2 + 0] * (int8_t)y[i * 2 + 0] +
x[i * 2 + 1] * (int8_t)y[i * 2 + 1]))));
}
}
static void MmxPsraw(uint8_t x[8], unsigned k) {
unsigned i;
if (k > 15) k = 15;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, (int16_t)Read16(x + i * 2) >> k);
}
}
static void MmxPsrad(uint8_t x[8], unsigned k) {
unsigned i;
if (k > 31) k = 31;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, (int32_t)Read32(x + i * 4) >> k);
}
}
static void MmxPsrlw(uint8_t x[8], unsigned k) {
unsigned i;
if (k < 16) {
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, Read16(x + i * 2) >> k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPsllw(uint8_t x[8], unsigned k) {
unsigned i;
if (k <= 15) {
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, Read16(x + i * 2) << k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPsrld(uint8_t x[8], unsigned k) {
unsigned i;
if (k <= 31) {
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, Read32(x + i * 4) >> k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPslld(uint8_t x[8], unsigned k) {
unsigned i;
if (k <= 31) {
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, Read32(x + i * 4) << k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPsrlq(uint8_t x[8], unsigned k) {
if (k <= 63) {
Write64(x, Read64(x) >> k);
} else {
memset(x, 0, 8);
}
}
static void MmxPsllq(uint8_t x[8], unsigned k) {
if (k <= 63) {
Write64(x, Read64(x) << k);
} else {
memset(x, 0, 8);
}
}
static void MmxPslldq(uint8_t x[8], unsigned k) {
unsigned i;
uint8_t t[8];
if (k > 8) k = 8;
for (i = 0; i < k; ++i) t[i] = 0;
for (i = 0; i < 8 - k; ++i) t[k + i] = x[i];
memcpy(x, t, 8);
}
static void MmxPsrldq(uint8_t x[8], unsigned k) {
uint8_t t[8];
if (k > 8) k = 8;
memcpy(t, x + k, 8 - k);
memset(t + (8 - k), 0, k);
memcpy(x, t, 8);
}
static void MmxPalignr(uint8_t x[8], const uint8_t y[8], unsigned k) {
uint8_t t[24];
memcpy(t, y, 8);
memcpy(t + 8, x, 8);
memset(t + 16, 0, 8);
memcpy(x, t + MIN(k, 16), 8);
}
static void MmxPsubw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, Read16(x + i * 2) - Read16(y + i * 2));
}
}
static void MmxPaddw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, Read16(x + i * 2) + Read16(y + i * 2));
}
}
static void MmxPsubd(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, Read32(x + i * 4) - Read32(y + i * 4));
}
}
static void MmxPaddd(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, Read32(x + i * 4) + Read32(y + i * 4));
}
}
static void MmxPaddq(uint8_t x[8], const uint8_t y[8]) {
Write64(x, Read64(x) + Read64(y));
}
static void MmxPsubq(uint8_t x[8], const uint8_t y[8]) {
Write64(x, Read64(x) - Read64(y));
}
static void MmxPaddsw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + i * 2) +
(int16_t)Read16(y + i * 2)))));
}
}
static void MmxPsubsw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + i * 2) -
(int16_t)Read16(y + i * 2)))));
}
}
static void MmxPaddusw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, MIN(65535, Read16(x + i * 2) + Read16(y + i * 2)));
}
}
static void MmxPsubusw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2,
MIN(65535, MAX(0, Read16(x + i * 2) - Read16(y + i * 2))));
}
}
static void MmxPminsw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2,
MIN((int16_t)Read16(x + i * 2), (int16_t)Read16(y + i * 2)));
}
}
static void MmxPmaxsw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2,
MAX((int16_t)Read16(x + i * 2), (int16_t)Read16(y + i * 2)));
}
}
static void MmxPackuswb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint8_t t[8];
for (i = 0; i < 4; ++i) {
t[i + 0] = MIN(255, MAX(0, (int16_t)Read16(x + i * 2)));
}
for (i = 0; i < 4; ++i) {
t[i + 4] = MIN(255, MAX(0, (int16_t)Read16(y + i * 2)));
}
memcpy(x, t, 8);
}
static void MmxPacksswb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint8_t t[8];
for (i = 0; i < 4; ++i) {
t[i + 0] = MAX(-128, MIN(127, (int16_t)Read16(x + i * 2)));
}
for (i = 0; i < 4; ++i) {
t[i + 4] = MAX(-128, MIN(127, (int16_t)Read16(y + i * 2)));
}
memcpy(x, t, 8);
}
static void MmxPackssdw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint8_t t[8];
for (i = 0; i < 2; ++i) {
Write16(t + i * 2 + 0, MAX(-32768, MIN(32767, (int32_t)Read32(x + i * 4))));
}
for (i = 0; i < 2; ++i) {
Write16(t + i * 2 + 4, MAX(-32768, MIN(32767, (int32_t)Read32(y + i * 4))));
}
memcpy(x, t, 8);
}
static void MmxPcmpgtw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2,
-((int16_t)Read16(x + i * 2) > (int16_t)Read16(y + i * 2)));
}
}
static void MmxPcmpeqw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, -(Read16(x + i * 2) == Read16(y + i * 2)));
}
}
static void MmxPcmpgtd(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4,
-((int32_t)Read32(x + i * 4) > (int32_t)Read32(y + i * 4)));
}
}
static void MmxPcmpeqd(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, -(Read32(x + i * 4) == Read32(y + i * 4)));
}
}
static void MmxPsrawv(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint64_t k;
k = Read64(y);
if (k > 15) k = 15;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, (int16_t)Read16(x + i * 2) >> k);
}
}
static void MmxPsradv(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint64_t k;
k = Read64(y);
if (k > 31) k = 31;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, (int32_t)Read32(x + i * 4) >> k);
}
}
static void MmxPsrlwv(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint64_t k;
k = Read64(y);
if (k < 16) {
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, Read16(x + i * 2) >> k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPsllwv(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint64_t k;
k = Read64(y);
if (k < 16) {
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, Read16(x + i * 2) << k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPsrldv(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint64_t k;
k = Read64(y);
if (k < 32) {
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, Read32(x + i * 4) >> k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPslldv(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint64_t k;
k = Read64(y);
if (k < 32) {
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, Read32(x + i * 4) << k);
}
} else {
memset(x, 0, 8);
}
}
static void MmxPsrlqv(uint8_t x[8], const uint8_t y[8]) {
uint64_t k;
k = Read64(y);
if (k < 64) {
Write64(x, Read64(x) >> k);
} else {
memset(x, 0, 8);
}
}
static void MmxPsllqv(uint8_t x[8], const uint8_t y[8]) {
uint64_t k;
k = Read64(y);
if (k < 64) {
Write64(x, Read64(x) << k);
} else {
memset(x, 0, 8);
}
}
static void MmxPavgw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, (Read16(x + i * 2) + Read16(y + i * 2) + 1) >> 1);
}
}
static void MmxPsadbw(uint8_t x[8], const uint8_t y[8]) {
unsigned i, s, t;
for (s = i = 0; i < 4; ++i) s += ABS(x[i] - y[i]);
for (t = 0; i < 8; ++i) t += ABS(x[i] - y[i]);
Write32(x + 0, s);
Write32(x + 4, t);
}
static void MmxPmaddwd(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4,
((int16_t)Read16(x + i * 4 + 0) * (int16_t)Read16(y + i * 4 + 0) +
(int16_t)Read16(x + i * 4 + 2) * (int16_t)Read16(y + i * 4 + 2)));
}
}
static void MmxPmulhuw(uint8_t x[8], const uint8_t y[8]) {
uint32_t v;
unsigned i;
for (i = 0; i < 4; ++i) {
v = Read16(x + i * 2);
v *= Read16(y + i * 2);
v >>= 16;
Write16(x + i * 2, v);
}
}
static void MmxPmulhw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2,
((int16_t)Read16(x + i * 2) * (int16_t)Read16(y + i * 2)) >> 16);
}
}
static void MmxPmuludq(uint8_t x[8], const uint8_t y[8]) {
Write64(x, (uint64_t)Read32(x) * Read32(y));
}
static void MmxPmullw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, (int16_t)Read16(x + i * 2) * (int16_t)Read16(y + i * 2));
}
}
static void MmxPmulld(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write32(x + i * 4, Read32(x + i * 4) * Read32(y + i * 4));
}
}
static void MmxPshufb(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
uint8_t t[8];
for (i = 0; i < 8; ++i) {
t[i] = (y[i] & 128) ? 0 : x[y[i] & 7];
}
memcpy(x, t, 8);
}
static void MmxPsignb(uint8_t x[8], const uint8_t y[8]) {
int v;
unsigned i;
for (i = 0; i < 8; ++i) {
v = (int8_t)y[i];
if (!v) {
x[i] = 0;
} else if (v < 0) {
x[i] = -(int8_t)x[i];
}
}
}
static void MmxPsignw(uint8_t x[8], const uint8_t y[8]) {
int v;
unsigned i;
for (i = 0; i < 4; ++i) {
v = (int16_t)Read16(y + i * 2);
if (!v) {
Write16(x + i * 2, 0);
} else if (v < 0) {
Write16(x + i * 2, -(int16_t)Read16(x + i * 2));
}
}
}
static void MmxPsignd(uint8_t x[8], const uint8_t y[8]) {
int32_t v;
unsigned i;
for (i = 0; i < 2; ++i) {
v = Read32(y + i * 4);
if (!v) {
Write32(x + i * 4, 0);
} else if (v < 0) {
Write32(x + i * 4, -Read32(x + i * 4));
}
}
}
static void MmxPabsw(uint8_t x[8], const uint8_t y[8]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write16(x + i * 2, ABS((int16_t)Read16(y + i * 2)));
}
}
static void MmxPabsd(uint8_t x[8], const uint8_t y[8]) {
int32_t v;
unsigned i;
for (i = 0; i < 2; ++i) {
v = Read32(y + i * 4);
Write32(x + i * 4, v >= 0 ? v : -(uint32_t)v);
}
}
static void MmxPhaddw(uint8_t x[8], const uint8_t y[8]) {
uint8_t t[8];
Write16(t + 0 * 2, Read16(x + 0 * 2) + Read16(x + 1 * 2));
Write16(t + 1 * 2, Read16(x + 2 * 2) + Read16(x + 3 * 2));
Write16(t + 2 * 2, Read16(y + 0 * 2) + Read16(y + 1 * 2));
Write16(t + 3 * 2, Read16(y + 2 * 2) + Read16(y + 3 * 2));
memcpy(x, t, 8);
}
static void MmxPhsubw(uint8_t x[8], const uint8_t y[8]) {
uint8_t t[8];
Write16(t + 0 * 2, Read16(x + 0 * 2) - Read16(x + 1 * 2));
Write16(t + 1 * 2, Read16(x + 2 * 2) - Read16(x + 3 * 2));
Write16(t + 2 * 2, Read16(y + 0 * 2) - Read16(y + 1 * 2));
Write16(t + 3 * 2, Read16(y + 2 * 2) - Read16(y + 3 * 2));
memcpy(x, t, 8);
}
static void MmxPhaddd(uint8_t x[8], const uint8_t y[8]) {
uint8_t t[8];
Write32(t + 0 * 4, Read32(x + 0 * 4) + Read32(x + 1 * 4));
Write32(t + 1 * 4, Read32(y + 0 * 4) + Read32(y + 1 * 4));
memcpy(x, t, 8);
}
static void MmxPhsubd(uint8_t x[8], const uint8_t y[8]) {
uint8_t t[8];
Write32(t + 0 * 4, Read32(x + 0 * 4) - Read32(x + 1 * 4));
Write32(t + 1 * 4, Read32(y + 0 * 4) - Read32(y + 1 * 4));
memcpy(x, t, 8);
}
static void MmxPhaddsw(uint8_t x[8], const uint8_t y[8]) {
uint8_t t[8];
Write16(t + 0 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 0 * 2) +
(int16_t)Read16(x + 1 * 2)))));
Write16(t + 1 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 2 * 2) +
(int16_t)Read16(x + 3 * 2)))));
Write16(t + 2 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 0 * 2) +
(int16_t)Read16(y + 1 * 2)))));
Write16(t + 3 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 2 * 2) +
(int16_t)Read16(y + 3 * 2)))));
memcpy(x, t, 8);
}
static void MmxPhsubsw(uint8_t x[8], const uint8_t y[8]) {
uint8_t t[8];
Write16(t + 0 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 0 * 2) -
(int16_t)Read16(x + 1 * 2)))));
Write16(t + 1 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 2 * 2) -
(int16_t)Read16(x + 3 * 2)))));
Write16(t + 2 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 0 * 2) -
(int16_t)Read16(x + 1 * 2)))));
Write16(t + 3 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 2 * 2) -
(int16_t)Read16(y + 3 * 2)))));
memcpy(x, t, 8);
}
static void MmxPunpcklbw(uint8_t x[8], const uint8_t y[8]) {
x[7] = y[3];
x[6] = x[3];
x[5] = y[2];
x[4] = x[2];
x[3] = y[1];
x[2] = x[1];
x[1] = y[0];
x[0] = x[0];
}
static void MmxPunpckhbw(uint8_t x[8], const uint8_t y[8]) {
x[0] = x[4];
x[1] = y[4];
x[2] = x[5];
x[3] = y[5];
x[4] = x[6];
x[5] = y[6];
x[6] = x[7];
x[7] = y[7];
}
static void MmxPunpcklwd(uint8_t x[8], const uint8_t y[8]) {
x[6] = y[2];
x[7] = y[3];
x[4] = x[2];
x[5] = x[3];
x[2] = y[0];
x[3] = y[1];
x[0] = x[0];
x[1] = x[1];
}
static void MmxPunpckldq(uint8_t x[8], const uint8_t y[8]) {
x[4] = y[0];
x[5] = y[1];
x[6] = y[2];
x[7] = y[3];
x[0] = x[0];
x[1] = x[1];
x[2] = x[2];
x[3] = x[3];
}
static void MmxPunpckhwd(uint8_t x[8], const uint8_t y[8]) {
x[0] = x[4];
x[1] = x[5];
x[2] = y[4];
x[3] = y[5];
x[4] = x[6];
x[5] = x[7];
x[6] = y[6];
x[7] = y[7];
}
static void MmxPunpckhdq(uint8_t x[8], const uint8_t y[8]) {
x[0] = x[4];
x[1] = x[5];
x[2] = x[6];
x[3] = x[7];
x[4] = y[4];
x[5] = y[5];
x[6] = y[6];
x[7] = y[7];
}
static void MmxPunpcklqdq(uint8_t x[8], const uint8_t y[8]) {
}
static void MmxPunpckhqdq(uint8_t x[8], const uint8_t y[8]) {
}
static void SsePsubb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] -= y[i];
}
}
static void SsePaddb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] += y[i];
}
}
static void SsePor(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] |= y[i];
}
}
static void SsePxor(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] ^= y[i];
}
}
static void SsePand(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] &= y[i];
}
}
static void SsePandn(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] = ~x[i] & y[i];
}
}
static void SsePcmpeqb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] = -(x[i] == y[i]);
}
}
static void SsePcmpgtb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] = -((int8_t)x[i] > (int8_t)y[i]);
}
}
static void SsePavgb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] = (x[i] + y[i] + 1) >> 1;
}
}
static void SsePabsb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] = ABS((int8_t)y[i]);
}
}
static void SsePminub(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] = MIN(x[i], y[i]);
}
}
static void SsePmaxub(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 16; ++i) {
x[i] = MAX(x[i], y[i]);
}
}
static void SsePslldq(uint8_t x[16], unsigned k) {
unsigned i;
uint8_t t[16];
if (k > 16) k = 16;
for (i = 0; i < k; ++i) t[i] = 0;
for (i = 0; i < 16 - k; ++i) t[k + i] = x[i];
memcpy(x, t, 16);
}
static void SsePsrldq(uint8_t x[16], unsigned k) {
uint8_t t[16];
if (k > 16) k = 16;
memcpy(t, x + k, 16 - k);
memset(t + (16 - k), 0, k);
memcpy(x, t, 16);
}
static void SsePalignr(uint8_t x[16], const uint8_t y[16], unsigned k) {
uint8_t t[48];
memcpy(t, y, 16);
memcpy(t + 16, x, 16);
memset(t + 32, 0, 16);
memcpy(x, t + MIN(k, 32), 16);
}
static void SsePsubw(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 8; ++i) {
Write16(x + i * 2, Read16(x + i * 2) - Read16(y + i * 2));
}
}
static void SsePaddw(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 8; ++i) {
Write16(x + i * 2, Read16(x + i * 2) + Read16(y + i * 2));
}
}
static void SsePsubd(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write32(x + i * 4, Read32(x + i * 4) - Read32(y + i * 4));
}
}
static void SsePaddd(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 4; ++i) {
Write32(x + i * 4, Read32(x + i * 4) + Read32(y + i * 4));
}
}
static void SsePaddq(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write64(x + i * 8, Read64(x + i * 8) + Read64(y + i * 8));
}
}
static void SsePsubq(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write64(x + i * 8, Read64(x + i * 8) - Read64(y + i * 8));
}
}
static void SsePaddusw(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 8; ++i) {
Write16(x + i * 2, MIN(65535, Read16(x + i * 2) + Read16(y + i * 2)));
}
}
static void SsePackuswb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
uint8_t t[16];
for (i = 0; i < 8; ++i) {
t[i + 0] = MIN(255, MAX(0, (int16_t)Read16(x + i * 2)));
}
for (i = 0; i < 8; ++i) {
t[i + 8] = MIN(255, MAX(0, (int16_t)Read16(y + i * 2)));
}
memcpy(x, t, 16);
}
static void SsePacksswb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
uint8_t t[16];
for (i = 0; i < 8; ++i) {
t[i + 0] = MAX(-128, MIN(127, (int16_t)Read16(x + i * 2)));
}
for (i = 0; i < 8; ++i) {
t[i + 8] = MAX(-128, MIN(127, (int16_t)Read16(y + i * 2)));
}
memcpy(x, t, 16);
}
static void SsePackssdw(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
uint8_t t[16];
for (i = 0; i < 4; ++i) {
Write16(t + i * 2 + 0, MAX(-32768, MIN(32767, (int32_t)Read32(x + i * 4))));
}
for (i = 0; i < 4; ++i) {
Write16(t + i * 2 + 8, MAX(-32768, MIN(32767, (int32_t)Read32(y + i * 4))));
}
memcpy(x, t, 16);
}
static void SsePsadbw(uint8_t x[16], const uint8_t y[16]) {
unsigned i, s, t;
for (s = i = 0; i < 8; ++i) s += ABS(x[i] - y[i]);
for (t = 0; i < 16; ++i) t += ABS(x[i] - y[i]);
Write64(x + 0, s);
Write64(x + 8, t);
}
static void SsePmuludq(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
for (i = 0; i < 2; ++i) {
Write64(x + i * 8, (uint64_t)Read32(x + i * 8) * Read32(y + i * 8));
}
}
static void SsePshufb(uint8_t x[16], const uint8_t y[16]) {
unsigned i;
uint8_t t[16];
for (i = 0; i < 16; ++i) {
t[i] = (y[i] & 128) ? 0 : x[y[i] & 15];
}
memcpy(x, t, 16);
}
static void SsePhaddd(uint8_t x[16], const uint8_t y[16]) {
uint8_t t[16];
Write32(t + 0 * 4, Read32(x + 0 * 4) + Read32(x + 1 * 4));
Write32(t + 1 * 4, Read32(x + 2 * 4) + Read32(x + 3 * 4));
Write32(t + 2 * 4, Read32(y + 0 * 4) + Read32(y + 1 * 4));
Write32(t + 3 * 4, Read32(y + 2 * 4) + Read32(y + 3 * 4));
memcpy(x, t, 16);
}
static void SsePhsubd(uint8_t x[16], const uint8_t y[16]) {
uint8_t t[16];
Write32(t + 0 * 4, Read32(x + 0 * 4) - Read32(x + 1 * 4));
Write32(t + 1 * 4, Read32(x + 2 * 4) - Read32(x + 3 * 4));
Write32(t + 2 * 4, Read32(y + 0 * 4) - Read32(y + 1 * 4));
Write32(t + 3 * 4, Read32(y + 2 * 4) - Read32(y + 3 * 4));
memcpy(x, t, 16);
}
static void SsePhaddw(uint8_t x[16], const uint8_t y[16]) {
uint8_t t[16];
Write16(t + 0 * 2, Read16(x + 0 * 2) + Read16(x + 1 * 2));
Write16(t + 1 * 2, Read16(x + 2 * 2) + Read16(x + 3 * 2));
Write16(t + 2 * 2, Read16(x + 4 * 2) + Read16(x + 5 * 2));
Write16(t + 3 * 2, Read16(x + 6 * 2) + Read16(x + 7 * 2));
Write16(t + 4 * 2, Read16(y + 0 * 2) + Read16(y + 1 * 2));
Write16(t + 5 * 2, Read16(y + 2 * 2) + Read16(y + 3 * 2));
Write16(t + 6 * 2, Read16(y + 4 * 2) + Read16(y + 5 * 2));
Write16(t + 7 * 2, Read16(y + 6 * 2) + Read16(y + 7 * 2));
memcpy(x, t, 16);
}
static void SsePhsubw(uint8_t x[16], const uint8_t y[16]) {
uint8_t t[16];
Write16(t + 0 * 2, Read16(x + 0 * 2) - Read16(x + 1 * 2));
Write16(t + 1 * 2, Read16(x + 2 * 2) - Read16(x + 3 * 2));
Write16(t + 2 * 2, Read16(x + 4 * 2) - Read16(x + 5 * 2));
Write16(t + 3 * 2, Read16(x + 6 * 2) - Read16(x + 7 * 2));
Write16(t + 4 * 2, Read16(y + 0 * 2) - Read16(y + 1 * 2));
Write16(t + 5 * 2, Read16(y + 2 * 2) - Read16(y + 3 * 2));
Write16(t + 6 * 2, Read16(y + 4 * 2) - Read16(y + 5 * 2));
Write16(t + 7 * 2, Read16(y + 6 * 2) - Read16(y + 7 * 2));
memcpy(x, t, 16);
}
static void SsePhaddsw(uint8_t x[16], const uint8_t y[16]) {
uint8_t t[16];
Write16(t + 0 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 0 * 2) +
(int16_t)Read16(x + 1 * 2)))));
Write16(t + 1 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 2 * 2) +
(int16_t)Read16(x + 3 * 2)))));
Write16(t + 2 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 4 * 2) +
(int16_t)Read16(x + 5 * 2)))));
Write16(t + 3 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 6 * 2) +
(int16_t)Read16(x + 7 * 2)))));
Write16(t + 4 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 0 * 2) +
(int16_t)Read16(y + 1 * 2)))));
Write16(t + 5 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 2 * 2) +
(int16_t)Read16(y + 3 * 2)))));
Write16(t + 6 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 4 * 2) +
(int16_t)Read16(y + 5 * 2)))));
Write16(t + 7 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 6 * 2) +
(int16_t)Read16(y + 7 * 2)))));
memcpy(x, t, 16);
}
static void SsePhsubsw(uint8_t x[16], const uint8_t y[16]) {
uint8_t t[16];
Write16(t + 0 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 0 * 2) -
(int16_t)Read16(x + 1 * 2)))));
Write16(t + 1 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 2 * 2) -
(int16_t)Read16(x + 3 * 2)))));
Write16(t + 2 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 4 * 2) -
(int16_t)Read16(x + 5 * 2)))));
Write16(t + 3 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(x + 6 * 2) -
(int16_t)Read16(x + 7 * 2)))));
Write16(t + 4 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 0 * 2) -
(int16_t)Read16(y + 1 * 2)))));
Write16(t + 5 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 2 * 2) -
(int16_t)Read16(y + 3 * 2)))));
Write16(t + 6 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 4 * 2) -
(int16_t)Read16(y + 5 * 2)))));
Write16(t + 7 * 2, MAX(-32768, MIN(32767, ((int16_t)Read16(y + 6 * 2) -
(int16_t)Read16(y + 7 * 2)))));
memcpy(x, t, 16);
}
static void SsePunpcklbw(uint8_t x[16], const uint8_t y[16]) {
x[0xf] = y[0x7];
x[0xe] = x[0x7];
x[0xd] = y[0x6];
x[0xc] = x[0x6];
x[0xb] = y[0x5];
x[0xa] = x[0x5];
x[0x9] = y[0x4];
x[0x8] = x[0x4];
x[0x7] = y[0x3];
x[0x6] = x[0x3];
x[0x5] = y[0x2];
x[0x4] = x[0x2];
x[0x3] = y[0x1];
x[0x2] = x[0x1];
x[0x1] = y[0x0];
x[0x0] = x[0x0];
}
static void SsePunpckhbw(uint8_t x[16], const uint8_t y[16]) {
x[0x0] = x[0x8];
x[0x1] = y[0x8];
x[0x2] = x[0x9];
x[0x3] = y[0x9];
x[0x4] = x[0xa];
x[0x5] = y[0xa];
x[0x6] = x[0xb];
x[0x7] = y[0xb];
x[0x8] = x[0xc];
x[0x9] = y[0xc];
x[0xa] = x[0xd];
x[0xb] = y[0xd];
x[0xc] = x[0xe];
x[0xd] = y[0xe];
x[0xe] = x[0xf];
x[0xf] = y[0xf];
}
static void SsePunpcklwd(uint8_t x[16], const uint8_t y[16]) {
x[0xe] = y[0x6];
x[0xf] = y[0x7];
x[0xc] = x[0x6];
x[0xd] = x[0x7];
x[0xa] = y[0x4];
x[0xb] = y[0x5];
x[0x8] = x[0x4];
x[0x9] = x[0x5];
x[0x6] = y[0x2];
x[0x7] = y[0x3];
x[0x4] = x[0x2];
x[0x5] = x[0x3];
x[0x2] = y[0x0];
x[0x3] = y[0x1];
x[0x0] = x[0x0];
x[0x1] = x[0x1];
}
static void SsePunpckldq(uint8_t x[16], const uint8_t y[16]) {
x[0xc] = y[0x4];
x[0xd] = y[0x5];
x[0xe] = y[0x6];
x[0xf] = y[0x7];
x[0x8] = x[0x4];
x[0x9] = x[0x5];
x[0xa] = x[0x6];
x[0xb] = x[0x7];
x[0x4] = y[0x0];
x[0x5] = y[0x1];
x[0x6] = y[0x2];
x[0x7] = y[0x3];
x[0x0] = x[0x0];
x[0x1] = x[0x1];
x[0x2] = x[0x2];
x[0x3] = x[0x3];
}
static void SsePunpckhwd(uint8_t x[16], const uint8_t y[16]) {
x[0x0] = x[0x8];
x[0x1] = x[0x9];
x[0x2] = y[0x8];
x[0x3] = y[0x9];
x[0x4] = x[0xa];
x[0x5] = x[0xb];
x[0x6] = y[0xa];
x[0x7] = y[0xb];
x[0x8] = x[0xc];
x[0x9] = x[0xd];
x[0xa] = y[0xc];
x[0xb] = y[0xd];
x[0xc] = x[0xe];
x[0xd] = x[0xf];
x[0xe] = y[0xe];
x[0xf] = y[0xf];
}
static void SsePunpckhdq(uint8_t x[16], const uint8_t y[16]) {
x[0x0] = x[0x8];
x[0x1] = x[0x9];
x[0x2] = x[0xa];
x[0x3] = x[0xb];
x[0x4] = y[0x8];
x[0x5] = y[0x9];
x[0x6] = y[0xa];
x[0x7] = y[0xb];
x[0x8] = x[0xc];
x[0x9] = x[0xd];
x[0xa] = x[0xe];
x[0xb] = x[0xf];
x[0xc] = y[0xc];
x[0xd] = y[0xd];
x[0xe] = y[0xe];
x[0xf] = y[0xf];
}
static void SsePunpcklqdq(uint8_t x[16], const uint8_t y[16]) {
x[0x8] = y[0x0];
x[0x9] = y[0x1];
x[0xa] = y[0x2];
x[0xb] = y[0x3];
x[0xc] = y[0x4];
x[0xd] = y[0x5];
x[0xe] = y[0x6];
x[0xf] = y[0x7];
x[0x0] = x[0x0];
x[0x1] = x[0x1];
x[0x2] = x[0x2];
x[0x3] = x[0x3];
x[0x4] = x[0x4];
x[0x5] = x[0x5];
x[0x6] = x[0x6];
x[0x7] = x[0x7];
}
static void SsePunpckhqdq(uint8_t x[16], const uint8_t y[16]) {
x[0x0] = x[0x8];
x[0x1] = x[0x9];
x[0x2] = x[0xa];
x[0x3] = x[0xb];
x[0x4] = x[0xc];
x[0x5] = x[0xd];
x[0x6] = x[0xe];
x[0x7] = x[0xf];
x[0x8] = y[0x8];
x[0x9] = y[0x9];
x[0xa] = y[0xa];
x[0xb] = y[0xb];
x[0xc] = y[0xc];
x[0xd] = y[0xd];
x[0xe] = y[0xe];
x[0xf] = y[0xf];
}
static void SsePsrlw(uint8_t x[16], unsigned k) {
MmxPsrlw(x + 0, k);
MmxPsrlw(x + 8, k);
}
static void SsePsraw(uint8_t x[16], unsigned k) {
MmxPsraw(x + 0, k);
MmxPsraw(x + 8, k);
}
static void SsePsllw(uint8_t x[16], unsigned k) {
MmxPsllw(x + 0, k);
MmxPsllw(x + 8, k);
}
static void SsePsrld(uint8_t x[16], unsigned k) {
MmxPsrld(x + 0, k);
MmxPsrld(x + 8, k);
}
static void SsePsrad(uint8_t x[16], unsigned k) {
MmxPsrad(x + 0, k);
MmxPsrad(x + 8, k);
}
static void SsePslld(uint8_t x[16], unsigned k) {
MmxPslld(x + 0, k);
MmxPslld(x + 8, k);
}
static void SsePsrlq(uint8_t x[16], unsigned k) {
MmxPsrlq(x + 0, k);
MmxPsrlq(x + 8, k);
}
static void SsePsllq(uint8_t x[16], unsigned k) {
MmxPsllq(x + 0, k);
MmxPsllq(x + 8, k);
}
static void SsePsubsb(uint8_t x[16], const uint8_t y[16]) {
MmxPsubsb(x + 0, y + 0);
MmxPsubsb(x + 8, y + 8);
}
static void SsePaddsb(uint8_t x[16], const uint8_t y[16]) {
MmxPaddsb(x + 0, y + 0);
MmxPaddsb(x + 8, y + 8);
}
static void SsePsubsw(uint8_t x[16], const uint8_t y[16]) {
MmxPsubsw(x + 0, y + 0);
MmxPsubsw(x + 8, y + 8);
}
static void SsePaddsw(uint8_t x[16], const uint8_t y[16]) {
MmxPaddsw(x + 0, y + 0);
MmxPaddsw(x + 8, y + 8);
}
static void SsePaddusb(uint8_t x[16], const uint8_t y[16]) {
MmxPaddusb(x + 0, y + 0);
MmxPaddusb(x + 8, y + 8);
}
static void SsePsubusb(uint8_t x[16], const uint8_t y[16]) {
MmxPsubusb(x + 0, y + 0);
MmxPsubusb(x + 8, y + 8);
}
static void SsePsubusw(uint8_t x[16], const uint8_t y[16]) {
MmxPsubusw(x + 0, y + 0);
MmxPsubusw(x + 8, y + 8);
}
static void SsePminsw(uint8_t x[16], const uint8_t y[16]) {
MmxPminsw(x + 0, y + 0);
MmxPminsw(x + 8, y + 8);
}
static void SsePmaxsw(uint8_t x[16], const uint8_t y[16]) {
MmxPmaxsw(x + 0, y + 0);
MmxPmaxsw(x + 8, y + 8);
}
static void SsePsignb(uint8_t x[16], const uint8_t y[16]) {
MmxPsignb(x + 0, y + 0);
MmxPsignb(x + 8, y + 8);
}
static void SsePsignw(uint8_t x[16], const uint8_t y[16]) {
MmxPsignw(x + 0, y + 0);
MmxPsignw(x + 8, y + 8);
}
static void SsePsignd(uint8_t x[16], const uint8_t y[16]) {
MmxPsignd(x + 0, y + 0);
MmxPsignd(x + 8, y + 8);
}
static void SsePmulhrsw(uint8_t x[16], const uint8_t y[16]) {
MmxPmulhrsw(x + 0, y + 0);
MmxPmulhrsw(x + 8, y + 8);
}
static void SsePabsw(uint8_t x[16], const uint8_t y[16]) {
MmxPabsw(x + 0, y + 0);
MmxPabsw(x + 8, y + 8);
}
static void SsePabsd(uint8_t x[16], const uint8_t y[16]) {
MmxPabsd(x + 0, y + 0);
MmxPabsd(x + 8, y + 8);
}
static void SsePcmpgtw(uint8_t x[16], const uint8_t y[16]) {
MmxPcmpgtw(x + 0, y + 0);
MmxPcmpgtw(x + 8, y + 8);
}
static void SsePcmpeqw(uint8_t x[16], const uint8_t y[16]) {
MmxPcmpeqw(x + 0, y + 0);
MmxPcmpeqw(x + 8, y + 8);
}
static void SsePcmpgtd(uint8_t x[16], const uint8_t y[16]) {
MmxPcmpgtd(x + 0, y + 0);
MmxPcmpgtd(x + 8, y + 8);
}
static void SsePcmpeqd(uint8_t x[16], const uint8_t y[16]) {
MmxPcmpeqd(x + 0, y + 0);
MmxPcmpeqd(x + 8, y + 8);
}
static void SsePsrawv(uint8_t x[16], const uint8_t y[16]) {
MmxPsrawv(x + 0, y);
MmxPsrawv(x + 8, y);
}
static void SsePsradv(uint8_t x[16], const uint8_t y[16]) {
MmxPsradv(x + 0, y);
MmxPsradv(x + 8, y);
}
static void SsePsrlwv(uint8_t x[16], const uint8_t y[16]) {
MmxPsrlwv(x + 0, y);
MmxPsrlwv(x + 8, y);
}
static void SsePsllwv(uint8_t x[16], const uint8_t y[16]) {
MmxPsllwv(x + 0, y);
MmxPsllwv(x + 8, y);
}
static void SsePsrldv(uint8_t x[16], const uint8_t y[16]) {
MmxPsrldv(x + 0, y);
MmxPsrldv(x + 8, y);
}
static void SsePslldv(uint8_t x[16], const uint8_t y[16]) {
MmxPslldv(x + 0, y);
MmxPslldv(x + 8, y);
}
static void SsePsrlqv(uint8_t x[16], const uint8_t y[16]) {
MmxPsrlqv(x + 0, y);
MmxPsrlqv(x + 8, y);
}
static void SsePsllqv(uint8_t x[16], const uint8_t y[16]) {
MmxPsllqv(x + 0, y);
MmxPsllqv(x + 8, y);
}
static void SsePavgw(uint8_t x[16], const uint8_t y[16]) {
MmxPavgw(x + 0, y + 0);
MmxPavgw(x + 8, y + 8);
}
static void SsePmaddwd(uint8_t x[16], const uint8_t y[16]) {
MmxPmaddwd(x + 0, y + 0);
MmxPmaddwd(x + 8, y + 8);
}
static void SsePmulhuw(uint8_t x[16], const uint8_t y[16]) {
MmxPmulhuw(x + 0, y + 0);
MmxPmulhuw(x + 8, y + 8);
}
static void SsePmulhw(uint8_t x[16], const uint8_t y[16]) {
MmxPmulhw(x + 0, y + 0);
MmxPmulhw(x + 8, y + 8);
}
static void SsePmullw(uint8_t x[16], const uint8_t y[16]) {
MmxPmullw(x + 0, y + 0);
MmxPmullw(x + 8, y + 8);
}
static void SsePmulld(uint8_t x[16], const uint8_t y[16]) {
MmxPmulld(x + 0, y + 0);
MmxPmulld(x + 8, y + 8);
}
static void SsePmaddubsw(uint8_t x[16], const uint8_t y[16]) {
MmxPmaddubsw(x + 0, y + 0);
MmxPmaddubsw(x + 8, y + 8);
}
static void OpPsb(struct Machine *m, uint32_t rde,
void MmxKernel(uint8_t[8], unsigned),
void SseKernel(uint8_t[16], unsigned)) {
if (Osz(rde)) {
SseKernel(XmmRexbRm(m, rde), m->xedd->op.uimm0);
} else {
MmxKernel(XmmRexbRm(m, rde), m->xedd->op.uimm0);
}
}
void Op171(struct Machine *m, uint32_t rde) {
switch (ModrmReg(rde)) {
case 2:
OpPsb(m, rde, MmxPsrlw, SsePsrlw);
break;
case 4:
OpPsb(m, rde, MmxPsraw, SsePsraw);
break;
case 6:
OpPsb(m, rde, MmxPsllw, SsePsllw);
break;
default:
OpUd(m, rde);
}
}
void Op172(struct Machine *m, uint32_t rde) {
switch (ModrmReg(rde)) {
case 2:
OpPsb(m, rde, MmxPsrld, SsePsrld);
break;
case 4:
OpPsb(m, rde, MmxPsrad, SsePsrad);
break;
case 6:
OpPsb(m, rde, MmxPslld, SsePslld);
break;
default:
OpUd(m, rde);
}
}
void Op173(struct Machine *m, uint32_t rde) {
switch (ModrmReg(rde)) {
case 2:
OpPsb(m, rde, MmxPsrlq, SsePsrlq);
break;
case 3:
OpPsb(m, rde, MmxPsrldq, SsePsrldq);
break;
case 6:
OpPsb(m, rde, MmxPsllq, SsePsllq);
break;
case 7:
OpPsb(m, rde, MmxPslldq, SsePslldq);
break;
default:
OpUd(m, rde);
}
}
void OpSsePalignr(struct Machine *m, uint32_t rde) {
if (Osz(rde)) {
SsePalignr(XmmRexrReg(m, rde), GetModrmRegisterXmmPointerRead16(m, rde),
m->xedd->op.uimm0);
} else {
MmxPalignr(XmmRexrReg(m, rde), GetModrmRegisterXmmPointerRead8(m, rde),
m->xedd->op.uimm0);
}
}
static void OpSse(struct Machine *m, uint32_t rde,
void MmxKernel(uint8_t[8], const uint8_t[8]),
void SseKernel(uint8_t[16], const uint8_t[16])) {
if (Osz(rde)) {
SseKernel(XmmRexrReg(m, rde), GetModrmRegisterXmmPointerRead16(m, rde));
} else {
MmxKernel(XmmRexrReg(m, rde), GetModrmRegisterXmmPointerRead8(m, rde));
}
}
/* clang-format off */
void OpSsePunpcklbw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpcklbw, SsePunpcklbw); }
void OpSsePunpcklwd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpcklwd, SsePunpcklwd); }
void OpSsePunpckldq(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpckldq, SsePunpckldq); }
void OpSsePacksswb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPacksswb, SsePacksswb); }
void OpSsePcmpgtb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPcmpgtb, SsePcmpgtb); }
void OpSsePcmpgtw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPcmpgtw, SsePcmpgtw); }
void OpSsePcmpgtd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPcmpgtd, SsePcmpgtd); }
void OpSsePackuswb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPackuswb, SsePackuswb); }
void OpSsePunpckhbw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpckhbw, SsePunpckhbw); }
void OpSsePunpckhwd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpckhwd, SsePunpckhwd); }
void OpSsePunpckhdq(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpckhdq, SsePunpckhdq); }
void OpSsePackssdw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPackssdw, SsePackssdw); }
void OpSsePunpcklqdq(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpcklqdq, SsePunpcklqdq); }
void OpSsePunpckhqdq(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPunpckhqdq, SsePunpckhqdq); }
void OpSsePcmpeqb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPcmpeqb, SsePcmpeqb); }
void OpSsePcmpeqw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPcmpeqw, SsePcmpeqw); }
void OpSsePcmpeqd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPcmpeqd, SsePcmpeqd); }
void OpSsePsrlwv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsrlwv, SsePsrlwv); }
void OpSsePsrldv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsrldv, SsePsrldv); }
void OpSsePsrlqv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsrlqv, SsePsrlqv); }
void OpSsePaddq(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddq, SsePaddq); }
void OpSsePmullw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmullw, SsePmullw); }
void OpSsePsubusb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubusb, SsePsubusb); }
void OpSsePsubusw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubusw, SsePsubusw); }
void OpSsePminub(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPminub, SsePminub); }
void OpSsePand(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPand, SsePand); }
void OpSsePaddusb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddusb, SsePaddusb); }
void OpSsePaddusw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddusw, SsePaddusw); }
void OpSsePmaxub(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmaxub, SsePmaxub); }
void OpSsePandn(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPandn, SsePandn); }
void OpSsePavgb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPavgb, SsePavgb); }
void OpSsePsrawv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsrawv, SsePsrawv); }
void OpSsePsradv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsradv, SsePsradv); }
void OpSsePavgw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPavgw, SsePavgw); }
void OpSsePmulhuw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmulhuw, SsePmulhuw); }
void OpSsePmulhw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmulhw, SsePmulhw); }
void OpSsePsubsb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubsb, SsePsubsb); }
void OpSsePsubsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubsw, SsePsubsw); }
void OpSsePminsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPminsw, SsePminsw); }
void OpSsePor(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPor, SsePor); }
void OpSsePaddsb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddsb, SsePaddsb); }
void OpSsePaddsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddsw, SsePaddsw); }
void OpSsePmaxsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmaxsw, SsePmaxsw); }
void OpSsePxor(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPxor, SsePxor); }
void OpSsePsllwv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsllwv, SsePsllwv); }
void OpSsePslldv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPslldv, SsePslldv); }
void OpSsePsllqv(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsllqv, SsePsllqv); }
void OpSsePmuludq(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmuludq, SsePmuludq); }
void OpSsePmaddwd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmaddwd, SsePmaddwd); }
void OpSsePsadbw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsadbw, SsePsadbw); }
void OpSsePsubb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubb, SsePsubb); }
void OpSsePsubw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubw, SsePsubw); }
void OpSsePsubd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubd, SsePsubd); }
void OpSsePsubq(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsubq, SsePsubq); }
void OpSsePaddb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddb, SsePaddb); }
void OpSsePaddw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddw, SsePaddw); }
void OpSsePaddd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPaddd, SsePaddd); }
void OpSsePshufb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPshufb, SsePshufb); }
void OpSsePhaddw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPhaddw, SsePhaddw); }
void OpSsePhaddd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPhaddd, SsePhaddd); }
void OpSsePhaddsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPhaddsw, SsePhaddsw); }
void OpSsePmaddubsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmaddubsw, SsePmaddubsw); }
void OpSsePhsubw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPhsubw, SsePhsubw); }
void OpSsePhsubd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPhsubd, SsePhsubd); }
void OpSsePhsubsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPhsubsw, SsePhsubsw); }
void OpSsePsignb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsignb, SsePsignb); }
void OpSsePsignw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsignw, SsePsignw); }
void OpSsePsignd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPsignd, SsePsignd); }
void OpSsePmulhrsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmulhrsw, SsePmulhrsw); }
void OpSsePabsb(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPabsb, SsePabsb); }
void OpSsePabsw(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPabsw, SsePabsw); }
void OpSsePabsd(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPabsd, SsePabsd); }
void OpSsePmulld(struct Machine *m, uint32_t rde) { OpSse(m, rde, MmxPmulld, SsePmulld); }
/* clang-format on */