cosmopolitan/tool/build/lib/sse.c
Justine Tunney 416fd86676 Make improvements
- Emulator can now test the αcτµαlly pδrταblε εxεcµταblε bootloader

- Whipped up a webserver named redbean. It services 150k requests per
  second on a single core. Bundling assets inside zip enables extremely
  fast serving for two reasons. The first is that zip central directory
  lookups go faster than stat() system calls. The second is that both
  zip and gzip content-encoding use DEFLATE, therefore, compressed
  responses can be served via the sendfile() system call which does an
  in-kernel copy directly from the zip executable structure. Also note
  that red bean zip executables can be deployed easily to all platforms,
  since these native executables work on Linux, Mac, BSD, and Windows.

- Address sanitizer now works very well
2020-09-14 00:02:34 -07:00

755 lines
18 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ This program is free software; you can redistribute it and/or modify │
│ it under the terms of the GNU General Public License as published by │
│ the Free Software Foundation; version 2 of the License. │
│ │
│ This program is distributed in the hope that it will be useful, but │
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
│ General Public License for more details. │
│ │
│ You should have received a copy of the GNU General Public License │
│ along with this program; if not, write to the Free Software │
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
│ 02110-1301 USA │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/intrin/pabsb.h"
#include "libc/intrin/pabsd.h"
#include "libc/intrin/pabsw.h"
#include "libc/intrin/packssdw.h"
#include "libc/intrin/packsswb.h"
#include "libc/intrin/packuswb.h"
#include "libc/intrin/paddb.h"
#include "libc/intrin/paddd.h"
#include "libc/intrin/paddq.h"
#include "libc/intrin/paddsb.h"
#include "libc/intrin/paddsw.h"
#include "libc/intrin/paddusb.h"
#include "libc/intrin/paddusw.h"
#include "libc/intrin/paddw.h"
#include "libc/intrin/palignr.h"
#include "libc/intrin/pand.h"
#include "libc/intrin/pandn.h"
#include "libc/intrin/pavgb.h"
#include "libc/intrin/pavgw.h"
#include "libc/intrin/pcmpeqb.h"
#include "libc/intrin/pcmpeqd.h"
#include "libc/intrin/pcmpeqw.h"
#include "libc/intrin/pcmpgtb.h"
#include "libc/intrin/pcmpgtd.h"
#include "libc/intrin/pcmpgtw.h"
#include "libc/intrin/phaddd.h"
#include "libc/intrin/phaddsw.h"
#include "libc/intrin/phaddw.h"
#include "libc/intrin/phsubd.h"
#include "libc/intrin/phsubsw.h"
#include "libc/intrin/phsubw.h"
#include "libc/intrin/pmaddubsw.h"
#include "libc/intrin/pmaddwd.h"
#include "libc/intrin/pmaxsw.h"
#include "libc/intrin/pmaxub.h"
#include "libc/intrin/pminsw.h"
#include "libc/intrin/pminub.h"
#include "libc/intrin/pmulhrsw.h"
#include "libc/intrin/pmulhuw.h"
#include "libc/intrin/pmulhw.h"
#include "libc/intrin/pmulld.h"
#include "libc/intrin/pmullw.h"
#include "libc/intrin/pmuludq.h"
#include "libc/intrin/por.h"
#include "libc/intrin/psadbw.h"
#include "libc/intrin/pshufb.h"
#include "libc/intrin/psignb.h"
#include "libc/intrin/psignd.h"
#include "libc/intrin/psignw.h"
#include "libc/intrin/pslld.h"
#include "libc/intrin/pslldq.h"
#include "libc/intrin/psllq.h"
#include "libc/intrin/psllw.h"
#include "libc/intrin/psrad.h"
#include "libc/intrin/psraw.h"
#include "libc/intrin/psrld.h"
#include "libc/intrin/psrldq.h"
#include "libc/intrin/psrlq.h"
#include "libc/intrin/psrlw.h"
#include "libc/intrin/psubb.h"
#include "libc/intrin/psubd.h"
#include "libc/intrin/psubq.h"
#include "libc/intrin/psubsb.h"
#include "libc/intrin/psubsw.h"
#include "libc/intrin/psubusb.h"
#include "libc/intrin/psubusw.h"
#include "libc/intrin/psubw.h"
#include "libc/intrin/punpckhbw.h"
#include "libc/intrin/punpckhdq.h"
#include "libc/intrin/punpckhqdq.h"
#include "libc/intrin/punpckhwd.h"
#include "libc/intrin/punpcklbw.h"
#include "libc/intrin/punpckldq.h"
#include "libc/intrin/punpcklqdq.h"
#include "libc/intrin/punpcklwd.h"
#include "libc/intrin/pxor.h"
#include "libc/macros.h"
#include "libc/str/str.h"
#include "tool/build/lib/case.h"
#include "tool/build/lib/machine.h"
#include "tool/build/lib/memory.h"
#include "tool/build/lib/modrm.h"
#include "tool/build/lib/sse.h"
union MachineVector {
float f32[4];
double f64[2];
int8_t i8[16];
int16_t i16[8];
int32_t i32[4];
int64_t i64[2];
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
};
static void SsePsubb(void *b, const void *a) {
psubb(b, b, a);
}
static void SsePaddb(void *b, const void *a) {
paddb(b, b, a);
}
static void SsePsubw(void *b, const void *a) {
psubw(b, b, a);
}
static void SsePaddw(void *b, const void *a) {
paddw(b, b, a);
}
static void SsePsubd(void *b, const void *a) {
psubd(b, b, a);
}
static void SsePaddd(void *b, const void *a) {
paddd(b, b, a);
}
static void SsePaddq(void *b, const void *a) {
paddq(b, b, a);
}
static void SsePsubq(void *b, const void *a) {
psubq(b, b, a);
}
static void SsePsubsb(void *b, const void *a) {
psubsb(b, b, a);
}
static void SsePsubsw(void *b, const void *a) {
psubsw(b, b, a);
}
static void SsePaddsb(void *b, const void *a) {
paddsb(b, b, a);
}
static void SsePaddsw(void *b, const void *a) {
paddsw(b, b, a);
}
static void SsePaddusb(void *b, const void *a) {
paddusb(b, b, a);
}
static void SsePaddusw(void *b, const void *a) {
paddusw(b, b, a);
}
static void SsePor(void *b, const void *a) {
por(b, b, a);
}
static void SsePxor(void *b, const void *a) {
pxor(b, b, a);
}
static void SsePand(void *b, const void *a) {
pand(b, b, a);
}
static void SsePandn(void *b, const void *a) {
pandn(b, b, a);
}
static void SsePsubusb(void *b, const void *a) {
psubusb(b, b, a);
}
static void SsePsubusw(void *b, const void *a) {
psubusw(b, b, a);
}
static void SsePminub(void *b, const void *a) {
pminub(b, b, a);
}
static void SsePmaxub(void *b, const void *a) {
pmaxub(b, b, a);
}
static void SsePminsw(void *b, const void *a) {
pminsw(b, b, a);
}
static void SsePmaxsw(void *b, const void *a) {
pmaxsw(b, b, a);
}
static void SsePunpcklbw(void *b, const void *a) {
punpcklbw(b, b, a);
}
static void SsePunpckhbw(void *b, const void *a) {
punpckhbw(b, b, a);
}
static void SsePunpcklwd(void *b, const void *a) {
punpcklwd(b, b, a);
}
static void SsePunpckldq(void *b, const void *a) {
punpckldq(b, b, a);
}
static void SsePunpckhwd(void *b, const void *a) {
punpckhwd(b, b, a);
}
static void SsePunpckhdq(void *b, const void *a) {
punpckhdq(b, b, a);
}
static void SsePunpcklqdq(void *b, const void *a) {
punpcklqdq(b, b, a);
}
static void SsePunpckhqdq(void *b, const void *a) {
punpckhqdq(b, b, a);
}
static void SsePacksswb(void *b, const void *a) {
packsswb(b, b, a);
}
static void SsePackuswb(void *b, const void *a) {
packuswb(b, b, a);
}
static void SsePackssdw(void *b, const void *a) {
packssdw(b, b, a);
}
static void SsePcmpgtb(void *b, const void *a) {
pcmpgtb(b, b, a);
}
static void SsePcmpgtw(void *b, const void *a) {
pcmpgtw(b, b, a);
}
static void SsePcmpgtd(void *b, const void *a) {
pcmpgtd(b, b, a);
}
static void SsePcmpeqb(void *b, const void *a) {
pcmpeqb(b, b, a);
}
static void SsePcmpeqw(void *b, const void *a) {
pcmpeqw(b, b, a);
}
static void SsePcmpeqd(void *b, const void *a) {
pcmpeqd(b, b, a);
}
static void SsePsrawv(void *b, const void *a) {
psrawv(b, b, a);
}
static void SsePsrlwv(void *b, const void *a) {
psrlwv(b, b, a);
}
static void SsePsllwv(void *b, const void *a) {
psllwv(b, b, a);
}
static void SsePsradv(void *b, const void *a) {
psradv(b, b, a);
}
static void SsePsrldv(void *b, const void *a) {
psrldv(b, b, a);
}
static void SsePslldv(void *b, const void *a) {
pslldv(b, b, a);
}
static void SsePsrlqv(void *b, const void *a) {
psrlqv(b, b, a);
}
static void SsePsllqv(void *b, const void *a) {
psllqv(b, b, a);
}
static void SsePavgb(void *b, const void *a) {
pavgb(b, b, a);
}
static void SsePavgw(void *b, const void *a) {
pavgw(b, b, a);
}
static void SsePsadbw(void *b, const void *a) {
psadbw(b, b, a);
}
static void SsePmaddwd(void *b, const void *a) {
pmaddwd(b, b, a);
}
static void SsePmulhuw(void *b, const void *a) {
pmulhuw(b, b, a);
}
static void SsePmulhw(void *b, const void *a) {
pmulhw(b, b, a);
}
static void SsePmuludq(void *b, const void *a) {
pmuludq(b, b, a);
}
static void SsePmullw(void *b, const void *a) {
pmullw(b, b, a);
}
static void SsePmulld(void *b, const void *a) {
pmulld(b, b, a);
}
static void SsePshufb(void *b, const void *a) {
pshufb(b, b, a);
}
static void SsePhaddw(void *b, const void *a) {
phaddw(b, b, a);
}
static void SsePhaddd(void *b, const void *a) {
phaddd(b, b, a);
}
static void SsePhaddsw(void *b, const void *a) {
phaddsw(b, b, a);
}
static void SsePmaddubsw(void *b, const void *a) {
pmaddubsw(b, b, a);
}
static void SsePhsubw(void *b, const void *a) {
phsubw(b, b, a);
}
static void SsePhsubd(void *b, const void *a) {
phsubd(b, b, a);
}
static void SsePhsubsw(void *b, const void *a) {
phsubsw(b, b, a);
}
static void SsePsignb(void *b, const void *a) {
psignb(b, b, a);
}
static void SsePsignw(void *b, const void *a) {
psignw(b, b, a);
}
static void SsePsignd(void *b, const void *a) {
psignd(b, b, a);
}
static void SsePmulhrsw(void *b, const void *a) {
pmulhrsw(b, b, a);
}
static void SsePabsb(void *b, const void *a) {
pabsb(b, a);
}
static void SsePabsw(void *b, const void *a) {
pabsw(b, a);
}
static void SsePabsd(void *b, const void *a) {
pabsd(b, a);
}
static void OpSse(struct Machine *m, uint32_t rde,
void kernel(void *, const void *)) {
char x[16], y[16];
if (Osz(rde)) {
kernel(XmmRexrReg(m, rde), GetModrmRegisterXmmPointerRead16(m, rde));
} else {
memcpy(x, XmmRexrReg(m, rde), 8);
memcpy(y, GetModrmRegisterXmmPointerRead16(m, rde), 8);
kernel(x, y);
memcpy(XmmRexrReg(m, rde), x, 8);
}
}
void OpSsePunpcklbw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpcklbw);
}
void OpSsePunpcklwd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpcklwd);
}
void OpSsePunpckldq(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpckldq);
}
void OpSsePacksswb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePacksswb);
}
void OpSsePcmpgtb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePcmpgtb);
}
void OpSsePcmpgtw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePcmpgtw);
}
void OpSsePcmpgtd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePcmpgtd);
}
void OpSsePackuswb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePackuswb);
}
void OpSsePunpckhbw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpckhbw);
}
void OpSsePunpckhwd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpckhwd);
}
void OpSsePunpckhdq(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpckhdq);
}
void OpSsePackssdw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePackssdw);
}
void OpSsePunpcklqdq(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpcklqdq);
}
void OpSsePunpckhqdq(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePunpckhqdq);
}
void OpSsePcmpeqb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePcmpeqb);
}
void OpSsePcmpeqw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePcmpeqw);
}
void OpSsePcmpeqd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePcmpeqd);
}
void OpSsePsrlwv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsrlwv);
}
void OpSsePsrldv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsrldv);
}
void OpSsePsrlqv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsrlqv);
}
void OpSsePaddq(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddq);
}
void OpSsePmullw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmullw);
}
void OpSsePsubusb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubusb);
}
void OpSsePsubusw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubusw);
}
void OpSsePminub(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePminub);
}
void OpSsePand(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePand);
}
void OpSsePaddusb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddusb);
}
void OpSsePaddusw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddusw);
}
void OpSsePmaxub(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmaxub);
}
void OpSsePandn(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePandn);
}
void OpSsePavgb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePavgb);
}
void OpSsePsrawv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsrawv);
}
void OpSsePsradv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsradv);
}
void OpSsePavgw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePavgw);
}
void OpSsePmulhuw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmulhuw);
}
void OpSsePmulhw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmulhw);
}
void OpSsePsubsb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubsb);
}
void OpSsePsubsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubsw);
}
void OpSsePminsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePminsw);
}
void OpSsePor(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePor);
}
void OpSsePaddsb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddsb);
}
void OpSsePaddsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddsw);
}
void OpSsePmaxsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmaxsw);
}
void OpSsePxor(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePxor);
}
void OpSsePsllwv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsllwv);
}
void OpSsePslldv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePslldv);
}
void OpSsePsllqv(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsllqv);
}
void OpSsePmuludq(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmuludq);
}
void OpSsePmaddwd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmaddwd);
}
void OpSsePsadbw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsadbw);
}
void OpSsePsubb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubb);
}
void OpSsePsubw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubw);
}
void OpSsePsubd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubd);
}
void OpSsePsubq(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsubq);
}
void OpSsePaddb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddb);
}
void OpSsePaddw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddw);
}
void OpSsePaddd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePaddd);
}
void OpSsePshufb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePshufb);
}
void OpSsePhaddw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePhaddw);
}
void OpSsePhaddd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePhaddd);
}
void OpSsePhaddsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePhaddsw);
}
void OpSsePmaddubsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmaddubsw);
}
void OpSsePhsubw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePhsubw);
}
void OpSsePhsubd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePhsubd);
}
void OpSsePhsubsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePhsubsw);
}
void OpSsePsignb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsignb);
}
void OpSsePsignw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsignw);
}
void OpSsePsignd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePsignd);
}
void OpSsePmulhrsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmulhrsw);
}
void OpSsePabsb(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePabsb);
}
void OpSsePabsw(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePabsw);
}
void OpSsePabsd(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePabsd);
}
void OpSsePmulld(struct Machine *m, uint32_t rde) {
OpSse(m, rde, SsePmulld);
}
void OpSseUdqIb(struct Machine *m, uint32_t rde, enum OpSseUdqIbKernel kernel) {
uint8_t i;
union MachineVector x;
i = m->xedd->op.uimm0;
memcpy(&x, XmmRexbRm(m, rde), 16);
switch (kernel) {
CASE(kOpSseUdqIbPsrlw, (psrlw)(x.u16, x.u16, i));
CASE(kOpSseUdqIbPsraw, (psraw)(x.i16, x.i16, i));
CASE(kOpSseUdqIbPsllw, (psllw)(x.u16, x.u16, i));
CASE(kOpSseUdqIbPsrld, (psrld)(x.u32, x.u32, i));
CASE(kOpSseUdqIbPsrad, (psrad)(x.i32, x.i32, i));
CASE(kOpSseUdqIbPslld, (pslld)(x.u32, x.u32, i));
CASE(kOpSseUdqIbPsrlq, (psrlq)(x.u64, x.u64, i));
CASE(kOpSseUdqIbPsrldq, (psrldq)(x.u8, x.u8, i));
CASE(kOpSseUdqIbPsllq, (psllq)(x.u64, x.u64, i));
CASE(kOpSseUdqIbPslldq, (pslldq)(x.u8, x.u8, i));
default:
unreachable;
}
if (Osz(rde)) {
memcpy(XmmRexbRm(m, rde), &x, 16);
} else {
memcpy(XmmRexbRm(m, rde), &x, 8);
}
}
static void OpSsePalignrMmx(struct Machine *m, uint32_t rde) {
char t[24];
memcpy(t, GetModrmRegisterXmmPointerRead8(m, rde), 8);
memcpy(t + 8, XmmRexrReg(m, rde), 8);
memset(t + 16, 0, 8);
memcpy(XmmRexrReg(m, rde), t + MIN(m->xedd->op.uimm0, 16), 8);
}
void OpSsePalignr(struct Machine *m, uint32_t rde) {
if (Osz(rde)) {
palignr(XmmRexrReg(m, rde), XmmRexrReg(m, rde),
GetModrmRegisterXmmPointerRead8(m, rde), m->xedd->op.uimm0);
} else {
OpSsePalignrMmx(m, rde);
}
}