cosmopolitan/tool/build/lib/ssefloat.c
Justine Tunney 416fd86676 Make improvements
- Emulator can now test the αcτµαlly pδrταblε εxεcµταblε bootloader

- Whipped up a webserver named redbean. It services 150k requests per
  second on a single core. Bundling assets inside zip enables extremely
  fast serving for two reasons. The first is that zip central directory
  lookups go faster than stat() system calls. The second is that both
  zip and gzip content-encoding use DEFLATE, therefore, compressed
  responses can be served via the sendfile() system call which does an
  in-kernel copy directly from the zip executable structure. Also note
  that red bean zip executables can be deployed easily to all platforms,
  since these native executables work on Linux, Mac, BSD, and Windows.

- Address sanitizer now works very well
2020-09-14 00:02:34 -07:00

571 lines
16 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ This program is free software; you can redistribute it and/or modify │
│ it under the terms of the GNU General Public License as published by │
│ the Free Software Foundation; version 2 of the License. │
│ │
│ This program is distributed in the hope that it will be useful, but │
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
│ General Public License for more details. │
│ │
│ You should have received a copy of the GNU General Public License │
│ along with this program; if not, write to the Free Software │
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
│ 02110-1301 USA │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/dce.h"
#include "libc/intrin/pshufd.h"
#include "libc/intrin/pshufhw.h"
#include "libc/intrin/pshuflw.h"
#include "libc/intrin/pshufw.h"
#include "libc/intrin/shufpd.h"
#include "libc/intrin/shufps.h"
#include "libc/macros.h"
#include "libc/math.h"
#include "libc/str/str.h"
#include "tool/build/lib/endian.h"
#include "tool/build/lib/flags.h"
#include "tool/build/lib/modrm.h"
#include "tool/build/lib/ssefloat.h"
#include "tool/build/lib/throw.h"
#define SSE_BUILTINS \
(!IsModeDbg() && __SSE3__ + 0 && \
(__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 408)
typedef int int_v _Vector_size(16) aligned(16);
typedef long long_v _Vector_size(16) aligned(16);
static float_v Addps(struct Machine *m, float_v x, float_v y) {
return x + y;
}
static double_v Addpd(struct Machine *m, double_v x, double_v y) {
return x + y;
}
static float_v Mulps(struct Machine *m, float_v x, float_v y) {
return x * y;
}
static double_v Mulpd(struct Machine *m, double_v x, double_v y) {
return x * y;
}
static float_v Subps(struct Machine *m, float_v x, float_v y) {
return x - y;
}
static double_v Subpd(struct Machine *m, double_v x, double_v y) {
return x - y;
}
static float_v Divps(struct Machine *m, float_v x, float_v y) {
return x / y;
}
static double_v Divpd(struct Machine *m, double_v x, double_v y) {
return x / y;
}
static float_v Andps(struct Machine *m, float_v x, float_v y) {
return (float_v)((int_v)x & (int_v)y);
}
static double_v Andpd(struct Machine *m, double_v x, double_v y) {
return (double_v)((long_v)x & (long_v)y);
}
static float_v Andnps(struct Machine *m, float_v x, float_v y) {
return (float_v)(~(int_v)x & (int_v)y);
}
static double_v Andnpd(struct Machine *m, double_v x, double_v y) {
return (double_v)(~(long_v)x & (long_v)y);
}
static float_v Orps(struct Machine *m, float_v x, float_v y) {
return (float_v)((int_v)x | (int_v)y);
}
static double_v Orpd(struct Machine *m, double_v x, double_v y) {
return (double_v)((long_v)x | (long_v)y);
}
static float_v Xorps(struct Machine *m, float_v x, float_v y) {
return (float_v)((int_v)x ^ (int_v)y);
}
static double_v Xorpd(struct Machine *m, double_v x, double_v y) {
return (double_v)((long_v)x ^ (long_v)y);
}
static float_v Minps(struct Machine *m, float_v x, float_v y) {
#if SSE_BUILTINS
return __builtin_ia32_minps(x, y);
#else
unsigned i;
for (i = 0; i < 4; ++i) {
x[i] = MIN(x[i], y[i]);
}
return x;
#endif
}
static double_v Minpd(struct Machine *m, double_v x, double_v y) {
#if SSE_BUILTINS
return __builtin_ia32_minpd(x, y);
#else
unsigned i;
for (i = 0; i < 2; ++i) {
x[i] = MIN(x[i], y[i]);
}
return x;
#endif
}
static float_v Maxps(struct Machine *m, float_v x, float_v y) {
#if SSE_BUILTINS
return __builtin_ia32_maxps(x, y);
#else
unsigned i;
for (i = 0; i < 4; ++i) {
x[i] = MAX(x[i], y[i]);
}
return x;
#endif
}
static double_v Maxpd(struct Machine *m, double_v x, double_v y) {
#if SSE_BUILTINS
return __builtin_ia32_maxpd(x, y);
#else
unsigned i;
for (i = 0; i < 2; ++i) {
x[i] = MAX(x[i], y[i]);
}
return x;
#endif
}
static double_v Haddpd(struct Machine *m, double_v x, double_v y) {
#if SSE_BUILTINS
return __builtin_ia32_haddpd(x, y);
#else
return (double_v){x[0] + x[1], y[0] + y[1]};
#endif
}
static float_v Haddps(struct Machine *m, float_v x, float_v y) {
#if SSE_BUILTINS
return __builtin_ia32_haddps(x, y);
#else
return (float_v){x[0] + x[1], x[2] + x[3], y[0] + y[1], y[2] + y[3]};
#endif
}
static double_v Hsubpd(struct Machine *m, double_v x, double_v y) {
#if SSE_BUILTINS
return __builtin_ia32_hsubpd(x, y);
#else
return (double_v){x[0] - x[1], y[0] - y[1]};
#endif
}
static float_v Hsubps(struct Machine *m, float_v x, float_v y) {
#if SSE_BUILTINS
return __builtin_ia32_hsubps(x, y);
#else
return (float_v){x[0] - x[1], x[2] - x[3], y[0] - y[1], y[2] - y[3]};
#endif
}
static double_v Addsubpd(struct Machine *m, double_v x, double_v y) {
#if SSE_BUILTINS
return __builtin_ia32_addsubpd(x, y);
#else
return (double_v){x[0] - y[0], x[1] + y[1]};
#endif
}
static float_v Addsubps(struct Machine *m, float_v x, float_v y) {
#if SSE_BUILTINS
return __builtin_ia32_addsubps(x, y);
#else
return (float_v){x[0] - y[0], x[1] + y[1], x[2] - y[2], x[3] + y[3]};
#endif
}
void OpUnpcklpsd(struct Machine *m, uint32_t rde) {
uint8_t *a, *b;
a = XmmRexrReg(m, rde);
b = GetModrmRegisterXmmPointerRead8(m, rde);
if (Osz(rde)) {
memcpy(a + 8, b, 8);
} else {
memcpy(a + 4 * 3, b + 4, 4);
memcpy(a + 4 * 2, a + 4, 4);
memcpy(a + 4 * 1, b + 0, 4);
}
}
void OpUnpckhpsd(struct Machine *m, uint32_t rde) {
uint8_t *a, *b;
a = XmmRexrReg(m, rde);
b = GetModrmRegisterXmmPointerRead16(m, rde);
if (Osz(rde)) {
memcpy(a + 0, b + 8, 8);
memcpy(a + 8, b + 8, 8);
} else {
memcpy(a + 4 * 0, a + 4 * 2, 4);
memcpy(a + 4 * 1, b + 4 * 2, 4);
memcpy(a + 4 * 2, a + 4 * 3, 4);
memcpy(a + 4 * 3, b + 4 * 3, 4);
}
}
void OpPextrwGdqpUdqIb(struct Machine *m, uint32_t rde) {
uint8_t i;
i = m->xedd->op.uimm0;
i &= Osz(rde) ? 7 : 3;
Write16(RegRexrReg(m, rde), Read16(XmmRexbRm(m, rde) + i * 2));
}
void OpPinsrwVdqEwIb(struct Machine *m, uint32_t rde) {
uint8_t i;
i = m->xedd->op.uimm0;
i &= Osz(rde) ? 7 : 3;
Write16(XmmRexrReg(m, rde) + i * 2,
Read16(GetModrmRegisterWordPointerRead2(m, rde)));
}
void OpShuffle(struct Machine *m, uint32_t rde) {
int16_t q16[4];
int16_t x16[8];
int32_t x32[4];
switch (Rep(rde) | Osz(rde)) {
case 0:
memcpy(q16, GetModrmRegisterXmmPointerRead8(m, rde), 8);
(pshufw)(q16, q16, m->xedd->op.uimm0);
memcpy(XmmRexrReg(m, rde), q16, 8);
break;
case 1:
memcpy(x32, GetModrmRegisterXmmPointerRead16(m, rde), 16);
(pshufd)(x32, x32, m->xedd->op.uimm0);
memcpy(XmmRexrReg(m, rde), x32, 16);
break;
case 2:
memcpy(x16, GetModrmRegisterXmmPointerRead16(m, rde), 16);
(pshuflw)(x16, x16, m->xedd->op.uimm0);
memcpy(XmmRexrReg(m, rde), x16, 16);
break;
case 3:
memcpy(x16, GetModrmRegisterXmmPointerRead16(m, rde), 16);
(pshufhw)(x16, x16, m->xedd->op.uimm0);
memcpy(XmmRexrReg(m, rde), x16, 16);
break;
default:
unreachable;
}
}
static void Shufps(struct Machine *m, uint32_t rde) {
shufps((void *)XmmRexrReg(m, rde), (void *)XmmRexrReg(m, rde),
(void *)GetModrmRegisterXmmPointerRead16(m, rde), m->xedd->op.uimm0);
}
static void Shufpd(struct Machine *m, uint32_t rde) {
shufpd((void *)XmmRexrReg(m, rde), (void *)XmmRexrReg(m, rde),
(void *)GetModrmRegisterXmmPointerRead16(m, rde), m->xedd->op.uimm0);
}
void OpShufpsd(struct Machine *m, uint32_t rde) {
if (Osz(rde)) {
Shufpd(m, rde);
} else {
Shufps(m, rde);
}
}
void OpSqrtpsd(struct Machine *m, uint32_t rde) {
long i;
float_v xf;
double_v xd;
switch (Rep(rde) | Osz(rde)) {
case 0:
memcpy(&xf, GetModrmRegisterXmmPointerRead16(m, rde), 16);
for (i = 0; i < 4; ++i) xf[i] = sqrtf(xf[i]);
memcpy(XmmRexrReg(m, rde), &xf, 16);
break;
case 1:
memcpy(&xd, GetModrmRegisterXmmPointerRead16(m, rde), 16);
for (i = 0; i < 2; ++i) xd[i] = sqrt(xd[i]);
memcpy(XmmRexrReg(m, rde), &xd, 16);
break;
case 2:
memcpy(&xd, GetModrmRegisterXmmPointerRead8(m, rde), 8);
xd[0] = sqrt(xd[0]);
memcpy(XmmRexrReg(m, rde), &xd, 8);
break;
case 3:
memcpy(&xf, GetModrmRegisterXmmPointerRead4(m, rde), 4);
xf[0] = sqrtf(xf[0]);
memcpy(XmmRexrReg(m, rde), &xf, 4);
break;
default:
unreachable;
}
}
void OpRsqrtps(struct Machine *m, uint32_t rde) {
float_v x;
unsigned i;
if (Rep(rde) != 3) {
memcpy(&x, GetModrmRegisterXmmPointerRead16(m, rde), 16);
for (i = 0; i < 4; ++i) x[i] = 1.f / sqrtf(x[i]);
memcpy(XmmRexrReg(m, rde), &x, 16);
} else {
memcpy(&x, GetModrmRegisterXmmPointerRead4(m, rde), 4);
x[0] = 1.f / sqrtf(x[0]);
memcpy(XmmRexrReg(m, rde), &x, 4);
}
}
void OpRcpps(struct Machine *m, uint32_t rde) {
float_v x;
unsigned i;
if (Rep(rde) != 3) {
memcpy(&x, GetModrmRegisterXmmPointerRead16(m, rde), 16);
for (i = 0; i < 4; ++i) x[i] = 1.f / x[i];
memcpy(XmmRexrReg(m, rde), &x, 16);
} else {
memcpy(&x, GetModrmRegisterXmmPointerRead4(m, rde), 4);
x[0] = 1.f / x[0];
memcpy(XmmRexrReg(m, rde), &x, 4);
}
}
static void VpsdWpsd(struct Machine *m, uint32_t rde,
float_v opf(struct Machine *, float_v, float_v),
double_v opd(struct Machine *, double_v, double_v),
bool isfloat, bool isdouble) {
float_v xf, yf;
double_v xd, yd;
if (isfloat) {
memcpy(&xf, XmmRexrReg(m, rde), 16);
memcpy(&yf, GetModrmRegisterXmmPointerRead16(m, rde), 16);
xf = opf(m, xf, yf);
memcpy(XmmRexrReg(m, rde), &xf, 16);
} else if (isdouble) {
memcpy(&xd, XmmRexrReg(m, rde), 16);
memcpy(&yd, GetModrmRegisterXmmPointerRead16(m, rde), 16);
xd = opd(m, xd, yd);
memcpy(XmmRexrReg(m, rde), &xd, 16);
} else {
OpUd(m, rde);
}
}
static void VpsdWpsd66(struct Machine *m, uint32_t rde,
float_v opf(struct Machine *, float_v, float_v),
double_v opd(struct Machine *, double_v, double_v)) {
VpsdWpsd(m, rde, opf, opd, !Osz(rde), Osz(rde));
}
static void VpsdWpsd66f2(struct Machine *m, uint32_t rde,
float_v opf(struct Machine *, float_v, float_v),
double_v opd(struct Machine *, double_v, double_v)) {
VpsdWpsd(m, rde, opf, opd, Rep(rde) == 2, Osz(rde));
}
static void VspsdWspsd(struct Machine *m, uint32_t rde,
float_v opf(struct Machine *, float_v, float_v),
double_v opd(struct Machine *, double_v, double_v)) {
float_v xf, yf;
double_v xd, yd;
switch (Rep(rde) | Osz(rde)) {
case 0:
memcpy(&yf, GetModrmRegisterXmmPointerRead16(m, rde), 16);
memcpy(&xf, XmmRexrReg(m, rde), 16);
xf = opf(m, xf, yf);
memcpy(XmmRexrReg(m, rde), &xf, 16);
break;
case 1:
memcpy(&yd, GetModrmRegisterXmmPointerRead16(m, rde), 16);
memcpy(&xd, XmmRexrReg(m, rde), 16);
xd = opd(m, xd, yd);
memcpy(XmmRexrReg(m, rde), &xd, 16);
break;
case 2:
memcpy(&yd, GetModrmRegisterXmmPointerRead8(m, rde), 8);
memcpy(&xd, XmmRexrReg(m, rde), 8);
xd = opd(m, xd, yd);
memcpy(XmmRexrReg(m, rde), &xd, 8);
break;
case 3:
memcpy(&yf, GetModrmRegisterXmmPointerRead4(m, rde), 4);
memcpy(&xf, XmmRexrReg(m, rde), 4);
xf = opf(m, xf, yf);
memcpy(XmmRexrReg(m, rde), &xf, 4);
break;
default:
unreachable;
}
}
void OpComissVsWs(struct Machine *m, uint32_t rde) {
float xf, yf;
double xd, yd;
uint8_t zf, cf, pf, ie;
if (!Osz(rde)) {
memcpy(&xf, XmmRexrReg(m, rde), 4);
memcpy(&yf, GetModrmRegisterXmmPointerRead4(m, rde), 4);
if (!isnan(xf) && !isnan(yf)) {
zf = xf == yf;
cf = xf < yf;
pf = false;
ie = false;
} else {
zf = cf = pf = ie = true;
}
} else {
memcpy(&xd, XmmRexrReg(m, rde), 8);
memcpy(&yd, GetModrmRegisterXmmPointerRead8(m, rde), 8);
if (!isnan(xd) && !isnan(yd)) {
zf = xd == yd;
cf = xd < yd;
pf = false;
ie = false;
} else {
zf = cf = pf = ie = true;
}
}
m->flags = SetFlag(m->flags, FLAGS_ZF, zf);
m->flags = SetFlag(m->flags, FLAGS_PF, pf);
m->flags = SetFlag(m->flags, FLAGS_CF, cf);
m->flags = SetFlag(m->flags, FLAGS_SF, false);
m->flags = SetFlag(m->flags, FLAGS_OF, false);
if ((m->xedd->op.opcode & 1) && (m->sse.ie = ie) && !m->sse.im) {
HaltMachine(m, kMachineSimdException);
}
}
static float_v Cmpps(struct Machine *m, float_v x, float_v y) {
long i;
switch (m->xedd->op.uimm0) {
case 0:
return x == y;
case 1:
return x < y;
case 2:
return x <= y;
case 3:
for (i = 0; i < 4; ++i) {
x[i] = isnan(x[i]) || isnan(y[i]);
}
return x;
case 4:
return x != y;
case 5:
return x >= y;
case 6:
return x > y;
case 7:
for (i = 0; i < 4; ++i) {
x[i] = !(isnan(x[i]) || isnan(y[i]));
}
return x;
default:
OpUd(m, 0);
}
}
static double_v Cmppd(struct Machine *m, double_v x, double_v y) {
long i;
switch (m->xedd->op.uimm0) {
case 0:
return x == y;
case 1:
return x < y;
case 2:
return x <= y;
case 3:
for (i = 0; i < 2; ++i) {
x[i] = isnan(x[i]) || isnan(y[i]);
}
return x;
case 4:
return x != y;
case 5:
return x >= y;
case 6:
return x > y;
case 7:
for (i = 0; i < 2; ++i) {
x[i] = !(isnan(x[i]) || isnan(y[i]));
}
return x;
default:
OpUd(m, 0);
}
}
void OpAddpsd(struct Machine *m, uint32_t rde) {
VspsdWspsd(m, rde, Addps, Addpd);
}
void OpMulpsd(struct Machine *m, uint32_t rde) {
VspsdWspsd(m, rde, Mulps, Mulpd);
}
void OpSubpsd(struct Machine *m, uint32_t rde) {
VspsdWspsd(m, rde, Subps, Subpd);
}
void OpDivpsd(struct Machine *m, uint32_t rde) {
VspsdWspsd(m, rde, Divps, Divpd);
}
void OpMinpsd(struct Machine *m, uint32_t rde) {
VspsdWspsd(m, rde, Minps, Minpd);
}
void OpMaxpsd(struct Machine *m, uint32_t rde) {
VspsdWspsd(m, rde, Maxps, Maxpd);
}
void OpCmppsd(struct Machine *m, uint32_t rde) {
VspsdWspsd(m, rde, Cmpps, Cmppd);
}
void OpAndpsd(struct Machine *m, uint32_t rde) {
VpsdWpsd66(m, rde, Andps, Andpd);
}
void OpAndnpsd(struct Machine *m, uint32_t rde) {
VpsdWpsd66(m, rde, Andnps, Andnpd);
}
void OpOrpsd(struct Machine *m, uint32_t rde) {
VpsdWpsd66(m, rde, Orps, Orpd);
}
void OpXorpsd(struct Machine *m, uint32_t rde) {
VpsdWpsd66(m, rde, Xorps, Xorpd);
}
void OpHaddpsd(struct Machine *m, uint32_t rde) {
VpsdWpsd66f2(m, rde, Haddps, Haddpd);
}
void OpHsubpsd(struct Machine *m, uint32_t rde) {
VpsdWpsd66f2(m, rde, Hsubps, Hsubpd);
}
void OpAddsubpsd(struct Machine *m, uint32_t rde) {
VpsdWpsd66f2(m, rde, Addsubps, Addsubpd);
}