diff --git a/dsp/scale/BUILD.mk b/dsp/scale/BUILD.mk index bd4f6df7e..80397cd97 100644 --- a/dsp/scale/BUILD.mk +++ b/dsp/scale/BUILD.mk @@ -45,6 +45,12 @@ $(DSP_SCALE_A).pkg: \ $(DSP_SCALE_A_OBJS) \ $(foreach x,$(DSP_SCALE_A_DIRECTDEPS),$($(x)_A).pkg) +ifeq ($(ARCH),x86_64) +o/$(MODE)/dsp/scale/cdecimate2xuint8x8.o: private \ + CFLAGS += \ + -mssse3 +endif + o/$(MODE)/dsp/scale/cdecimate2xuint8x8.o \ o/$(MODE)/dsp/scale/gyarados.o \ o/$(MODE)/dsp/scale/magikarp.o \ diff --git a/libc/intrin/pandn.c b/libc/intrin/pandn.c deleted file mode 100644 index 10d91c52b..000000000 --- a/libc/intrin/pandn.c +++ /dev/null @@ -1,34 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/pandn.h" - -/** - * Nands 128-bit integers. - * - * @param 𝑎 [w/o] receives result - * @param 𝑏 [r/o] supplies first input vector - * @param 𝑐 [r/o] supplies second input vector - * @mayalias - */ -void(pandn)(uint64_t a[2], const uint64_t b[2], const uint64_t c[2]) { - unsigned i; - for (i = 0; i < 2; ++i) { - a[i] = ~b[i] & c[i]; - } -} diff --git a/libc/intrin/pandn.h b/libc/intrin/pandn.h deleted file mode 100644 index bb4687614..000000000 --- a/libc/intrin/pandn.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PANDN_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PANDN_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -void pandn(uint64_t[2], const uint64_t[2], const uint64_t[2]); - -#define pandn(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(pandn, SSE2, "pandn", INTRIN_NONCOMMUTATIVE, A, B, C) - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PANDN_H_ */ diff --git a/libc/intrin/pcmpgtb.c b/libc/intrin/pcmpgtb.c deleted file mode 100644 index f1c895d72..000000000 --- a/libc/intrin/pcmpgtb.c +++ /dev/null @@ -1,38 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/pcmpgtb.h" -#include "libc/str/str.h" - -/** - * Compares signed 8-bit integers w/ greater than predicate. - * - * Note that operands can be xor'd with 0x80 for unsigned compares. - * - * @param 𝑎 [w/o] receives result - * @param 𝑏 [r/o] supplies first input vector - * @param 𝑐 [r/o] supplies second input vector - * @mayalias - */ -void(pcmpgtb)(int8_t a[16], const int8_t b[16], const int8_t c[16]) { - unsigned i; - int8_t r[16]; - for (i = 0; i < 16; ++i) - r[i] = -(b[i] > c[i]); - __builtin_memcpy(a, r, 16); -} diff --git a/libc/intrin/pcmpgtb.h b/libc/intrin/pcmpgtb.h deleted file mode 100644 index 043cedf4f..000000000 --- a/libc/intrin/pcmpgtb.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PCMPGTB_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PCMPGTB_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -void pcmpgtb(int8_t[16], const int8_t[16], const int8_t[16]); - -#define pcmpgtb(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(pcmpgtb, SSE2, "pcmpgtb", INTRIN_NONCOMMUTATIVE, A, B, C) - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PCMPGTB_H_ */ diff --git a/libc/intrin/pcmpgtw.c b/libc/intrin/pcmpgtw.c deleted file mode 100644 index 7bf94ef49..000000000 --- a/libc/intrin/pcmpgtw.c +++ /dev/null @@ -1,36 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/pcmpgtw.h" -#include "libc/str/str.h" - -/** - * Compares signed 16-bit integers w/ greater than predicate. - * - * @param 𝑎 [w/o] receives result - * @param 𝑏 [r/o] supplies first input vector - * @param 𝑐 [r/o] supplies second input vector - * @mayalias - */ -void(pcmpgtw)(int16_t a[8], const int16_t b[8], const int16_t c[8]) { - unsigned i; - int16_t r[8]; - for (i = 0; i < 8; ++i) - r[i] = -(b[i] > c[i]); - __builtin_memcpy(a, r, 16); -} diff --git a/libc/intrin/pcmpgtw.h b/libc/intrin/pcmpgtw.h deleted file mode 100644 index bb9707d19..000000000 --- a/libc/intrin/pcmpgtw.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PCMPGTW_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PCMPGTW_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -void pcmpgtw(int16_t[8], const int16_t[8], const int16_t[8]); - -#define pcmpgtw(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(pcmpgtw, SSE2, "pcmpgtw", INTRIN_NONCOMMUTATIVE, A, B, C) - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PCMPGTW_H_ */ diff --git a/libc/intrin/pmovmskb.c b/libc/intrin/pmovmskb.c deleted file mode 100644 index 0ff024d1d..000000000 --- a/libc/intrin/pmovmskb.c +++ /dev/null @@ -1,34 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/pmovmskb.h" - -/** - * Turns result of byte comparison into bitmask. - * - * @param 𝑝 is byte vector to crunch - * @see pcmpeqb(), bsf(), etc. - */ -uint32_t(pmovmskb)(const uint8_t p[16]) { - uint32_t i, m; - for (m = i = 0; i < 16; ++i) { - if (p[i] & 0x80) - m |= 1 << i; - } - return m; -} diff --git a/libc/intrin/pmovmskb.h b/libc/intrin/pmovmskb.h deleted file mode 100644 index e17e1fb16..000000000 --- a/libc/intrin/pmovmskb.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PMOVMSKB_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PMOVMSKB_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -uint32_t pmovmskb(const uint8_t[16]); - -#if defined(__x86_64__) && defined(__GNUC__) -#define pmovmskb(A) \ - ({ \ - uint32_t Mask; \ - if (!IsModeDbg() && X86_HAVE(SSE2)) { \ - const __intrin_xmm_t *Xmm = (const __intrin_xmm_t *)(A); \ - if (!X86_NEED(AVX)) { \ - asm("pmovmskb\t%1,%0" : "=r"(Mask) : "x"(*Xmm)); \ - } else { \ - asm("vpmovmskb\t%1,%0" : "=r"(Mask) : "x"(*Xmm)); \ - } \ - } else { \ - Mask = pmovmskb(A); \ - } \ - Mask; \ - }) -#endif - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PMOVMSKB_H_ */ diff --git a/libc/intrin/psraw.h b/libc/intrin/psraw.h index 4814b073c..083bb7445 100644 --- a/libc/intrin/psraw.h +++ b/libc/intrin/psraw.h @@ -4,11 +4,8 @@ COSMOPOLITAN_C_START_ void psraw(int16_t[8], const int16_t[8], unsigned char) libcesque; -void psrawv(int16_t[8], const int16_t[8], const uint64_t[2]) libcesque; #define psraw(A, B, I) INTRIN_SSEVEX_X_I_(psraw, SSE2, "psraw", A, B, I) -#define psrawv(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(psrawv, SSE2, "psraw", INTRIN_NONCOMMUTATIVE, A, B, C) COSMOPOLITAN_C_END_ #endif /* COSMOPOLITAN_LIBC_INTRIN_PSRAW_H_ */ diff --git a/libc/intrin/psrawv.c b/libc/intrin/psrawv.c deleted file mode 100644 index 5409db233..000000000 --- a/libc/intrin/psrawv.c +++ /dev/null @@ -1,34 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/psraw.h" - -/** - * Divides shorts by two power. - * - * @note arithmetic shift right will sign extend negatives - * @mayalias - */ -void(psrawv)(int16_t a[8], const int16_t b[8], const uint64_t c[2]) { - unsigned i; - unsigned char k; - k = c[0] > 15 ? 15 : c[0]; - for (i = 0; i < 8; ++i) { - a[i] = b[i] >> k; - } -} diff --git a/libc/intrin/punpckhbw.c b/libc/intrin/punpckhbw.c deleted file mode 100644 index 151530c77..000000000 --- a/libc/intrin/punpckhbw.c +++ /dev/null @@ -1,46 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/punpckhbw.h" - -/** - * Interleaves high bytes. - * - * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved - * @param 𝑏 [r/o] supplies eight words - * @param 𝑐 [r/o] supplies eight words - * @mayalias - */ -void(punpckhbw)(uint8_t a[16], const uint8_t b[16], const uint8_t c[16]) { - a[0x0] = b[0x8]; - a[0x1] = c[0x8]; - a[0x2] = b[0x9]; - a[0x3] = c[0x9]; - a[0x4] = b[0xa]; - a[0x5] = c[0xa]; - a[0x6] = b[0xb]; - a[0x7] = c[0xb]; - a[0x8] = b[0xc]; - a[0x9] = c[0xc]; - a[0xa] = b[0xd]; - a[0xb] = c[0xd]; - a[0xc] = b[0xe]; - a[0xd] = c[0xe]; - a[0xe] = b[0xf]; - a[0xf] = c[0xf]; -} diff --git a/libc/intrin/punpckhbw.h b/libc/intrin/punpckhbw.h deleted file mode 100644 index 306cb1597..000000000 --- a/libc/intrin/punpckhbw.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKHBW_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKHBW_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -void punpckhbw(uint8_t[16], const uint8_t[16], const uint8_t[16]); - -#define punpckhbw(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(punpckhbw, SSE2, "punpckhbw", INTRIN_NONCOMMUTATIVE, A, \ - B, C) - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKHBW_H_ */ diff --git a/libc/intrin/punpckhwd.c b/libc/intrin/punpckhwd.c deleted file mode 100644 index 5aad8b10b..000000000 --- a/libc/intrin/punpckhwd.c +++ /dev/null @@ -1,49 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/punpckhwd.h" -#include "libc/str/str.h" - -/** - * Interleaves high words. - * - * 0 1 2 3 4 5 6 7 - * B aa bb cc dd EE FF GG HH - * C ii jj kk ll MM NN OO PP - * └┤ └┤ └┤ └┤ - * ┌────────┘ │ │ │ - * │ ┌─────┘ │ │ - * │ │ ┌──┘ │ - * ┌───┤ ┌───┤ ┌───┤ ┌───┤ - * → A EE MM FF NN GG OO HH PP - * - * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved - * @param 𝑏 [r/o] supplies eight words - * @param 𝑐 [r/o] supplies eight words - * @mayalias - */ -void(punpckhwd)(uint16_t a[8], const uint16_t b[8], const uint16_t c[8]) { - a[0] = b[4]; - a[1] = c[4]; - a[2] = b[5]; - a[3] = c[5]; - a[4] = b[6]; - a[5] = c[6]; - a[6] = b[7]; - a[7] = c[7]; -} diff --git a/libc/intrin/punpckhwd.h b/libc/intrin/punpckhwd.h deleted file mode 100644 index 548e6ee92..000000000 --- a/libc/intrin/punpckhwd.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKHWD_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKHWD_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -void punpckhwd(uint16_t[8], const uint16_t[8], const uint16_t[8]); - -#define punpckhwd(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(punpckhwd, SSE2, "punpckhwd", INTRIN_NONCOMMUTATIVE, A, \ - B, C) - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKHWD_H_ */ diff --git a/libc/intrin/punpcklbw.c b/libc/intrin/punpcklbw.c deleted file mode 100644 index 559d8a553..000000000 --- a/libc/intrin/punpcklbw.c +++ /dev/null @@ -1,56 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/punpcklbw.h" - -/** - * Interleaves low bytes. - * - * 0 1 2 3 4 5 6 7 8 9 A B C D E F - * B A B C D E F G H i j k l m n o p - * C Q R S T U V W X y z α σ π μ τ ε - * │ │ │ │ │ │ │ │ - * │ │ │ └─────┐ - * │ │ └───┐ │ etc... - * │ └─┐ │ │ - * ├─┐ ├─┐ ├─┐ ├─┐ - * → A A Q B R C S D T E U F V G W H X - * - * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved - * @param 𝑏 [r/o] supplies eight words - * @param 𝑐 [r/o] supplies eight words - * @mayalias - */ -void(punpcklbw)(uint8_t a[16], const uint8_t b[16], const uint8_t c[16]) { - a[0xf] = c[7]; - a[0xe] = b[7]; - a[0xd] = c[6]; - a[0xc] = b[6]; - a[0xb] = c[5]; - a[0xa] = b[5]; - a[0x9] = c[4]; - a[0x8] = b[4]; - a[0x7] = c[3]; - a[0x6] = b[3]; - a[0x5] = c[2]; - a[0x4] = b[2]; - a[0x3] = c[1]; - a[0x2] = b[1]; - a[0x1] = c[0]; - a[0x0] = b[0]; -} diff --git a/libc/intrin/punpcklbw.h b/libc/intrin/punpcklbw.h deleted file mode 100644 index 40c9cef89..000000000 --- a/libc/intrin/punpcklbw.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKLBW_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKLBW_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -void punpcklbw(uint8_t[16], const uint8_t[16], const uint8_t[16]); - -#define punpcklbw(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(punpcklbw, SSE2, "punpcklbw", INTRIN_NONCOMMUTATIVE, A, \ - B, C) - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKLBW_H_ */ diff --git a/libc/intrin/punpcklwd.c b/libc/intrin/punpcklwd.c deleted file mode 100644 index 11936c456..000000000 --- a/libc/intrin/punpcklwd.c +++ /dev/null @@ -1,48 +0,0 @@ -/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ -│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/punpcklwd.h" - -/** - * Interleaves low words. - * - * 0 1 2 3 4 5 6 7 - * B AA BB CC DD ee ff gg hh - * C II JJ KK LL mm nn oo pp - * ├┘ ├┘ ├┘ ├┘ - * │ │ │ └────────┐ - * │ │ └─────┐ │ - * │ └──┐ │ │ - * ├───┐ ├───┐ ├───┐ ├───┐ - * → A AA II BB JJ CC KK DD LL - * - * @param 𝑎 [w/o] receives reduced 𝑏 and 𝑐 interleaved - * @param 𝑏 [r/o] supplies eight words - * @param 𝑐 [r/o] supplies eight words - * @mayalias - */ -void(punpcklwd)(uint16_t a[8], const uint16_t b[8], const uint16_t c[8]) { - a[7] = c[3]; - a[6] = b[3]; - a[5] = c[2]; - a[4] = b[2]; - a[3] = c[1]; - a[2] = b[1]; - a[1] = c[0]; - a[0] = b[0]; -} diff --git a/libc/intrin/punpcklwd.h b/libc/intrin/punpcklwd.h deleted file mode 100644 index e286ba9c2..000000000 --- a/libc/intrin/punpcklwd.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_INTRIN_PUNPCKLWD_H_ -#define COSMOPOLITAN_LIBC_INTRIN_PUNPCKLWD_H_ -#include "libc/intrin/macros.h" -COSMOPOLITAN_C_START_ - -void punpcklwd(uint16_t[8], const uint16_t[8], const uint16_t[8]); - -#define punpcklwd(A, B, C) \ - INTRIN_SSEVEX_X_X_X_(punpcklwd, SSE2, "punpcklwd", INTRIN_NONCOMMUTATIVE, A, \ - B, C) - -COSMOPOLITAN_C_END_ -#endif /* COSMOPOLITAN_LIBC_INTRIN_PUNPCKLWD_H_ */ diff --git a/libc/str/strnwidth.c b/libc/str/strnwidth.c index 761994e80..b67436e57 100644 --- a/libc/str/strnwidth.c +++ b/libc/str/strnwidth.c @@ -17,8 +17,6 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/intrin/bsf.h" -#include "libc/intrin/pcmpgtb.h" -#include "libc/intrin/pmovmskb.h" #include "libc/macros.h" #include "libc/str/str.h" #include "libc/str/thompike.h" diff --git a/libc/str/tprecode16to8.c b/libc/str/tprecode16to8.c index 9bea83682..d23eb0b5d 100644 --- a/libc/str/tprecode16to8.c +++ b/libc/str/tprecode16to8.c @@ -18,35 +18,55 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/dce.h" #include "libc/fmt/conv.h" -#include "libc/intrin/packsswb.h" -#include "libc/intrin/pandn.h" -#include "libc/intrin/pcmpgtw.h" -#include "libc/intrin/pmovmskb.h" #include "libc/str/str.h" #include "libc/str/utf16.h" +#include "third_party/aarch64/arm_neon.internal.h" +#include "third_party/intel/emmintrin.internal.h" -static const int16_t kDel16[8] = {127, 127, 127, 127, 127, 127, 127, 127}; +#if !IsModeDbg() +#if defined(__x86_64__) -/* 10x speedup for ascii */ static axdx_t tprecode16to8_sse2(char *dst, size_t dstsize, const char16_t *src, axdx_t r) { - int16_t v1[8], v2[8], v3[8], vz[8]; - memset(vz, 0, 16); + __m128i v1, v2, v3, vz; + vz = _mm_setzero_si128(); while (r.ax + 8 < dstsize) { - memcpy(v1, src + r.dx, 16); - pcmpgtw(v2, v1, vz); - pcmpgtw(v3, v1, kDel16); - pandn((void *)v2, (void *)v3, (void *)v2); - if (pmovmskb((void *)v2) != 0xFFFF) + v1 = _mm_loadu_si128((__m128i *)(src + r.dx)); + v2 = _mm_cmpgt_epi16(v1, vz); + v3 = _mm_cmpgt_epi16(v1, _mm_set1_epi16(0x7F)); + v2 = _mm_andnot_si128(v3, v2); + if (_mm_movemask_epi8(v2) != 0xFFFF) break; - packsswb((void *)v1, v1, v1); - memcpy(dst + r.ax, v1, 8); + v1 = _mm_packs_epi16(v1, v1); + _mm_storel_epi64((__m128i *)(dst + r.ax), v1); r.ax += 8; r.dx += 8; } return r; } +#elif defined(__aarch64__) + +static axdx_t tprecode16to8_neon(char *dst, size_t dstsize, const char16_t *src, + axdx_t r) { + uint16x8_t v1, v2, v3; + while (r.ax + 8 < dstsize) { + v1 = vld1q_u16((const uint16_t *)(src + r.dx)); + v2 = vcgtq_u16(v1, vdupq_n_u16(0)); + v3 = vcgtq_u16(v1, vdupq_n_u16(0x7F)); + v2 = vbicq_u16(v2, v3); + if (vaddvq_u16(v2) != 8 * 0xFFFF) + break; + vst1_u8((uint8_t *)(dst + r.ax), vqmovn_u16(v1)); + r.ax += 8; + r.dx += 8; + } + return r; +} + +#endif +#endif + /** * Transcodes UTF-16 to UTF-8. * @@ -66,10 +86,14 @@ axdx_t tprecode16to8(char *dst, size_t dstsize, const char16_t *src) { r.ax = 0; r.dx = 0; for (;;) { -#if defined(__x86_64__) && !IsModeDbg() && !IsTiny() - if (!((uintptr_t)(src + r.dx) & 15)) { +#if !IsModeDbg() +#if defined(__x86_64__) + if (!((uintptr_t)(src + r.dx) & 15)) r = tprecode16to8_sse2(dst, dstsize, src, r); - } +#elif defined(__aarch64__) + if (!((uintptr_t)(src + r.dx) & 15)) + r = tprecode16to8_neon(dst, dstsize, src, r); +#endif #endif if (!(x = src[r.dx++])) break; diff --git a/libc/str/tprecode8to16.c b/libc/str/tprecode8to16.c index d823f3163..2924184f8 100644 --- a/libc/str/tprecode8to16.c +++ b/libc/str/tprecode8to16.c @@ -16,34 +16,61 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/pcmpgtb.h" -#include "libc/intrin/pmovmskb.h" -#include "libc/intrin/punpckhbw.h" -#include "libc/intrin/punpcklbw.h" +#include +#include +#include +#include "libc/dce.h" #include "libc/str/str.h" #include "libc/str/thompike.h" #include "libc/str/utf16.h" +#include "third_party/aarch64/arm_neon.internal.h" +#include "third_party/intel/emmintrin.internal.h" + +#if !IsModeDbg() +#if defined(__x86_64__) -// 34x speedup for ascii static inline axdx_t tprecode8to16_sse2(char16_t *dst, size_t dstsize, const char *src, axdx_t r) { - uint8_t v1[16], v2[16], vz[16]; - memset(vz, 0, 16); + __m128i v1, v2, vz; + vz = _mm_setzero_si128(); while (r.ax + 16 < dstsize) { - memcpy(v1, src + r.dx, 16); - pcmpgtb((int8_t *)v2, (int8_t *)v1, (int8_t *)vz); - if (pmovmskb(v2) != 0xFFFF) + v1 = _mm_loadu_si128((__m128i *)(src + r.dx)); + v2 = _mm_cmpgt_epi8(v1, vz); + if (_mm_movemask_epi8(v2) != 0xFFFF) break; - punpcklbw(v2, v1, vz); - punpckhbw(v1, v1, vz); - memcpy(dst + r.ax + 0, v2, 16); - memcpy(dst + r.ax + 8, v1, 16); + __m128i lo = _mm_unpacklo_epi8(v1, vz); + __m128i hi = _mm_unpackhi_epi8(v1, vz); + _mm_storeu_si128((__m128i *)(dst + r.ax), lo); + _mm_storeu_si128((__m128i *)(dst + r.ax + 8), hi); r.ax += 16; r.dx += 16; } return r; } +#elif defined(__aarch64__) + +static inline axdx_t tprecode8to16_neon(char16_t *dst, size_t dstsize, + const char *src, axdx_t r) { + uint8x16_t v1; + while (r.ax + 16 < dstsize) { + v1 = vld1q_u8((const uint8_t *)(src + r.dx)); + uint8x16_t cmp = vcgtq_u8(v1, vdupq_n_u8(0)); + if (vaddvq_u8(cmp) != 16 * 0xFF) + break; + uint16x8_t lo = vmovl_u8(vget_low_u8(v1)); + uint16x8_t hi = vmovl_u8(vget_high_u8(v1)); + vst1q_u16((uint16_t *)(dst + r.ax), lo); + vst1q_u16((uint16_t *)(dst + r.ax + 8), hi); + r.ax += 16; + r.dx += 16; + } + return r; +} + +#endif +#endif + /** * Transcodes UTF-8 to UTF-16. * @@ -64,10 +91,14 @@ axdx_t tprecode8to16(char16_t *dst, size_t dstsize, const char *src) { r.ax = 0; r.dx = 0; for (;;) { -#if defined(__x86_64__) && !IsModeDbg() - if (!((uintptr_t)(src + r.dx) & 15)) { +#if !IsModeDbg() +#if defined(__x86_64__) + if (!((uintptr_t)(src + r.dx) & 15)) r = tprecode8to16_sse2(dst, dstsize, src, r); - } +#elif defined(__aarch64__) + if (!((uintptr_t)(src + r.dx) & 15)) + r = tprecode8to16_neon(dst, dstsize, src, r); +#endif #endif x = src[r.dx++] & 0377; if (x >= 0300) { diff --git a/libc/x/utf16to8.c b/libc/x/utf16to8.c index 219c2e2a9..dfcd4dea3 100644 --- a/libc/x/utf16to8.c +++ b/libc/x/utf16to8.c @@ -17,21 +17,13 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/intrin/bsr.h" -#include "libc/intrin/packsswb.h" -#include "libc/intrin/pandn.h" -#include "libc/intrin/pcmpgtb.h" -#include "libc/intrin/pcmpgtw.h" -#include "libc/intrin/pmovmskb.h" -#include "libc/intrin/punpckhbw.h" -#include "libc/intrin/punpcklbw.h" #include "libc/mem/mem.h" #include "libc/serialize.h" #include "libc/str/str.h" #include "libc/str/thompike.h" #include "libc/str/utf16.h" #include "libc/x/x.h" - -static const int16_t kDel16[8] = {127, 127, 127, 127, 127, 127, 127, 127}; +#include "third_party/intel/emmintrin.internal.h" /** * Transcodes UTF-16 to UTF-8. @@ -45,28 +37,27 @@ char *utf16to8(const char16_t *p, size_t n, size_t *z) { char *r, *q; wint_t x, y; const char16_t *e; - int16_t v1[8], v2[8], v3[8], vz[8]; if (z) *z = 0; if (n == -1) n = p ? strlen16(p) : 0; if ((q = r = malloc(n * 4 + 8 + 1))) { for (e = p + n; p < e;) { - if (p + 8 < e) { /* 17x ascii */ - bzero(vz, 16); +#if defined(__x86_64__) + if (p + 8 < e) { do { - memcpy(v1, p, 16); - pcmpgtw(v2, v1, vz); - pcmpgtw(v3, v1, kDel16); - pandn((void *)v2, (void *)v3, (void *)v2); - if (pmovmskb((void *)v2) != 0xFFFF) + __m128i v1 = _mm_loadu_si128((__m128i *)p); + __m128i v2 = _mm_cmpgt_epi16(v1, _mm_setzero_si128()); + __m128i v3 = _mm_cmpgt_epi16(v1, _mm_set1_epi16(127)); + v2 = _mm_andnot_si128(v3, v2); + if (_mm_movemask_epi8(v2) != 0xFFFF) break; - packsswb((void *)v1, v1, v1); - memcpy(q, v1, 8); + _mm_storel_epi64((__m128i *)q, _mm_packs_epi16(v1, v1)); p += 8; q += 8; } while (p + 8 < e); } +#endif x = *p++ & 0xffff; if (!IsUcs2(x)) { if (p < e) { diff --git a/libc/x/utf8to32.c b/libc/x/utf8to32.c index f1a8568cc..15170e1a2 100644 --- a/libc/x/utf8to32.c +++ b/libc/x/utf8to32.c @@ -16,18 +16,12 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/likely.h" -#include "libc/intrin/pcmpgtb.h" -#include "libc/intrin/pmovmskb.h" -#include "libc/intrin/punpckhbw.h" -#include "libc/intrin/punpckhwd.h" -#include "libc/intrin/punpcklbw.h" -#include "libc/intrin/punpcklwd.h" #include "libc/mem/mem.h" #include "libc/str/str.h" #include "libc/str/thompike.h" #include "libc/str/utf16.h" #include "libc/x/x.h" +#include "third_party/intel/emmintrin.internal.h" /** * Transcodes UTF-8 to UTF-32. @@ -41,35 +35,35 @@ wchar_t *utf8to32(const char *p, size_t n, size_t *z) { unsigned m, j; wint_t x, a, b; wchar_t *r, *q; - uint8_t v1[16], v2[16], v3[16], v4[16], vz[16]; if (z) *z = 0; if (n == -1) n = p ? strlen(p) : 0; if ((q = r = malloc(n * sizeof(wchar_t) + sizeof(wchar_t)))) { for (i = 0; i < n;) { +#ifdef __x86_64__ if (!((uintptr_t)(p + i) & 15) && i + 16 < n) { - /* 10x speedup for ascii */ - bzero(vz, 16); do { - memcpy(v1, p + i, 16); - pcmpgtb((int8_t *)v2, (int8_t *)v1, (int8_t *)vz); - if (pmovmskb(v2) != 0xFFFF) + __m128i v1, v2, v3, v4; + v1 = _mm_loadu_si128((__m128i *)(p + i)); + v2 = _mm_cmpgt_epi8(v1, _mm_setzero_si128()); + if (_mm_movemask_epi8(v2) != 0xFFFF) break; - punpcklbw(v3, v1, vz); - punpckhbw(v1, v1, vz); - punpcklwd((void *)v4, (void *)v3, (void *)vz); - punpckhwd((void *)v3, (void *)v3, (void *)vz); - punpcklwd((void *)v2, (void *)v1, (void *)vz); - punpckhwd((void *)v1, (void *)v1, (void *)vz); - memcpy(q + 0, v4, 16); - memcpy(q + 4, v3, 16); - memcpy(q + 8, v2, 16); - memcpy(q + 12, v1, 16); + v3 = _mm_unpacklo_epi8(v1, _mm_setzero_si128()); + v1 = _mm_unpackhi_epi8(v1, _mm_setzero_si128()); + v4 = _mm_unpacklo_epi16(v3, _mm_setzero_si128()); + v3 = _mm_unpackhi_epi16(v3, _mm_setzero_si128()); + v2 = _mm_unpacklo_epi16(v1, _mm_setzero_si128()); + v1 = _mm_unpackhi_epi16(v1, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)(q + 0), v4); + _mm_storeu_si128((__m128i *)(q + 4), v3); + _mm_storeu_si128((__m128i *)(q + 8), v2); + _mm_storeu_si128((__m128i *)(q + 12), v1); i += 16; q += 16; } while (i + 16 < n); } +#endif x = p[i++] & 0xff; if (x >= 0300) { a = ThomPikeByte(x); diff --git a/net/http/decodelatin1.c b/net/http/decodelatin1.c index 4799d8a9d..ce9aed209 100644 --- a/net/http/decodelatin1.c +++ b/net/http/decodelatin1.c @@ -16,8 +16,6 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/pcmpgtb.h" -#include "libc/intrin/pmovmskb.h" #include "libc/mem/mem.h" #include "libc/str/str.h" #include "net/http/escape.h" @@ -34,23 +32,12 @@ char *DecodeLatin1(const char *p, size_t n, size_t *z) { int c; size_t i; char *r, *q; - int8_t v1[16], v2[16], vz[16]; if (z) *z = 0; if (n == -1) n = p ? strlen(p) : 0; if ((q = r = malloc(n * 2 + 1))) { for (i = 0; i < n;) { - bzero(vz, 16); /* 3x speedup for ASCII */ - while (i + 16 < n) { - memcpy(v1, p + i, 16); - pcmpgtb(v2, v1, vz); - if (pmovmskb((void *)v2) != 0xFFFF) - break; - memcpy(q, v1, 16); - q += 16; - i += 16; - } c = p[i++] & 0xff; if (c < 0200) { *q++ = c; diff --git a/net/http/encodelatin1.c b/net/http/encodelatin1.c index 4d6798ec7..9d3bc0ed8 100644 --- a/net/http/encodelatin1.c +++ b/net/http/encodelatin1.c @@ -17,8 +17,6 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/errno.h" -#include "libc/intrin/pcmpgtb.h" -#include "libc/intrin/pmovmskb.h" #include "libc/mem/mem.h" #include "libc/stdio/stdio.h" #include "libc/str/str.h" diff --git a/net/http/underlong.c b/net/http/underlong.c index a48e7f48c..1d0906582 100644 --- a/net/http/underlong.c +++ b/net/http/underlong.c @@ -16,8 +16,6 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/pcmpgtb.h" -#include "libc/intrin/pmovmskb.h" #include "libc/mem/mem.h" #include "libc/str/str.h" #include "libc/str/thompike.h" @@ -40,23 +38,12 @@ char *Underlong(const char *p, size_t n, size_t *z) { char *r, *q; size_t i, j, m; wint_t x, a, b; - int8_t v1[16], v2[16], vz[16]; if (z) *z = 0; if (n == -1) n = p ? strlen(p) : 0; if ((q = r = malloc(n * 2 + 1))) { for (i = 0; i < n;) { - bzero(vz, 16); /* 50x speedup for ASCII */ - while (i + 16 < n) { - memcpy(v1, p + i, 16); - pcmpgtb(v2, v1, vz); - if (pmovmskb((void *)v2) != 0xFFFF) - break; - memcpy(q, v1, 16); - q += 16; - i += 16; - } x = p[i++] & 0xff; if (x >= 0300) { a = ThomPikeByte(x);