Make AARCH64 harder, better, faster, stronger

- Perform some housekeeping on scalar math function code
- Import ARM's Optimized Routines for SIMD string processing
- Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
Justine Tunney 2023-05-15 01:51:29 -07:00
parent 550b52abf6
commit cc1732bc42
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
143 changed files with 15661 additions and 1329 deletions

View file

@ -73,6 +73,13 @@ IMAGE_BASE_VIRTUAL ?= 0x400000
IGNORE := $(shell $(ECHO) -2 ♥cosmo)
IGNORE := $(shell $(MKDIR) o/tmp)
ifeq ($(MODE), dbg)
# be generous about resources in debug mode
# let commands use 64 seconds cpu time max
# let commands use 300 seconds wall time max
QUOTA ?= -C64 -L300
endif
ifneq ($(findstring aarch64,$(MODE)),)
ARCH = aarch64
VM = o/third_party/qemu/qemu-aarch64

View file

@ -12,8 +12,8 @@
#include "libc/errno.h"
#include "libc/fmt/conv.h"
#include "libc/log/check.h"
#include "libc/mem/mem.h"
#include "libc/mem/gc.internal.h"
#include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
@ -48,26 +48,62 @@ FLAGS\n\
// clang-format off
// make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
/*
#!/bin/bash
# data file is o/dbg/third_party/python/python.com
# level 0 147517 compress 495 MB/s decompress 1.4 GB/s
# level 1 80274 compress 29.2 MB/s decompress 303 MB/s
# level 2 79384 compress 33.8 MB/s decompress 212 MB/s
# level 3 78875 compress 28.9 MB/s decompress 224 MB/s
# level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot?
# level 5 77107 compress 19.5 MB/s decompress 273 MB/s
# level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s
# level 7 75022 compress 7.5 MB/s decompress 287 MB/s
# level 8 75016 compress 5.4 MB/s decompress 109 MB/s
# level 9 75016 compress 5.4 MB/s decompress 344 MB/s
# level 1 348739 compress 22.8 MB/s decompress 444 MB/s
# level 2 347549 compress 37.8 MB/s decompress 457 MB/s
# level 3 346902 compress 33.3 MB/s decompress 463 MB/s
# level 4 345671 compress 29.3 MB/s decompress 467 MB/s
# level 5 344392 compress 22.4 MB/s decompress 506 MB/s
# level 6 342105 compress 10.9 MB/s decompress 516 MB/s
# level 7 342046 compress 7.9 MB/s decompress 515 MB/s
# level 8 342009 compress 5.8 MB/s decompress 518 MB/s
# level 9 342001 compress 5.7 MB/s decompress 524 MB/s
# level F 1 362426 compress 48.2 MB/s decompress 488 MB/s
# level F 2 360875 compress 42.7 MB/s decompress 484 MB/s
# level F 3 359992 compress 37.1 MB/s decompress 499 MB/s
# level F 4 358460 compress 32.9 MB/s decompress 503 MB/s
# level F 5 356431 compress 24.0 MB/s decompress 547 MB/s
# level F 6 352274 compress 11.6 MB/s decompress 558 MB/s
# level F 7 352155 compress 8.7 MB/s decompress 554 MB/s
# level F 8 352065 compress 6.3 MB/s decompress 554 MB/s
# level F 9 352051 compress 6.2 MB/s decompress 556 MB/s
# level L 1 348739 compress 41.1 MB/s decompress 446 MB/s
# level L 2 347549 compress 37.4 MB/s decompress 443 MB/s
# level L 3 346902 compress 32.3 MB/s decompress 462 MB/s
# level L 4 351932 compress 28.8 MB/s decompress 511 MB/s
# level L 5 351384 compress 23.6 MB/s decompress 520 MB/s
# level L 6 351328 compress 12.1 MB/s decompress 522 MB/s
# level L 7 351230 compress 7.3 MB/s decompress 518 MB/s
# level L 8 351192 compress 5.7 MB/s decompress 522 MB/s
# level L 9 351182 compress 6.5 MB/s decompress 519 MB/s
# level R 1 388209 compress 83.1 MB/s decompress 371 MB/s
# level R 2 388209 compress 82.3 MB/s decompress 362 MB/s
# level R 3 388209 compress 81.8 MB/s decompress 361 MB/s
# level R 4 388209 compress 81.7 MB/s decompress 364 MB/s
# level R 5 388209 compress 81.7 MB/s decompress 363 MB/s
# level R 6 388209 compress 80.1 MB/s decompress 359 MB/s
# level R 7 388209 compress 80.3 MB/s decompress 354 MB/s
# level R 8 388209 compress 80.3 MB/s decompress 363 MB/s
# level R 9 388209 compress 81.3 MB/s decompress 364 MB/s
# level H 1 390207 compress 87.6 MB/s decompress 371 MB/s
# level H 2 390207 compress 87.5 MB/s decompress 372 MB/s
# level H 3 390207 compress 85.5 MB/s decompress 364 MB/s
# level H 4 390207 compress 87.3 MB/s decompress 375 MB/s
# level H 5 390207 compress 89.0 MB/s decompress 373 MB/s
# level H 6 390207 compress 87.3 MB/s decompress 372 MB/s
# level H 7 390207 compress 87.0 MB/s decompress 368 MB/s
# level H 8 390207 compress 86.2 MB/s decompress 367 MB/s
# level H 9 390207 compress 86.9 MB/s decompress 369 MB/s
m=
make -j8 MODE=$m o/$m/examples || exit
for strategy in ' ' F L R H; do
for level in $(seq 1 9); do
for strategy in F L R H; do
o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
o/$m/examples/compress.com -$level$strategy <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null
decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c)
size=$(o/$m/examples/compress.com -$level$strategy <o/$m/examples/compress.com | wc -c)
echo "level $strategy $level $size compress $compspeed decompress $decompspeed"
done
done

View file

@ -10,43 +10,14 @@
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/errno.h"
#include "libc/mem/mem.h"
#include "libc/mem/gc.internal.h"
#include "libc/mem/mem.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "third_party/zlib/zlib.h"
#define CHUNK 32768
// clang-format off
// make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
/*
# data file is o/dbg/third_party/python/python.com
# level 0 147517 compress 495 MB/s decompress 1.4 GB/s
# level 1 80274 compress 29.2 MB/s decompress 303 MB/s
# level 2 79384 compress 33.8 MB/s decompress 212 MB/s
# level 3 78875 compress 28.9 MB/s decompress 224 MB/s
# level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot?
# level 5 77107 compress 19.5 MB/s decompress 273 MB/s
# level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s
# level 7 75022 compress 7.5 MB/s decompress 287 MB/s
# level 8 75016 compress 5.4 MB/s decompress 109 MB/s
# level 9 75016 compress 5.4 MB/s decompress 344 MB/s
m=
make -j8 MODE=$m o/$m/examples || exit
for level in $(seq 0 9); do
for strategy in F L R H; do
o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null
decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c)
echo "level $strategy $level $size compress $compspeed decompress $decompspeed"
done
done
*/
// clang-format on
int decompressor(int infd, int outfd) {
int rc;
unsigned have;

View file

@ -0,0 +1,88 @@
#ifndef COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
#define COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
#ifdef __ASSEMBLER__
// clang-format off
/* Branch Target Identitication support. */
#define BTI_C hint 34
#define BTI_J hint 36
/* Return address signing support (pac-ret). */
#define PACIASP hint 25; .cfi_window_save
#define AUTIASP hint 29; .cfi_window_save
/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
#define FEATURE_1_AND 0xc0000000
#define FEATURE_1_BTI 1
#define FEATURE_1_PAC 2
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 3; \
.word 4; \
.word 16; \
.word 5; \
.asciz "GNU"; \
.word type; \
.word 4; \
.word value; \
.word 0; \
.text
/* If set then the GNU Property Note section will be added to
mark objects to support BTI and PAC-RET. */
#ifndef WANT_GNU_PROPERTY
#define WANT_GNU_PROPERTY 1
#endif
#if WANT_GNU_PROPERTY
/* Add property note with supported features to all asm files. */
GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
#endif
#define ENTRY_ALIGN(name, alignment) \
.global name; \
.type name,%function; \
.align alignment; \
name: \
.cfi_startproc; \
BTI_C;
#define ENTRY(name) ENTRY_ALIGN(name, 6)
#define ENTRY_ALIAS(name) \
.global name; \
.type name,%function; \
name:
#define END(name) \
.cfi_endproc; \
.size name, .-name;
#define L(l) .L ## l
#ifdef __ILP32__
/* Sanitize padding bits of pointer arguments as per aapcs64 */
#define PTR_ARG(n) mov w##n, w##n
#else
#define PTR_ARG(n)
#endif
#ifdef __ILP32__
/* Sanitize padding bits of size arguments as per aapcs64 */
#define SIZE_ARG(n) mov w##n, w##n
#else
#define SIZE_ARG(n)
#endif
/* Compiler supports SVE instructions */
#ifndef HAVE_SVE
# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
# define HAVE_SVE 1
# else
# define HAVE_SVE 0
# endif
#endif
#endif /* __ASSEMBLER__ */
#endif /* COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_ */

View file

@ -0,0 +1,172 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __memchr_aarch64 memchr
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define cntin x2
#define result x0
#define src x3
#define tmp x4
#define wtmp2 w5
#define synd x6
#define soff x9
#define cntrem x10
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_chr1 v3
#define vhas_chr2 v4
#define vrepmask v5
#define vend v6
/*
* Core algorithm:
*
* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
* per byte. For each tuple, bit 0 is set if the relevant byte matched the
* requested character and bit 1 is not used (faster than using a 32bit
* syndrome). Since the bits in the syndrome reflect exactly the order in which
* things occur in the original string, counting trailing zeros allows to
* identify exactly which byte has matched.
*/
ENTRY (__memchr_aarch64)
PTR_ARG (0)
SIZE_ARG (2)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(zero_length)
/*
* Magic constant 0x40100401 allows us to identify which lane matches
* the requested byte.
*/
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
/* Work with aligned 32-byte chunks */
bic src, srcin, #31
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
b.eq L(loop)
/*
* Input string is not 32-byte aligned. We calculate the syndrome
* value for the aligned 32 bytes block containing the first bytes
* and mask the irrelevant part.
*/
ld1 {vdata1.16b, vdata2.16b}, [src], #32
sub tmp, soff, #32
adds cntin, cntin, tmp
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Clear the soff*2 lower bits */
lsl tmp, soff, #1
lsr synd, synd, tmp
lsl synd, synd, tmp
/* The first block can also be the last */
b.ls L(masklast)
/* Have we found something already? */
cbnz synd, L(tail)
L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* If we're out of data we finish regardless of the result */
b.ls L(end)
/* Use a fast check for the termination condition */
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
mov synd, vend.d[0]
/* We're not out of data, loop if we haven't found the character */
cbz synd, L(loop)
L(end):
/* Termination condition found, let's calculate the syndrome value */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Only do the clear for the last possible block */
b.hs L(tail)
L(masklast):
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
add tmp, cntrem, soff
and tmp, tmp, #31
sub tmp, tmp, #32
neg tmp, tmp, lsl #1
lsl synd, synd, tmp
lsr synd, synd, tmp
L(tail):
/* Count the trailing zeros using bit reversing */
rbit synd, synd
/* Compensate the last post-increment */
sub src, src, #32
/* Check that we have found a character */
cmp synd, #0
/* And count the leading zeros */
clz synd, synd
/* Compute the potential result */
add result, src, synd, lsr #1
/* Select result or NULL */
csel result, xzr, result, eq
ret
L(zero_length):
mov result, #0
ret
END (__memchr_aarch64)

View file

@ -0,0 +1,218 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __memcmp_aarch64 memcmp
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*/
#define src1 x0
#define src2 x1
#define limit x2
#define result w0
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define data3 x5
#define data3w w5
#define data4 x6
#define data4w w6
#define tmp x6
#define src1end x7
#define src2end x8
ENTRY (__memcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
cmp limit, 16
b.lo L(less16)
ldp data1, data3, [src1]
ldp data2, data4, [src2]
ccmp data1, data2, 0, ne
ccmp data3, data4, 0, eq
b.ne L(return2)
add src1end, src1, limit
add src2end, src2, limit
cmp limit, 32
b.ls L(last_bytes)
cmp limit, 160
b.hs L(loop_align)
sub limit, limit, 32
.p2align 4
L(loop32):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
ccmp data3, data4, 0, eq
b.ne L(return2)
cmp limit, 16
b.ls L(last_bytes)
ldp data1, data3, [src1, 32]
ldp data2, data4, [src2, 32]
cmp data1, data2
ccmp data3, data4, 0, eq
b.ne L(return2)
add src1, src1, 32
add src2, src2, 32
L(last64):
subs limit, limit, 32
b.hi L(loop32)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
ldp data1, data3, [src1end, -16]
ldp data2, data4, [src2end, -16]
L(return2):
cmp data1, data2
csel data1, data1, data3, ne
csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
cmp data1, data2
cset result, ne
cneg result, result, lo
ret
.p2align 4
L(less16):
add src1end, src1, limit
add src2end, src2, limit
tbz limit, 3, L(less8)
ldr data1, [src1]
ldr data2, [src2]
ldr data3, [src1end, -8]
ldr data4, [src2end, -8]
b L(return2)
.p2align 4
L(less8):
tbz limit, 2, L(less4)
ldr data1w, [src1]
ldr data2w, [src2]
ldr data3w, [src1end, -4]
ldr data4w, [src2end, -4]
b L(return2)
L(less4):
tbz limit, 1, L(less2)
ldrh data1w, [src1]
ldrh data2w, [src2]
cmp data1w, data2w
b.ne L(return)
L(less2):
mov result, 0
tbz limit, 0, L(return_zero)
ldrb data1w, [src1end, -1]
ldrb data2w, [src2end, -1]
sub result, data1w, data2w
L(return_zero):
ret
L(loop_align):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
ccmp data3, data4, 0, eq
b.ne L(return2)
/* Align src2 and adjust src1, src2 and limit. */
and tmp, src2, 15
sub tmp, tmp, 16
sub src2, src2, tmp
add limit, limit, tmp
sub src1, src1, tmp
sub limit, limit, 64 + 16
.p2align 4
L(loop64):
ldr q0, [src1, 16]
ldr q1, [src2, 16]
subs limit, limit, 64
ldr q2, [src1, 32]
ldr q3, [src2, 32]
eor v0.16b, v0.16b, v1.16b
eor v1.16b, v2.16b, v3.16b
ldr q2, [src1, 48]
ldr q3, [src2, 48]
umaxp v0.16b, v0.16b, v1.16b
ldr q4, [src1, 64]!
ldr q5, [src2, 64]!
eor v1.16b, v2.16b, v3.16b
eor v2.16b, v4.16b, v5.16b
umaxp v1.16b, v1.16b, v2.16b
umaxp v0.16b, v0.16b, v1.16b
umaxp v0.16b, v0.16b, v0.16b
fmov tmp, d0
ccmp tmp, 0, 0, hi
b.eq L(loop64)
/* If equal, process last 1-64 bytes using scalar loop. */
add limit, limit, 64 + 16
cbz tmp, L(last64)
/* Determine the 8-byte aligned offset of the first difference. */
#ifdef __AARCH64EB__
rev16 tmp, tmp
#endif
rev tmp, tmp
clz tmp, tmp
bic tmp, tmp, 7
sub tmp, tmp, 48
ldr data1, [src1, tmp]
ldr data2, [src2, tmp]
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
mov result, 1
cmp data1, data2
cneg result, result, lo
ret
END (__memcmp_aarch64)

View file

@ -0,0 +1,233 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __memcpy_aarch64_simd memcpy
#define __memmove_aarch64_simd memmove
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
#define dstin x0
#define src x1
#define count x2
#define dst x3
#define srcend x4
#define dstend x5
#define A_l x6
#define A_lw w6
#define A_h x7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_lw w10
#define tmp1 x14
#define A_q q0
#define B_q q1
#define C_q q2
#define D_q q3
#define E_q q4
#define F_q q5
#define G_q q6
#define H_q q7
/* This implementation handles overlaps and supports both memcpy and memmove
from a single entry point. It uses unaligned accesses and branchless
sequences to keep the code small, simple and improve performance.
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
copies of up to 128 bytes, and large copies. The overhead of the overlap
check is negligible since it is only required for large copies.
Large copies use a software pipelined loop processing 64 bytes per iteration.
The source pointer is 16-byte aligned to minimize unaligned accesses.
The loop tail is handled by always copying 64 bytes from the end.
*/
ENTRY_ALIAS (__memmove_aarch64_simd)
ENTRY (__memcpy_aarch64_simd)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
b.hi L(copy_long)
cmp count, 32
b.hi L(copy32_128)
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
ldr A_q, [src]
ldr B_q, [srcend, -16]
str A_q, [dstin]
str B_q, [dstend, -16]
ret
/* Copy 8-15 bytes. */
L(copy16):
tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
ldr A_lw, [src]
ldr B_lw, [srcend, -4]
str A_lw, [dstin]
str B_lw, [dstend, -4]
ret
/* Copy 0..3 bytes using a branchless sequence. */
L(copy4):
cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb C_lw, [dstend, -1]
L(copy0):
ret
.p2align 4
/* Medium copies: 33..128 bytes. */
L(copy32_128):
ldp A_q, B_q, [src]
ldp C_q, D_q, [srcend, -32]
cmp count, 64
b.hi L(copy128)
stp A_q, B_q, [dstin]
stp C_q, D_q, [dstend, -32]
ret
.p2align 4
/* Copy 65..128 bytes. */
L(copy128):
ldp E_q, F_q, [src, 32]
cmp count, 96
b.ls L(copy96)
ldp G_q, H_q, [srcend, -64]
stp G_q, H_q, [dstend, -64]
L(copy96):
stp A_q, B_q, [dstin]
stp E_q, F_q, [dstin, 32]
stp C_q, D_q, [dstend, -32]
ret
/* Copy more than 128 bytes. */
L(copy_long):
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
cmp tmp1, count
b.lo L(copy_long_backwards)
/* Copy 16 bytes and then align src to 16-byte alignment. */
ldr D_q, [src]
and tmp1, src, 15
bic src, src, 15
sub dst, dstin, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_q, B_q, [src, 16]
str D_q, [dstin]
ldp C_q, D_q, [src, 48]
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end)
L(loop64):
stp A_q, B_q, [dst, 16]
ldp A_q, B_q, [src, 80]
stp C_q, D_q, [dst, 48]
ldp C_q, D_q, [src, 112]
add src, src, 64
add dst, dst, 64
subs count, count, 64
b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
ldp E_q, F_q, [srcend, -64]
stp A_q, B_q, [dst, 16]
ldp A_q, B_q, [srcend, -32]
stp C_q, D_q, [dst, 48]
stp E_q, F_q, [dstend, -64]
stp A_q, B_q, [dstend, -32]
ret
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align srcend to 16-byte alignment. */
L(copy_long_backwards):
cbz tmp1, L(copy0)
ldr D_q, [srcend, -16]
and tmp1, srcend, 15
bic srcend, srcend, 15
sub count, count, tmp1
ldp A_q, B_q, [srcend, -32]
str D_q, [dstend, -16]
ldp C_q, D_q, [srcend, -64]
sub dstend, dstend, tmp1
subs count, count, 128
b.ls L(copy64_from_start)
L(loop64_backwards):
str B_q, [dstend, -16]
str A_q, [dstend, -32]
ldp A_q, B_q, [srcend, -96]
str D_q, [dstend, -48]
str C_q, [dstend, -64]!
ldp C_q, D_q, [srcend, -128]
sub srcend, srcend, 64
subs count, count, 64
b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
ldp E_q, F_q, [src, 32]
stp A_q, B_q, [dstend, -32]
ldp A_q, B_q, [src]
stp C_q, D_q, [dstend, -64]
stp E_q, F_q, [dstin, 32]
stp A_q, B_q, [dstin]
ret
END (__memcpy_aarch64_simd)

View file

@ -0,0 +1,138 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __memrchr_aarch64 memrchr
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD.
* MTE compatible.
*/
#define srcin x0
#define chrin w1
#define cntin x2
#define result x0
#define src x3
#define cntrem x4
#define synd x5
#define shift x6
#define tmp x7
#define end x8
#define endm1 x9
#define vrepchr v0
#define qdata q1
#define vdata v1
#define vhas_chr v2
#define vend v3
#define dend d3
/*
Core algorithm:
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. We take 4 bits of every comparison byte with shift right and narrow
by 4 instruction. Since the bits in the nibble mask reflect the order in
which things occur in the original string, counting leading zeros identifies
exactly which byte matched. */
ENTRY (__memrchr_aarch64)
PTR_ARG (0)
add end, srcin, cntin
sub endm1, end, 1
bic src, endm1, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
neg shift, end, lsl 2
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsl synd, synd, shift
cbz synd, L(start_loop)
clz synd, synd
sub result, endm1, synd, lsr 2
cmp cntin, synd, lsr 2
csel result, result, xzr, hi
ret
nop
L(start_loop):
subs cntrem, src, srcin
b.ls L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
sub cntrem, cntrem, 1
tbz cntrem, 4, L(loop32_2)
add src, src, 16
.p2align 5
L(loop32):
ldr qdata, [src, -32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
ldr qdata, [src, -16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
L(end_2):
sub src, src, 16
L(end):
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
add tmp, src, 15
#ifdef __AARCH64EB__
rbit synd, synd
#endif
clz synd, synd
sub tmp, tmp, synd, lsr 2
cmp tmp, srcin
csel result, tmp, xzr, hs
ret
L(nomatch):
mov result, 0
ret
END (__memrchr_aarch64)

View file

@ -0,0 +1,143 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __memset_aarch64 memset
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
#define dstin x0
#define val x1
#define valw w1
#define count x2
#define dst x3
#define dstend x4
#define zva_val x5
ENTRY (__memset_aarch64)
PTR_ARG (0)
SIZE_ARG (2)
dup v0.16B, valw
add dstend, dstin, count
cmp count, 96
b.hi L(set_long)
cmp count, 16
b.hs L(set_medium)
mov val, v0.D[0]
/* Set 0..15 bytes. */
tbz count, 3, 1f
str val, [dstin]
str val, [dstend, -8]
ret
.p2align 4
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend, -4]
ret
2: cbz count, 3f
strb valw, [dstin]
tbz count, 1, 3f
strh valw, [dstend, -2]
3: ret
/* Set 17..96 bytes. */
L(set_medium):
str q0, [dstin]
tbnz count, 6, L(set96)
str q0, [dstend, -16]
tbz count, 5, 1f
str q0, [dstin, 16]
str q0, [dstend, -32]
1: ret
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
L(set96):
str q0, [dstin, 16]
stp q0, q0, [dstin, 32]
stp q0, q0, [dstend, -32]
ret
.p2align 4
L(set_long):
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
cmp count, 160
ccmp valw, 0, 0, hs
b.ne L(no_zva)
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
b.ne L(no_zva)
#endif
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
sub count, dstend, dst /* Count is now 64 too large. */
sub count, count, 128 /* Adjust count and bias for loop. */
.p2align 4
L(zva_loop):
add dst, dst, 64
dc zva, dst
subs count, count, 64
b.hi L(zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
L(no_zva):
sub count, dstend, dst /* Count is 16 too large. */
sub dst, dst, 16 /* Dst is biased by -32. */
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
L(no_zva_loop):
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]!
subs count, count, 64
b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
END (__memset_aarch64)

View file

@ -0,0 +1,175 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __stpcpy_aarch64 stpcpy
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD.
* MTE compatible.
*/
#define dstin x0
#define srcin x1
#define result x0
#define src x2
#define dst x3
#define len x4
#define synd x4
#define tmp x5
#define shift x5
#define data1 x6
#define dataw1 w6
#define data2 x7
#define dataw2 w7
#define dataq q0
#define vdata v0
#define vhas_nul v1
#define vend v2
#define dend d2
#define dataq2 q1
/*
Core algorithm:
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. We take 4 bits of every comparison byte with shift right and narrow
by 4 instruction. Since the bits in the nibble mask reflect the order in
which things occur in the original string, counting leading zeros identifies
exactly which byte matched. */
ENTRY (__stpcpy_aarch64)
PTR_ARG (0)
PTR_ARG (1)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
cbz synd, L(start_loop)
#ifndef __AARCH64EB__
rbit synd, synd
#endif
sub tmp, src, srcin
clz len, synd
add len, tmp, len, lsr 2
tbz len, 4, L(less16)
sub tmp, len, 15
ldr dataq, [srcin]
ldr dataq2, [srcin, tmp]
str dataq, [dstin]
str dataq2, [dstin, tmp]
add result, dstin, len
ret
L(tail):
rbit synd, synd
clz len, synd
lsr len, len, 2
L(less16):
tbz len, 3, L(less8)
sub tmp, len, 7
ldr data1, [srcin]
ldr data2, [srcin, tmp]
str data1, [dstin]
str data2, [dstin, tmp]
add result, dstin, len
ret
.p2align 4
L(less8):
subs tmp, len, 3
b.lo L(less4)
ldr dataw1, [srcin]
ldr dataw2, [srcin, tmp]
str dataw1, [dstin]
str dataw2, [dstin, tmp]
add result, dstin, len
ret
L(less4):
cbz len, L(zerobyte)
ldrh dataw1, [srcin]
strh dataw1, [dstin]
L(zerobyte):
strb wzr, [dstin, len]
add result, dstin, len
ret
.p2align 4
L(start_loop):
sub tmp, srcin, dstin
ldr dataq2, [srcin]
sub dst, src, tmp
str dataq2, [dstin]
L(loop):
str dataq, [dst], 32
ldr dataq, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbnz synd, L(loopend)
str dataq, [dst, -16]
ldr dataq, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
add dst, dst, 16
L(loopend):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
sub dst, dst, 31
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz len, synd
lsr len, len, 2
add dst, dst, len
ldr dataq, [dst, tmp]
str dataq, [dst]
add result, dst, 15
ret
END (__stpcpy_aarch64)

View file

@ -0,0 +1,152 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strchr_aarch64 strchr
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define result x0
#define src x2
#define tmp1 x3
#define wtmp2 w4
#define tmp3 x5
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_nul1 v3
#define vhas_nul2 v4
#define vhas_chr1 v5
#define vhas_chr2 v6
#define vrepmask_0 v7
#define vrepmask_c v16
#define vend1 v17
#define vend2 v18
/* Core algorithm.
For each 32-byte hunk we calculate a 64-bit syndrome value, with
two bits per byte (LSB is always in bits 0 and 1, for both big
and little-endian systems). For each tuple, bit 0 is set iff
the relevant byte matched the requested character; bit 1 is set
iff the relevant byte matched the NUL end of string (we trigger
off bit0 for the special case of looking for NUL). Since the bits
in the syndrome reflect exactly the order in which things occur
in the original string a count_trailing_zeros() operation will
identify exactly which byte is causing the termination, and why. */
/* Locals and temporaries. */
ENTRY (__strchr_aarch64)
PTR_ARG (0)
/* Magic constant 0xc0300c03 to allow us to identify which lane
matches the requested byte. Even bits are set if the character
matches, odd bits if either the char is NUL or matches. */
mov wtmp2, 0x0c03
movk wtmp2, 0xc030, lsl 16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask_c.4s, wtmp2
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
for all the bytes, but then mask off those bits of the
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vend1.16b, vend2.16b // 256->128
mov tmp3, #~0
addp vend1.16b, vend1.16b, vend2.16b // 128->64
lsr tmp1, tmp3, tmp1
mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, L(tail)
.p2align 4
L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
umaxp vend1.16b, vend1.16b, vend1.16b
mov tmp1, vend1.d[0]
cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
addp vend1.16b, vend1.16b, vend2.16b // 256->128
addp vend1.16b, vend1.16b, vend2.16b // 128->64
mov tmp1, vend1.d[0]
L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
sub src, src, #32
clz tmp1, tmp1 /* And counting the leading zeros. */
/* Tmp1 is even if the target charager was found first. Otherwise
we've found the end of string and we weren't looking for NUL. */
tst tmp1, #1
add result, src, tmp1, lsr #1
csel result, result, xzr, eq
ret
END (__strchr_aarch64)

View file

@ -0,0 +1,140 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strchrnul_aarch64 strchrnul
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define result x0
#define src x2
#define tmp1 x3
#define wtmp2 w4
#define tmp3 x5
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_nul1 v3
#define vhas_nul2 v4
#define vhas_chr1 v5
#define vhas_chr2 v6
#define vrepmask v7
#define vend1 v16
/* Core algorithm.
For each 32-byte hunk we calculate a 64-bit syndrome value, with
two bits per byte (LSB is always in bits 0 and 1, for both big
and little-endian systems). For each tuple, bit 0 is set iff
the relevant byte matched the requested character or nul. Since the
bits in the syndrome reflect exactly the order in which things occur
in the original string a count_trailing_zeros() operation will
identify exactly which byte is causing the termination. */
/* Locals and temporaries. */
ENTRY (__strchrnul_aarch64)
PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask.4s, wtmp2
ands tmp1, srcin, #31
b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
for all the bytes, but then mask off those bits of the
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
mov tmp3, #~0
addp vend1.16b, vend1.16b, vend1.16b // 128->64
lsr tmp1, tmp3, tmp1
mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, L(tail)
.p2align 4
L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
umaxp vend1.16b, vend1.16b, vend1.16b
mov tmp1, vend1.d[0]
cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend1.16b, vend1.16b, vend1.16b // 128->64
mov tmp1, vend1.d[0]
L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
sub src, src, #32
clz tmp1, tmp1 /* ... and counting the leading zeros. */
/* tmp1 is twice the offset into the fragment. */
add result, src, tmp1, lsr #1
ret
END (__strchrnul_aarch64)

View file

@ -0,0 +1,214 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strcmp_aarch64 strcmp
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64.
* MTE compatible.
*/
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define src1 x0
#define src2 x1
#define result x0
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
#define off1 x5
#define syndrome x6
#define tmp x6
#define data3 x7
#define zeroones x8
#define shift x9
#define off2 x10
/* On big-endian early bytes are at MSB and on little-endian LSB.
LS_FW means shifting towards early bytes. */
#ifdef __AARCH64EB__
# define LS_FW lsl
#else
# define LS_FW lsr
#endif
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word.
Since carry propagation makes 0x1 bytes before a NUL byte appear
NUL too in big-endian, byte-reverse the data before the NUL check. */
ENTRY (__strcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
sub off2, src2, src1
mov zeroones, REP8_01
and tmp, src1, 7
tst off2, 7
b.ne L(misaligned8)
cbnz tmp, L(mutual_align)
.p2align 4
L(loop_aligned):
ldr data2, [src1, off2]
ldr data1, [src1], 8
L(start_realigned):
#ifdef __AARCH64EB__
rev tmp, data1
sub has_nul, tmp, zeroones
orr tmp, tmp, REP8_7f
#else
sub has_nul, data1, zeroones
orr tmp, data1, REP8_7f
#endif
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
ccmp data1, data2, 0, eq
b.eq L(loop_aligned)
#ifdef __AARCH64EB__
rev has_nul, has_nul
#endif
eor diff, data1, data2
orr syndrome, diff, has_nul
L(end):
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
rev data2, data2
#endif
clz shift, syndrome
/* The most-significant-non-zero bit of the syndrome marks either the
first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
lsl data1, data1, shift
lsl data2, data2, shift
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, 56
sub result, data1, data2, lsr 56
ret
.p2align 4
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point. */
bic src1, src1, 7
ldr data2, [src1, off2]
ldr data1, [src1], 8
neg shift, src2, lsl 3 /* Bits to alignment -64. */
mov tmp, -1
LS_FW tmp, tmp, shift
orr data1, data1, tmp
orr data2, data2, tmp
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
checking to make sure that we don't access beyond the end of SRC2. */
cbz tmp, L(src1_aligned)
L(do_misaligned):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
cmp data1w, 0
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.ne L(done)
tst src1, 7
b.ne L(do_misaligned)
L(src1_aligned):
neg shift, src2, lsl 3
bic src2, src2, 7
ldr data3, [src2], 8
#ifdef __AARCH64EB__
rev data3, data3
#endif
lsr tmp, zeroones, shift
orr data3, data3, tmp
sub has_nul, data3, zeroones
orr tmp, data3, REP8_7f
bics has_nul, has_nul, tmp
b.ne L(tail)
sub off1, src2, src1
.p2align 4
L(loop_unaligned):
ldr data3, [src1, off1]
ldr data2, [src1, off2]
#ifdef __AARCH64EB__
rev data3, data3
#endif
sub has_nul, data3, zeroones
orr tmp, data3, REP8_7f
ldr data1, [src1], 8
bics has_nul, has_nul, tmp
ccmp data1, data2, 0, eq
b.eq L(loop_unaligned)
lsl tmp, has_nul, shift
#ifdef __AARCH64EB__
rev tmp, tmp
#endif
eor diff, data1, data2
orr syndrome, diff, tmp
cbnz syndrome, L(end)
L(tail):
ldr data1, [src1]
neg shift, shift
lsr data2, data3, shift
lsr has_nul, has_nul, shift
#ifdef __AARCH64EB__
rev data2, data2
rev has_nul, has_nul
#endif
eor diff, data1, data2
orr syndrome, diff, has_nul
b L(end)
L(done):
sub result, data1, data2
ret
END (__strcmp_aarch64)

View file

@ -0,0 +1,170 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strcpy_aarch64 strcpy
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD.
* MTE compatible.
*/
#define dstin x0
#define srcin x1
#define result x0
#define src x2
#define dst x3
#define len x4
#define synd x4
#define tmp x5
#define shift x5
#define data1 x6
#define dataw1 w6
#define data2 x7
#define dataw2 w7
#define dataq q0
#define vdata v0
#define vhas_nul v1
#define vend v2
#define dend d2
#define dataq2 q1
/*
Core algorithm:
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. We take 4 bits of every comparison byte with shift right and narrow
by 4 instruction. Since the bits in the nibble mask reflect the order in
which things occur in the original string, counting leading zeros identifies
exactly which byte matched. */
ENTRY (__strcpy_aarch64)
PTR_ARG (0)
PTR_ARG (1)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
cbz synd, L(start_loop)
#ifndef __AARCH64EB__
rbit synd, synd
#endif
sub tmp, src, srcin
clz len, synd
add len, tmp, len, lsr 2
tbz len, 4, L(less16)
sub tmp, len, 15
ldr dataq, [srcin]
ldr dataq2, [srcin, tmp]
str dataq, [dstin]
str dataq2, [dstin, tmp]
ret
L(tail):
rbit synd, synd
clz len, synd
lsr len, len, 2
L(less16):
tbz len, 3, L(less8)
sub tmp, len, 7
ldr data1, [srcin]
ldr data2, [srcin, tmp]
str data1, [dstin]
str data2, [dstin, tmp]
ret
.p2align 4
L(less8):
subs tmp, len, 3
b.lo L(less4)
ldr dataw1, [srcin]
ldr dataw2, [srcin, tmp]
str dataw1, [dstin]
str dataw2, [dstin, tmp]
ret
L(less4):
cbz len, L(zerobyte)
ldrh dataw1, [srcin]
strh dataw1, [dstin]
L(zerobyte):
strb wzr, [dstin, len]
ret
.p2align 4
L(start_loop):
sub tmp, srcin, dstin
ldr dataq2, [srcin]
sub dst, src, tmp
str dataq2, [dstin]
L(loop):
str dataq, [dst], 32
ldr dataq, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbnz synd, L(loopend)
str dataq, [dst, -16]
ldr dataq, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
add dst, dst, 16
L(loopend):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
sub dst, dst, 31
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz len, synd
lsr len, len, 2
add dst, dst, len
ldr dataq, [dst, tmp]
str dataq, [dst]
ret
END (__strcpy_aarch64)

View file

@ -0,0 +1,220 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strlen_aarch64 strlen
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
* Not MTE compatible.
*/
#define srcin x0
#define len x0
#define src x1
#define data1 x2
#define data2 x3
#define has_nul1 x4
#define has_nul2 x5
#define tmp1 x4
#define tmp2 x5
#define tmp3 x6
#define tmp4 x7
#define zeroones x8
#define maskv v0
#define maskd d0
#define dataq1 q1
#define dataq2 q2
#define datav1 v1
#define datav2 v2
#define tmp x2
#define tmpw w2
#define synd x3
#define syndw w3
#define shift x4
/* For the first 32 bytes, NUL detection works on the principle that
(X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
byte is zero, and can be done in parallel across the entire word. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
/* To test the page crossing code path more thoroughly, compile with
-DTEST_PAGE_CROSS - this will force all calls through the slower
entry path. This option is not intended for production use. */
#ifdef TEST_PAGE_CROSS
# define MIN_PAGE_SIZE 32
#else
# define MIN_PAGE_SIZE 4096
#endif
/* Core algorithm:
Since strings are short on average, we check the first 32 bytes of the
string for a NUL character without aligning the string. In order to use
unaligned loads safely we must do a page cross check first.
If there is a NUL byte we calculate the length from the 2 8-byte words
using conditional select to reduce branch mispredictions (it is unlikely
strlen will be repeatedly called on strings with the same length).
If the string is longer than 32 bytes, align src so we don't need further
page cross checks, and process 32 bytes per iteration using a fast SIMD
loop.
If the page cross check fails, we read 32 bytes from an aligned address,
and ignore any characters before the string. If it contains a NUL
character, return the length, if not, continue in the main loop. */
ENTRY (__strlen_aarch64)
PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
cmp tmp1, MIN_PAGE_SIZE - 32
b.hi L(page_cross)
/* Look for a NUL byte in the first 16 bytes. */
ldp data1, data2, [srcin]
mov zeroones, REP8_01
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
Since we expect strings to be small and early-exit,
byte-swap the data now so has_null1/2 will be correct. */
rev data1, data1
rev data2, data2
#endif
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
b.eq L(bytes16_31)
/* Find the exact offset of the first NUL byte in the first 16 bytes
from the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
csel len, xzr, len, cc
clz tmp1, has_nul1
add len, len, tmp1, lsr 3
ret
/* Look for a NUL byte at offset 16..31 in the string. */
L(bytes16_31):
ldp data1, data2, [srcin, 16]
#ifdef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
b.eq L(loop_entry)
/* Find the exact offset of the first NUL byte at offset 16..31 from
the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 24
rev has_nul1, has_nul1
mov tmp3, 16
clz tmp1, has_nul1
csel len, tmp3, len, cc
add len, len, tmp1, lsr 3
ret
nop
L(loop_entry):
bic src, srcin, 31
.p2align 5
L(loop):
ldp dataq1, dataq2, [src, 32]!
uminp maskv.16b, datav1.16b, datav2.16b
uminp maskv.16b, maskv.16b, maskv.16b
cmeq maskv.8b, maskv.8b, 0
fmov synd, maskd
cbz synd, L(loop)
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
cmeq maskv.16b, datav1.16b, 0
sub len, src, srcin
cbnz syndw, 1f
cmeq maskv.16b, datav2.16b, 0
add len, len, 16
1:
/* Generate a bitmask and compute correct byte offset. */
shrn maskv.8b, maskv.8h, 4
fmov synd, maskd
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz tmp, synd
add len, len, tmp, lsr 2
ret
L(page_cross):
bic src, srcin, 31
mov tmpw, 0x0c03
movk tmpw, 0xc030, lsl 16
ld1 {datav1.16b, datav2.16b}, [src]
dup maskv.4s, tmpw
cmeq datav1.16b, datav1.16b, 0
cmeq datav2.16b, datav2.16b, 0
and datav1.16b, datav1.16b, maskv.16b
and datav2.16b, datav2.16b, maskv.16b
addp maskv.16b, datav1.16b, datav2.16b
addp maskv.16b, maskv.16b, maskv.16b
fmov synd, maskd
lsl shift, srcin, 1
lsr synd, synd, shift
cbz synd, L(loop)
rbit synd, synd
clz len, synd
lsr len, len, 1
ret
END (__strlen_aarch64)

View file

@ -0,0 +1,334 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strncmp_aarch64 strncmp
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64.
* MTE compatible.
*/
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result x0
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define has_nul x5
#define diff x6
#define syndrome x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define zeroones x11
#define pos x12
#define mask x13
#define endloop x14
#define count mask
#define offset pos
#define neg_offset x15
/* Define endian dependent shift operations.
On big-endian early bytes are at MSB and on little-endian LSB.
LS_FW means shifting towards early bytes.
LS_BK means shifting towards later bytes.
*/
#ifdef __AARCH64EB__
#define LS_FW lsl
#define LS_BK lsr
#else
#define LS_FW lsr
#define LS_BK lsl
#endif
ENTRY (__strncmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
and count, src1, #7
b.ne L(misaligned8)
cbnz count, L(mutual_align)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
.p2align 4
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
/* End of main loop */
L(full_check):
#ifndef __AARCH64EB__
orr syndrome, diff, has_nul
add limit, limit, 8 /* Rewind limit to before last subs. */
L(syndrome_check):
/* Limit was reached. Check if the NUL byte or the difference
is before the limit. */
rev syndrome, syndrome
rev data1, data1
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
csel result, result, xzr, hi
ret
#else
/* Not reached the limit, must have found the end or a diff. */
tbz limit, #63, L(not_limit)
add tmp1, limit, 8
cbz limit, L(not_limit)
lsl limit, tmp1, #3 /* Bits -> bytes. */
mov mask, #~0
lsr mask, mask, limit
bic data1, data1, mask
bic data2, data2, mask
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
L(not_limit):
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
/* However, if there is no NUL byte in the dword, we can generate
the result directly. We can't just subtract the bytes as the
MSB might be significant. */
cbnz has_nul, 1f
cmp data1, data2
cset result, ne
cneg result, result, lo
ret
1:
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
rev tmp3, data1
sub tmp1, tmp3, zeroones
orr tmp2, tmp3, #REP8_7f
bic has_nul, tmp1, tmp2
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
/* The most-significant-non-zero bit of the syndrome marks either the
first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#endif
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point.
We also need to adjust the limit calculations, but without
overflowing if the limit is near ULONG_MAX. */
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
/* Adjust the limit and ensure it doesn't overflow. */
adds limit, limit, count
csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
b L(start_realigned)
.p2align 4
/* Don't bother with dwords for up to 16 bytes. */
L(misaligned8):
cmp limit, #16
b.hs L(try_misaligned_words)
L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq L(byte_loop)
L(done):
sub result, data1, data2
ret
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
cbz count, L(src1_aligned)
neg count, count
and count, count, #7
sub limit, limit, count
L(page_end_loop):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.ne L(done)
subs count, count, #1
b.hi L(page_end_loop)
/* The following diagram explains the comparison of misaligned strings.
The bytes are shown in natural order. For little-endian, it is
reversed in the registers. The "x" bytes are before the string.
The "|" separates data that is loaded at one time.
src1 | a a a a a a a a | b b b c c c c c | . . .
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
After shifting in each step, the data looks like this:
STEP_A STEP_B STEP_C
data1 a a a a a a a a b b b c c c c c b b b c c c c c
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
The bytes with "0" are eliminated from the syndrome via mask.
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
time from SRC2. The comparison happens in 3 steps. After each step
the loop can exit, or read from SRC1 or SRC2. */
L(src1_aligned):
/* Calculate offset from 8 byte alignment to string start in bits. No
need to mask offset since shifts are ignoring upper bits. */
lsl offset, src2, #3
bic src2, src2, #0xf
mov mask, -1
neg neg_offset, offset
ldr data1, [src1], #8
ldp tmp1, tmp2, [src2], #16
LS_BK mask, mask, neg_offset
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
/* Skip the first compare if data in tmp1 is irrelevant. */
tbnz offset, 6, L(misaligned_mid_loop)
L(loop_misaligned):
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
LS_FW data2, tmp1, offset
LS_BK tmp1, tmp2, neg_offset
subs limit, limit, #8
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
sub has_nul, data1, zeroones
eor diff, data1, data2 /* Non-zero if differences found. */
orr tmp3, data1, #REP8_7f
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
orr tmp3, endloop, has_nul
cbnz tmp3, L(full_check)
ldr data1, [src1], #8
L(misaligned_mid_loop):
/* STEP_B: Compare first part of data1 to second part of tmp2. */
LS_FW data2, tmp2, offset
#ifdef __AARCH64EB__
/* For big-endian we do a byte reverse to avoid carry-propagation
problem described above. This way we can reuse the has_nul in the
next step and also use syndrome value trick at the end. */
rev tmp3, data1
#define data1_fixed tmp3
#else
#define data1_fixed data1
#endif
sub has_nul, data1_fixed, zeroones
orr tmp3, data1_fixed, #REP8_7f
eor diff, data2, data1 /* Non-zero if differences found. */
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
#ifdef __AARCH64EB__
rev has_nul, has_nul
#endif
cmp limit, neg_offset, lsr #3
orr syndrome, diff, has_nul
bic syndrome, syndrome, mask /* Ignore later bytes. */
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
cbnz tmp3, L(syndrome_check)
/* STEP_C: Compare second part of data1 to first part of tmp1. */
ldp tmp1, tmp2, [src2], #16
cmp limit, #8
LS_BK data2, tmp1, neg_offset
eor diff, data2, data1 /* Non-zero if differences found. */
orr syndrome, diff, has_nul
and syndrome, syndrome, mask /* Ignore earlier bytes. */
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
cbnz tmp3, L(syndrome_check)
ldr data1, [src1], #8
sub limit, limit, #8
b L(loop_misaligned)
#ifdef __AARCH64EB__
L(syndrome_check):
clz pos, syndrome
cmp pos, limit, lsl #3
b.lo L(end_quick)
#endif
L(ret0):
mov result, #0
ret
END(__strncmp_aarch64)

View file

@ -0,0 +1,128 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strnlen_aarch64 strnlen
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD.
* MTE compatible.
*/
#define srcin x0
#define cntin x1
#define result x0
#define src x2
#define synd x3
#define shift x4
#define tmp x4
#define cntrem x5
#define qdata q0
#define vdata v0
#define vhas_chr v1
#define vend v2
#define dend d2
/*
Core algorithm:
Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
four bits per byte using the shrn instruction. A count trailing zeros then
identifies the first zero byte. */
ENTRY (__strnlen_aarch64)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
L(finish):
rbit synd, synd
clz synd, synd
lsr result, synd, 2
cmp cntin, result
csel result, cntin, result, ls
ret
L(nomatch):
mov result, cntin
ret
L(start_loop):
sub tmp, src, srcin
add tmp, tmp, 17
subs cntrem, cntin, tmp
b.lo L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
tbz cntrem, 4, L(loop32_2)
sub src, src, 16
.p2align 5
L(loop32):
ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, 0
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
ldr qdata, [src, 16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, 0
b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
L(end_2):
add src, src, 16
L(end):
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
sub result, src, srcin
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz synd, synd
add result, result, synd, lsr 2
cmp cntin, result
csel result, cntin, result, ls
ret
END (__strnlen_aarch64)

View file

@ -0,0 +1,175 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Optimized Routines
Copyright (c) 1999-2022, Arm Limited.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libc/intrin/aarch64/asmdefs.h"
#define __strrchr_aarch64 strrchr
.ident "\n\
Optimized Routines (MIT License)\n\
Copyright 2022 ARM Limited\n"
.include "libc/disclaimer.inc"
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define result x0
#define src x2
#define tmp1 x3
#define wtmp2 w4
#define tmp3 x5
#define src_match x6
#define src_offset x7
#define const_m1 x8
#define tmp4 x9
#define nul_match x10
#define chr_match x11
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_nul1 v3
#define vhas_nul2 v4
#define vhas_chr1 v5
#define vhas_chr2 v6
#define vrepmask_0 v7
#define vrepmask_c v16
#define vend1 v17
#define vend2 v18
/* Core algorithm.
For each 32-byte hunk we calculate a 64-bit syndrome value, with
two bits per byte (LSB is always in bits 0 and 1, for both big
and little-endian systems). For each tuple, bit 0 is set iff
the relevant byte matched the requested character; bit 1 is set
iff the relevant byte matched the NUL end of string (we trigger
off bit0 for the special case of looking for NUL). Since the bits
in the syndrome reflect exactly the order in which things occur
in the original string a count_trailing_zeros() operation will
identify exactly which byte is causing the termination, and why. */
ENTRY (__strrchr_aarch64)
PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
similarly for NUL termination. */
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask_c.4s, wtmp2
mov src_offset, #0
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
b.eq L(aligned)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
for all the bytes, but then mask off those bits of the
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
mov nul_match, vend1.d[0]
lsl tmp1, tmp1, #1
mov const_m1, #~0
lsr tmp3, const_m1, tmp1
mov chr_match, vend1.d[1]
bic nul_match, nul_match, tmp3 // Mask padding bits.
bic chr_match, chr_match, tmp3 // Mask padding bits.
cbnz nul_match, L(tail)
.p2align 4
L(loop):
cmp chr_match, #0
csel src_match, src, src_match, ne
csel src_offset, chr_match, src_offset, ne
L(aligned):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
uminp vend1.16b, vdata1.16b, vdata2.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
cmeq vend1.16b, vend1.16b, 0
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
mov nul_match, vend1.d[0]
mov chr_match, vend1.d[1]
cbz nul_match, L(loop)
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_nul2.16b, vdata2.16b, #0
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
mov nul_match, vhas_nul1.d[0]
L(tail):
/* Work out exactly where the string ends. */
sub tmp4, nul_match, #1
eor tmp4, tmp4, nul_match
ands chr_match, chr_match, tmp4
/* And pick the values corresponding to the last match. */
csel src_match, src, src_match, ne
csel src_offset, chr_match, src_offset, ne
/* Count down from the top of the syndrome to find the last match. */
clz tmp3, src_offset
/* Src_match points beyond the word containing the match, so we can
simply subtract half the bit-offset into the syndrome. Because
we are counting down, we need to go back one more character. */
add tmp3, tmp3, #2
sub result, src_match, tmp3, lsr #1
/* But if the syndrome shows no match was found, then return NULL. */
cmp src_offset, #0
csel result, result, xzr, ne
ret
END (__strrchr_aarch64)

View file

@ -6,6 +6,7 @@ PKGS += LIBC_INTRIN
LIBC_INTRIN_ARTIFACTS += LIBC_INTRIN_A
LIBC_INTRIN = $(LIBC_INTRIN_A_DEPS) $(LIBC_INTRIN_A)
LIBC_INTRIN_A = o/$(MODE)/libc/intrin/intrin.a
LIBC_INTRIN_A_FILES := $(wildcard libc/intrin/*)
LIBC_INTRIN_A_HDRS = $(filter %.h,$(LIBC_INTRIN_A_FILES))
LIBC_INTRIN_A_INCS = $(filter %.inc,$(LIBC_INTRIN_A_FILES))
LIBC_INTRIN_A_SRCS_S = $(filter %.S,$(LIBC_INTRIN_A_FILES))
@ -13,8 +14,9 @@ LIBC_INTRIN_A_SRCS_C = $(filter %.c,$(LIBC_INTRIN_A_FILES))
LIBC_INTRIN_A_SRCS = $(LIBC_INTRIN_A_SRCS_S) $(LIBC_INTRIN_A_SRCS_C)
LIBC_INTRIN_A_CHECKS = $(LIBC_INTRIN_A).pkg
LIBC_INTRIN_A_FILES := \
$(wildcard libc/intrin/*)
ifeq ($(ARCH), aarch64)
LIBC_INTRIN_A_SRCS_S += $(wildcard libc/intrin/aarch64/*.S)
endif
LIBC_INTRIN_A_OBJS = \
$(LIBC_INTRIN_A_SRCS_S:%.S=o/$(MODE)/%.o) \
@ -203,6 +205,8 @@ o/$(MODE)/libc/intrin/memmove.o: private \
-fpie
# these assembly files are safe to build on aarch64
o/$(MODE)/libc/intrin/aarch64/%.o: libc/intrin/aarch64/%.S
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
o/$(MODE)/libc/intrin/fenv.o: libc/intrin/fenv.S
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
o/$(MODE)/libc/intrin/futex.o: libc/intrin/futex.S

View file

@ -20,6 +20,7 @@
#include "libc/intrin/asan.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
@ -83,3 +84,5 @@ void *memchr(const void *s, int c, size_t n) {
return memchr_pure(s, c, n);
#endif
}
#endif /* __aarch64__ */

View file

@ -20,6 +20,7 @@
#include "libc/intrin/likely.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
@ -129,7 +130,9 @@ microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
* memcmp n=32768 29 ps/byte 32,851 mb/s
* memcmp n=131072 33 ps/byte 28,983 mb/s
*
* @return unsigned char subtraction at stop index
* @return an integer that's (1) equal to zero if `a` is equal to `b`,
* (2) less than zero if `a` is less than `b`, or (3) greater than
* zero if `a` is greater than `b`
* @asyncsignalsafe
*/
int memcmp(const void *a, const void *b, size_t n) {
@ -200,3 +203,5 @@ int memcmp(const void *a, const void *b, size_t n) {
}
return 0;
}
#endif /* __aarch64__ */

View file

@ -22,6 +22,7 @@
#include "libc/nexgen32e/nexgen32e.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
@ -343,3 +344,5 @@ void *memmove(void *dst, const void *src, size_t n) {
asm("memcpy = memmove\n\t"
".globl\tmemcpy");
#endif /* __aarch64__ */

View file

@ -20,6 +20,7 @@
#include "libc/intrin/asan.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
@ -81,3 +82,5 @@ void *memrchr(const void *s, int c, size_t n) {
return memrchr_pure(s, c, n);
#endif
}
#endif /* __aarch64__ */

View file

@ -22,6 +22,7 @@
#include "libc/nexgen32e/nexgen32e.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
@ -168,3 +169,5 @@ void *memset(void *p, int c, size_t n) {
return memset_sse(b, c, n);
}
}
#endif /* __aarch64__ */

View file

@ -17,6 +17,9 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
#ifndef __aarch64__
// TODO(jart): ASAN support here is important.
typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
@ -63,3 +66,5 @@ char *stpcpy(char *d, const char *s) {
++i;
}
}
#endif /* __aarch64__ */

View file

@ -21,6 +21,7 @@
#include "libc/intrin/asan.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
static inline const char *strchr_pure(const char *s, int c) {
for (;; ++s) {
@ -115,3 +116,5 @@ char *strchr(const char *s, int c) {
return r;
#endif
}
#endif /* __aarch64__ */

View file

@ -21,6 +21,7 @@
#include "libc/intrin/asan.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
static inline const char *strchrnul_pure(const char *s, int c) {
for (;; ++s) {
@ -113,3 +114,5 @@ char *strchrnul(const char *s, int c) {
return r;
#endif
}
#endif /* __aarch64__ */

View file

@ -17,6 +17,9 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
#ifndef __aarch64__
// TODO(jart): ASAN support here is important.
typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
@ -63,3 +66,5 @@ char *strcpy(char *d, const char *s) {
++i;
}
}
#endif /* __aarch64__ */

View file

@ -19,6 +19,7 @@
#include "libc/dce.h"
#include "libc/intrin/asan.internal.h"
#include "libc/str/str.h"
#ifndef __aarch64__
/**
* Returns length of NUL-terminated string.
@ -61,3 +62,5 @@ noasan size_t strlen(const char *s) {
return n;
#endif
}
#endif /* __aarch64__ */

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
#ifndef __aarch64__
/**
* Compares NUL-terminated strings w/ limit.
@ -32,3 +33,5 @@ int strncmp(const char *a, const char *b, size_t n) {
while (i < n && a[i] == b[i] && b[i]) ++i;
return (a[i] & 0xff) - (b[i] & 0xff);
}
#endif /* __aarch64__ */

View file

@ -21,6 +21,7 @@
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/bits.h"
#include "libc/str/str.h"
#ifndef __aarch64__
static noasan size_t strnlen_x64(const char *s, size_t n, size_t i) {
uint64_t w;
@ -56,3 +57,5 @@ noasan size_t strnlen(const char *s, size_t n) {
if (IsAsan()) __asan_verify(s, i);
return i;
}
#endif /* __aarch64__ */

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
#ifndef __aarch64__
/**
* Searches for last instance of character in string.
@ -29,3 +30,5 @@
char *strrchr(const char *s, int c) {
return memrchr(s, c, strlen(s));
}
#endif /* __aarch64__ */

View file

@ -36,10 +36,16 @@ STATIC_YOINK("strerror_wr");
/**
* Handles failure of CHECK_xx() macros.
*/
relegated void __check_fail(const char *suffix, const char *opstr,
uint64_t want, const char *wantstr, uint64_t got,
const char *gotstr, const char *file, int line,
const char *fmt, ...) {
relegated void __check_fail(const char *suffix, //
const char *opstr, //
uint64_t want, //
const char *wantstr, //
uint64_t got, //
const char *gotstr, //
const char *file, //
int line, //
const char *fmt, //
...) {
int e;
char *p;
size_t i;

View file

@ -33,21 +33,69 @@
*
* @see libc/log/thunks/__check_fail_ndebug.S
*/
relegated wontreturn void __check_fail_ndebug(uint64_t want, uint64_t got,
const char *file, int line,
const char *opchar,
const char *fmt, ...) {
va_list va;
static relegated wontreturn void __check_fail_ndebug(uint64_t want, //
uint64_t got, //
const char *file, //
int line, //
const char *opchar, //
const char *fmt, //
va_list va) {
__restore_tty();
kprintf("%rerror:%s:%d: check failed: %'ld %s %'ld% m", file, line, want,
opchar, got);
if (*fmt) {
if (fmt && *fmt) {
kprintf(": ");
va_start(va, fmt);
kvprintf(fmt, va);
va_end(va);
}
kprintf("\n");
if (_weaken(__die)) _weaken(__die)();
_Exitr(68);
}
void __check_fail_eq(uint64_t want, uint64_t got, const char *file, int line,
const char *opchar, const char *fmt, ...) {
va_list va;
va_start(va, fmt);
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
va_end(va);
}
void __check_fail_ne(uint64_t want, uint64_t got, const char *file, int line,
const char *opchar, const char *fmt, ...) {
va_list va;
va_start(va, fmt);
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
va_end(va);
}
void __check_fail_le(uint64_t want, uint64_t got, const char *file, int line,
const char *opchar, const char *fmt, ...) {
va_list va;
va_start(va, fmt);
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
va_end(va);
}
void __check_fail_lt(uint64_t want, uint64_t got, const char *file, int line,
const char *opchar, const char *fmt, ...) {
va_list va;
va_start(va, fmt);
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
va_end(va);
}
void __check_fail_ge(uint64_t want, uint64_t got, const char *file, int line,
const char *opchar, const char *fmt, ...) {
va_list va;
va_start(va, fmt);
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
va_end(va);
}
void __check_fail_gt(uint64_t want, uint64_t got, const char *file, int line,
const char *opchar, const char *fmt, ...) {
va_list va;
va_start(va, fmt);
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
va_end(va);
}

View file

@ -6,9 +6,7 @@ PKGS += LIBC_LOG
LIBC_LOG_ARTIFACTS += LIBC_LOG_A
LIBC_LOG = $(LIBC_LOG_A_DEPS) $(LIBC_LOG_A)
LIBC_LOG_A = o/$(MODE)/libc/log/log.a
LIBC_LOG_A_FILES := \
$(wildcard libc/log/thunks/*) \
$(wildcard libc/log/*)
LIBC_LOG_A_FILES := $(wildcard libc/log/*)
LIBC_LOG_A_HDRS = $(filter %.h,$(LIBC_LOG_A_FILES))
LIBC_LOG_A_SRCS_C = $(filter %.c,$(LIBC_LOG_A_FILES))
LIBC_LOG_A_SRCS_S = $(filter %.S,$(LIBC_LOG_A_FILES))

View file

@ -1,30 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.text.unlikely
// Code-size saving thunk for CHECK_EQ() in NDEBUG mode.
__check_fail_eq:
lea .Lop(%rip),%r8
jmp __check_fail_ndebug
.endfn __check_fail_eq,globl
.rodata.str1.1
.Lop: .asciz "=="
.previous

View file

@ -1,30 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.text.unlikely
// Code-size saving thunk for CHECK_GE() in NDEBUG mode.
__check_fail_ge:
lea .Lop(%rip),%r8
jmp __check_fail_ndebug
.endfn __check_fail_ge,globl
.rodata.str1.1
.Lop: .asciz ">="
.previous

View file

@ -1,30 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.text.unlikely
// Code-size saving thunk for CHECK_GT() in NDEBUG mode.
__check_fail_gt:
lea .Lop(%rip),%r8
jmp __check_fail_ndebug
.endfn __check_fail_gt,globl
.rodata.str1.1
.Lop: .asciz ">"
.previous

View file

@ -1,30 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.text.unlikely
// Code-size saving thunk for CHECK_LE() in NDEBUG mode.
__check_fail_le:
lea .Lop(%rip),%r8
jmp __check_fail_ndebug
.endfn __check_fail_le,globl
.rodata.str1.1
.Lop: .asciz "<="
.previous

View file

@ -1,30 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.text.unlikely
// Code-size saving thunk for CHECK_LT() in NDEBUG mode.
__check_fail_lt:
lea .Lop(%rip),%r8
jmp __check_fail_ndebug
.endfn __check_fail_lt,globl
.rodata.str1.1
.Lop: .asciz "<"
.previous

View file

@ -1,30 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.text.unlikely
// Code-size saving thunk for CHECK_NE() in NDEBUG mode.
__check_fail_ne:
lea .Lop(%rip),%r8
jmp __check_fail_ndebug
.endfn __check_fail_ne,globl
.rodata.str1.1
.Lop: .asciz "!="
.previous

View file

@ -43,8 +43,22 @@
Ticks; \
})
#else
#define __startbench() rdtsc()
#define __endbench() rdtsc()
#define __startbench() \
({ \
uint64_t _ts; \
asm volatile("isb" ::: "memory"); \
_ts = rdtsc(); \
asm volatile("isb" ::: "memory"); \
_ts; \
})
#define __endbench() \
({ \
uint64_t _ts; \
asm volatile("isb" ::: "memory"); \
_ts = rdtsc(); \
asm volatile("isb" ::: "memory"); \
_ts; \
})
#endif
#define __startbench_m() mfence_lfence_rdtsc_lfence()

View file

@ -1,262 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Computes Phil Katz CRC-32 w/ carryless multiply isa.
//
// This is support code that's abstracted by crc32_z().
//
// @param edi is initial value
// @param rsi points to buffer
// @param rdx is bytes in buffer that's >=64 and %16==0
// @return eax is crc32
// @note needs Westmere (c.2010) or Bulldozer (c.2011)
// @see “Fast CRC Computation for Generic Polynomials Using
// PCLMULQDQ Instruction V. Gopal, E. Ozturk, et al.,
// 2009, intel.ly/2ySEwL0
crc32_pclmul:
.leafprologue
.profilable
movdqu (%rsi),%xmm7
movd %edi,%xmm1
movdqu 16(%rsi),%xmm9
movdqu 32(%rsi),%xmm4
movdqu 48(%rsi),%xmm0
lea -64(%rdx),%rdi
lea 64(%rsi),%rcx
pxor %xmm7,%xmm1
movdqa .Lk1k2(%rip),%xmm8
cmp $63,%rdi
jbe 2f
lea -128(%rdx),%rdi
mov %rdi,%rdx
shr $6,%rdx
lea 2(%rdx),%rax
sal $6,%rax
add %rax,%rsi
mov %rcx,%rax
3: add $64,%rax
movdqa %xmm1,%xmm7
movdqa %xmm4,%xmm5
movdqa %xmm0,%xmm3
movdqa %xmm9,%xmm6
movdqa %xmm9,%xmm2
movdqu -48(%rax),%xmm9
pclmullqlqdq %xmm8,%xmm7
pclmullqlqdq %xmm8,%xmm6
pclmullqlqdq %xmm8,%xmm5
pclmulhqhqdq %xmm8,%xmm1
pclmulhqhqdq %xmm8,%xmm2
pclmulhqhqdq %xmm8,%xmm4
pxor %xmm7,%xmm1
movdqu -64(%rax),%xmm7
pxor %xmm6,%xmm2
pxor %xmm5,%xmm4
movdqu -32(%rax),%xmm6
movdqu -16(%rax),%xmm5
pclmullqlqdq %xmm8,%xmm3
pclmulhqhqdq %xmm8,%xmm0
pxor %xmm7,%xmm1
pxor %xmm3,%xmm0
pxor %xmm2,%xmm9
pxor %xmm6,%xmm4
pxor %xmm5,%xmm0
cmp %rsi,%rax
jne 3b
lea 1(%rdx),%rax
sal $6,%rdx
sal $6,%rax
sub %rdx,%rdi
add %rax,%rcx
2: movdqa .Lk3k4(%rip),%xmm3
movdqa %xmm1,%xmm2
movdqa %xmm1,%xmm5
pclmulhqhqdq %xmm3,%xmm2
pclmullqlqdq %xmm3,%xmm5
pxor %xmm9,%xmm2
pxor %xmm5,%xmm2
movdqa %xmm2,%xmm5
pclmulhqhqdq %xmm3,%xmm2
movdqa %xmm2,%xmm1
pclmullqlqdq %xmm3,%xmm5
pxor %xmm4,%xmm1
pxor %xmm5,%xmm1
movdqa %xmm1,%xmm2
pclmulhqhqdq %xmm3,%xmm1
pclmullqlqdq %xmm3,%xmm2
pxor %xmm1,%xmm0
pxor %xmm2,%xmm0
cmp $15,%rdi
jbe 4f
sub $16,%rdi
mov %rcx,%rax
and $-16,%rdi
lea 16(%rcx,%rdi),%rdx
5: movdqa %xmm0,%xmm1
movdqu (%rax),%xmm6
pclmulhqhqdq %xmm3,%xmm0
add $16,%rax
pclmullqlqdq %xmm3,%xmm1
pxor %xmm1,%xmm0
pxor %xmm6,%xmm0
cmp %rax,%rdx
jne 5b
4: movdqa %xmm0,%xmm1
movdqa .Lboop(%rip),%xmm2
psrldq $8,%xmm0
pclmullqhqdq %xmm3,%xmm1
movdqa .Lpoly(%rip),%xmm3
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm1
pand %xmm2,%xmm0
pclmullqlqdq .Lk5k0(%rip),%xmm0
psrldq $4,%xmm1
pxor %xmm0,%xmm1
movdqa %xmm1,%xmm0
pand %xmm2,%xmm0
pclmullqhqdq %xmm3,%xmm0
pand %xmm2,%xmm0
pclmullqlqdq %xmm3,%xmm0
pxor %xmm1,%xmm0
movq %xmm0,%rax
shr $32,%rax
.leafepilogue
.endfn crc32_pclmul,globl,hidden
// Definitions of the bit-reflected domain constants k1,k2,k3, etc.
// and the CRC32+Barrett polynomials given at the end of the paper.
.rodata.cst16
.Lk1k2: .quad 0x0000000154442bd4
.quad 0x00000001c6e41596
.endobj .Lk1k2
.Lk3k4: .quad 0x00000001751997d0
.quad 0x00000000ccaa009e
.endobj .Lk3k4
.Lk5k0: .quad 0x0000000163cd6124
.quad 0x0000000000000000
.endobj .Lk5k0
.Lboop: .quad 0x00000000ffffffff
.quad 0x00000000ffffffff
.endobj .Lboop
.Lpoly: .quad 0x00000001db710641
.quad 0x00000001f7011641
.endobj .Lpoly
.previous
/* crc32() w/ pclmul for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 4437.000 42.375 38.141 85
1 45.000 39.375 38.234 85
2 31.500 25.312 23.102 141
3 25.667 19.792 17.911 181
4 21.250 16.219 15.035 216
7 18.429 12.946 11.712 277
8 16.125 12.578 10.998 296
15 12.867 9.925 9.161 355
16 12.438 9.836 9.114 357
31 11.194 8.528 8.149 399
32 10.781 8.418 8.098 401
63 9.063 7.780 7.647 425
64 3.109 1.604 1.414 2299
127 2.260 1.824 1.729 1880
128 1.305 0.860 0.806 4033
255 1.290 1.001 0.948 3428
256 0.574 0.491 0.476 6822
511 0.773 0.571 0.546 5956
512 0.354 0.320 0.306 10613
1023 0.425 0.365 0.347 9375
1024 0.237 0.229 0.231 14097
2047 0.278 0.251 0.246 13236
2048 0.187 0.187 0.188 17306
4095 0.229 0.200 0.194 16761
4096 0.162 0.170 0.167 19438
8191 0.182 0.173 0.178 18266
8192 0.162 0.155 0.158 20560
16383 0.156 0.162 0.154 21136
16384 0.156 0.156 0.148 22005
32767 0.163 0.149 0.149 21768
32768 0.150 0.146 0.145 22491
65535 0.158 0.141 0.141 23102
65536 0.149 0.140 0.138 23478
131071 0.150 0.145 0.141 23011
131072 0.148 0.141 0.148 21892
262143 0.151 0.148 0.147 22136
262144 0.149 0.146 0.146 22298
524287 0.150 0.149 0.149 21832
524288 0.148 0.148 0.147 22043
1048575 0.148 0.158 0.163 19913
1048576 0.156 0.179 0.153 21186
2097151 0.153 0.149 0.148 21979
2097152 0.147 0.148 0.147 22040
4194303 0.148 0.148 0.151 21482
4194304 0.148 0.148 0.147 22061
8388607 0.185 0.183 0.185 17536
8388608 0.193 0.180 0.183 17769
crc32() w/ 10+ year old cpus for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 4447.000 43.625 37.641 86
1 41.000 37.125 37.609 86
2 31.500 26.562 22.477 145
3 25.000 20.125 17.422 187
4 21.250 16.594 15.230 213
7 16.714 13.089 11.717 277
8 16.875 12.609 11.174 291
15 12.733 9.958 9.339 348
16 12.438 9.852 9.208 353
31 10.935 8.617 8.164 398
32 10.906 8.496 8.155 399
63 9.095 7.819 7.692 423
64 9.172 7.807 7.692 423
127 8.165 7.531 7.438 437
128 8.133 7.503 7.437 437
255 7.714 7.329 7.293 446
256 7.723 7.348 7.293 446
511 7.434 7.253 7.223 450
512 7.412 7.237 7.218 450
1023 7.274 7.214 7.201 451
1024 7.292 7.203 7.189 452
2047 7.232 7.185 7.178 453
2048 7.239 7.189 7.186 452
4095 7.189 7.175 7.172 453
4096 7.192 7.173 7.172 453
8191 7.187 7.173 7.172 453
8192 7.183 7.174 7.181 453
16383 7.175 7.170 7.169 453
16384 7.176 7.169 7.169 453
32767 7.169 7.182 7.170 453
32768 7.173 7.172 7.172 453
65535 7.170 7.170 7.171 453
65536 7.172 7.171 7.204 451
131071 7.170 7.354 7.260 448
131072 7.172 7.172 7.182 453
262143 7.037 7.178 7.182 453
262144 7.169 7.343 7.205 451
524287 7.438 7.170 7.206 451
524288 7.169 7.164 7.209 451
1048575 6.995 7.119 7.158 454
1048576 7.168 7.110 7.157 454
2097151 7.057 7.058 7.065 460
2097152 6.977 7.047 7.089 458
4194303 7.017 7.504 7.030 462
4194304 7.025 7.059 7.030 462
8388607 7.082 6.980 6.997 464
8388608 7.051 6.985 6.999 464 */

View file

@ -8,12 +8,6 @@ extern const uint32_t kCrc32cTab[256];
void crc32init(uint32_t[hasatleast 256], uint32_t);
uint32_t crc32a(uint32_t, const void *, size_t);
uint32_t crc32c(uint32_t, const void *, size_t);
uint32_t crc32_z(uint32_t, const void *, size_t);
uint32_t crc32c_pure(uint32_t, const void *, size_t)
strlenesque _Hide;
uint32_t crc32c_sse42(uint32_t, const void *, size_t)
strlenesque _Hide;
uint32_t crc32_pclmul(uint32_t, const void *, size_t) _Hide;
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */

View file

@ -1,63 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/dce.h"
#include "libc/intrin/asan.internal.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/crc32.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
/**
* Computes Phil Katz CRC-32 used by zip/zlib/gzip/etc.
*
* x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
* 0b100000100110000010001110110110111
* _bitreverse32(0x104c11db7)
*
* This implementation takes 32 picoseconds per byte or 30 gibibyte/s.
*
* @param h is initial value
*/
uint32_t crc32_z(uint32_t h, const void *data, size_t size) {
size_t n;
static bool once;
const unsigned char *p, *e;
static uint32_t kCrc32Tab[256];
if (!once) {
crc32init(kCrc32Tab, 0xedb88320);
once = 0;
}
if (size == -1) {
size = data ? strlen(data) : 0;
}
p = data;
e = p + size;
h ^= 0xffffffff;
if (X86_HAVE(PCLMUL)) {
while (((intptr_t)p & 15) && p < e)
h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
if ((n = ROUNDDOWN(e - p, 16)) >= 64) {
if (IsAsan()) __asan_verify(p, n);
h = crc32_pclmul(h, p, n); /* 51x faster */
p += n;
}
}
while (p < e) h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
return h ^ 0xffffffff;
}

61
libc/sysv/consts/hwap.h Normal file
View file

@ -0,0 +1,61 @@
#ifndef COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_
#define COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_
#ifdef __aarch64__
// Feature bits for getauxval(AT_HWCAP) on AARCH64 GNU/SystemD.
#define HWCAP_FP (1 << 0)
#define HWCAP_ASIMD (1 << 1)
#define HWCAP_EVTSTRM (1 << 2)
#define HWCAP_AES (1 << 3)
#define HWCAP_PMULL (1 << 4)
#define HWCAP_SHA1 (1 << 5)
#define HWCAP_SHA2 (1 << 6)
#define HWCAP_CRC32 (1 << 7)
#define HWCAP_ATOMICS (1 << 8)
#define HWCAP_FPHP (1 << 9)
#define HWCAP_ASIMDHP (1 << 10)
#define HWCAP_CPUID (1 << 11)
#define HWCAP_ASIMDRDM (1 << 12)
#define HWCAP_JSCVT (1 << 13)
#define HWCAP_FCMA (1 << 14)
#define HWCAP_LRCPC (1 << 15)
#define HWCAP_DCPOP (1 << 16)
#define HWCAP_SHA3 (1 << 17)
#define HWCAP_SM3 (1 << 18)
#define HWCAP_SM4 (1 << 19)
#define HWCAP_ASIMDDP (1 << 20)
#define HWCAP_SHA512 (1 << 21)
#define HWCAP_SVE (1 << 22)
#define HWCAP_ASIMDFHM (1 << 23)
#define HWCAP_DIT (1 << 24)
#define HWCAP_USCAT (1 << 25)
#define HWCAP_ILRCPC (1 << 26)
#define HWCAP_FLAGM (1 << 27)
#define HWCAP_SSBS (1 << 28)
#define HWCAP_SB (1 << 29)
#define HWCAP_PACA (1 << 30)
#define HWCAP_PACG (1UL << 31)
#define HWCAP2_DCPODP (1 << 0)
#define HWCAP2_SVE2 (1 << 1)
#define HWCAP2_SVEAES (1 << 2)
#define HWCAP2_SVEPMULL (1 << 3)
#define HWCAP2_SVEBITPERM (1 << 4)
#define HWCAP2_SVESHA3 (1 << 5)
#define HWCAP2_SVESM4 (1 << 6)
#define HWCAP2_FLAGM2 (1 << 7)
#define HWCAP2_FRINT (1 << 8)
#define HWCAP2_SVEI8MM (1 << 9)
#define HWCAP2_SVEF32MM (1 << 10)
#define HWCAP2_SVEF64MM (1 << 11)
#define HWCAP2_SVEBF16 (1 << 12)
#define HWCAP2_I8MM (1 << 13)
#define HWCAP2_BF16 (1 << 14)
#define HWCAP2_DGH (1 << 15)
#define HWCAP2_RNG (1 << 16)
#define HWCAP2_BTI (1 << 17)
#define HWCAP2_MTE (1 << 18)
#endif /* __aarch64__ */
#endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_ */

View file

@ -5,6 +5,13 @@
FreeBSD lib/msun/src/e_acoshl.c
Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -28,12 +35,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"

View file

@ -27,6 +27,7 @@
*/
#include "libc/math.h"
#include "libc/tinymath/feval.internal.h"
#include "libc/tinymath/freebsd.internal.h"
asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\

View file

@ -5,6 +5,13 @@
FreeBSD lib/msun/src/s_asinhl.c
Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -28,12 +35,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"

View file

@ -4,6 +4,13 @@
FreeBSD lib/msun/src/e_atan2.c
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -27,12 +34,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"

View file

@ -79,7 +79,7 @@ long double atan2l(long double y, long double x)
long double z;
int m, ex, ey;
if (isnan(x) || isnan(y))
if (isunordered(x, y))
return x+y;
if (x == 1)
return atanl(y);

View file

@ -5,6 +5,13 @@
FreeBSD lib/msun/src/s_tanhf.c
Converted to long double by Bruce D. Evans.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -28,12 +35,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"

View file

@ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
long double cosl(long double x) {
/**
* Returns cosine of 𝑥.
*/
long double cosl(long double x)
{
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
return cos(x);
#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384

View file

@ -4,6 +4,13 @@
FreeBSD lib/msun/src/s_expm1f.c
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -27,12 +34,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"

View file

@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
#define INSERT_WORDS(d,hi,lo) \

View file

@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
#define SET_FLOAT_WORD(d,w) \

View file

@ -22,6 +22,6 @@
* Returns positive difference.
*/
double fdim(double x, double y) {
if (isnan(x) || isnan(y)) return NAN;
if (isunordered(x, y)) return NAN;
return x > y ? x - y : 0;
}

View file

@ -22,6 +22,6 @@
* Returns positive difference.
*/
float fdimf(float x, float y) {
if (isnan(x) || isnan(y)) return NAN;
if (isunordered(x, y)) return NAN;
return x > y ? x - y : 0;
}

View file

@ -25,7 +25,7 @@ long double fdiml(long double x, long double y) {
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
return fdim(x, y);
#else
if (isnan(x) || isnan(y)) return NAN;
if (isunordered(x, y)) return NAN;
return x > y ? x - y : 0;
#endif
}

View file

@ -906,67 +906,6 @@ irintl(long double x)
__x + __y; \
})
/*
* ieee style elementary functions
*
* We rename functions here to improve other sources' diffability
* against fdlibm.
*/
#define __ieee754_sqrt sqrt
#define __ieee754_acos acos
#define __ieee754_acosh acosh
#define __ieee754_log log
#define __ieee754_log2 log2
#define __ieee754_atanh atanh
#define __ieee754_asin asin
#define __ieee754_atan2 atan2
#define __ieee754_exp exp
#define __ieee754_cosh cosh
#define __ieee754_fmod fmod
#define __ieee754_pow pow
#define __ieee754_lgamma lgamma
#define __ieee754_gamma gamma
#define __ieee754_lgamma_r lgamma_r
#define __ieee754_gamma_r gamma_r
#define __ieee754_log10 log10
#define __ieee754_sinh sinh
#define __ieee754_hypot hypot
#define __ieee754_j0 j0
#define __ieee754_j1 j1
#define __ieee754_y0 y0
#define __ieee754_y1 y1
#define __ieee754_jn jn
#define __ieee754_yn yn
#define __ieee754_remainder remainder
#define __ieee754_scalb scalb
#define __ieee754_sqrtf sqrtf
#define __ieee754_acosf acosf
#define __ieee754_acoshf acoshf
#define __ieee754_logf logf
#define __ieee754_atanhf atanhf
#define __ieee754_asinf asinf
#define __ieee754_atan2f atan2f
#define __ieee754_expf expf
#define __ieee754_coshf coshf
#define __ieee754_fmodf fmodf
#define __ieee754_powf powf
#define __ieee754_lgammaf lgammaf
#define __ieee754_gammaf gammaf
#define __ieee754_lgammaf_r lgammaf_r
#define __ieee754_gammaf_r gammaf_r
#define __ieee754_log10f log10f
#define __ieee754_log2f log2f
#define __ieee754_sinhf sinhf
#define __ieee754_hypotf hypotf
#define __ieee754_j0f j0f
#define __ieee754_j1f j1f
#define __ieee754_y0f y0f
#define __ieee754_y1f y1f
#define __ieee754_jnf jnf
#define __ieee754_ynf ynf
#define __ieee754_remainderf remainderf
#define __ieee754_scalbf scalbf
/* fdlibm kernel function */
int __kernel_rem_pio2(double*,double*,int,int,int);

View file

@ -16,7 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/tinymath/tinymath.h"
#include "libc/math.h"
/**
* Rounds to nearest integer.

View file

@ -83,7 +83,8 @@ static dontinline long lrint_slow(double x) {
/**
* Rounds to nearest integer.
*/
long lrint(double x) {
long lrint(double x)
{
#ifdef __x86_64__
long res;
asm("cvtsd2si\t%1,%0" : "=r"(res) : "x"(x));

View file

@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
double modf(double x, double *iptr)
{

View file

@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
float modff(float x, float *iptr)
{

View file

@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
double nextafter(double x, double y)
{
@ -40,7 +40,7 @@ double nextafter(double x, double y)
uint64_t ax, ay;
int e;
if (isnan(x) || isnan(y))
if (isunordered(x, y))
return x + y;
if (ux.i == uy.i)
return y;

View file

@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
float nextafterf(float x, float y)
{
union {float f; uint32_t i;} ux={x}, uy={y};
uint32_t ax, ay, e;
if (isnan(x) || isnan(y))
if (isunordered(x, y))
return x + y;
if (ux.i == uy.i)
return y;

View file

@ -36,13 +36,14 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
long double nextafterl(long double x, long double y) {
long double nextafterl(long double x, long double y)
{
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
return nextafter(x, y);
#elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384
union ldshape ux, uy;
if (isnan(x) || isnan(y))
if (isunordered(x, y))
return x + y;
if (x == y)
return y;
@ -75,7 +76,7 @@ long double nextafterl(long double x, long double y) {
#elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384
union ldshape ux, uy;
if (isnan(x) || isnan(y))
if (isunordered(x, y))
return x + y;
if (x == y)
return y;

View file

@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
double nexttoward(double x, long double y)
{
union {double f; uint64_t i;} ux = {x};
int e;
if (isnan(x) || isnan(y))
if (isunordered(x, y))
return x + y;
if (x == y)
return y;

View file

@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
float nexttowardf(float x, long double y)
{
union {float f; uint32_t i;} ux = {x};
uint32_t e;
if (isnan(x) || isnan(y))
if (isunordered(x, y))
return x + y;
if (x == y)
return y;

View file

@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
long double nexttowardl(long double x, long double y)
{

View file

@ -34,7 +34,7 @@ asm(".ident\t\"\\n\\n\
OpenBSD libm (ISC License)\\n\
Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/* origin: OpenBSD /usr/src/lib/libm/src/polevll.c */
/*

View file

@ -32,10 +32,10 @@
#include "libc/tinymath/pow_data.internal.h"
asm(".ident\t\"\\n\\n\
Double-precision math functions (MIT License)\\n\
Copyright 2018 ARM Limited\"");
Optimized Routines (MIT License)\\n\
Copyright 2022 ARM Limited\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/*
* Double-precision x^y function.

View file

@ -121,9 +121,9 @@ double pochisq(
e = (even ? 0.0 : LOG_SQRT_PI);
c = log(a);
while (z <= x) {
e = log(z) + e;
s += ex(c * z - a - e);
z += 1.0;
e = log(z) + e;
s += ex(c * z - a - e);
z += 1.0;
}
return (s);
} else {

View file

@ -35,8 +35,8 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* clang-format off */
/* origin: FreeBSD /usr/src/lib/msun/src/k_rem_pio2.c */
/*
* ====================================================

View file

@ -58,7 +58,7 @@ asm(".include \"libc/disclaimer.inc\"");
*/
double scalb(double x, double fn)
{
if (isnan(x) || isnan(fn))
if (isunordered(x, fn))
return x*fn;
if (!isfinite(fn)) {
if (fn > 0.0)

View file

@ -38,7 +38,8 @@ asm(".include \"libc/disclaimer.inc\"");
float scalbf(float x, float fn)
{
if (isnan(x) || isnan(fn)) return x*fn;
if (isunordered(x, fn))
return x*fn;
if (!isfinite(fn)) {
if (fn > 0.0f)
return x*fn;

View file

@ -3,7 +3,7 @@
#include "libc/tinymath/internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
/* clang-format off */
// clang-format off
/*
* Header for sinf, cosf and sincosf.

View file

@ -35,7 +35,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/* origin: FreeBSD /usr/src/lib/msun/src/k_sinf.c */
/*

View file

@ -37,7 +37,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/* origin: FreeBSD /usr/src/lib/msun/src/s_sinf.c */
/*

View file

@ -27,12 +27,13 @@
*/
#include "libc/math.h"
#include "libc/tinymath/expo.internal.h"
#include "libc/tinymath/freebsd.internal.h"
asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/**
* Returns hyperbolic sine of 𝑥.

View file

@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/**
* Returns hyperbolic sine of 𝑥.

View file

@ -5,6 +5,13 @@
FreeBSD lib/msun/src/e_sinhl.c
Converted to long double by Bruce D. Evans
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -28,12 +35,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/intrin/likely.h"
#include "libc/math.h"

View file

@ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
long double sinl(long double x) {
/**
* Returns sine of 𝑥.
*/
long double sinl(long double x)
{
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
return sin(x);
#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384

View file

@ -36,7 +36,7 @@ asm(".ident\t\"\\n\\n\
Musl libc (MIT License)\\n\
Copyright 2005-2014 Rich Felker, et. al.\"");
asm(".include \"libc/disclaimer.inc\"");
/* clang-format off */
// clang-format off
/* origin: FreeBSD /usr/src/lib/msun/src/s_tan.c */
/*

View file

@ -5,6 +5,13 @@
FreeBSD lib/msun/src/s_tanhf.c
Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -28,12 +35,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/math.h"
#include "libc/tinymath/freebsd.internal.h"

View file

@ -5,6 +5,13 @@
FreeBSD lib/msun/src/s_tanhl.c
Converted to long double by Bruce D. Evans
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
Copyright (c) 1992-2023 The FreeBSD Project.
Redistribution and use in source and binary forms, with or without
@ -28,12 +35,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
Developed at SunPro, a Sun Microsystems, Inc. business.
Permission to use, copy, modify, and distribute this
software is freely granted, provided that this notice
is preserved.
*/
#include "libc/intrin/likely.h"
#include "libc/math.h"

View file

@ -31,7 +31,6 @@
#include "libc/intrin/directmap.internal.h"
#include "libc/intrin/extend.internal.h"
#include "libc/intrin/weaken.h"
#include "libc/nexgen32e/crc32.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/memtrack.internal.h"
#include "libc/sysv/consts/f.h"
@ -159,11 +158,6 @@ static int __zipos_load(struct Zipos *zipos, size_t cf, unsigned flags,
h->pos = 0;
h->cfile = cf;
h->size = size;
if (!IsTiny() && h->mem &&
crc32_z(0, h->mem, h->size) != ZIP_LFILE_CRC32(zipos->map + lf)) {
h->mem = 0;
eio();
}
if (h->mem) {
minfd = 3;
__fds_lock();

View file

@ -61,6 +61,17 @@ TEST(memcmp, hug) {
}
}
static int coerce(int result) {
#ifdef __aarch64__
// arm's strcmp assembly is nuts and unpredictable, but it's legal
if (result < 0) return -1;
if (result > 0) return +1;
return 0;
#else
return result;
#endif
}
TEST(memcmp, fuzz) {
int i, o, n, g;
char a[256], b[256];
@ -79,8 +90,18 @@ TEST(memcmp, fuzz) {
}
o = rand() & 31;
n = rand() % (sizeof(a) - o);
g = golden(a + o, b + o, n);
ASSERT_EQ(g, memcmp(a + o, b + o, n), "n=%d o=%d", n, o);
g = coerce(golden(a + o, b + o, n));
#if 0
if (memcmp(a + o, b + o, n) != g) {
kprintf("const size_t g = %d;\n", g);
kprintf("const size_t n = %d;\n", n);
kprintf("const char a[] = unbingstr(%#.*hhhs); /* %p */\n", n, a + o,
a + o);
kprintf("const char b[] = unbingstr(%#.*hhhs); /* %p */\n", n, b + o,
b + o);
}
#endif
ASSERT_EQ(g, coerce(memcmp(a + o, b + o, n)), "n=%d o=%d", n, o);
ASSERT_EQ(!!g, !!bcmp(a + o, b + o, n), "n=%d o=%d", n, o);
ASSERT_EQ(!!g, !!timingsafe_bcmp(a + o, b + o, n), "n=%d o=%d", n, o);
ASSERT_EQ(MAX(-1, MIN(1, g)), timingsafe_memcmp(a + o, b + o, n),

View file

@ -190,9 +190,11 @@ BENCH(strchr, bench2) {
char *strlen_(const char *) asm("strlen");
char *rawmemchr_(const char *, int) asm("rawmemchr");
EZBENCH2("strchr z", donothing, strchr_(kHyperion, 'z'));
EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z'));
EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize));
EZBENCH2("strchr Z", donothing, strchr_(kHyperion, 'Z'));
EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize));
EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize));
EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z'));
EZBENCH2("rawmemchr Z", donothing, rawmemchr_(kHyperion, 'z'));
EZBENCH2("rawmemchr \\0", donothing, rawmemchr_(kHyperion, 0));
EZBENCH2("strlen", donothing, strlen_(kHyperion));
EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize));

View file

@ -1,49 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
#include "libc/nexgen32e/crc32.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/hyperion.h"
#include "libc/testlib/testlib.h"
#include "third_party/zlib/zlib.h"
TEST(crc32, testBigText) {
size_t size;
void *hyperion;
size = kHyperionSize;
hyperion = kHyperion;
EXPECT_EQ(0xe9ded8e6, crc32(0, hyperion, size));
EXPECT_EQ(0xe9ded8e6, crc32_z(0, hyperion, size));
if (X86_HAVE(PCLMUL)) {
size = ROUNDDOWN(size, 64);
EXPECT_EQ(0xc7adc04f, crc32(0, hyperion, size));
EXPECT_EQ(0xc7adc04f, crc32_z(0, hyperion, size));
EXPECT_EQ(0xc7adc04f,
0xffffffffu ^ crc32_pclmul(0 ^ 0xffffffffu, hyperion, size));
}
}
#define TESTSTR "libc/calls/typedef/sighandler_t.h"
BENCH(crc32c, bench) {
EZBENCH2("crc32c", donothing,
EXPROPRIATE(crc32c(0, VEIL("r", TESTSTR), sizeof(TESTSTR) - 1)));
}

View file

@ -16,17 +16,18 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/bits.h"
#include "libc/dce.h"
#include "libc/intrin/bits.h"
#include "libc/mem/gc.internal.h"
#include "libc/mem/mem.h"
#include "libc/nexgen32e/crc32.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/mem/gc.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/hyperion.h"
#include "libc/testlib/testlib.h"
#include "third_party/zlib/zlib.h"
#define FANATICS "Fanatics"

View file

@ -17,18 +17,19 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/intrin/bits.h"
#include "libc/dce.h"
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/bits.h"
#include "libc/mem/gc.internal.h"
#include "libc/mem/mem.h"
#include "libc/nexgen32e/crc32.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/mem/gc.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/hyperion.h"
#include "libc/testlib/testlib.h"
#include "third_party/zlib/zlib.h"
#define FANATICS "Fanatics"

View file

@ -15,15 +15,16 @@
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "libc/str/highwayhash64.h"
#include "libc/inttypes.h"
#include "libc/nexgen32e/crc32.h"
#include "libc/stdio/rand.h"
#include "libc/stdio/stdio.h"
#include "libc/str/highwayhash64.h"
#include "libc/str/str.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/hyperion.h"
#include "libc/testlib/testlib.h"
#include "third_party/zlib/zlib.h"
#define kMaxSize 64

View file

@ -472,8 +472,6 @@ TEST(wcscmp, testTwosComplementBane) {
TEST(wcsncmp, testTwosComplementBane) {
wchar_t *B1 = malloc(4);
wchar_t *B2 = malloc(4);
B1[1] = L'\0';
B2[1] = L'\0';
EXPECT_EQ(wcsncmp(memcpy(B1, "\x00\x00\x00\x80", 4),
memcpy(B2, "\x00\x00\x00\x80", 4), 1),
0);

View file

@ -18,6 +18,7 @@
*/
#include "libc/math.h"
#include "libc/mem/gc.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/testlib.h"
#include "libc/x/x.h"
#include "libc/x/xasprintf.h"
@ -51,3 +52,9 @@ TEST(asinhl, test) {
EXPECT_STREQ("NAN", _gc(xdtoal(_asinhl(NAN))));
EXPECT_STREQ("INFINITY", _gc(xdtoal(_asinhl(INFINITY))));
}
BENCH(asinh, bench) {
EZBENCH2("asinh", donothing, _asinh(.7)); // ~26ns
EZBENCH2("asinhf", donothing, _asinhf(.7)); // ~17ns
EZBENCH2("asinhl", donothing, _asinhl(.7)); // ~48ns
}

View file

@ -18,6 +18,7 @@
*/
#include "libc/math.h"
#include "libc/mem/gc.internal.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/testlib.h"
#include "libc/x/x.h"
@ -60,3 +61,9 @@ TEST(sinhf, test) {
EXPECT_STREQ("INFINITY", gc(xdtoaf(_sinhf(INFINITY))));
EXPECT_STREQ("-INFINITY", gc(xdtoaf(_sinhf(-INFINITY))));
}
BENCH(sinh, bench) {
EZBENCH2("sinh", donothing, _sinh(.7)); // ~24ns
EZBENCH2("sinhf", donothing, _sinhf(.7)); // ~19ns
EZBENCH2("sinhl", donothing, _sinhl(.7)); // ~15ns
}

Some files were not shown because too many files have changed in this diff Show more