mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 03:27:39 +00:00
Make AARCH64 harder, better, faster, stronger
- Perform some housekeeping on scalar math function code - Import ARM's Optimized Routines for SIMD string processing - Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
parent
550b52abf6
commit
cc1732bc42
143 changed files with 15661 additions and 1329 deletions
|
@ -73,6 +73,13 @@ IMAGE_BASE_VIRTUAL ?= 0x400000
|
|||
IGNORE := $(shell $(ECHO) -2 ♥cosmo)
|
||||
IGNORE := $(shell $(MKDIR) o/tmp)
|
||||
|
||||
ifeq ($(MODE), dbg)
|
||||
# be generous about resources in debug mode
|
||||
# let commands use 64 seconds cpu time max
|
||||
# let commands use 300 seconds wall time max
|
||||
QUOTA ?= -C64 -L300
|
||||
endif
|
||||
|
||||
ifneq ($(findstring aarch64,$(MODE)),)
|
||||
ARCH = aarch64
|
||||
VM = o/third_party/qemu/qemu-aarch64
|
||||
|
|
|
@ -12,8 +12,8 @@
|
|||
#include "libc/errno.h"
|
||||
#include "libc/fmt/conv.h"
|
||||
#include "libc/log/check.h"
|
||||
#include "libc/mem/mem.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/mem/mem.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/str/str.h"
|
||||
|
@ -48,26 +48,62 @@ FLAGS\n\
|
|||
// clang-format off
|
||||
// make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
|
||||
/*
|
||||
#!/bin/bash
|
||||
# data file is o/dbg/third_party/python/python.com
|
||||
# level 0 147517 compress 495 MB/s decompress 1.4 GB/s
|
||||
# level 1 80274 compress 29.2 MB/s decompress 303 MB/s
|
||||
# level 2 79384 compress 33.8 MB/s decompress 212 MB/s
|
||||
# level 3 78875 compress 28.9 MB/s decompress 224 MB/s
|
||||
# level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot?
|
||||
# level 5 77107 compress 19.5 MB/s decompress 273 MB/s
|
||||
# level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s
|
||||
# level 7 75022 compress 7.5 MB/s decompress 287 MB/s
|
||||
# level 8 75016 compress 5.4 MB/s decompress 109 MB/s
|
||||
# level 9 75016 compress 5.4 MB/s decompress 344 MB/s
|
||||
# level 1 348739 compress 22.8 MB/s decompress 444 MB/s
|
||||
# level 2 347549 compress 37.8 MB/s decompress 457 MB/s
|
||||
# level 3 346902 compress 33.3 MB/s decompress 463 MB/s
|
||||
# level 4 345671 compress 29.3 MB/s decompress 467 MB/s
|
||||
# level 5 344392 compress 22.4 MB/s decompress 506 MB/s
|
||||
# level 6 342105 compress 10.9 MB/s decompress 516 MB/s
|
||||
# level 7 342046 compress 7.9 MB/s decompress 515 MB/s
|
||||
# level 8 342009 compress 5.8 MB/s decompress 518 MB/s
|
||||
# level 9 342001 compress 5.7 MB/s decompress 524 MB/s
|
||||
# level F 1 362426 compress 48.2 MB/s decompress 488 MB/s
|
||||
# level F 2 360875 compress 42.7 MB/s decompress 484 MB/s
|
||||
# level F 3 359992 compress 37.1 MB/s decompress 499 MB/s
|
||||
# level F 4 358460 compress 32.9 MB/s decompress 503 MB/s
|
||||
# level F 5 356431 compress 24.0 MB/s decompress 547 MB/s
|
||||
# level F 6 352274 compress 11.6 MB/s decompress 558 MB/s
|
||||
# level F 7 352155 compress 8.7 MB/s decompress 554 MB/s
|
||||
# level F 8 352065 compress 6.3 MB/s decompress 554 MB/s
|
||||
# level F 9 352051 compress 6.2 MB/s decompress 556 MB/s
|
||||
# level L 1 348739 compress 41.1 MB/s decompress 446 MB/s
|
||||
# level L 2 347549 compress 37.4 MB/s decompress 443 MB/s
|
||||
# level L 3 346902 compress 32.3 MB/s decompress 462 MB/s
|
||||
# level L 4 351932 compress 28.8 MB/s decompress 511 MB/s
|
||||
# level L 5 351384 compress 23.6 MB/s decompress 520 MB/s
|
||||
# level L 6 351328 compress 12.1 MB/s decompress 522 MB/s
|
||||
# level L 7 351230 compress 7.3 MB/s decompress 518 MB/s
|
||||
# level L 8 351192 compress 5.7 MB/s decompress 522 MB/s
|
||||
# level L 9 351182 compress 6.5 MB/s decompress 519 MB/s
|
||||
# level R 1 388209 compress 83.1 MB/s decompress 371 MB/s
|
||||
# level R 2 388209 compress 82.3 MB/s decompress 362 MB/s
|
||||
# level R 3 388209 compress 81.8 MB/s decompress 361 MB/s
|
||||
# level R 4 388209 compress 81.7 MB/s decompress 364 MB/s
|
||||
# level R 5 388209 compress 81.7 MB/s decompress 363 MB/s
|
||||
# level R 6 388209 compress 80.1 MB/s decompress 359 MB/s
|
||||
# level R 7 388209 compress 80.3 MB/s decompress 354 MB/s
|
||||
# level R 8 388209 compress 80.3 MB/s decompress 363 MB/s
|
||||
# level R 9 388209 compress 81.3 MB/s decompress 364 MB/s
|
||||
# level H 1 390207 compress 87.6 MB/s decompress 371 MB/s
|
||||
# level H 2 390207 compress 87.5 MB/s decompress 372 MB/s
|
||||
# level H 3 390207 compress 85.5 MB/s decompress 364 MB/s
|
||||
# level H 4 390207 compress 87.3 MB/s decompress 375 MB/s
|
||||
# level H 5 390207 compress 89.0 MB/s decompress 373 MB/s
|
||||
# level H 6 390207 compress 87.3 MB/s decompress 372 MB/s
|
||||
# level H 7 390207 compress 87.0 MB/s decompress 368 MB/s
|
||||
# level H 8 390207 compress 86.2 MB/s decompress 367 MB/s
|
||||
# level H 9 390207 compress 86.9 MB/s decompress 369 MB/s
|
||||
m=
|
||||
make -j8 MODE=$m o/$m/examples || exit
|
||||
for strategy in ' ' F L R H; do
|
||||
for level in $(seq 1 9); do
|
||||
for strategy in F L R H; do
|
||||
o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
|
||||
o/$m/examples/compress.com -$level$strategy <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
|
||||
compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
|
||||
o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null
|
||||
decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
|
||||
size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c)
|
||||
size=$(o/$m/examples/compress.com -$level$strategy <o/$m/examples/compress.com | wc -c)
|
||||
echo "level $strategy $level $size compress $compspeed decompress $decompspeed"
|
||||
done
|
||||
done
|
||||
|
|
|
@ -10,43 +10,14 @@
|
|||
#include "libc/assert.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/errno.h"
|
||||
#include "libc/mem/mem.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/mem/mem.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
#define CHUNK 32768
|
||||
|
||||
// clang-format off
|
||||
// make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
|
||||
/*
|
||||
# data file is o/dbg/third_party/python/python.com
|
||||
# level 0 147517 compress 495 MB/s decompress 1.4 GB/s
|
||||
# level 1 80274 compress 29.2 MB/s decompress 303 MB/s
|
||||
# level 2 79384 compress 33.8 MB/s decompress 212 MB/s
|
||||
# level 3 78875 compress 28.9 MB/s decompress 224 MB/s
|
||||
# level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot?
|
||||
# level 5 77107 compress 19.5 MB/s decompress 273 MB/s
|
||||
# level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s
|
||||
# level 7 75022 compress 7.5 MB/s decompress 287 MB/s
|
||||
# level 8 75016 compress 5.4 MB/s decompress 109 MB/s
|
||||
# level 9 75016 compress 5.4 MB/s decompress 344 MB/s
|
||||
m=
|
||||
make -j8 MODE=$m o/$m/examples || exit
|
||||
for level in $(seq 0 9); do
|
||||
for strategy in F L R H; do
|
||||
o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
|
||||
compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
|
||||
o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null
|
||||
decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
|
||||
size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c)
|
||||
echo "level $strategy $level $size compress $compspeed decompress $decompspeed"
|
||||
done
|
||||
done
|
||||
*/
|
||||
// clang-format on
|
||||
|
||||
int decompressor(int infd, int outfd) {
|
||||
int rc;
|
||||
unsigned have;
|
||||
|
|
88
libc/intrin/aarch64/asmdefs.h
Normal file
88
libc/intrin/aarch64/asmdefs.h
Normal file
|
@ -0,0 +1,88 @@
|
|||
#ifndef COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
|
||||
#define COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
|
||||
#ifdef __ASSEMBLER__
|
||||
// clang-format off
|
||||
|
||||
/* Branch Target Identitication support. */
|
||||
#define BTI_C hint 34
|
||||
#define BTI_J hint 36
|
||||
/* Return address signing support (pac-ret). */
|
||||
#define PACIASP hint 25; .cfi_window_save
|
||||
#define AUTIASP hint 29; .cfi_window_save
|
||||
|
||||
/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
|
||||
#define FEATURE_1_AND 0xc0000000
|
||||
#define FEATURE_1_BTI 1
|
||||
#define FEATURE_1_PAC 2
|
||||
|
||||
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
|
||||
#define GNU_PROPERTY(type, value) \
|
||||
.section .note.gnu.property, "a"; \
|
||||
.p2align 3; \
|
||||
.word 4; \
|
||||
.word 16; \
|
||||
.word 5; \
|
||||
.asciz "GNU"; \
|
||||
.word type; \
|
||||
.word 4; \
|
||||
.word value; \
|
||||
.word 0; \
|
||||
.text
|
||||
|
||||
/* If set then the GNU Property Note section will be added to
|
||||
mark objects to support BTI and PAC-RET. */
|
||||
#ifndef WANT_GNU_PROPERTY
|
||||
#define WANT_GNU_PROPERTY 1
|
||||
#endif
|
||||
|
||||
#if WANT_GNU_PROPERTY
|
||||
/* Add property note with supported features to all asm files. */
|
||||
GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
|
||||
#endif
|
||||
|
||||
#define ENTRY_ALIGN(name, alignment) \
|
||||
.global name; \
|
||||
.type name,%function; \
|
||||
.align alignment; \
|
||||
name: \
|
||||
.cfi_startproc; \
|
||||
BTI_C;
|
||||
|
||||
#define ENTRY(name) ENTRY_ALIGN(name, 6)
|
||||
|
||||
#define ENTRY_ALIAS(name) \
|
||||
.global name; \
|
||||
.type name,%function; \
|
||||
name:
|
||||
|
||||
#define END(name) \
|
||||
.cfi_endproc; \
|
||||
.size name, .-name;
|
||||
|
||||
#define L(l) .L ## l
|
||||
|
||||
#ifdef __ILP32__
|
||||
/* Sanitize padding bits of pointer arguments as per aapcs64 */
|
||||
#define PTR_ARG(n) mov w##n, w##n
|
||||
#else
|
||||
#define PTR_ARG(n)
|
||||
#endif
|
||||
|
||||
#ifdef __ILP32__
|
||||
/* Sanitize padding bits of size arguments as per aapcs64 */
|
||||
#define SIZE_ARG(n) mov w##n, w##n
|
||||
#else
|
||||
#define SIZE_ARG(n)
|
||||
#endif
|
||||
|
||||
/* Compiler supports SVE instructions */
|
||||
#ifndef HAVE_SVE
|
||||
# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
|
||||
# define HAVE_SVE 1
|
||||
# else
|
||||
# define HAVE_SVE 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* __ASSEMBLER__ */
|
||||
#endif /* COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_ */
|
172
libc/intrin/aarch64/memchr.S
Normal file
172
libc/intrin/aarch64/memchr.S
Normal file
|
@ -0,0 +1,172 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memchr_aarch64 memchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
#define cntin x2
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x3
|
||||
#define tmp x4
|
||||
#define wtmp2 w5
|
||||
#define synd x6
|
||||
#define soff x9
|
||||
#define cntrem x10
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_chr1 v3
|
||||
#define vhas_chr2 v4
|
||||
#define vrepmask v5
|
||||
#define vend v6
|
||||
|
||||
/*
|
||||
* Core algorithm:
|
||||
*
|
||||
* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
|
||||
* per byte. For each tuple, bit 0 is set if the relevant byte matched the
|
||||
* requested character and bit 1 is not used (faster than using a 32bit
|
||||
* syndrome). Since the bits in the syndrome reflect exactly the order in which
|
||||
* things occur in the original string, counting trailing zeros allows to
|
||||
* identify exactly which byte has matched.
|
||||
*/
|
||||
|
||||
ENTRY (__memchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
/* Do not dereference srcin if no bytes to compare. */
|
||||
cbz cntin, L(zero_length)
|
||||
/*
|
||||
* Magic constant 0x40100401 allows us to identify which lane matches
|
||||
* the requested byte.
|
||||
*/
|
||||
mov wtmp2, #0x0401
|
||||
movk wtmp2, #0x4010, lsl #16
|
||||
dup vrepchr.16b, chrin
|
||||
/* Work with aligned 32-byte chunks */
|
||||
bic src, srcin, #31
|
||||
dup vrepmask.4s, wtmp2
|
||||
ands soff, srcin, #31
|
||||
and cntrem, cntin, #31
|
||||
b.eq L(loop)
|
||||
|
||||
/*
|
||||
* Input string is not 32-byte aligned. We calculate the syndrome
|
||||
* value for the aligned 32 bytes block containing the first bytes
|
||||
* and mask the irrelevant part.
|
||||
*/
|
||||
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
sub tmp, soff, #32
|
||||
adds cntin, cntin, tmp
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
|
||||
addp vend.16b, vend.16b, vend.16b /* 128->64 */
|
||||
mov synd, vend.d[0]
|
||||
/* Clear the soff*2 lower bits */
|
||||
lsl tmp, soff, #1
|
||||
lsr synd, synd, tmp
|
||||
lsl synd, synd, tmp
|
||||
/* The first block can also be the last */
|
||||
b.ls L(masklast)
|
||||
/* Have we found something already? */
|
||||
cbnz synd, L(tail)
|
||||
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
subs cntin, cntin, #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
/* If we're out of data we finish regardless of the result */
|
||||
b.ls L(end)
|
||||
/* Use a fast check for the termination condition */
|
||||
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
|
||||
addp vend.2d, vend.2d, vend.2d
|
||||
mov synd, vend.d[0]
|
||||
/* We're not out of data, loop if we haven't found the character */
|
||||
cbz synd, L(loop)
|
||||
|
||||
L(end):
|
||||
/* Termination condition found, let's calculate the syndrome value */
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
|
||||
addp vend.16b, vend.16b, vend.16b /* 128->64 */
|
||||
mov synd, vend.d[0]
|
||||
/* Only do the clear for the last possible block */
|
||||
b.hs L(tail)
|
||||
|
||||
L(masklast):
|
||||
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
|
||||
add tmp, cntrem, soff
|
||||
and tmp, tmp, #31
|
||||
sub tmp, tmp, #32
|
||||
neg tmp, tmp, lsl #1
|
||||
lsl synd, synd, tmp
|
||||
lsr synd, synd, tmp
|
||||
|
||||
L(tail):
|
||||
/* Count the trailing zeros using bit reversing */
|
||||
rbit synd, synd
|
||||
/* Compensate the last post-increment */
|
||||
sub src, src, #32
|
||||
/* Check that we have found a character */
|
||||
cmp synd, #0
|
||||
/* And count the leading zeros */
|
||||
clz synd, synd
|
||||
/* Compute the potential result */
|
||||
add result, src, synd, lsr #1
|
||||
/* Select result or NULL */
|
||||
csel result, xzr, result, eq
|
||||
ret
|
||||
|
||||
L(zero_length):
|
||||
mov result, #0
|
||||
ret
|
||||
|
||||
END (__memchr_aarch64)
|
218
libc/intrin/aarch64/memcmp.S
Normal file
218
libc/intrin/aarch64/memcmp.S
Normal file
|
@ -0,0 +1,218 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memcmp_aarch64 memcmp
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*/
|
||||
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result w0
|
||||
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data2 x4
|
||||
#define data2w w4
|
||||
#define data3 x5
|
||||
#define data3w w5
|
||||
#define data4 x6
|
||||
#define data4w w6
|
||||
#define tmp x6
|
||||
#define src1end x7
|
||||
#define src2end x8
|
||||
|
||||
|
||||
ENTRY (__memcmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
|
||||
cmp limit, 16
|
||||
b.lo L(less16)
|
||||
ldp data1, data3, [src1]
|
||||
ldp data2, data4, [src2]
|
||||
ccmp data1, data2, 0, ne
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
cmp limit, 32
|
||||
b.ls L(last_bytes)
|
||||
cmp limit, 160
|
||||
b.hs L(loop_align)
|
||||
sub limit, limit, 32
|
||||
|
||||
.p2align 4
|
||||
L(loop32):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
cmp limit, 16
|
||||
b.ls L(last_bytes)
|
||||
|
||||
ldp data1, data3, [src1, 32]
|
||||
ldp data2, data4, [src2, 32]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
add src1, src1, 32
|
||||
add src2, src2, 32
|
||||
L(last64):
|
||||
subs limit, limit, 32
|
||||
b.hi L(loop32)
|
||||
|
||||
/* Compare last 1-16 bytes using unaligned access. */
|
||||
L(last_bytes):
|
||||
ldp data1, data3, [src1end, -16]
|
||||
ldp data2, data4, [src2end, -16]
|
||||
L(return2):
|
||||
cmp data1, data2
|
||||
csel data1, data1, data3, ne
|
||||
csel data2, data2, data4, ne
|
||||
|
||||
/* Compare data bytes and set return value to 0, -1 or 1. */
|
||||
L(return):
|
||||
#ifndef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less16):
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
tbz limit, 3, L(less8)
|
||||
ldr data1, [src1]
|
||||
ldr data2, [src2]
|
||||
ldr data3, [src1end, -8]
|
||||
ldr data4, [src2end, -8]
|
||||
b L(return2)
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
tbz limit, 2, L(less4)
|
||||
ldr data1w, [src1]
|
||||
ldr data2w, [src2]
|
||||
ldr data3w, [src1end, -4]
|
||||
ldr data4w, [src2end, -4]
|
||||
b L(return2)
|
||||
|
||||
L(less4):
|
||||
tbz limit, 1, L(less2)
|
||||
ldrh data1w, [src1]
|
||||
ldrh data2w, [src2]
|
||||
cmp data1w, data2w
|
||||
b.ne L(return)
|
||||
L(less2):
|
||||
mov result, 0
|
||||
tbz limit, 0, L(return_zero)
|
||||
ldrb data1w, [src1end, -1]
|
||||
ldrb data2w, [src2end, -1]
|
||||
sub result, data1w, data2w
|
||||
L(return_zero):
|
||||
ret
|
||||
|
||||
L(loop_align):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
/* Align src2 and adjust src1, src2 and limit. */
|
||||
and tmp, src2, 15
|
||||
sub tmp, tmp, 16
|
||||
sub src2, src2, tmp
|
||||
add limit, limit, tmp
|
||||
sub src1, src1, tmp
|
||||
sub limit, limit, 64 + 16
|
||||
|
||||
.p2align 4
|
||||
L(loop64):
|
||||
ldr q0, [src1, 16]
|
||||
ldr q1, [src2, 16]
|
||||
subs limit, limit, 64
|
||||
ldr q2, [src1, 32]
|
||||
ldr q3, [src2, 32]
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
ldr q2, [src1, 48]
|
||||
ldr q3, [src2, 48]
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
ldr q4, [src1, 64]!
|
||||
ldr q5, [src2, 64]!
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
eor v2.16b, v4.16b, v5.16b
|
||||
umaxp v1.16b, v1.16b, v2.16b
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
umaxp v0.16b, v0.16b, v0.16b
|
||||
fmov tmp, d0
|
||||
ccmp tmp, 0, 0, hi
|
||||
b.eq L(loop64)
|
||||
|
||||
/* If equal, process last 1-64 bytes using scalar loop. */
|
||||
add limit, limit, 64 + 16
|
||||
cbz tmp, L(last64)
|
||||
|
||||
/* Determine the 8-byte aligned offset of the first difference. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev16 tmp, tmp
|
||||
#endif
|
||||
rev tmp, tmp
|
||||
clz tmp, tmp
|
||||
bic tmp, tmp, 7
|
||||
sub tmp, tmp, 48
|
||||
ldr data1, [src1, tmp]
|
||||
ldr data2, [src2, tmp]
|
||||
#ifndef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
mov result, 1
|
||||
cmp data1, data2
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
END (__memcmp_aarch64)
|
233
libc/intrin/aarch64/memcpy.S
Normal file
233
libc/intrin/aarch64/memcpy.S
Normal file
|
@ -0,0 +1,233 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memcpy_aarch64_simd memcpy
|
||||
#define __memmove_aarch64_simd memmove
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define srcend x4
|
||||
#define dstend x5
|
||||
#define A_l x6
|
||||
#define A_lw w6
|
||||
#define A_h x7
|
||||
#define B_l x8
|
||||
#define B_lw w8
|
||||
#define B_h x9
|
||||
#define C_lw w10
|
||||
#define tmp1 x14
|
||||
|
||||
#define A_q q0
|
||||
#define B_q q1
|
||||
#define C_q q2
|
||||
#define D_q q3
|
||||
#define E_q q4
|
||||
#define F_q q5
|
||||
#define G_q q6
|
||||
#define H_q q7
|
||||
|
||||
/* This implementation handles overlaps and supports both memcpy and memmove
|
||||
from a single entry point. It uses unaligned accesses and branchless
|
||||
sequences to keep the code small, simple and improve performance.
|
||||
|
||||
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
|
||||
copies of up to 128 bytes, and large copies. The overhead of the overlap
|
||||
check is negligible since it is only required for large copies.
|
||||
|
||||
Large copies use a software pipelined loop processing 64 bytes per iteration.
|
||||
The source pointer is 16-byte aligned to minimize unaligned accesses.
|
||||
The loop tail is handled by always copying 64 bytes from the end.
|
||||
*/
|
||||
|
||||
ENTRY_ALIAS (__memmove_aarch64_simd)
|
||||
ENTRY (__memcpy_aarch64_simd)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 128
|
||||
b.hi L(copy_long)
|
||||
cmp count, 32
|
||||
b.hi L(copy32_128)
|
||||
|
||||
/* Small copies: 0..32 bytes. */
|
||||
cmp count, 16
|
||||
b.lo L(copy16)
|
||||
ldr A_q, [src]
|
||||
ldr B_q, [srcend, -16]
|
||||
str A_q, [dstin]
|
||||
str B_q, [dstend, -16]
|
||||
ret
|
||||
|
||||
/* Copy 8-15 bytes. */
|
||||
L(copy16):
|
||||
tbz count, 3, L(copy8)
|
||||
ldr A_l, [src]
|
||||
ldr A_h, [srcend, -8]
|
||||
str A_l, [dstin]
|
||||
str A_h, [dstend, -8]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
/* Copy 4-7 bytes. */
|
||||
L(copy8):
|
||||
tbz count, 2, L(copy4)
|
||||
ldr A_lw, [src]
|
||||
ldr B_lw, [srcend, -4]
|
||||
str A_lw, [dstin]
|
||||
str B_lw, [dstend, -4]
|
||||
ret
|
||||
|
||||
/* Copy 0..3 bytes using a branchless sequence. */
|
||||
L(copy4):
|
||||
cbz count, L(copy0)
|
||||
lsr tmp1, count, 1
|
||||
ldrb A_lw, [src]
|
||||
ldrb C_lw, [srcend, -1]
|
||||
ldrb B_lw, [src, tmp1]
|
||||
strb A_lw, [dstin]
|
||||
strb B_lw, [dstin, tmp1]
|
||||
strb C_lw, [dstend, -1]
|
||||
L(copy0):
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
L(copy32_128):
|
||||
ldp A_q, B_q, [src]
|
||||
ldp C_q, D_q, [srcend, -32]
|
||||
cmp count, 64
|
||||
b.hi L(copy128)
|
||||
stp A_q, B_q, [dstin]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 65..128 bytes. */
|
||||
L(copy128):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
cmp count, 96
|
||||
b.ls L(copy96)
|
||||
ldp G_q, H_q, [srcend, -64]
|
||||
stp G_q, H_q, [dstend, -64]
|
||||
L(copy96):
|
||||
stp A_q, B_q, [dstin]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
/* Copy more than 128 bytes. */
|
||||
L(copy_long):
|
||||
/* Use backwards copy if there is an overlap. */
|
||||
sub tmp1, dstin, src
|
||||
cmp tmp1, count
|
||||
b.lo L(copy_long_backwards)
|
||||
|
||||
/* Copy 16 bytes and then align src to 16-byte alignment. */
|
||||
ldr D_q, [src]
|
||||
and tmp1, src, 15
|
||||
bic src, src, 15
|
||||
sub dst, dstin, tmp1
|
||||
add count, count, tmp1 /* Count is now 16 too large. */
|
||||
ldp A_q, B_q, [src, 16]
|
||||
str D_q, [dstin]
|
||||
ldp C_q, D_q, [src, 48]
|
||||
subs count, count, 128 + 16 /* Test and readjust count. */
|
||||
b.ls L(copy64_from_end)
|
||||
L(loop64):
|
||||
stp A_q, B_q, [dst, 16]
|
||||
ldp A_q, B_q, [src, 80]
|
||||
stp C_q, D_q, [dst, 48]
|
||||
ldp C_q, D_q, [src, 112]
|
||||
add src, src, 64
|
||||
add dst, dst, 64
|
||||
subs count, count, 64
|
||||
b.hi L(loop64)
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the end. */
|
||||
L(copy64_from_end):
|
||||
ldp E_q, F_q, [srcend, -64]
|
||||
stp A_q, B_q, [dst, 16]
|
||||
ldp A_q, B_q, [srcend, -32]
|
||||
stp C_q, D_q, [dst, 48]
|
||||
stp E_q, F_q, [dstend, -64]
|
||||
stp A_q, B_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
/* Large backwards copy for overlapping copies.
|
||||
Copy 16 bytes and then align srcend to 16-byte alignment. */
|
||||
L(copy_long_backwards):
|
||||
cbz tmp1, L(copy0)
|
||||
ldr D_q, [srcend, -16]
|
||||
and tmp1, srcend, 15
|
||||
bic srcend, srcend, 15
|
||||
sub count, count, tmp1
|
||||
ldp A_q, B_q, [srcend, -32]
|
||||
str D_q, [dstend, -16]
|
||||
ldp C_q, D_q, [srcend, -64]
|
||||
sub dstend, dstend, tmp1
|
||||
subs count, count, 128
|
||||
b.ls L(copy64_from_start)
|
||||
|
||||
L(loop64_backwards):
|
||||
str B_q, [dstend, -16]
|
||||
str A_q, [dstend, -32]
|
||||
ldp A_q, B_q, [srcend, -96]
|
||||
str D_q, [dstend, -48]
|
||||
str C_q, [dstend, -64]!
|
||||
ldp C_q, D_q, [srcend, -128]
|
||||
sub srcend, srcend, 64
|
||||
subs count, count, 64
|
||||
b.hi L(loop64_backwards)
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the start. */
|
||||
L(copy64_from_start):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
stp A_q, B_q, [dstend, -32]
|
||||
ldp A_q, B_q, [src]
|
||||
stp C_q, D_q, [dstend, -64]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp A_q, B_q, [dstin]
|
||||
ret
|
||||
|
||||
END (__memcpy_aarch64_simd)
|
138
libc/intrin/aarch64/memrchr.S
Normal file
138
libc/intrin/aarch64/memrchr.S
Normal file
|
@ -0,0 +1,138 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memrchr_aarch64 memrchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
#define cntin x2
|
||||
#define result x0
|
||||
|
||||
#define src x3
|
||||
#define cntrem x4
|
||||
#define synd x5
|
||||
#define shift x6
|
||||
#define tmp x7
|
||||
#define end x8
|
||||
#define endm1 x9
|
||||
|
||||
#define vrepchr v0
|
||||
#define qdata q1
|
||||
#define vdata v1
|
||||
#define vhas_chr v2
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__memrchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
add end, srcin, cntin
|
||||
sub endm1, end, 1
|
||||
bic src, endm1, 15
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepchr.16b, chrin
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
neg shift, end, lsl 2
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsl synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
clz synd, synd
|
||||
sub result, endm1, synd, lsr 2
|
||||
cmp cntin, synd, lsr 2
|
||||
csel result, result, xzr, hi
|
||||
ret
|
||||
|
||||
nop
|
||||
L(start_loop):
|
||||
subs cntrem, src, srcin
|
||||
b.ls L(nomatch)
|
||||
|
||||
/* Make sure that it won't overread by a 16-byte chunk */
|
||||
sub cntrem, cntrem, 1
|
||||
tbz cntrem, 4, L(loop32_2)
|
||||
add src, src, 16
|
||||
|
||||
.p2align 5
|
||||
L(loop32):
|
||||
ldr qdata, [src, -32]!
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbnz synd, L(end)
|
||||
|
||||
L(loop32_2):
|
||||
ldr qdata, [src, -16]
|
||||
subs cntrem, cntrem, 32
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
b.lo L(end_2)
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop32)
|
||||
L(end_2):
|
||||
sub src, src, 16
|
||||
L(end):
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
|
||||
add tmp, src, 15
|
||||
#ifdef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz synd, synd
|
||||
sub tmp, tmp, synd, lsr 2
|
||||
cmp tmp, srcin
|
||||
csel result, tmp, xzr, hs
|
||||
ret
|
||||
|
||||
L(nomatch):
|
||||
mov result, 0
|
||||
ret
|
||||
|
||||
END (__memrchr_aarch64)
|
143
libc/intrin/aarch64/memset.S
Normal file
143
libc/intrin/aarch64/memset.S
Normal file
|
@ -0,0 +1,143 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memset_aarch64 memset
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define val x1
|
||||
#define valw w1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define dstend x4
|
||||
#define zva_val x5
|
||||
|
||||
ENTRY (__memset_aarch64)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
|
||||
dup v0.16B, valw
|
||||
add dstend, dstin, count
|
||||
|
||||
cmp count, 96
|
||||
b.hi L(set_long)
|
||||
cmp count, 16
|
||||
b.hs L(set_medium)
|
||||
mov val, v0.D[0]
|
||||
|
||||
/* Set 0..15 bytes. */
|
||||
tbz count, 3, 1f
|
||||
str val, [dstin]
|
||||
str val, [dstend, -8]
|
||||
ret
|
||||
.p2align 4
|
||||
1: tbz count, 2, 2f
|
||||
str valw, [dstin]
|
||||
str valw, [dstend, -4]
|
||||
ret
|
||||
2: cbz count, 3f
|
||||
strb valw, [dstin]
|
||||
tbz count, 1, 3f
|
||||
strh valw, [dstend, -2]
|
||||
3: ret
|
||||
|
||||
/* Set 17..96 bytes. */
|
||||
L(set_medium):
|
||||
str q0, [dstin]
|
||||
tbnz count, 6, L(set96)
|
||||
str q0, [dstend, -16]
|
||||
tbz count, 5, 1f
|
||||
str q0, [dstin, 16]
|
||||
str q0, [dstend, -32]
|
||||
1: ret
|
||||
|
||||
.p2align 4
|
||||
/* Set 64..96 bytes. Write 64 bytes from the start and
|
||||
32 bytes from the end. */
|
||||
L(set96):
|
||||
str q0, [dstin, 16]
|
||||
stp q0, q0, [dstin, 32]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(set_long):
|
||||
and valw, valw, 255
|
||||
bic dst, dstin, 15
|
||||
str q0, [dstin]
|
||||
cmp count, 160
|
||||
ccmp valw, 0, 0, hs
|
||||
b.ne L(no_zva)
|
||||
|
||||
#ifndef SKIP_ZVA_CHECK
|
||||
mrs zva_val, dczid_el0
|
||||
and zva_val, zva_val, 31
|
||||
cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||
b.ne L(no_zva)
|
||||
#endif
|
||||
str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
bic dst, dst, 63
|
||||
sub count, dstend, dst /* Count is now 64 too large. */
|
||||
sub count, count, 128 /* Adjust count and bias for loop. */
|
||||
|
||||
.p2align 4
|
||||
L(zva_loop):
|
||||
add dst, dst, 64
|
||||
dc zva, dst
|
||||
subs count, count, 64
|
||||
b.hi L(zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
L(no_zva):
|
||||
sub count, dstend, dst /* Count is 16 too large. */
|
||||
sub dst, dst, 16 /* Dst is biased by -32. */
|
||||
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||
L(no_zva_loop):
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
subs count, count, 64
|
||||
b.hi L(no_zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
END (__memset_aarch64)
|
175
libc/intrin/aarch64/stpcpy.S
Normal file
175
libc/intrin/aarch64/stpcpy.S
Normal file
|
@ -0,0 +1,175 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __stpcpy_aarch64 stpcpy
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define dst x3
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
#define data2 x7
|
||||
#define dataw2 w7
|
||||
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
#define dataq2 q1
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__stpcpy_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
sub tmp, src, srcin
|
||||
clz len, synd
|
||||
add len, tmp, len, lsr 2
|
||||
tbz len, 4, L(less16)
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [srcin]
|
||||
ldr dataq2, [srcin, tmp]
|
||||
str dataq, [dstin]
|
||||
str dataq2, [dstin, tmp]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
ldr data1, [srcin]
|
||||
ldr data2, [srcin, tmp]
|
||||
str data1, [dstin]
|
||||
str data2, [dstin, tmp]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
subs tmp, len, 3
|
||||
b.lo L(less4)
|
||||
ldr dataw1, [srcin]
|
||||
ldr dataw2, [srcin, tmp]
|
||||
str dataw1, [dstin]
|
||||
str dataw2, [dstin, tmp]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
L(less4):
|
||||
cbz len, L(zerobyte)
|
||||
ldrh dataw1, [srcin]
|
||||
strh dataw1, [dstin]
|
||||
L(zerobyte):
|
||||
strb wzr, [dstin, len]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub tmp, srcin, dstin
|
||||
ldr dataq2, [srcin]
|
||||
sub dst, src, tmp
|
||||
str dataq2, [dstin]
|
||||
L(loop):
|
||||
str dataq, [dst], 32
|
||||
ldr dataq, [src, 16]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbnz synd, L(loopend)
|
||||
str dataq, [dst, -16]
|
||||
ldr dataq, [src, 32]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
add dst, dst, 16
|
||||
L(loopend):
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
sub dst, dst, 31
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
add dst, dst, len
|
||||
ldr dataq, [dst, tmp]
|
||||
str dataq, [dst]
|
||||
add result, dst, 15
|
||||
ret
|
||||
|
||||
END (__stpcpy_aarch64)
|
152
libc/intrin/aarch64/strchr.S
Normal file
152
libc/intrin/aarch64/strchr.S
Normal file
|
@ -0,0 +1,152 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strchr_aarch64 strchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define tmp1 x3
|
||||
#define wtmp2 w4
|
||||
#define tmp3 x5
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_nul1 v3
|
||||
#define vhas_nul2 v4
|
||||
#define vhas_chr1 v5
|
||||
#define vhas_chr2 v6
|
||||
#define vrepmask_0 v7
|
||||
#define vrepmask_c v16
|
||||
#define vend1 v17
|
||||
#define vend2 v18
|
||||
|
||||
/* Core algorithm.
|
||||
|
||||
For each 32-byte hunk we calculate a 64-bit syndrome value, with
|
||||
two bits per byte (LSB is always in bits 0 and 1, for both big
|
||||
and little-endian systems). For each tuple, bit 0 is set iff
|
||||
the relevant byte matched the requested character; bit 1 is set
|
||||
iff the relevant byte matched the NUL end of string (we trigger
|
||||
off bit0 for the special case of looking for NUL). Since the bits
|
||||
in the syndrome reflect exactly the order in which things occur
|
||||
in the original string a count_trailing_zeros() operation will
|
||||
identify exactly which byte is causing the termination, and why. */
|
||||
|
||||
/* Locals and temporaries. */
|
||||
|
||||
ENTRY (__strchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0xc0300c03 to allow us to identify which lane
|
||||
matches the requested byte. Even bits are set if the character
|
||||
matches, odd bits if either the char is NUL or matches. */
|
||||
mov wtmp2, 0x0c03
|
||||
movk wtmp2, 0xc030, lsl 16
|
||||
dup vrepchr.16b, chrin
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask_c.4s, wtmp2
|
||||
ands tmp1, srcin, #31
|
||||
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
|
||||
b.eq L(loop)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
for all the bytes, but then mask off those bits of the
|
||||
syndrome that are related to the padding. */
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
neg tmp1, tmp1
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
|
||||
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
|
||||
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
|
||||
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
|
||||
lsl tmp1, tmp1, #1
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 256->128
|
||||
mov tmp3, #~0
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 128->64
|
||||
lsr tmp1, tmp3, tmp1
|
||||
|
||||
mov tmp3, vend1.d[0]
|
||||
bic tmp1, tmp3, tmp1 // Mask padding bits.
|
||||
cbnz tmp1, L(tail)
|
||||
|
||||
.p2align 4
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
umaxp vend1.16b, vend1.16b, vend1.16b
|
||||
mov tmp1, vend1.d[0]
|
||||
cbz tmp1, L(loop)
|
||||
|
||||
/* Termination condition found. Now need to establish exactly why
|
||||
we terminated. */
|
||||
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
|
||||
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
|
||||
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
|
||||
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 128->64
|
||||
mov tmp1, vend1.d[0]
|
||||
L(tail):
|
||||
/* Count the trailing zeros, by bit reversing... */
|
||||
rbit tmp1, tmp1
|
||||
/* Re-bias source. */
|
||||
sub src, src, #32
|
||||
clz tmp1, tmp1 /* And counting the leading zeros. */
|
||||
/* Tmp1 is even if the target charager was found first. Otherwise
|
||||
we've found the end of string and we weren't looking for NUL. */
|
||||
tst tmp1, #1
|
||||
add result, src, tmp1, lsr #1
|
||||
csel result, result, xzr, eq
|
||||
ret
|
||||
|
||||
END (__strchr_aarch64)
|
140
libc/intrin/aarch64/strchrnul.S
Normal file
140
libc/intrin/aarch64/strchrnul.S
Normal file
|
@ -0,0 +1,140 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strchrnul_aarch64 strchrnul
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define tmp1 x3
|
||||
#define wtmp2 w4
|
||||
#define tmp3 x5
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_nul1 v3
|
||||
#define vhas_nul2 v4
|
||||
#define vhas_chr1 v5
|
||||
#define vhas_chr2 v6
|
||||
#define vrepmask v7
|
||||
#define vend1 v16
|
||||
|
||||
/* Core algorithm.
|
||||
|
||||
For each 32-byte hunk we calculate a 64-bit syndrome value, with
|
||||
two bits per byte (LSB is always in bits 0 and 1, for both big
|
||||
and little-endian systems). For each tuple, bit 0 is set iff
|
||||
the relevant byte matched the requested character or nul. Since the
|
||||
bits in the syndrome reflect exactly the order in which things occur
|
||||
in the original string a count_trailing_zeros() operation will
|
||||
identify exactly which byte is causing the termination. */
|
||||
|
||||
/* Locals and temporaries. */
|
||||
|
||||
ENTRY (__strchrnul_aarch64)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0x40100401 to allow us to identify which lane
|
||||
matches the termination condition. */
|
||||
mov wtmp2, #0x0401
|
||||
movk wtmp2, #0x4010, lsl #16
|
||||
dup vrepchr.16b, chrin
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask.4s, wtmp2
|
||||
ands tmp1, srcin, #31
|
||||
b.eq L(loop)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
for all the bytes, but then mask off those bits of the
|
||||
syndrome that are related to the padding. */
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
neg tmp1, tmp1
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
|
||||
lsl tmp1, tmp1, #1
|
||||
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
mov tmp3, #~0
|
||||
addp vend1.16b, vend1.16b, vend1.16b // 128->64
|
||||
lsr tmp1, tmp3, tmp1
|
||||
|
||||
mov tmp3, vend1.d[0]
|
||||
bic tmp1, tmp3, tmp1 // Mask padding bits.
|
||||
cbnz tmp1, L(tail)
|
||||
|
||||
.p2align 4
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
umaxp vend1.16b, vend1.16b, vend1.16b
|
||||
mov tmp1, vend1.d[0]
|
||||
cbz tmp1, L(loop)
|
||||
|
||||
/* Termination condition found. Now need to establish exactly why
|
||||
we terminated. */
|
||||
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
|
||||
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vend1.16b // 128->64
|
||||
|
||||
mov tmp1, vend1.d[0]
|
||||
L(tail):
|
||||
/* Count the trailing zeros, by bit reversing... */
|
||||
rbit tmp1, tmp1
|
||||
/* Re-bias source. */
|
||||
sub src, src, #32
|
||||
clz tmp1, tmp1 /* ... and counting the leading zeros. */
|
||||
/* tmp1 is twice the offset into the fragment. */
|
||||
add result, src, tmp1, lsr #1
|
||||
ret
|
||||
|
||||
END (__strchrnul_aarch64)
|
214
libc/intrin/aarch64/strcmp.S
Normal file
214
libc/intrin/aarch64/strcmp.S
Normal file
|
@ -0,0 +1,214 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strcmp_aarch64 strcmp
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define result x0
|
||||
|
||||
#define data1 x2
|
||||
#define data1w w2
|
||||
#define data2 x3
|
||||
#define data2w w3
|
||||
#define has_nul x4
|
||||
#define diff x5
|
||||
#define off1 x5
|
||||
#define syndrome x6
|
||||
#define tmp x6
|
||||
#define data3 x7
|
||||
#define zeroones x8
|
||||
#define shift x9
|
||||
#define off2 x10
|
||||
|
||||
/* On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes. */
|
||||
#ifdef __AARCH64EB__
|
||||
# define LS_FW lsl
|
||||
#else
|
||||
# define LS_FW lsr
|
||||
#endif
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word.
|
||||
Since carry propagation makes 0x1 bytes before a NUL byte appear
|
||||
NUL too in big-endian, byte-reverse the data before the NUL check. */
|
||||
|
||||
|
||||
ENTRY (__strcmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
sub off2, src2, src1
|
||||
mov zeroones, REP8_01
|
||||
and tmp, src1, 7
|
||||
tst off2, 7
|
||||
b.ne L(misaligned8)
|
||||
cbnz tmp, L(mutual_align)
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_aligned):
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
L(start_realigned):
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, data1
|
||||
sub has_nul, tmp, zeroones
|
||||
orr tmp, tmp, REP8_7f
|
||||
#else
|
||||
sub has_nul, data1, zeroones
|
||||
orr tmp, data1, REP8_7f
|
||||
#endif
|
||||
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_aligned)
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
L(end):
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
clz shift, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
lsl data1, data1, shift
|
||||
lsl data2, data2, shift
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, 56
|
||||
sub result, data1, data2, lsr 56
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that precede the start point. */
|
||||
bic src1, src1, 7
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
neg shift, src2, lsl 3 /* Bits to alignment -64. */
|
||||
mov tmp, -1
|
||||
LS_FW tmp, tmp, shift
|
||||
orr data1, data1, tmp
|
||||
orr data2, data2, tmp
|
||||
b L(start_realigned)
|
||||
|
||||
L(misaligned8):
|
||||
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
|
||||
checking to make sure that we don't access beyond the end of SRC2. */
|
||||
cbz tmp, L(src1_aligned)
|
||||
L(do_misaligned):
|
||||
ldrb data1w, [src1], 1
|
||||
ldrb data2w, [src2], 1
|
||||
cmp data1w, 0
|
||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
tst src1, 7
|
||||
b.ne L(do_misaligned)
|
||||
|
||||
L(src1_aligned):
|
||||
neg shift, src2, lsl 3
|
||||
bic src2, src2, 7
|
||||
ldr data3, [src2], 8
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
lsr tmp, zeroones, shift
|
||||
orr data3, data3, tmp
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
bics has_nul, has_nul, tmp
|
||||
b.ne L(tail)
|
||||
|
||||
sub off1, src2, src1
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_unaligned):
|
||||
ldr data3, [src1, off1]
|
||||
ldr data2, [src1, off2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
ldr data1, [src1], 8
|
||||
bics has_nul, has_nul, tmp
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_unaligned)
|
||||
|
||||
lsl tmp, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, tmp
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, tmp
|
||||
cbnz syndrome, L(end)
|
||||
L(tail):
|
||||
ldr data1, [src1]
|
||||
neg shift, shift
|
||||
lsr data2, data3, shift
|
||||
lsr has_nul, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev data2, data2
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
b L(end)
|
||||
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
|
||||
END (__strcmp_aarch64)
|
170
libc/intrin/aarch64/strcpy.S
Normal file
170
libc/intrin/aarch64/strcpy.S
Normal file
|
@ -0,0 +1,170 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strcpy_aarch64 strcpy
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define dst x3
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
#define data2 x7
|
||||
#define dataw2 w7
|
||||
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
#define dataq2 q1
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__strcpy_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
sub tmp, src, srcin
|
||||
clz len, synd
|
||||
add len, tmp, len, lsr 2
|
||||
tbz len, 4, L(less16)
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [srcin]
|
||||
ldr dataq2, [srcin, tmp]
|
||||
str dataq, [dstin]
|
||||
str dataq2, [dstin, tmp]
|
||||
ret
|
||||
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
ldr data1, [srcin]
|
||||
ldr data2, [srcin, tmp]
|
||||
str data1, [dstin]
|
||||
str data2, [dstin, tmp]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
subs tmp, len, 3
|
||||
b.lo L(less4)
|
||||
ldr dataw1, [srcin]
|
||||
ldr dataw2, [srcin, tmp]
|
||||
str dataw1, [dstin]
|
||||
str dataw2, [dstin, tmp]
|
||||
ret
|
||||
|
||||
L(less4):
|
||||
cbz len, L(zerobyte)
|
||||
ldrh dataw1, [srcin]
|
||||
strh dataw1, [dstin]
|
||||
L(zerobyte):
|
||||
strb wzr, [dstin, len]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub tmp, srcin, dstin
|
||||
ldr dataq2, [srcin]
|
||||
sub dst, src, tmp
|
||||
str dataq2, [dstin]
|
||||
L(loop):
|
||||
str dataq, [dst], 32
|
||||
ldr dataq, [src, 16]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbnz synd, L(loopend)
|
||||
str dataq, [dst, -16]
|
||||
ldr dataq, [src, 32]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
add dst, dst, 16
|
||||
L(loopend):
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
sub dst, dst, 31
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
add dst, dst, len
|
||||
ldr dataq, [dst, tmp]
|
||||
str dataq, [dst]
|
||||
ret
|
||||
|
||||
END (__strcpy_aarch64)
|
220
libc/intrin/aarch64/strlen.S
Normal file
220
libc/intrin/aarch64/strlen.S
Normal file
|
@ -0,0 +1,220 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strlen_aarch64 strlen
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
* Not MTE compatible.
|
||||
*/
|
||||
|
||||
#define srcin x0
|
||||
#define len x0
|
||||
|
||||
#define src x1
|
||||
#define data1 x2
|
||||
#define data2 x3
|
||||
#define has_nul1 x4
|
||||
#define has_nul2 x5
|
||||
#define tmp1 x4
|
||||
#define tmp2 x5
|
||||
#define tmp3 x6
|
||||
#define tmp4 x7
|
||||
#define zeroones x8
|
||||
|
||||
#define maskv v0
|
||||
#define maskd d0
|
||||
#define dataq1 q1
|
||||
#define dataq2 q2
|
||||
#define datav1 v1
|
||||
#define datav2 v2
|
||||
#define tmp x2
|
||||
#define tmpw w2
|
||||
#define synd x3
|
||||
#define syndw w3
|
||||
#define shift x4
|
||||
|
||||
/* For the first 32 bytes, NUL detection works on the principle that
|
||||
(X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
|
||||
byte is zero, and can be done in parallel across the entire word. */
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
/* To test the page crossing code path more thoroughly, compile with
|
||||
-DTEST_PAGE_CROSS - this will force all calls through the slower
|
||||
entry path. This option is not intended for production use. */
|
||||
|
||||
#ifdef TEST_PAGE_CROSS
|
||||
# define MIN_PAGE_SIZE 32
|
||||
#else
|
||||
# define MIN_PAGE_SIZE 4096
|
||||
#endif
|
||||
|
||||
/* Core algorithm:
|
||||
|
||||
Since strings are short on average, we check the first 32 bytes of the
|
||||
string for a NUL character without aligning the string. In order to use
|
||||
unaligned loads safely we must do a page cross check first.
|
||||
|
||||
If there is a NUL byte we calculate the length from the 2 8-byte words
|
||||
using conditional select to reduce branch mispredictions (it is unlikely
|
||||
strlen will be repeatedly called on strings with the same length).
|
||||
|
||||
If the string is longer than 32 bytes, align src so we don't need further
|
||||
page cross checks, and process 32 bytes per iteration using a fast SIMD
|
||||
loop.
|
||||
|
||||
If the page cross check fails, we read 32 bytes from an aligned address,
|
||||
and ignore any characters before the string. If it contains a NUL
|
||||
character, return the length, if not, continue in the main loop. */
|
||||
|
||||
ENTRY (__strlen_aarch64)
|
||||
PTR_ARG (0)
|
||||
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
||||
cmp tmp1, MIN_PAGE_SIZE - 32
|
||||
b.hi L(page_cross)
|
||||
|
||||
/* Look for a NUL byte in the first 16 bytes. */
|
||||
ldp data1, data2, [srcin]
|
||||
mov zeroones, REP8_01
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian, carry propagation (if the final byte in the
|
||||
string is 0x01) means we cannot use has_nul1/2 directly.
|
||||
Since we expect strings to be small and early-exit,
|
||||
byte-swap the data now so has_null1/2 will be correct. */
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
b.eq L(bytes16_31)
|
||||
|
||||
/* Find the exact offset of the first NUL byte in the first 16 bytes
|
||||
from the string start. Enter with C = has_nul1 == 0. */
|
||||
csel has_nul1, has_nul1, has_nul2, cc
|
||||
mov len, 8
|
||||
rev has_nul1, has_nul1
|
||||
csel len, xzr, len, cc
|
||||
clz tmp1, has_nul1
|
||||
add len, len, tmp1, lsr 3
|
||||
ret
|
||||
|
||||
/* Look for a NUL byte at offset 16..31 in the string. */
|
||||
L(bytes16_31):
|
||||
ldp data1, data2, [srcin, 16]
|
||||
#ifdef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
b.eq L(loop_entry)
|
||||
|
||||
/* Find the exact offset of the first NUL byte at offset 16..31 from
|
||||
the string start. Enter with C = has_nul1 == 0. */
|
||||
csel has_nul1, has_nul1, has_nul2, cc
|
||||
mov len, 24
|
||||
rev has_nul1, has_nul1
|
||||
mov tmp3, 16
|
||||
clz tmp1, has_nul1
|
||||
csel len, tmp3, len, cc
|
||||
add len, len, tmp1, lsr 3
|
||||
ret
|
||||
|
||||
nop
|
||||
L(loop_entry):
|
||||
bic src, srcin, 31
|
||||
|
||||
.p2align 5
|
||||
L(loop):
|
||||
ldp dataq1, dataq2, [src, 32]!
|
||||
uminp maskv.16b, datav1.16b, datav2.16b
|
||||
uminp maskv.16b, maskv.16b, maskv.16b
|
||||
cmeq maskv.8b, maskv.8b, 0
|
||||
fmov synd, maskd
|
||||
cbz synd, L(loop)
|
||||
|
||||
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
|
||||
cmeq maskv.16b, datav1.16b, 0
|
||||
sub len, src, srcin
|
||||
cbnz syndw, 1f
|
||||
cmeq maskv.16b, datav2.16b, 0
|
||||
add len, len, 16
|
||||
1:
|
||||
/* Generate a bitmask and compute correct byte offset. */
|
||||
shrn maskv.8b, maskv.8h, 4
|
||||
fmov synd, maskd
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz tmp, synd
|
||||
add len, len, tmp, lsr 2
|
||||
ret
|
||||
|
||||
L(page_cross):
|
||||
bic src, srcin, 31
|
||||
mov tmpw, 0x0c03
|
||||
movk tmpw, 0xc030, lsl 16
|
||||
ld1 {datav1.16b, datav2.16b}, [src]
|
||||
dup maskv.4s, tmpw
|
||||
cmeq datav1.16b, datav1.16b, 0
|
||||
cmeq datav2.16b, datav2.16b, 0
|
||||
and datav1.16b, datav1.16b, maskv.16b
|
||||
and datav2.16b, datav2.16b, maskv.16b
|
||||
addp maskv.16b, datav1.16b, datav2.16b
|
||||
addp maskv.16b, maskv.16b, maskv.16b
|
||||
fmov synd, maskd
|
||||
lsl shift, srcin, 1
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(loop)
|
||||
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 1
|
||||
ret
|
||||
|
||||
END (__strlen_aarch64)
|
334
libc/intrin/aarch64/strncmp.S
Normal file
334
libc/intrin/aarch64/strncmp.S
Normal file
|
@ -0,0 +1,334 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strncmp_aarch64 strncmp
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result x0
|
||||
|
||||
/* Internal variables. */
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data2 x4
|
||||
#define data2w w4
|
||||
#define has_nul x5
|
||||
#define diff x6
|
||||
#define syndrome x7
|
||||
#define tmp1 x8
|
||||
#define tmp2 x9
|
||||
#define tmp3 x10
|
||||
#define zeroones x11
|
||||
#define pos x12
|
||||
#define mask x13
|
||||
#define endloop x14
|
||||
#define count mask
|
||||
#define offset pos
|
||||
#define neg_offset x15
|
||||
|
||||
/* Define endian dependent shift operations.
|
||||
On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes.
|
||||
LS_BK means shifting towards later bytes.
|
||||
*/
|
||||
#ifdef __AARCH64EB__
|
||||
#define LS_FW lsl
|
||||
#define LS_BK lsr
|
||||
#else
|
||||
#define LS_FW lsr
|
||||
#define LS_BK lsl
|
||||
#endif
|
||||
|
||||
ENTRY (__strncmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
cbz limit, L(ret0)
|
||||
eor tmp1, src1, src2
|
||||
mov zeroones, #REP8_01
|
||||
tst tmp1, #7
|
||||
and count, src1, #7
|
||||
b.ne L(misaligned8)
|
||||
cbnz count, L(mutual_align)
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
.p2align 4
|
||||
L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(start_realigned):
|
||||
subs limit, limit, #8
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp endloop, #0, #0, eq
|
||||
b.eq L(loop_aligned)
|
||||
/* End of main loop */
|
||||
|
||||
L(full_check):
|
||||
#ifndef __AARCH64EB__
|
||||
orr syndrome, diff, has_nul
|
||||
add limit, limit, 8 /* Rewind limit to before last subs. */
|
||||
L(syndrome_check):
|
||||
/* Limit was reached. Check if the NUL byte or the difference
|
||||
is before the limit. */
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
cmp limit, pos, lsr #3
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
csel result, result, xzr, hi
|
||||
ret
|
||||
#else
|
||||
/* Not reached the limit, must have found the end or a diff. */
|
||||
tbz limit, #63, L(not_limit)
|
||||
add tmp1, limit, 8
|
||||
cbz limit, L(not_limit)
|
||||
|
||||
lsl limit, tmp1, #3 /* Bits -> bytes. */
|
||||
mov mask, #~0
|
||||
lsr mask, mask, limit
|
||||
bic data1, data1, mask
|
||||
bic data2, data2, mask
|
||||
|
||||
/* Make sure that the NUL byte is marked in the syndrome. */
|
||||
orr has_nul, has_nul, mask
|
||||
|
||||
L(not_limit):
|
||||
/* For big-endian we cannot use the trick with the syndrome value
|
||||
as carry-propagation can corrupt the upper bits if the trailing
|
||||
bytes in the string contain 0x01. */
|
||||
/* However, if there is no NUL byte in the dword, we can generate
|
||||
the result directly. We can't just subtract the bytes as the
|
||||
MSB might be significant. */
|
||||
cbnz has_nul, 1f
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
1:
|
||||
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
|
||||
rev tmp3, data1
|
||||
sub tmp1, tmp3, zeroones
|
||||
orr tmp2, tmp3, #REP8_7f
|
||||
bic has_nul, tmp1, tmp2
|
||||
rev has_nul, has_nul
|
||||
orr syndrome, diff, has_nul
|
||||
clz pos, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
L(end_quick):
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
ret
|
||||
#endif
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that precede the start point.
|
||||
We also need to adjust the limit calculations, but without
|
||||
overflowing if the limit is near ULONG_MAX. */
|
||||
bic src1, src1, #7
|
||||
bic src2, src2, #7
|
||||
ldr data1, [src1], #8
|
||||
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
|
||||
ldr data2, [src2], #8
|
||||
mov tmp2, #~0
|
||||
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
/* Adjust the limit and ensure it doesn't overflow. */
|
||||
adds limit, limit, count
|
||||
csinv limit, limit, xzr, lo
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
b L(start_realigned)
|
||||
|
||||
.p2align 4
|
||||
/* Don't bother with dwords for up to 16 bytes. */
|
||||
L(misaligned8):
|
||||
cmp limit, #16
|
||||
b.hs L(try_misaligned_words)
|
||||
|
||||
L(byte_loop):
|
||||
/* Perhaps we can do better than this. */
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
subs limit, limit, #1
|
||||
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.eq L(byte_loop)
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
/* Align the SRC1 to a dword by doing a bytewise compare and then do
|
||||
the dword loop. */
|
||||
L(try_misaligned_words):
|
||||
cbz count, L(src1_aligned)
|
||||
|
||||
neg count, count
|
||||
and count, count, #7
|
||||
sub limit, limit, count
|
||||
|
||||
L(page_end_loop):
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
cmp data1w, #1
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
subs count, count, #1
|
||||
b.hi L(page_end_loop)
|
||||
|
||||
/* The following diagram explains the comparison of misaligned strings.
|
||||
The bytes are shown in natural order. For little-endian, it is
|
||||
reversed in the registers. The "x" bytes are before the string.
|
||||
The "|" separates data that is loaded at one time.
|
||||
src1 | a a a a a a a a | b b b c c c c c | . . .
|
||||
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
|
||||
|
||||
After shifting in each step, the data looks like this:
|
||||
STEP_A STEP_B STEP_C
|
||||
data1 a a a a a a a a b b b c c c c c b b b c c c c c
|
||||
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
|
||||
|
||||
The bytes with "0" are eliminated from the syndrome via mask.
|
||||
|
||||
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
|
||||
time from SRC2. The comparison happens in 3 steps. After each step
|
||||
the loop can exit, or read from SRC1 or SRC2. */
|
||||
L(src1_aligned):
|
||||
/* Calculate offset from 8 byte alignment to string start in bits. No
|
||||
need to mask offset since shifts are ignoring upper bits. */
|
||||
lsl offset, src2, #3
|
||||
bic src2, src2, #0xf
|
||||
mov mask, -1
|
||||
neg neg_offset, offset
|
||||
ldr data1, [src1], #8
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
LS_BK mask, mask, neg_offset
|
||||
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
|
||||
/* Skip the first compare if data in tmp1 is irrelevant. */
|
||||
tbnz offset, 6, L(misaligned_mid_loop)
|
||||
|
||||
L(loop_misaligned):
|
||||
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
|
||||
LS_FW data2, tmp1, offset
|
||||
LS_BK tmp1, tmp2, neg_offset
|
||||
subs limit, limit, #8
|
||||
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
|
||||
sub has_nul, data1, zeroones
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
orr tmp3, data1, #REP8_7f
|
||||
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
|
||||
orr tmp3, endloop, has_nul
|
||||
cbnz tmp3, L(full_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
L(misaligned_mid_loop):
|
||||
/* STEP_B: Compare first part of data1 to second part of tmp2. */
|
||||
LS_FW data2, tmp2, offset
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian we do a byte reverse to avoid carry-propagation
|
||||
problem described above. This way we can reuse the has_nul in the
|
||||
next step and also use syndrome value trick at the end. */
|
||||
rev tmp3, data1
|
||||
#define data1_fixed tmp3
|
||||
#else
|
||||
#define data1_fixed data1
|
||||
#endif
|
||||
sub has_nul, data1_fixed, zeroones
|
||||
orr tmp3, data1_fixed, #REP8_7f
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
cmp limit, neg_offset, lsr #3
|
||||
orr syndrome, diff, has_nul
|
||||
bic syndrome, syndrome, mask /* Ignore later bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
/* STEP_C: Compare second part of data1 to first part of tmp1. */
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
cmp limit, #8
|
||||
LS_BK data2, tmp1, neg_offset
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
orr syndrome, diff, has_nul
|
||||
and syndrome, syndrome, mask /* Ignore earlier bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
sub limit, limit, #8
|
||||
b L(loop_misaligned)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
L(syndrome_check):
|
||||
clz pos, syndrome
|
||||
cmp pos, limit, lsl #3
|
||||
b.lo L(end_quick)
|
||||
#endif
|
||||
|
||||
L(ret0):
|
||||
mov result, #0
|
||||
ret
|
||||
END(__strncmp_aarch64)
|
128
libc/intrin/aarch64/strnlen.S
Normal file
128
libc/intrin/aarch64/strnlen.S
Normal file
|
@ -0,0 +1,128 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strnlen_aarch64 strnlen
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define srcin x0
|
||||
#define cntin x1
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define synd x3
|
||||
#define shift x4
|
||||
#define tmp x4
|
||||
#define cntrem x5
|
||||
|
||||
#define qdata q0
|
||||
#define vdata v0
|
||||
#define vhas_chr v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
|
||||
four bits per byte using the shrn instruction. A count trailing zeros then
|
||||
identifies the first zero byte. */
|
||||
|
||||
ENTRY (__strnlen_aarch64)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (1)
|
||||
bic src, srcin, 15
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
L(finish):
|
||||
rbit synd, synd
|
||||
clz synd, synd
|
||||
lsr result, synd, 2
|
||||
cmp cntin, result
|
||||
csel result, cntin, result, ls
|
||||
ret
|
||||
|
||||
L(nomatch):
|
||||
mov result, cntin
|
||||
ret
|
||||
|
||||
L(start_loop):
|
||||
sub tmp, src, srcin
|
||||
add tmp, tmp, 17
|
||||
subs cntrem, cntin, tmp
|
||||
b.lo L(nomatch)
|
||||
|
||||
/* Make sure that it won't overread by a 16-byte chunk */
|
||||
tbz cntrem, 4, L(loop32_2)
|
||||
sub src, src, 16
|
||||
.p2align 5
|
||||
L(loop32):
|
||||
ldr qdata, [src, 32]!
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbnz synd, L(end)
|
||||
L(loop32_2):
|
||||
ldr qdata, [src, 16]
|
||||
subs cntrem, cntrem, 32
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
b.lo L(end_2)
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop32)
|
||||
L(end_2):
|
||||
add src, src, 16
|
||||
L(end):
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
sub result, src, srcin
|
||||
fmov synd, dend
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz synd, synd
|
||||
add result, result, synd, lsr 2
|
||||
cmp cntin, result
|
||||
csel result, cntin, result, ls
|
||||
ret
|
||||
|
||||
END (__strnlen_aarch64)
|
175
libc/intrin/aarch64/strrchr.S
Normal file
175
libc/intrin/aarch64/strrchr.S
Normal file
|
@ -0,0 +1,175 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strrchr_aarch64 strrchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define tmp1 x3
|
||||
#define wtmp2 w4
|
||||
#define tmp3 x5
|
||||
#define src_match x6
|
||||
#define src_offset x7
|
||||
#define const_m1 x8
|
||||
#define tmp4 x9
|
||||
#define nul_match x10
|
||||
#define chr_match x11
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_nul1 v3
|
||||
#define vhas_nul2 v4
|
||||
#define vhas_chr1 v5
|
||||
#define vhas_chr2 v6
|
||||
#define vrepmask_0 v7
|
||||
#define vrepmask_c v16
|
||||
#define vend1 v17
|
||||
#define vend2 v18
|
||||
|
||||
/* Core algorithm.
|
||||
|
||||
For each 32-byte hunk we calculate a 64-bit syndrome value, with
|
||||
two bits per byte (LSB is always in bits 0 and 1, for both big
|
||||
and little-endian systems). For each tuple, bit 0 is set iff
|
||||
the relevant byte matched the requested character; bit 1 is set
|
||||
iff the relevant byte matched the NUL end of string (we trigger
|
||||
off bit0 for the special case of looking for NUL). Since the bits
|
||||
in the syndrome reflect exactly the order in which things occur
|
||||
in the original string a count_trailing_zeros() operation will
|
||||
identify exactly which byte is causing the termination, and why. */
|
||||
|
||||
ENTRY (__strrchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0x40100401 to allow us to identify which lane
|
||||
matches the requested byte. Magic constant 0x80200802 used
|
||||
similarly for NUL termination. */
|
||||
mov wtmp2, #0x0401
|
||||
movk wtmp2, #0x4010, lsl #16
|
||||
dup vrepchr.16b, chrin
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask_c.4s, wtmp2
|
||||
mov src_offset, #0
|
||||
ands tmp1, srcin, #31
|
||||
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
|
||||
b.eq L(aligned)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
for all the bytes, but then mask off those bits of the
|
||||
syndrome that are related to the padding. */
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
neg tmp1, tmp1
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
|
||||
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vend1.d[0]
|
||||
lsl tmp1, tmp1, #1
|
||||
mov const_m1, #~0
|
||||
lsr tmp3, const_m1, tmp1
|
||||
mov chr_match, vend1.d[1]
|
||||
|
||||
bic nul_match, nul_match, tmp3 // Mask padding bits.
|
||||
bic chr_match, chr_match, tmp3 // Mask padding bits.
|
||||
cbnz nul_match, L(tail)
|
||||
|
||||
.p2align 4
|
||||
L(loop):
|
||||
cmp chr_match, #0
|
||||
csel src_match, src, src_match, ne
|
||||
csel src_offset, chr_match, src_offset, ne
|
||||
L(aligned):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
uminp vend1.16b, vdata1.16b, vdata2.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
cmeq vend1.16b, vend1.16b, 0
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vend1.d[0]
|
||||
mov chr_match, vend1.d[1]
|
||||
cbz nul_match, L(loop)
|
||||
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
|
||||
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
|
||||
mov nul_match, vhas_nul1.d[0]
|
||||
|
||||
L(tail):
|
||||
/* Work out exactly where the string ends. */
|
||||
sub tmp4, nul_match, #1
|
||||
eor tmp4, tmp4, nul_match
|
||||
ands chr_match, chr_match, tmp4
|
||||
/* And pick the values corresponding to the last match. */
|
||||
csel src_match, src, src_match, ne
|
||||
csel src_offset, chr_match, src_offset, ne
|
||||
|
||||
/* Count down from the top of the syndrome to find the last match. */
|
||||
clz tmp3, src_offset
|
||||
/* Src_match points beyond the word containing the match, so we can
|
||||
simply subtract half the bit-offset into the syndrome. Because
|
||||
we are counting down, we need to go back one more character. */
|
||||
add tmp3, tmp3, #2
|
||||
sub result, src_match, tmp3, lsr #1
|
||||
/* But if the syndrome shows no match was found, then return NULL. */
|
||||
cmp src_offset, #0
|
||||
csel result, result, xzr, ne
|
||||
|
||||
ret
|
||||
|
||||
END (__strrchr_aarch64)
|
|
@ -6,6 +6,7 @@ PKGS += LIBC_INTRIN
|
|||
LIBC_INTRIN_ARTIFACTS += LIBC_INTRIN_A
|
||||
LIBC_INTRIN = $(LIBC_INTRIN_A_DEPS) $(LIBC_INTRIN_A)
|
||||
LIBC_INTRIN_A = o/$(MODE)/libc/intrin/intrin.a
|
||||
LIBC_INTRIN_A_FILES := $(wildcard libc/intrin/*)
|
||||
LIBC_INTRIN_A_HDRS = $(filter %.h,$(LIBC_INTRIN_A_FILES))
|
||||
LIBC_INTRIN_A_INCS = $(filter %.inc,$(LIBC_INTRIN_A_FILES))
|
||||
LIBC_INTRIN_A_SRCS_S = $(filter %.S,$(LIBC_INTRIN_A_FILES))
|
||||
|
@ -13,8 +14,9 @@ LIBC_INTRIN_A_SRCS_C = $(filter %.c,$(LIBC_INTRIN_A_FILES))
|
|||
LIBC_INTRIN_A_SRCS = $(LIBC_INTRIN_A_SRCS_S) $(LIBC_INTRIN_A_SRCS_C)
|
||||
LIBC_INTRIN_A_CHECKS = $(LIBC_INTRIN_A).pkg
|
||||
|
||||
LIBC_INTRIN_A_FILES := \
|
||||
$(wildcard libc/intrin/*)
|
||||
ifeq ($(ARCH), aarch64)
|
||||
LIBC_INTRIN_A_SRCS_S += $(wildcard libc/intrin/aarch64/*.S)
|
||||
endif
|
||||
|
||||
LIBC_INTRIN_A_OBJS = \
|
||||
$(LIBC_INTRIN_A_SRCS_S:%.S=o/$(MODE)/%.o) \
|
||||
|
@ -203,6 +205,8 @@ o/$(MODE)/libc/intrin/memmove.o: private \
|
|||
-fpie
|
||||
|
||||
# these assembly files are safe to build on aarch64
|
||||
o/$(MODE)/libc/intrin/aarch64/%.o: libc/intrin/aarch64/%.S
|
||||
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
|
||||
o/$(MODE)/libc/intrin/fenv.o: libc/intrin/fenv.S
|
||||
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
|
||||
o/$(MODE)/libc/intrin/futex.o: libc/intrin/futex.S
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
|
@ -83,3 +84,5 @@ void *memchr(const void *s, int c, size_t n) {
|
|||
return memchr_pure(s, c, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -20,6 +20,7 @@
|
|||
#include "libc/intrin/likely.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
|
||||
|
||||
|
@ -129,7 +130,9 @@ microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
|
|||
* memcmp n=32768 29 ps/byte 32,851 mb/s
|
||||
* memcmp n=131072 33 ps/byte 28,983 mb/s
|
||||
*
|
||||
* @return unsigned char subtraction at stop index
|
||||
* @return an integer that's (1) equal to zero if `a` is equal to `b`,
|
||||
* (2) less than zero if `a` is less than `b`, or (3) greater than
|
||||
* zero if `a` is greater than `b`
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
int memcmp(const void *a, const void *b, size_t n) {
|
||||
|
@ -200,3 +203,5 @@ int memcmp(const void *a, const void *b, size_t n) {
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "libc/nexgen32e/nexgen32e.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -343,3 +344,5 @@ void *memmove(void *dst, const void *src, size_t n) {
|
|||
|
||||
asm("memcpy = memmove\n\t"
|
||||
".globl\tmemcpy");
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
|
@ -81,3 +82,5 @@ void *memrchr(const void *s, int c, size_t n) {
|
|||
return memrchr_pure(s, c, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -22,6 +22,7 @@
|
|||
#include "libc/nexgen32e/nexgen32e.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -168,3 +169,5 @@ void *memset(void *p, int c, size_t n) {
|
|||
return memset_sse(b, c, n);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -17,6 +17,9 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
// TODO(jart): ASAN support here is important.
|
||||
|
||||
typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -63,3 +66,5 @@ char *stpcpy(char *d, const char *s) {
|
|||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
static inline const char *strchr_pure(const char *s, int c) {
|
||||
for (;; ++s) {
|
||||
|
@ -115,3 +116,5 @@ char *strchr(const char *s, int c) {
|
|||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -21,6 +21,7 @@
|
|||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
static inline const char *strchrnul_pure(const char *s, int c) {
|
||||
for (;; ++s) {
|
||||
|
@ -113,3 +114,5 @@ char *strchrnul(const char *s, int c) {
|
|||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -17,6 +17,9 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
// TODO(jart): ASAN support here is important.
|
||||
|
||||
typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -63,3 +66,5 @@ char *strcpy(char *d, const char *s) {
|
|||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
/**
|
||||
* Returns length of NUL-terminated string.
|
||||
|
@ -61,3 +62,5 @@ noasan size_t strlen(const char *s) {
|
|||
return n;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
/**
|
||||
* Compares NUL-terminated strings w/ limit.
|
||||
|
@ -32,3 +33,5 @@ int strncmp(const char *a, const char *b, size_t n) {
|
|||
while (i < n && a[i] == b[i] && b[i]) ++i;
|
||||
return (a[i] & 0xff) - (b[i] & 0xff);
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -21,6 +21,7 @@
|
|||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
static noasan size_t strnlen_x64(const char *s, size_t n, size_t i) {
|
||||
uint64_t w;
|
||||
|
@ -56,3 +57,5 @@ noasan size_t strnlen(const char *s, size_t n) {
|
|||
if (IsAsan()) __asan_verify(s, i);
|
||||
return i;
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -17,6 +17,7 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
/**
|
||||
* Searches for last instance of character in string.
|
||||
|
@ -29,3 +30,5 @@
|
|||
char *strrchr(const char *s, int c) {
|
||||
return memrchr(s, c, strlen(s));
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -36,10 +36,16 @@ STATIC_YOINK("strerror_wr");
|
|||
/**
|
||||
* Handles failure of CHECK_xx() macros.
|
||||
*/
|
||||
relegated void __check_fail(const char *suffix, const char *opstr,
|
||||
uint64_t want, const char *wantstr, uint64_t got,
|
||||
const char *gotstr, const char *file, int line,
|
||||
const char *fmt, ...) {
|
||||
relegated void __check_fail(const char *suffix, //
|
||||
const char *opstr, //
|
||||
uint64_t want, //
|
||||
const char *wantstr, //
|
||||
uint64_t got, //
|
||||
const char *gotstr, //
|
||||
const char *file, //
|
||||
int line, //
|
||||
const char *fmt, //
|
||||
...) {
|
||||
int e;
|
||||
char *p;
|
||||
size_t i;
|
||||
|
|
|
@ -33,21 +33,69 @@
|
|||
*
|
||||
* @see libc/log/thunks/__check_fail_ndebug.S
|
||||
*/
|
||||
relegated wontreturn void __check_fail_ndebug(uint64_t want, uint64_t got,
|
||||
const char *file, int line,
|
||||
const char *opchar,
|
||||
const char *fmt, ...) {
|
||||
va_list va;
|
||||
static relegated wontreturn void __check_fail_ndebug(uint64_t want, //
|
||||
uint64_t got, //
|
||||
const char *file, //
|
||||
int line, //
|
||||
const char *opchar, //
|
||||
const char *fmt, //
|
||||
va_list va) {
|
||||
__restore_tty();
|
||||
kprintf("%rerror:%s:%d: check failed: %'ld %s %'ld% m", file, line, want,
|
||||
opchar, got);
|
||||
if (*fmt) {
|
||||
if (fmt && *fmt) {
|
||||
kprintf(": ");
|
||||
va_start(va, fmt);
|
||||
kvprintf(fmt, va);
|
||||
va_end(va);
|
||||
}
|
||||
kprintf("\n");
|
||||
if (_weaken(__die)) _weaken(__die)();
|
||||
_Exitr(68);
|
||||
}
|
||||
|
||||
void __check_fail_eq(uint64_t want, uint64_t got, const char *file, int line,
|
||||
const char *opchar, const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
void __check_fail_ne(uint64_t want, uint64_t got, const char *file, int line,
|
||||
const char *opchar, const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
void __check_fail_le(uint64_t want, uint64_t got, const char *file, int line,
|
||||
const char *opchar, const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
void __check_fail_lt(uint64_t want, uint64_t got, const char *file, int line,
|
||||
const char *opchar, const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
void __check_fail_ge(uint64_t want, uint64_t got, const char *file, int line,
|
||||
const char *opchar, const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
void __check_fail_gt(uint64_t want, uint64_t got, const char *file, int line,
|
||||
const char *opchar, const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
__check_fail_ndebug(want, got, file, line, opchar, fmt, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
|
|
@ -6,9 +6,7 @@ PKGS += LIBC_LOG
|
|||
LIBC_LOG_ARTIFACTS += LIBC_LOG_A
|
||||
LIBC_LOG = $(LIBC_LOG_A_DEPS) $(LIBC_LOG_A)
|
||||
LIBC_LOG_A = o/$(MODE)/libc/log/log.a
|
||||
LIBC_LOG_A_FILES := \
|
||||
$(wildcard libc/log/thunks/*) \
|
||||
$(wildcard libc/log/*)
|
||||
LIBC_LOG_A_FILES := $(wildcard libc/log/*)
|
||||
LIBC_LOG_A_HDRS = $(filter %.h,$(LIBC_LOG_A_FILES))
|
||||
LIBC_LOG_A_SRCS_C = $(filter %.c,$(LIBC_LOG_A_FILES))
|
||||
LIBC_LOG_A_SRCS_S = $(filter %.S,$(LIBC_LOG_A_FILES))
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
.text.unlikely
|
||||
|
||||
// Code-size saving thunk for CHECK_EQ() in NDEBUG mode.
|
||||
__check_fail_eq:
|
||||
lea .Lop(%rip),%r8
|
||||
jmp __check_fail_ndebug
|
||||
.endfn __check_fail_eq,globl
|
||||
|
||||
.rodata.str1.1
|
||||
.Lop: .asciz "=="
|
||||
.previous
|
|
@ -1,30 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
.text.unlikely
|
||||
|
||||
// Code-size saving thunk for CHECK_GE() in NDEBUG mode.
|
||||
__check_fail_ge:
|
||||
lea .Lop(%rip),%r8
|
||||
jmp __check_fail_ndebug
|
||||
.endfn __check_fail_ge,globl
|
||||
|
||||
.rodata.str1.1
|
||||
.Lop: .asciz ">="
|
||||
.previous
|
|
@ -1,30 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
.text.unlikely
|
||||
|
||||
// Code-size saving thunk for CHECK_GT() in NDEBUG mode.
|
||||
__check_fail_gt:
|
||||
lea .Lop(%rip),%r8
|
||||
jmp __check_fail_ndebug
|
||||
.endfn __check_fail_gt,globl
|
||||
|
||||
.rodata.str1.1
|
||||
.Lop: .asciz ">"
|
||||
.previous
|
|
@ -1,30 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
.text.unlikely
|
||||
|
||||
// Code-size saving thunk for CHECK_LE() in NDEBUG mode.
|
||||
__check_fail_le:
|
||||
lea .Lop(%rip),%r8
|
||||
jmp __check_fail_ndebug
|
||||
.endfn __check_fail_le,globl
|
||||
|
||||
.rodata.str1.1
|
||||
.Lop: .asciz "<="
|
||||
.previous
|
|
@ -1,30 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
.text.unlikely
|
||||
|
||||
// Code-size saving thunk for CHECK_LT() in NDEBUG mode.
|
||||
__check_fail_lt:
|
||||
lea .Lop(%rip),%r8
|
||||
jmp __check_fail_ndebug
|
||||
.endfn __check_fail_lt,globl
|
||||
|
||||
.rodata.str1.1
|
||||
.Lop: .asciz "<"
|
||||
.previous
|
|
@ -1,30 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
.text.unlikely
|
||||
|
||||
// Code-size saving thunk for CHECK_NE() in NDEBUG mode.
|
||||
__check_fail_ne:
|
||||
lea .Lop(%rip),%r8
|
||||
jmp __check_fail_ndebug
|
||||
.endfn __check_fail_ne,globl
|
||||
|
||||
.rodata.str1.1
|
||||
.Lop: .asciz "!="
|
||||
.previous
|
|
@ -43,8 +43,22 @@
|
|||
Ticks; \
|
||||
})
|
||||
#else
|
||||
#define __startbench() rdtsc()
|
||||
#define __endbench() rdtsc()
|
||||
#define __startbench() \
|
||||
({ \
|
||||
uint64_t _ts; \
|
||||
asm volatile("isb" ::: "memory"); \
|
||||
_ts = rdtsc(); \
|
||||
asm volatile("isb" ::: "memory"); \
|
||||
_ts; \
|
||||
})
|
||||
#define __endbench() \
|
||||
({ \
|
||||
uint64_t _ts; \
|
||||
asm volatile("isb" ::: "memory"); \
|
||||
_ts = rdtsc(); \
|
||||
asm volatile("isb" ::: "memory"); \
|
||||
_ts; \
|
||||
})
|
||||
#endif
|
||||
|
||||
#define __startbench_m() mfence_lfence_rdtsc_lfence()
|
||||
|
|
|
@ -1,262 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
// Computes Phil Katz CRC-32 w/ carryless multiply isa.
|
||||
//
|
||||
// This is support code that's abstracted by crc32_z().
|
||||
//
|
||||
// @param edi is initial value
|
||||
// @param rsi points to buffer
|
||||
// @param rdx is bytes in buffer that's >=64 and %16==0
|
||||
// @return eax is crc32
|
||||
// @note needs Westmere (c.2010) or Bulldozer (c.2011)
|
||||
// @see “Fast CRC Computation for Generic Polynomials Using
|
||||
// PCLMULQDQ Instruction” V. Gopal, E. Ozturk, et al.,
|
||||
// 2009, intel.ly/2ySEwL0
|
||||
crc32_pclmul:
|
||||
.leafprologue
|
||||
.profilable
|
||||
movdqu (%rsi),%xmm7
|
||||
movd %edi,%xmm1
|
||||
movdqu 16(%rsi),%xmm9
|
||||
movdqu 32(%rsi),%xmm4
|
||||
movdqu 48(%rsi),%xmm0
|
||||
lea -64(%rdx),%rdi
|
||||
lea 64(%rsi),%rcx
|
||||
pxor %xmm7,%xmm1
|
||||
movdqa .Lk1k2(%rip),%xmm8
|
||||
cmp $63,%rdi
|
||||
jbe 2f
|
||||
lea -128(%rdx),%rdi
|
||||
mov %rdi,%rdx
|
||||
shr $6,%rdx
|
||||
lea 2(%rdx),%rax
|
||||
sal $6,%rax
|
||||
add %rax,%rsi
|
||||
mov %rcx,%rax
|
||||
3: add $64,%rax
|
||||
movdqa %xmm1,%xmm7
|
||||
movdqa %xmm4,%xmm5
|
||||
movdqa %xmm0,%xmm3
|
||||
movdqa %xmm9,%xmm6
|
||||
movdqa %xmm9,%xmm2
|
||||
movdqu -48(%rax),%xmm9
|
||||
pclmullqlqdq %xmm8,%xmm7
|
||||
pclmullqlqdq %xmm8,%xmm6
|
||||
pclmullqlqdq %xmm8,%xmm5
|
||||
pclmulhqhqdq %xmm8,%xmm1
|
||||
pclmulhqhqdq %xmm8,%xmm2
|
||||
pclmulhqhqdq %xmm8,%xmm4
|
||||
pxor %xmm7,%xmm1
|
||||
movdqu -64(%rax),%xmm7
|
||||
pxor %xmm6,%xmm2
|
||||
pxor %xmm5,%xmm4
|
||||
movdqu -32(%rax),%xmm6
|
||||
movdqu -16(%rax),%xmm5
|
||||
pclmullqlqdq %xmm8,%xmm3
|
||||
pclmulhqhqdq %xmm8,%xmm0
|
||||
pxor %xmm7,%xmm1
|
||||
pxor %xmm3,%xmm0
|
||||
pxor %xmm2,%xmm9
|
||||
pxor %xmm6,%xmm4
|
||||
pxor %xmm5,%xmm0
|
||||
cmp %rsi,%rax
|
||||
jne 3b
|
||||
lea 1(%rdx),%rax
|
||||
sal $6,%rdx
|
||||
sal $6,%rax
|
||||
sub %rdx,%rdi
|
||||
add %rax,%rcx
|
||||
2: movdqa .Lk3k4(%rip),%xmm3
|
||||
movdqa %xmm1,%xmm2
|
||||
movdqa %xmm1,%xmm5
|
||||
pclmulhqhqdq %xmm3,%xmm2
|
||||
pclmullqlqdq %xmm3,%xmm5
|
||||
pxor %xmm9,%xmm2
|
||||
pxor %xmm5,%xmm2
|
||||
movdqa %xmm2,%xmm5
|
||||
pclmulhqhqdq %xmm3,%xmm2
|
||||
movdqa %xmm2,%xmm1
|
||||
pclmullqlqdq %xmm3,%xmm5
|
||||
pxor %xmm4,%xmm1
|
||||
pxor %xmm5,%xmm1
|
||||
movdqa %xmm1,%xmm2
|
||||
pclmulhqhqdq %xmm3,%xmm1
|
||||
pclmullqlqdq %xmm3,%xmm2
|
||||
pxor %xmm1,%xmm0
|
||||
pxor %xmm2,%xmm0
|
||||
cmp $15,%rdi
|
||||
jbe 4f
|
||||
sub $16,%rdi
|
||||
mov %rcx,%rax
|
||||
and $-16,%rdi
|
||||
lea 16(%rcx,%rdi),%rdx
|
||||
5: movdqa %xmm0,%xmm1
|
||||
movdqu (%rax),%xmm6
|
||||
pclmulhqhqdq %xmm3,%xmm0
|
||||
add $16,%rax
|
||||
pclmullqlqdq %xmm3,%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
pxor %xmm6,%xmm0
|
||||
cmp %rax,%rdx
|
||||
jne 5b
|
||||
4: movdqa %xmm0,%xmm1
|
||||
movdqa .Lboop(%rip),%xmm2
|
||||
psrldq $8,%xmm0
|
||||
pclmullqhqdq %xmm3,%xmm1
|
||||
movdqa .Lpoly(%rip),%xmm3
|
||||
pxor %xmm1,%xmm0
|
||||
movdqa %xmm0,%xmm1
|
||||
pand %xmm2,%xmm0
|
||||
pclmullqlqdq .Lk5k0(%rip),%xmm0
|
||||
psrldq $4,%xmm1
|
||||
pxor %xmm0,%xmm1
|
||||
movdqa %xmm1,%xmm0
|
||||
pand %xmm2,%xmm0
|
||||
pclmullqhqdq %xmm3,%xmm0
|
||||
pand %xmm2,%xmm0
|
||||
pclmullqlqdq %xmm3,%xmm0
|
||||
pxor %xmm1,%xmm0
|
||||
movq %xmm0,%rax
|
||||
shr $32,%rax
|
||||
.leafepilogue
|
||||
.endfn crc32_pclmul,globl,hidden
|
||||
|
||||
// Definitions of the bit-reflected domain constants k1,k2,k3, etc.
|
||||
// and the CRC32+Barrett polynomials given at the end of the paper.
|
||||
.rodata.cst16
|
||||
.Lk1k2: .quad 0x0000000154442bd4
|
||||
.quad 0x00000001c6e41596
|
||||
.endobj .Lk1k2
|
||||
.Lk3k4: .quad 0x00000001751997d0
|
||||
.quad 0x00000000ccaa009e
|
||||
.endobj .Lk3k4
|
||||
.Lk5k0: .quad 0x0000000163cd6124
|
||||
.quad 0x0000000000000000
|
||||
.endobj .Lk5k0
|
||||
.Lboop: .quad 0x00000000ffffffff
|
||||
.quad 0x00000000ffffffff
|
||||
.endobj .Lboop
|
||||
.Lpoly: .quad 0x00000001db710641
|
||||
.quad 0x00000001f7011641
|
||||
.endobj .Lpoly
|
||||
.previous
|
||||
|
||||
/* crc32() w/ pclmul for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 4437.000 42.375 38.141 85
|
||||
1 45.000 39.375 38.234 85
|
||||
2 31.500 25.312 23.102 141
|
||||
3 25.667 19.792 17.911 181
|
||||
4 21.250 16.219 15.035 216
|
||||
7 18.429 12.946 11.712 277
|
||||
8 16.125 12.578 10.998 296
|
||||
15 12.867 9.925 9.161 355
|
||||
16 12.438 9.836 9.114 357
|
||||
31 11.194 8.528 8.149 399
|
||||
32 10.781 8.418 8.098 401
|
||||
63 9.063 7.780 7.647 425
|
||||
64 3.109 1.604 1.414 2299
|
||||
127 2.260 1.824 1.729 1880
|
||||
128 1.305 0.860 0.806 4033
|
||||
255 1.290 1.001 0.948 3428
|
||||
256 0.574 0.491 0.476 6822
|
||||
511 0.773 0.571 0.546 5956
|
||||
512 0.354 0.320 0.306 10613
|
||||
1023 0.425 0.365 0.347 9375
|
||||
1024 0.237 0.229 0.231 14097
|
||||
2047 0.278 0.251 0.246 13236
|
||||
2048 0.187 0.187 0.188 17306
|
||||
4095 0.229 0.200 0.194 16761
|
||||
4096 0.162 0.170 0.167 19438
|
||||
8191 0.182 0.173 0.178 18266
|
||||
8192 0.162 0.155 0.158 20560
|
||||
16383 0.156 0.162 0.154 21136
|
||||
16384 0.156 0.156 0.148 22005
|
||||
32767 0.163 0.149 0.149 21768
|
||||
32768 0.150 0.146 0.145 22491
|
||||
65535 0.158 0.141 0.141 23102
|
||||
65536 0.149 0.140 0.138 23478
|
||||
131071 0.150 0.145 0.141 23011
|
||||
131072 0.148 0.141 0.148 21892
|
||||
262143 0.151 0.148 0.147 22136
|
||||
262144 0.149 0.146 0.146 22298
|
||||
524287 0.150 0.149 0.149 21832
|
||||
524288 0.148 0.148 0.147 22043
|
||||
1048575 0.148 0.158 0.163 19913
|
||||
1048576 0.156 0.179 0.153 21186
|
||||
2097151 0.153 0.149 0.148 21979
|
||||
2097152 0.147 0.148 0.147 22040
|
||||
4194303 0.148 0.148 0.151 21482
|
||||
4194304 0.148 0.148 0.147 22061
|
||||
8388607 0.185 0.183 0.185 17536
|
||||
8388608 0.193 0.180 0.183 17769
|
||||
|
||||
crc32() w/ 10+ year old cpus for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 4447.000 43.625 37.641 86
|
||||
1 41.000 37.125 37.609 86
|
||||
2 31.500 26.562 22.477 145
|
||||
3 25.000 20.125 17.422 187
|
||||
4 21.250 16.594 15.230 213
|
||||
7 16.714 13.089 11.717 277
|
||||
8 16.875 12.609 11.174 291
|
||||
15 12.733 9.958 9.339 348
|
||||
16 12.438 9.852 9.208 353
|
||||
31 10.935 8.617 8.164 398
|
||||
32 10.906 8.496 8.155 399
|
||||
63 9.095 7.819 7.692 423
|
||||
64 9.172 7.807 7.692 423
|
||||
127 8.165 7.531 7.438 437
|
||||
128 8.133 7.503 7.437 437
|
||||
255 7.714 7.329 7.293 446
|
||||
256 7.723 7.348 7.293 446
|
||||
511 7.434 7.253 7.223 450
|
||||
512 7.412 7.237 7.218 450
|
||||
1023 7.274 7.214 7.201 451
|
||||
1024 7.292 7.203 7.189 452
|
||||
2047 7.232 7.185 7.178 453
|
||||
2048 7.239 7.189 7.186 452
|
||||
4095 7.189 7.175 7.172 453
|
||||
4096 7.192 7.173 7.172 453
|
||||
8191 7.187 7.173 7.172 453
|
||||
8192 7.183 7.174 7.181 453
|
||||
16383 7.175 7.170 7.169 453
|
||||
16384 7.176 7.169 7.169 453
|
||||
32767 7.169 7.182 7.170 453
|
||||
32768 7.173 7.172 7.172 453
|
||||
65535 7.170 7.170 7.171 453
|
||||
65536 7.172 7.171 7.204 451
|
||||
131071 7.170 7.354 7.260 448
|
||||
131072 7.172 7.172 7.182 453
|
||||
262143 7.037 7.178 7.182 453
|
||||
262144 7.169 7.343 7.205 451
|
||||
524287 7.438 7.170 7.206 451
|
||||
524288 7.169 7.164 7.209 451
|
||||
1048575 6.995 7.119 7.158 454
|
||||
1048576 7.168 7.110 7.157 454
|
||||
2097151 7.057 7.058 7.065 460
|
||||
2097152 6.977 7.047 7.089 458
|
||||
4194303 7.017 7.504 7.030 462
|
||||
4194304 7.025 7.059 7.030 462
|
||||
8388607 7.082 6.980 6.997 464
|
||||
8388608 7.051 6.985 6.999 464 */
|
|
@ -8,12 +8,6 @@ extern const uint32_t kCrc32cTab[256];
|
|||
void crc32init(uint32_t[hasatleast 256], uint32_t);
|
||||
uint32_t crc32a(uint32_t, const void *, size_t);
|
||||
uint32_t crc32c(uint32_t, const void *, size_t);
|
||||
uint32_t crc32_z(uint32_t, const void *, size_t);
|
||||
uint32_t crc32c_pure(uint32_t, const void *, size_t)
|
||||
strlenesque _Hide;
|
||||
uint32_t crc32c_sse42(uint32_t, const void *, size_t)
|
||||
strlenesque _Hide;
|
||||
uint32_t crc32_pclmul(uint32_t, const void *, size_t) _Hide;
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
|
|
|
@ -1,63 +0,0 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/nexgen32e/crc32.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
|
||||
/**
|
||||
* Computes Phil Katz CRC-32 used by zip/zlib/gzip/etc.
|
||||
*
|
||||
* x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
|
||||
* 0b100000100110000010001110110110111
|
||||
* _bitreverse32(0x104c11db7)
|
||||
*
|
||||
* This implementation takes 32 picoseconds per byte or 30 gibibyte/s.
|
||||
*
|
||||
* @param h is initial value
|
||||
*/
|
||||
uint32_t crc32_z(uint32_t h, const void *data, size_t size) {
|
||||
size_t n;
|
||||
static bool once;
|
||||
const unsigned char *p, *e;
|
||||
static uint32_t kCrc32Tab[256];
|
||||
if (!once) {
|
||||
crc32init(kCrc32Tab, 0xedb88320);
|
||||
once = 0;
|
||||
}
|
||||
if (size == -1) {
|
||||
size = data ? strlen(data) : 0;
|
||||
}
|
||||
p = data;
|
||||
e = p + size;
|
||||
h ^= 0xffffffff;
|
||||
if (X86_HAVE(PCLMUL)) {
|
||||
while (((intptr_t)p & 15) && p < e)
|
||||
h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
|
||||
if ((n = ROUNDDOWN(e - p, 16)) >= 64) {
|
||||
if (IsAsan()) __asan_verify(p, n);
|
||||
h = crc32_pclmul(h, p, n); /* 51x faster */
|
||||
p += n;
|
||||
}
|
||||
}
|
||||
while (p < e) h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
|
||||
return h ^ 0xffffffff;
|
||||
}
|
61
libc/sysv/consts/hwap.h
Normal file
61
libc/sysv/consts/hwap.h
Normal file
|
@ -0,0 +1,61 @@
|
|||
#ifndef COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_
|
||||
#define COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_
|
||||
#ifdef __aarch64__
|
||||
|
||||
// Feature bits for getauxval(AT_HWCAP) on AARCH64 GNU/SystemD.
|
||||
|
||||
#define HWCAP_FP (1 << 0)
|
||||
#define HWCAP_ASIMD (1 << 1)
|
||||
#define HWCAP_EVTSTRM (1 << 2)
|
||||
#define HWCAP_AES (1 << 3)
|
||||
#define HWCAP_PMULL (1 << 4)
|
||||
#define HWCAP_SHA1 (1 << 5)
|
||||
#define HWCAP_SHA2 (1 << 6)
|
||||
#define HWCAP_CRC32 (1 << 7)
|
||||
#define HWCAP_ATOMICS (1 << 8)
|
||||
#define HWCAP_FPHP (1 << 9)
|
||||
#define HWCAP_ASIMDHP (1 << 10)
|
||||
#define HWCAP_CPUID (1 << 11)
|
||||
#define HWCAP_ASIMDRDM (1 << 12)
|
||||
#define HWCAP_JSCVT (1 << 13)
|
||||
#define HWCAP_FCMA (1 << 14)
|
||||
#define HWCAP_LRCPC (1 << 15)
|
||||
#define HWCAP_DCPOP (1 << 16)
|
||||
#define HWCAP_SHA3 (1 << 17)
|
||||
#define HWCAP_SM3 (1 << 18)
|
||||
#define HWCAP_SM4 (1 << 19)
|
||||
#define HWCAP_ASIMDDP (1 << 20)
|
||||
#define HWCAP_SHA512 (1 << 21)
|
||||
#define HWCAP_SVE (1 << 22)
|
||||
#define HWCAP_ASIMDFHM (1 << 23)
|
||||
#define HWCAP_DIT (1 << 24)
|
||||
#define HWCAP_USCAT (1 << 25)
|
||||
#define HWCAP_ILRCPC (1 << 26)
|
||||
#define HWCAP_FLAGM (1 << 27)
|
||||
#define HWCAP_SSBS (1 << 28)
|
||||
#define HWCAP_SB (1 << 29)
|
||||
#define HWCAP_PACA (1 << 30)
|
||||
#define HWCAP_PACG (1UL << 31)
|
||||
|
||||
#define HWCAP2_DCPODP (1 << 0)
|
||||
#define HWCAP2_SVE2 (1 << 1)
|
||||
#define HWCAP2_SVEAES (1 << 2)
|
||||
#define HWCAP2_SVEPMULL (1 << 3)
|
||||
#define HWCAP2_SVEBITPERM (1 << 4)
|
||||
#define HWCAP2_SVESHA3 (1 << 5)
|
||||
#define HWCAP2_SVESM4 (1 << 6)
|
||||
#define HWCAP2_FLAGM2 (1 << 7)
|
||||
#define HWCAP2_FRINT (1 << 8)
|
||||
#define HWCAP2_SVEI8MM (1 << 9)
|
||||
#define HWCAP2_SVEF32MM (1 << 10)
|
||||
#define HWCAP2_SVEF64MM (1 << 11)
|
||||
#define HWCAP2_SVEBF16 (1 << 12)
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#define HWCAP2_BF16 (1 << 14)
|
||||
#define HWCAP2_DGH (1 << 15)
|
||||
#define HWCAP2_RNG (1 << 16)
|
||||
#define HWCAP2_BTI (1 << 17)
|
||||
#define HWCAP2_MTE (1 << 18)
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
#endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_ */
|
|
@ -5,6 +5,13 @@
|
|||
│ FreeBSD lib/msun/src/e_acoshl.c │
|
||||
│ Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -28,12 +35,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/feval.internal.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
Musl libc (MIT License)\\n\
|
||||
|
|
|
@ -5,6 +5,13 @@
|
|||
│ FreeBSD lib/msun/src/s_asinhl.c │
|
||||
│ Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -28,12 +35,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
|
|
@ -4,6 +4,13 @@
|
|||
│ │
|
||||
│ FreeBSD lib/msun/src/e_atan2.c │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -27,12 +34,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
|
|
@ -79,7 +79,7 @@ long double atan2l(long double y, long double x)
|
|||
long double z;
|
||||
int m, ex, ey;
|
||||
|
||||
if (isnan(x) || isnan(y))
|
||||
if (isunordered(x, y))
|
||||
return x+y;
|
||||
if (x == 1)
|
||||
return atanl(y);
|
||||
|
|
|
@ -5,6 +5,13 @@
|
|||
│ FreeBSD lib/msun/src/s_tanhf.c │
|
||||
│ Converted to long double by Bruce D. Evans. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -28,12 +35,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
|
|
@ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
|
|||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
long double cosl(long double x) {
|
||||
/**
|
||||
* Returns cosine of 𝑥.
|
||||
*/
|
||||
long double cosl(long double x)
|
||||
{
|
||||
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
|
||||
return cos(x);
|
||||
#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
|
||||
|
|
|
@ -4,6 +4,13 @@
|
|||
│ │
|
||||
│ FreeBSD lib/msun/src/s_expm1f.c │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -27,12 +34,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
|
|
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
|
||||
#define INSERT_WORDS(d,hi,lo) \
|
||||
|
|
|
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
|
||||
#define SET_FLOAT_WORD(d,w) \
|
||||
|
|
|
@ -22,6 +22,6 @@
|
|||
* Returns positive difference.
|
||||
*/
|
||||
double fdim(double x, double y) {
|
||||
if (isnan(x) || isnan(y)) return NAN;
|
||||
if (isunordered(x, y)) return NAN;
|
||||
return x > y ? x - y : 0;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,6 @@
|
|||
* Returns positive difference.
|
||||
*/
|
||||
float fdimf(float x, float y) {
|
||||
if (isnan(x) || isnan(y)) return NAN;
|
||||
if (isunordered(x, y)) return NAN;
|
||||
return x > y ? x - y : 0;
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ long double fdiml(long double x, long double y) {
|
|||
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
|
||||
return fdim(x, y);
|
||||
#else
|
||||
if (isnan(x) || isnan(y)) return NAN;
|
||||
if (isunordered(x, y)) return NAN;
|
||||
return x > y ? x - y : 0;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -906,67 +906,6 @@ irintl(long double x)
|
|||
__x + __y; \
|
||||
})
|
||||
|
||||
/*
|
||||
* ieee style elementary functions
|
||||
*
|
||||
* We rename functions here to improve other sources' diffability
|
||||
* against fdlibm.
|
||||
*/
|
||||
#define __ieee754_sqrt sqrt
|
||||
#define __ieee754_acos acos
|
||||
#define __ieee754_acosh acosh
|
||||
#define __ieee754_log log
|
||||
#define __ieee754_log2 log2
|
||||
#define __ieee754_atanh atanh
|
||||
#define __ieee754_asin asin
|
||||
#define __ieee754_atan2 atan2
|
||||
#define __ieee754_exp exp
|
||||
#define __ieee754_cosh cosh
|
||||
#define __ieee754_fmod fmod
|
||||
#define __ieee754_pow pow
|
||||
#define __ieee754_lgamma lgamma
|
||||
#define __ieee754_gamma gamma
|
||||
#define __ieee754_lgamma_r lgamma_r
|
||||
#define __ieee754_gamma_r gamma_r
|
||||
#define __ieee754_log10 log10
|
||||
#define __ieee754_sinh sinh
|
||||
#define __ieee754_hypot hypot
|
||||
#define __ieee754_j0 j0
|
||||
#define __ieee754_j1 j1
|
||||
#define __ieee754_y0 y0
|
||||
#define __ieee754_y1 y1
|
||||
#define __ieee754_jn jn
|
||||
#define __ieee754_yn yn
|
||||
#define __ieee754_remainder remainder
|
||||
#define __ieee754_scalb scalb
|
||||
#define __ieee754_sqrtf sqrtf
|
||||
#define __ieee754_acosf acosf
|
||||
#define __ieee754_acoshf acoshf
|
||||
#define __ieee754_logf logf
|
||||
#define __ieee754_atanhf atanhf
|
||||
#define __ieee754_asinf asinf
|
||||
#define __ieee754_atan2f atan2f
|
||||
#define __ieee754_expf expf
|
||||
#define __ieee754_coshf coshf
|
||||
#define __ieee754_fmodf fmodf
|
||||
#define __ieee754_powf powf
|
||||
#define __ieee754_lgammaf lgammaf
|
||||
#define __ieee754_gammaf gammaf
|
||||
#define __ieee754_lgammaf_r lgammaf_r
|
||||
#define __ieee754_gammaf_r gammaf_r
|
||||
#define __ieee754_log10f log10f
|
||||
#define __ieee754_log2f log2f
|
||||
#define __ieee754_sinhf sinhf
|
||||
#define __ieee754_hypotf hypotf
|
||||
#define __ieee754_j0f j0f
|
||||
#define __ieee754_j1f j1f
|
||||
#define __ieee754_y0f y0f
|
||||
#define __ieee754_y1f y1f
|
||||
#define __ieee754_jnf jnf
|
||||
#define __ieee754_ynf ynf
|
||||
#define __ieee754_remainderf remainderf
|
||||
#define __ieee754_scalbf scalbf
|
||||
|
||||
/* fdlibm kernel function */
|
||||
int __kernel_rem_pio2(double*,double*,int,int,int);
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/tinymath/tinymath.h"
|
||||
#include "libc/math.h"
|
||||
|
||||
/**
|
||||
* Rounds to nearest integer.
|
||||
|
|
|
@ -83,7 +83,8 @@ static dontinline long lrint_slow(double x) {
|
|||
/**
|
||||
* Rounds to nearest integer.
|
||||
*/
|
||||
long lrint(double x) {
|
||||
long lrint(double x)
|
||||
{
|
||||
#ifdef __x86_64__
|
||||
long res;
|
||||
asm("cvtsd2si\t%1,%0" : "=r"(res) : "x"(x));
|
||||
|
|
|
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
double modf(double x, double *iptr)
|
||||
{
|
||||
|
|
|
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
float modff(float x, float *iptr)
|
||||
{
|
||||
|
|
|
@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
double nextafter(double x, double y)
|
||||
{
|
||||
|
@ -40,7 +40,7 @@ double nextafter(double x, double y)
|
|||
uint64_t ax, ay;
|
||||
int e;
|
||||
|
||||
if (isnan(x) || isnan(y))
|
||||
if (isunordered(x, y))
|
||||
return x + y;
|
||||
if (ux.i == uy.i)
|
||||
return y;
|
||||
|
|
|
@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
float nextafterf(float x, float y)
|
||||
{
|
||||
union {float f; uint32_t i;} ux={x}, uy={y};
|
||||
uint32_t ax, ay, e;
|
||||
|
||||
if (isnan(x) || isnan(y))
|
||||
if (isunordered(x, y))
|
||||
return x + y;
|
||||
if (ux.i == uy.i)
|
||||
return y;
|
||||
|
|
|
@ -36,13 +36,14 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
|
|||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
long double nextafterl(long double x, long double y) {
|
||||
long double nextafterl(long double x, long double y)
|
||||
{
|
||||
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
|
||||
return nextafter(x, y);
|
||||
#elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384
|
||||
union ldshape ux, uy;
|
||||
|
||||
if (isnan(x) || isnan(y))
|
||||
if (isunordered(x, y))
|
||||
return x + y;
|
||||
if (x == y)
|
||||
return y;
|
||||
|
@ -75,7 +76,7 @@ long double nextafterl(long double x, long double y) {
|
|||
#elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384
|
||||
union ldshape ux, uy;
|
||||
|
||||
if (isnan(x) || isnan(y))
|
||||
if (isunordered(x, y))
|
||||
return x + y;
|
||||
if (x == y)
|
||||
return y;
|
||||
|
|
|
@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
double nexttoward(double x, long double y)
|
||||
{
|
||||
union {double f; uint64_t i;} ux = {x};
|
||||
int e;
|
||||
|
||||
if (isnan(x) || isnan(y))
|
||||
if (isunordered(x, y))
|
||||
return x + y;
|
||||
if (x == y)
|
||||
return y;
|
||||
|
|
|
@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
float nexttowardf(float x, long double y)
|
||||
{
|
||||
union {float f; uint32_t i;} ux = {x};
|
||||
uint32_t e;
|
||||
|
||||
if (isnan(x) || isnan(y))
|
||||
if (isunordered(x, y))
|
||||
return x + y;
|
||||
if (x == y)
|
||||
return y;
|
||||
|
|
|
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
long double nexttowardl(long double x, long double y)
|
||||
{
|
||||
|
|
|
@ -34,7 +34,7 @@ asm(".ident\t\"\\n\\n\
|
|||
OpenBSD libm (ISC License)\\n\
|
||||
Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/* origin: OpenBSD /usr/src/lib/libm/src/polevll.c */
|
||||
/*
|
||||
|
|
|
@ -32,10 +32,10 @@
|
|||
#include "libc/tinymath/pow_data.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
Double-precision math functions (MIT License)\\n\
|
||||
Copyright 2018 ARM Limited\"");
|
||||
Optimized Routines (MIT License)\\n\
|
||||
Copyright 2022 ARM Limited\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/*
|
||||
* Double-precision x^y function.
|
||||
|
|
|
@ -121,9 +121,9 @@ double pochisq(
|
|||
e = (even ? 0.0 : LOG_SQRT_PI);
|
||||
c = log(a);
|
||||
while (z <= x) {
|
||||
e = log(z) + e;
|
||||
s += ex(c * z - a - e);
|
||||
z += 1.0;
|
||||
e = log(z) + e;
|
||||
s += ex(c * z - a - e);
|
||||
z += 1.0;
|
||||
}
|
||||
return (s);
|
||||
} else {
|
||||
|
|
|
@ -35,8 +35,8 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* clang-format off */
|
||||
/* origin: FreeBSD /usr/src/lib/msun/src/k_rem_pio2.c */
|
||||
/*
|
||||
* ====================================================
|
||||
|
|
|
@ -58,7 +58,7 @@ asm(".include \"libc/disclaimer.inc\"");
|
|||
*/
|
||||
double scalb(double x, double fn)
|
||||
{
|
||||
if (isnan(x) || isnan(fn))
|
||||
if (isunordered(x, fn))
|
||||
return x*fn;
|
||||
if (!isfinite(fn)) {
|
||||
if (fn > 0.0)
|
||||
|
|
|
@ -38,7 +38,8 @@ asm(".include \"libc/disclaimer.inc\"");
|
|||
|
||||
float scalbf(float x, float fn)
|
||||
{
|
||||
if (isnan(x) || isnan(fn)) return x*fn;
|
||||
if (isunordered(x, fn))
|
||||
return x*fn;
|
||||
if (!isfinite(fn)) {
|
||||
if (fn > 0.0f)
|
||||
return x*fn;
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#include "libc/tinymath/internal.h"
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/*
|
||||
* Header for sinf, cosf and sincosf.
|
||||
|
|
|
@ -35,7 +35,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/* origin: FreeBSD /usr/src/lib/msun/src/k_sinf.c */
|
||||
/*
|
||||
|
|
|
@ -37,7 +37,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/* origin: FreeBSD /usr/src/lib/msun/src/s_sinf.c */
|
||||
/*
|
||||
|
|
|
@ -27,12 +27,13 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/expo.internal.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/**
|
||||
* Returns hyperbolic sine of 𝑥.
|
||||
|
|
|
@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/**
|
||||
* Returns hyperbolic sine of 𝑥.
|
||||
|
|
|
@ -5,6 +5,13 @@
|
|||
│ FreeBSD lib/msun/src/e_sinhl.c │
|
||||
│ Converted to long double by Bruce D. Evans │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -28,12 +35,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/likely.h"
|
||||
#include "libc/math.h"
|
||||
|
|
|
@ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
|
|||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
long double sinl(long double x) {
|
||||
/**
|
||||
* Returns sine of 𝑥.
|
||||
*/
|
||||
long double sinl(long double x)
|
||||
{
|
||||
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
|
||||
return sin(x);
|
||||
#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
|
||||
|
|
|
@ -36,7 +36,7 @@ asm(".ident\t\"\\n\\n\
|
|||
Musl libc (MIT License)\\n\
|
||||
Copyright 2005-2014 Rich Felker, et. al.\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
|
||||
/* origin: FreeBSD /usr/src/lib/msun/src/s_tan.c */
|
||||
/*
|
||||
|
|
|
@ -5,6 +5,13 @@
|
|||
│ FreeBSD lib/msun/src/s_tanhf.c │
|
||||
│ Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -28,12 +35,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/tinymath/freebsd.internal.h"
|
||||
|
|
|
@ -5,6 +5,13 @@
|
|||
│ FreeBSD lib/msun/src/s_tanhl.c │
|
||||
│ Converted to long double by Bruce D. Evans │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
│ Copyright (c) 1992-2023 The FreeBSD Project. │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
|
@ -28,12 +35,6 @@
|
|||
│ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF │
|
||||
│ SUCH DAMAGE. │
|
||||
│ │
|
||||
│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. │
|
||||
│ Developed at SunPro, a Sun Microsystems, Inc. business. │
|
||||
│ Permission to use, copy, modify, and distribute this │
|
||||
│ software is freely granted, provided that this notice │
|
||||
│ is preserved. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/likely.h"
|
||||
#include "libc/math.h"
|
||||
|
|
|
@ -31,7 +31,6 @@
|
|||
#include "libc/intrin/directmap.internal.h"
|
||||
#include "libc/intrin/extend.internal.h"
|
||||
#include "libc/intrin/weaken.h"
|
||||
#include "libc/nexgen32e/crc32.h"
|
||||
#include "libc/runtime/internal.h"
|
||||
#include "libc/runtime/memtrack.internal.h"
|
||||
#include "libc/sysv/consts/f.h"
|
||||
|
@ -159,11 +158,6 @@ static int __zipos_load(struct Zipos *zipos, size_t cf, unsigned flags,
|
|||
h->pos = 0;
|
||||
h->cfile = cf;
|
||||
h->size = size;
|
||||
if (!IsTiny() && h->mem &&
|
||||
crc32_z(0, h->mem, h->size) != ZIP_LFILE_CRC32(zipos->map + lf)) {
|
||||
h->mem = 0;
|
||||
eio();
|
||||
}
|
||||
if (h->mem) {
|
||||
minfd = 3;
|
||||
__fds_lock();
|
||||
|
|
|
@ -61,6 +61,17 @@ TEST(memcmp, hug) {
|
|||
}
|
||||
}
|
||||
|
||||
static int coerce(int result) {
|
||||
#ifdef __aarch64__
|
||||
// arm's strcmp assembly is nuts and unpredictable, but it's legal
|
||||
if (result < 0) return -1;
|
||||
if (result > 0) return +1;
|
||||
return 0;
|
||||
#else
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(memcmp, fuzz) {
|
||||
int i, o, n, g;
|
||||
char a[256], b[256];
|
||||
|
@ -79,8 +90,18 @@ TEST(memcmp, fuzz) {
|
|||
}
|
||||
o = rand() & 31;
|
||||
n = rand() % (sizeof(a) - o);
|
||||
g = golden(a + o, b + o, n);
|
||||
ASSERT_EQ(g, memcmp(a + o, b + o, n), "n=%d o=%d", n, o);
|
||||
g = coerce(golden(a + o, b + o, n));
|
||||
#if 0
|
||||
if (memcmp(a + o, b + o, n) != g) {
|
||||
kprintf("const size_t g = %d;\n", g);
|
||||
kprintf("const size_t n = %d;\n", n);
|
||||
kprintf("const char a[] = unbingstr(%#.*hhhs); /* %p */\n", n, a + o,
|
||||
a + o);
|
||||
kprintf("const char b[] = unbingstr(%#.*hhhs); /* %p */\n", n, b + o,
|
||||
b + o);
|
||||
}
|
||||
#endif
|
||||
ASSERT_EQ(g, coerce(memcmp(a + o, b + o, n)), "n=%d o=%d", n, o);
|
||||
ASSERT_EQ(!!g, !!bcmp(a + o, b + o, n), "n=%d o=%d", n, o);
|
||||
ASSERT_EQ(!!g, !!timingsafe_bcmp(a + o, b + o, n), "n=%d o=%d", n, o);
|
||||
ASSERT_EQ(MAX(-1, MIN(1, g)), timingsafe_memcmp(a + o, b + o, n),
|
||||
|
|
|
@ -190,9 +190,11 @@ BENCH(strchr, bench2) {
|
|||
char *strlen_(const char *) asm("strlen");
|
||||
char *rawmemchr_(const char *, int) asm("rawmemchr");
|
||||
EZBENCH2("strchr z", donothing, strchr_(kHyperion, 'z'));
|
||||
EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z'));
|
||||
EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize));
|
||||
EZBENCH2("strchr Z", donothing, strchr_(kHyperion, 'Z'));
|
||||
EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize));
|
||||
EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize));
|
||||
EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z'));
|
||||
EZBENCH2("rawmemchr Z", donothing, rawmemchr_(kHyperion, 'z'));
|
||||
EZBENCH2("rawmemchr \\0", donothing, rawmemchr_(kHyperion, 0));
|
||||
EZBENCH2("strlen", donothing, strlen_(kHyperion));
|
||||
EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize));
|
|
@ -1,49 +0,0 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/nexgen32e/crc32.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/hyperion.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
TEST(crc32, testBigText) {
|
||||
size_t size;
|
||||
void *hyperion;
|
||||
size = kHyperionSize;
|
||||
hyperion = kHyperion;
|
||||
EXPECT_EQ(0xe9ded8e6, crc32(0, hyperion, size));
|
||||
EXPECT_EQ(0xe9ded8e6, crc32_z(0, hyperion, size));
|
||||
if (X86_HAVE(PCLMUL)) {
|
||||
size = ROUNDDOWN(size, 64);
|
||||
EXPECT_EQ(0xc7adc04f, crc32(0, hyperion, size));
|
||||
EXPECT_EQ(0xc7adc04f, crc32_z(0, hyperion, size));
|
||||
EXPECT_EQ(0xc7adc04f,
|
||||
0xffffffffu ^ crc32_pclmul(0 ^ 0xffffffffu, hyperion, size));
|
||||
}
|
||||
}
|
||||
|
||||
#define TESTSTR "libc/calls/typedef/sighandler_t.h"
|
||||
|
||||
BENCH(crc32c, bench) {
|
||||
EZBENCH2("crc32c", donothing,
|
||||
EXPROPRIATE(crc32c(0, VEIL("r", TESTSTR), sizeof(TESTSTR) - 1)));
|
||||
}
|
|
@ -16,17 +16,18 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/mem/mem.h"
|
||||
#include "libc/nexgen32e/crc32.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/hyperion.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
#define FANATICS "Fanatics"
|
||||
|
||||
|
|
|
@ -17,18 +17,19 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/assert.h"
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/mem/mem.h"
|
||||
#include "libc/nexgen32e/crc32.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/hyperion.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
#define FANATICS "Fanatics"
|
||||
|
||||
|
|
|
@ -15,15 +15,16 @@
|
|||
│ See the License for the specific language governing permissions and │
|
||||
│ limitations under the License. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/highwayhash64.h"
|
||||
#include "libc/inttypes.h"
|
||||
#include "libc/nexgen32e/crc32.h"
|
||||
#include "libc/stdio/rand.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/str/highwayhash64.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/hyperion.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
#define kMaxSize 64
|
||||
|
||||
|
|
|
@ -472,8 +472,6 @@ TEST(wcscmp, testTwosComplementBane) {
|
|||
TEST(wcsncmp, testTwosComplementBane) {
|
||||
wchar_t *B1 = malloc(4);
|
||||
wchar_t *B2 = malloc(4);
|
||||
B1[1] = L'\0';
|
||||
B2[1] = L'\0';
|
||||
EXPECT_EQ(wcsncmp(memcpy(B1, "\x00\x00\x00\x80", 4),
|
||||
memcpy(B2, "\x00\x00\x00\x80", 4), 1),
|
||||
0);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/mem/gc.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
#include "libc/x/x.h"
|
||||
#include "libc/x/xasprintf.h"
|
||||
|
@ -51,3 +52,9 @@ TEST(asinhl, test) {
|
|||
EXPECT_STREQ("NAN", _gc(xdtoal(_asinhl(NAN))));
|
||||
EXPECT_STREQ("INFINITY", _gc(xdtoal(_asinhl(INFINITY))));
|
||||
}
|
||||
|
||||
BENCH(asinh, bench) {
|
||||
EZBENCH2("asinh", donothing, _asinh(.7)); // ~26ns
|
||||
EZBENCH2("asinhf", donothing, _asinhf(.7)); // ~17ns
|
||||
EZBENCH2("asinhl", donothing, _asinhl(.7)); // ~48ns
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/math.h"
|
||||
#include "libc/mem/gc.internal.h"
|
||||
#include "libc/testlib/ezbench.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
#include "libc/x/x.h"
|
||||
|
||||
|
@ -60,3 +61,9 @@ TEST(sinhf, test) {
|
|||
EXPECT_STREQ("INFINITY", gc(xdtoaf(_sinhf(INFINITY))));
|
||||
EXPECT_STREQ("-INFINITY", gc(xdtoaf(_sinhf(-INFINITY))));
|
||||
}
|
||||
|
||||
BENCH(sinh, bench) {
|
||||
EZBENCH2("sinh", donothing, _sinh(.7)); // ~24ns
|
||||
EZBENCH2("sinhf", donothing, _sinhf(.7)); // ~19ns
|
||||
EZBENCH2("sinhl", donothing, _sinhl(.7)); // ~15ns
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue