mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-08 04:08:32 +00:00
Make AARCH64 harder, better, faster, stronger
- Perform some housekeeping on scalar math function code - Import ARM's Optimized Routines for SIMD string processing - Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
parent
550b52abf6
commit
cc1732bc42
143 changed files with 15661 additions and 1329 deletions
88
libc/intrin/aarch64/asmdefs.h
Normal file
88
libc/intrin/aarch64/asmdefs.h
Normal file
|
@ -0,0 +1,88 @@
|
|||
#ifndef COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
|
||||
#define COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
|
||||
#ifdef __ASSEMBLER__
|
||||
// clang-format off
|
||||
|
||||
/* Branch Target Identitication support. */
|
||||
#define BTI_C hint 34
|
||||
#define BTI_J hint 36
|
||||
/* Return address signing support (pac-ret). */
|
||||
#define PACIASP hint 25; .cfi_window_save
|
||||
#define AUTIASP hint 29; .cfi_window_save
|
||||
|
||||
/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
|
||||
#define FEATURE_1_AND 0xc0000000
|
||||
#define FEATURE_1_BTI 1
|
||||
#define FEATURE_1_PAC 2
|
||||
|
||||
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
|
||||
#define GNU_PROPERTY(type, value) \
|
||||
.section .note.gnu.property, "a"; \
|
||||
.p2align 3; \
|
||||
.word 4; \
|
||||
.word 16; \
|
||||
.word 5; \
|
||||
.asciz "GNU"; \
|
||||
.word type; \
|
||||
.word 4; \
|
||||
.word value; \
|
||||
.word 0; \
|
||||
.text
|
||||
|
||||
/* If set then the GNU Property Note section will be added to
|
||||
mark objects to support BTI and PAC-RET. */
|
||||
#ifndef WANT_GNU_PROPERTY
|
||||
#define WANT_GNU_PROPERTY 1
|
||||
#endif
|
||||
|
||||
#if WANT_GNU_PROPERTY
|
||||
/* Add property note with supported features to all asm files. */
|
||||
GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
|
||||
#endif
|
||||
|
||||
#define ENTRY_ALIGN(name, alignment) \
|
||||
.global name; \
|
||||
.type name,%function; \
|
||||
.align alignment; \
|
||||
name: \
|
||||
.cfi_startproc; \
|
||||
BTI_C;
|
||||
|
||||
#define ENTRY(name) ENTRY_ALIGN(name, 6)
|
||||
|
||||
#define ENTRY_ALIAS(name) \
|
||||
.global name; \
|
||||
.type name,%function; \
|
||||
name:
|
||||
|
||||
#define END(name) \
|
||||
.cfi_endproc; \
|
||||
.size name, .-name;
|
||||
|
||||
#define L(l) .L ## l
|
||||
|
||||
#ifdef __ILP32__
|
||||
/* Sanitize padding bits of pointer arguments as per aapcs64 */
|
||||
#define PTR_ARG(n) mov w##n, w##n
|
||||
#else
|
||||
#define PTR_ARG(n)
|
||||
#endif
|
||||
|
||||
#ifdef __ILP32__
|
||||
/* Sanitize padding bits of size arguments as per aapcs64 */
|
||||
#define SIZE_ARG(n) mov w##n, w##n
|
||||
#else
|
||||
#define SIZE_ARG(n)
|
||||
#endif
|
||||
|
||||
/* Compiler supports SVE instructions */
|
||||
#ifndef HAVE_SVE
|
||||
# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
|
||||
# define HAVE_SVE 1
|
||||
# else
|
||||
# define HAVE_SVE 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#endif /* __ASSEMBLER__ */
|
||||
#endif /* COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_ */
|
172
libc/intrin/aarch64/memchr.S
Normal file
172
libc/intrin/aarch64/memchr.S
Normal file
|
@ -0,0 +1,172 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memchr_aarch64 memchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
#define cntin x2
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x3
|
||||
#define tmp x4
|
||||
#define wtmp2 w5
|
||||
#define synd x6
|
||||
#define soff x9
|
||||
#define cntrem x10
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_chr1 v3
|
||||
#define vhas_chr2 v4
|
||||
#define vrepmask v5
|
||||
#define vend v6
|
||||
|
||||
/*
|
||||
* Core algorithm:
|
||||
*
|
||||
* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
|
||||
* per byte. For each tuple, bit 0 is set if the relevant byte matched the
|
||||
* requested character and bit 1 is not used (faster than using a 32bit
|
||||
* syndrome). Since the bits in the syndrome reflect exactly the order in which
|
||||
* things occur in the original string, counting trailing zeros allows to
|
||||
* identify exactly which byte has matched.
|
||||
*/
|
||||
|
||||
ENTRY (__memchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
/* Do not dereference srcin if no bytes to compare. */
|
||||
cbz cntin, L(zero_length)
|
||||
/*
|
||||
* Magic constant 0x40100401 allows us to identify which lane matches
|
||||
* the requested byte.
|
||||
*/
|
||||
mov wtmp2, #0x0401
|
||||
movk wtmp2, #0x4010, lsl #16
|
||||
dup vrepchr.16b, chrin
|
||||
/* Work with aligned 32-byte chunks */
|
||||
bic src, srcin, #31
|
||||
dup vrepmask.4s, wtmp2
|
||||
ands soff, srcin, #31
|
||||
and cntrem, cntin, #31
|
||||
b.eq L(loop)
|
||||
|
||||
/*
|
||||
* Input string is not 32-byte aligned. We calculate the syndrome
|
||||
* value for the aligned 32 bytes block containing the first bytes
|
||||
* and mask the irrelevant part.
|
||||
*/
|
||||
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
sub tmp, soff, #32
|
||||
adds cntin, cntin, tmp
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
|
||||
addp vend.16b, vend.16b, vend.16b /* 128->64 */
|
||||
mov synd, vend.d[0]
|
||||
/* Clear the soff*2 lower bits */
|
||||
lsl tmp, soff, #1
|
||||
lsr synd, synd, tmp
|
||||
lsl synd, synd, tmp
|
||||
/* The first block can also be the last */
|
||||
b.ls L(masklast)
|
||||
/* Have we found something already? */
|
||||
cbnz synd, L(tail)
|
||||
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
subs cntin, cntin, #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
/* If we're out of data we finish regardless of the result */
|
||||
b.ls L(end)
|
||||
/* Use a fast check for the termination condition */
|
||||
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
|
||||
addp vend.2d, vend.2d, vend.2d
|
||||
mov synd, vend.d[0]
|
||||
/* We're not out of data, loop if we haven't found the character */
|
||||
cbz synd, L(loop)
|
||||
|
||||
L(end):
|
||||
/* Termination condition found, let's calculate the syndrome value */
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
|
||||
addp vend.16b, vend.16b, vend.16b /* 128->64 */
|
||||
mov synd, vend.d[0]
|
||||
/* Only do the clear for the last possible block */
|
||||
b.hs L(tail)
|
||||
|
||||
L(masklast):
|
||||
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
|
||||
add tmp, cntrem, soff
|
||||
and tmp, tmp, #31
|
||||
sub tmp, tmp, #32
|
||||
neg tmp, tmp, lsl #1
|
||||
lsl synd, synd, tmp
|
||||
lsr synd, synd, tmp
|
||||
|
||||
L(tail):
|
||||
/* Count the trailing zeros using bit reversing */
|
||||
rbit synd, synd
|
||||
/* Compensate the last post-increment */
|
||||
sub src, src, #32
|
||||
/* Check that we have found a character */
|
||||
cmp synd, #0
|
||||
/* And count the leading zeros */
|
||||
clz synd, synd
|
||||
/* Compute the potential result */
|
||||
add result, src, synd, lsr #1
|
||||
/* Select result or NULL */
|
||||
csel result, xzr, result, eq
|
||||
ret
|
||||
|
||||
L(zero_length):
|
||||
mov result, #0
|
||||
ret
|
||||
|
||||
END (__memchr_aarch64)
|
218
libc/intrin/aarch64/memcmp.S
Normal file
218
libc/intrin/aarch64/memcmp.S
Normal file
|
@ -0,0 +1,218 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memcmp_aarch64 memcmp
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*/
|
||||
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result w0
|
||||
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data2 x4
|
||||
#define data2w w4
|
||||
#define data3 x5
|
||||
#define data3w w5
|
||||
#define data4 x6
|
||||
#define data4w w6
|
||||
#define tmp x6
|
||||
#define src1end x7
|
||||
#define src2end x8
|
||||
|
||||
|
||||
ENTRY (__memcmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
|
||||
cmp limit, 16
|
||||
b.lo L(less16)
|
||||
ldp data1, data3, [src1]
|
||||
ldp data2, data4, [src2]
|
||||
ccmp data1, data2, 0, ne
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
cmp limit, 32
|
||||
b.ls L(last_bytes)
|
||||
cmp limit, 160
|
||||
b.hs L(loop_align)
|
||||
sub limit, limit, 32
|
||||
|
||||
.p2align 4
|
||||
L(loop32):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
cmp limit, 16
|
||||
b.ls L(last_bytes)
|
||||
|
||||
ldp data1, data3, [src1, 32]
|
||||
ldp data2, data4, [src2, 32]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
add src1, src1, 32
|
||||
add src2, src2, 32
|
||||
L(last64):
|
||||
subs limit, limit, 32
|
||||
b.hi L(loop32)
|
||||
|
||||
/* Compare last 1-16 bytes using unaligned access. */
|
||||
L(last_bytes):
|
||||
ldp data1, data3, [src1end, -16]
|
||||
ldp data2, data4, [src2end, -16]
|
||||
L(return2):
|
||||
cmp data1, data2
|
||||
csel data1, data1, data3, ne
|
||||
csel data2, data2, data4, ne
|
||||
|
||||
/* Compare data bytes and set return value to 0, -1 or 1. */
|
||||
L(return):
|
||||
#ifndef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less16):
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
tbz limit, 3, L(less8)
|
||||
ldr data1, [src1]
|
||||
ldr data2, [src2]
|
||||
ldr data3, [src1end, -8]
|
||||
ldr data4, [src2end, -8]
|
||||
b L(return2)
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
tbz limit, 2, L(less4)
|
||||
ldr data1w, [src1]
|
||||
ldr data2w, [src2]
|
||||
ldr data3w, [src1end, -4]
|
||||
ldr data4w, [src2end, -4]
|
||||
b L(return2)
|
||||
|
||||
L(less4):
|
||||
tbz limit, 1, L(less2)
|
||||
ldrh data1w, [src1]
|
||||
ldrh data2w, [src2]
|
||||
cmp data1w, data2w
|
||||
b.ne L(return)
|
||||
L(less2):
|
||||
mov result, 0
|
||||
tbz limit, 0, L(return_zero)
|
||||
ldrb data1w, [src1end, -1]
|
||||
ldrb data2w, [src2end, -1]
|
||||
sub result, data1w, data2w
|
||||
L(return_zero):
|
||||
ret
|
||||
|
||||
L(loop_align):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
/* Align src2 and adjust src1, src2 and limit. */
|
||||
and tmp, src2, 15
|
||||
sub tmp, tmp, 16
|
||||
sub src2, src2, tmp
|
||||
add limit, limit, tmp
|
||||
sub src1, src1, tmp
|
||||
sub limit, limit, 64 + 16
|
||||
|
||||
.p2align 4
|
||||
L(loop64):
|
||||
ldr q0, [src1, 16]
|
||||
ldr q1, [src2, 16]
|
||||
subs limit, limit, 64
|
||||
ldr q2, [src1, 32]
|
||||
ldr q3, [src2, 32]
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
ldr q2, [src1, 48]
|
||||
ldr q3, [src2, 48]
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
ldr q4, [src1, 64]!
|
||||
ldr q5, [src2, 64]!
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
eor v2.16b, v4.16b, v5.16b
|
||||
umaxp v1.16b, v1.16b, v2.16b
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
umaxp v0.16b, v0.16b, v0.16b
|
||||
fmov tmp, d0
|
||||
ccmp tmp, 0, 0, hi
|
||||
b.eq L(loop64)
|
||||
|
||||
/* If equal, process last 1-64 bytes using scalar loop. */
|
||||
add limit, limit, 64 + 16
|
||||
cbz tmp, L(last64)
|
||||
|
||||
/* Determine the 8-byte aligned offset of the first difference. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev16 tmp, tmp
|
||||
#endif
|
||||
rev tmp, tmp
|
||||
clz tmp, tmp
|
||||
bic tmp, tmp, 7
|
||||
sub tmp, tmp, 48
|
||||
ldr data1, [src1, tmp]
|
||||
ldr data2, [src2, tmp]
|
||||
#ifndef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
mov result, 1
|
||||
cmp data1, data2
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
END (__memcmp_aarch64)
|
233
libc/intrin/aarch64/memcpy.S
Normal file
233
libc/intrin/aarch64/memcpy.S
Normal file
|
@ -0,0 +1,233 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memcpy_aarch64_simd memcpy
|
||||
#define __memmove_aarch64_simd memmove
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define srcend x4
|
||||
#define dstend x5
|
||||
#define A_l x6
|
||||
#define A_lw w6
|
||||
#define A_h x7
|
||||
#define B_l x8
|
||||
#define B_lw w8
|
||||
#define B_h x9
|
||||
#define C_lw w10
|
||||
#define tmp1 x14
|
||||
|
||||
#define A_q q0
|
||||
#define B_q q1
|
||||
#define C_q q2
|
||||
#define D_q q3
|
||||
#define E_q q4
|
||||
#define F_q q5
|
||||
#define G_q q6
|
||||
#define H_q q7
|
||||
|
||||
/* This implementation handles overlaps and supports both memcpy and memmove
|
||||
from a single entry point. It uses unaligned accesses and branchless
|
||||
sequences to keep the code small, simple and improve performance.
|
||||
|
||||
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
|
||||
copies of up to 128 bytes, and large copies. The overhead of the overlap
|
||||
check is negligible since it is only required for large copies.
|
||||
|
||||
Large copies use a software pipelined loop processing 64 bytes per iteration.
|
||||
The source pointer is 16-byte aligned to minimize unaligned accesses.
|
||||
The loop tail is handled by always copying 64 bytes from the end.
|
||||
*/
|
||||
|
||||
ENTRY_ALIAS (__memmove_aarch64_simd)
|
||||
ENTRY (__memcpy_aarch64_simd)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 128
|
||||
b.hi L(copy_long)
|
||||
cmp count, 32
|
||||
b.hi L(copy32_128)
|
||||
|
||||
/* Small copies: 0..32 bytes. */
|
||||
cmp count, 16
|
||||
b.lo L(copy16)
|
||||
ldr A_q, [src]
|
||||
ldr B_q, [srcend, -16]
|
||||
str A_q, [dstin]
|
||||
str B_q, [dstend, -16]
|
||||
ret
|
||||
|
||||
/* Copy 8-15 bytes. */
|
||||
L(copy16):
|
||||
tbz count, 3, L(copy8)
|
||||
ldr A_l, [src]
|
||||
ldr A_h, [srcend, -8]
|
||||
str A_l, [dstin]
|
||||
str A_h, [dstend, -8]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
/* Copy 4-7 bytes. */
|
||||
L(copy8):
|
||||
tbz count, 2, L(copy4)
|
||||
ldr A_lw, [src]
|
||||
ldr B_lw, [srcend, -4]
|
||||
str A_lw, [dstin]
|
||||
str B_lw, [dstend, -4]
|
||||
ret
|
||||
|
||||
/* Copy 0..3 bytes using a branchless sequence. */
|
||||
L(copy4):
|
||||
cbz count, L(copy0)
|
||||
lsr tmp1, count, 1
|
||||
ldrb A_lw, [src]
|
||||
ldrb C_lw, [srcend, -1]
|
||||
ldrb B_lw, [src, tmp1]
|
||||
strb A_lw, [dstin]
|
||||
strb B_lw, [dstin, tmp1]
|
||||
strb C_lw, [dstend, -1]
|
||||
L(copy0):
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
L(copy32_128):
|
||||
ldp A_q, B_q, [src]
|
||||
ldp C_q, D_q, [srcend, -32]
|
||||
cmp count, 64
|
||||
b.hi L(copy128)
|
||||
stp A_q, B_q, [dstin]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 65..128 bytes. */
|
||||
L(copy128):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
cmp count, 96
|
||||
b.ls L(copy96)
|
||||
ldp G_q, H_q, [srcend, -64]
|
||||
stp G_q, H_q, [dstend, -64]
|
||||
L(copy96):
|
||||
stp A_q, B_q, [dstin]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
/* Copy more than 128 bytes. */
|
||||
L(copy_long):
|
||||
/* Use backwards copy if there is an overlap. */
|
||||
sub tmp1, dstin, src
|
||||
cmp tmp1, count
|
||||
b.lo L(copy_long_backwards)
|
||||
|
||||
/* Copy 16 bytes and then align src to 16-byte alignment. */
|
||||
ldr D_q, [src]
|
||||
and tmp1, src, 15
|
||||
bic src, src, 15
|
||||
sub dst, dstin, tmp1
|
||||
add count, count, tmp1 /* Count is now 16 too large. */
|
||||
ldp A_q, B_q, [src, 16]
|
||||
str D_q, [dstin]
|
||||
ldp C_q, D_q, [src, 48]
|
||||
subs count, count, 128 + 16 /* Test and readjust count. */
|
||||
b.ls L(copy64_from_end)
|
||||
L(loop64):
|
||||
stp A_q, B_q, [dst, 16]
|
||||
ldp A_q, B_q, [src, 80]
|
||||
stp C_q, D_q, [dst, 48]
|
||||
ldp C_q, D_q, [src, 112]
|
||||
add src, src, 64
|
||||
add dst, dst, 64
|
||||
subs count, count, 64
|
||||
b.hi L(loop64)
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the end. */
|
||||
L(copy64_from_end):
|
||||
ldp E_q, F_q, [srcend, -64]
|
||||
stp A_q, B_q, [dst, 16]
|
||||
ldp A_q, B_q, [srcend, -32]
|
||||
stp C_q, D_q, [dst, 48]
|
||||
stp E_q, F_q, [dstend, -64]
|
||||
stp A_q, B_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
/* Large backwards copy for overlapping copies.
|
||||
Copy 16 bytes and then align srcend to 16-byte alignment. */
|
||||
L(copy_long_backwards):
|
||||
cbz tmp1, L(copy0)
|
||||
ldr D_q, [srcend, -16]
|
||||
and tmp1, srcend, 15
|
||||
bic srcend, srcend, 15
|
||||
sub count, count, tmp1
|
||||
ldp A_q, B_q, [srcend, -32]
|
||||
str D_q, [dstend, -16]
|
||||
ldp C_q, D_q, [srcend, -64]
|
||||
sub dstend, dstend, tmp1
|
||||
subs count, count, 128
|
||||
b.ls L(copy64_from_start)
|
||||
|
||||
L(loop64_backwards):
|
||||
str B_q, [dstend, -16]
|
||||
str A_q, [dstend, -32]
|
||||
ldp A_q, B_q, [srcend, -96]
|
||||
str D_q, [dstend, -48]
|
||||
str C_q, [dstend, -64]!
|
||||
ldp C_q, D_q, [srcend, -128]
|
||||
sub srcend, srcend, 64
|
||||
subs count, count, 64
|
||||
b.hi L(loop64_backwards)
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the start. */
|
||||
L(copy64_from_start):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
stp A_q, B_q, [dstend, -32]
|
||||
ldp A_q, B_q, [src]
|
||||
stp C_q, D_q, [dstend, -64]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp A_q, B_q, [dstin]
|
||||
ret
|
||||
|
||||
END (__memcpy_aarch64_simd)
|
138
libc/intrin/aarch64/memrchr.S
Normal file
138
libc/intrin/aarch64/memrchr.S
Normal file
|
@ -0,0 +1,138 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memrchr_aarch64 memrchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
#define cntin x2
|
||||
#define result x0
|
||||
|
||||
#define src x3
|
||||
#define cntrem x4
|
||||
#define synd x5
|
||||
#define shift x6
|
||||
#define tmp x7
|
||||
#define end x8
|
||||
#define endm1 x9
|
||||
|
||||
#define vrepchr v0
|
||||
#define qdata q1
|
||||
#define vdata v1
|
||||
#define vhas_chr v2
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__memrchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
add end, srcin, cntin
|
||||
sub endm1, end, 1
|
||||
bic src, endm1, 15
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepchr.16b, chrin
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
neg shift, end, lsl 2
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsl synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
clz synd, synd
|
||||
sub result, endm1, synd, lsr 2
|
||||
cmp cntin, synd, lsr 2
|
||||
csel result, result, xzr, hi
|
||||
ret
|
||||
|
||||
nop
|
||||
L(start_loop):
|
||||
subs cntrem, src, srcin
|
||||
b.ls L(nomatch)
|
||||
|
||||
/* Make sure that it won't overread by a 16-byte chunk */
|
||||
sub cntrem, cntrem, 1
|
||||
tbz cntrem, 4, L(loop32_2)
|
||||
add src, src, 16
|
||||
|
||||
.p2align 5
|
||||
L(loop32):
|
||||
ldr qdata, [src, -32]!
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbnz synd, L(end)
|
||||
|
||||
L(loop32_2):
|
||||
ldr qdata, [src, -16]
|
||||
subs cntrem, cntrem, 32
|
||||
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
|
||||
b.lo L(end_2)
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop32)
|
||||
L(end_2):
|
||||
sub src, src, 16
|
||||
L(end):
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
|
||||
add tmp, src, 15
|
||||
#ifdef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz synd, synd
|
||||
sub tmp, tmp, synd, lsr 2
|
||||
cmp tmp, srcin
|
||||
csel result, tmp, xzr, hs
|
||||
ret
|
||||
|
||||
L(nomatch):
|
||||
mov result, 0
|
||||
ret
|
||||
|
||||
END (__memrchr_aarch64)
|
143
libc/intrin/aarch64/memset.S
Normal file
143
libc/intrin/aarch64/memset.S
Normal file
|
@ -0,0 +1,143 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memset_aarch64 memset
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define val x1
|
||||
#define valw w1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define dstend x4
|
||||
#define zva_val x5
|
||||
|
||||
ENTRY (__memset_aarch64)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
|
||||
dup v0.16B, valw
|
||||
add dstend, dstin, count
|
||||
|
||||
cmp count, 96
|
||||
b.hi L(set_long)
|
||||
cmp count, 16
|
||||
b.hs L(set_medium)
|
||||
mov val, v0.D[0]
|
||||
|
||||
/* Set 0..15 bytes. */
|
||||
tbz count, 3, 1f
|
||||
str val, [dstin]
|
||||
str val, [dstend, -8]
|
||||
ret
|
||||
.p2align 4
|
||||
1: tbz count, 2, 2f
|
||||
str valw, [dstin]
|
||||
str valw, [dstend, -4]
|
||||
ret
|
||||
2: cbz count, 3f
|
||||
strb valw, [dstin]
|
||||
tbz count, 1, 3f
|
||||
strh valw, [dstend, -2]
|
||||
3: ret
|
||||
|
||||
/* Set 17..96 bytes. */
|
||||
L(set_medium):
|
||||
str q0, [dstin]
|
||||
tbnz count, 6, L(set96)
|
||||
str q0, [dstend, -16]
|
||||
tbz count, 5, 1f
|
||||
str q0, [dstin, 16]
|
||||
str q0, [dstend, -32]
|
||||
1: ret
|
||||
|
||||
.p2align 4
|
||||
/* Set 64..96 bytes. Write 64 bytes from the start and
|
||||
32 bytes from the end. */
|
||||
L(set96):
|
||||
str q0, [dstin, 16]
|
||||
stp q0, q0, [dstin, 32]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(set_long):
|
||||
and valw, valw, 255
|
||||
bic dst, dstin, 15
|
||||
str q0, [dstin]
|
||||
cmp count, 160
|
||||
ccmp valw, 0, 0, hs
|
||||
b.ne L(no_zva)
|
||||
|
||||
#ifndef SKIP_ZVA_CHECK
|
||||
mrs zva_val, dczid_el0
|
||||
and zva_val, zva_val, 31
|
||||
cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||
b.ne L(no_zva)
|
||||
#endif
|
||||
str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
bic dst, dst, 63
|
||||
sub count, dstend, dst /* Count is now 64 too large. */
|
||||
sub count, count, 128 /* Adjust count and bias for loop. */
|
||||
|
||||
.p2align 4
|
||||
L(zva_loop):
|
||||
add dst, dst, 64
|
||||
dc zva, dst
|
||||
subs count, count, 64
|
||||
b.hi L(zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
L(no_zva):
|
||||
sub count, dstend, dst /* Count is 16 too large. */
|
||||
sub dst, dst, 16 /* Dst is biased by -32. */
|
||||
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||
L(no_zva_loop):
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
subs count, count, 64
|
||||
b.hi L(no_zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
END (__memset_aarch64)
|
175
libc/intrin/aarch64/stpcpy.S
Normal file
175
libc/intrin/aarch64/stpcpy.S
Normal file
|
@ -0,0 +1,175 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __stpcpy_aarch64 stpcpy
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define dst x3
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
#define data2 x7
|
||||
#define dataw2 w7
|
||||
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
#define dataq2 q1
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__stpcpy_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
sub tmp, src, srcin
|
||||
clz len, synd
|
||||
add len, tmp, len, lsr 2
|
||||
tbz len, 4, L(less16)
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [srcin]
|
||||
ldr dataq2, [srcin, tmp]
|
||||
str dataq, [dstin]
|
||||
str dataq2, [dstin, tmp]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
ldr data1, [srcin]
|
||||
ldr data2, [srcin, tmp]
|
||||
str data1, [dstin]
|
||||
str data2, [dstin, tmp]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
subs tmp, len, 3
|
||||
b.lo L(less4)
|
||||
ldr dataw1, [srcin]
|
||||
ldr dataw2, [srcin, tmp]
|
||||
str dataw1, [dstin]
|
||||
str dataw2, [dstin, tmp]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
L(less4):
|
||||
cbz len, L(zerobyte)
|
||||
ldrh dataw1, [srcin]
|
||||
strh dataw1, [dstin]
|
||||
L(zerobyte):
|
||||
strb wzr, [dstin, len]
|
||||
add result, dstin, len
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub tmp, srcin, dstin
|
||||
ldr dataq2, [srcin]
|
||||
sub dst, src, tmp
|
||||
str dataq2, [dstin]
|
||||
L(loop):
|
||||
str dataq, [dst], 32
|
||||
ldr dataq, [src, 16]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbnz synd, L(loopend)
|
||||
str dataq, [dst, -16]
|
||||
ldr dataq, [src, 32]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
add dst, dst, 16
|
||||
L(loopend):
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
sub dst, dst, 31
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
add dst, dst, len
|
||||
ldr dataq, [dst, tmp]
|
||||
str dataq, [dst]
|
||||
add result, dst, 15
|
||||
ret
|
||||
|
||||
END (__stpcpy_aarch64)
|
152
libc/intrin/aarch64/strchr.S
Normal file
152
libc/intrin/aarch64/strchr.S
Normal file
|
@ -0,0 +1,152 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strchr_aarch64 strchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define tmp1 x3
|
||||
#define wtmp2 w4
|
||||
#define tmp3 x5
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_nul1 v3
|
||||
#define vhas_nul2 v4
|
||||
#define vhas_chr1 v5
|
||||
#define vhas_chr2 v6
|
||||
#define vrepmask_0 v7
|
||||
#define vrepmask_c v16
|
||||
#define vend1 v17
|
||||
#define vend2 v18
|
||||
|
||||
/* Core algorithm.
|
||||
|
||||
For each 32-byte hunk we calculate a 64-bit syndrome value, with
|
||||
two bits per byte (LSB is always in bits 0 and 1, for both big
|
||||
and little-endian systems). For each tuple, bit 0 is set iff
|
||||
the relevant byte matched the requested character; bit 1 is set
|
||||
iff the relevant byte matched the NUL end of string (we trigger
|
||||
off bit0 for the special case of looking for NUL). Since the bits
|
||||
in the syndrome reflect exactly the order in which things occur
|
||||
in the original string a count_trailing_zeros() operation will
|
||||
identify exactly which byte is causing the termination, and why. */
|
||||
|
||||
/* Locals and temporaries. */
|
||||
|
||||
ENTRY (__strchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0xc0300c03 to allow us to identify which lane
|
||||
matches the requested byte. Even bits are set if the character
|
||||
matches, odd bits if either the char is NUL or matches. */
|
||||
mov wtmp2, 0x0c03
|
||||
movk wtmp2, 0xc030, lsl 16
|
||||
dup vrepchr.16b, chrin
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask_c.4s, wtmp2
|
||||
ands tmp1, srcin, #31
|
||||
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
|
||||
b.eq L(loop)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
for all the bytes, but then mask off those bits of the
|
||||
syndrome that are related to the padding. */
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
neg tmp1, tmp1
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
|
||||
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
|
||||
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
|
||||
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
|
||||
lsl tmp1, tmp1, #1
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 256->128
|
||||
mov tmp3, #~0
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 128->64
|
||||
lsr tmp1, tmp3, tmp1
|
||||
|
||||
mov tmp3, vend1.d[0]
|
||||
bic tmp1, tmp3, tmp1 // Mask padding bits.
|
||||
cbnz tmp1, L(tail)
|
||||
|
||||
.p2align 4
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
umaxp vend1.16b, vend1.16b, vend1.16b
|
||||
mov tmp1, vend1.d[0]
|
||||
cbz tmp1, L(loop)
|
||||
|
||||
/* Termination condition found. Now need to establish exactly why
|
||||
we terminated. */
|
||||
bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
|
||||
bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
|
||||
and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
|
||||
and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vend2.16b // 128->64
|
||||
mov tmp1, vend1.d[0]
|
||||
L(tail):
|
||||
/* Count the trailing zeros, by bit reversing... */
|
||||
rbit tmp1, tmp1
|
||||
/* Re-bias source. */
|
||||
sub src, src, #32
|
||||
clz tmp1, tmp1 /* And counting the leading zeros. */
|
||||
/* Tmp1 is even if the target charager was found first. Otherwise
|
||||
we've found the end of string and we weren't looking for NUL. */
|
||||
tst tmp1, #1
|
||||
add result, src, tmp1, lsr #1
|
||||
csel result, result, xzr, eq
|
||||
ret
|
||||
|
||||
END (__strchr_aarch64)
|
140
libc/intrin/aarch64/strchrnul.S
Normal file
140
libc/intrin/aarch64/strchrnul.S
Normal file
|
@ -0,0 +1,140 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strchrnul_aarch64 strchrnul
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define tmp1 x3
|
||||
#define wtmp2 w4
|
||||
#define tmp3 x5
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_nul1 v3
|
||||
#define vhas_nul2 v4
|
||||
#define vhas_chr1 v5
|
||||
#define vhas_chr2 v6
|
||||
#define vrepmask v7
|
||||
#define vend1 v16
|
||||
|
||||
/* Core algorithm.
|
||||
|
||||
For each 32-byte hunk we calculate a 64-bit syndrome value, with
|
||||
two bits per byte (LSB is always in bits 0 and 1, for both big
|
||||
and little-endian systems). For each tuple, bit 0 is set iff
|
||||
the relevant byte matched the requested character or nul. Since the
|
||||
bits in the syndrome reflect exactly the order in which things occur
|
||||
in the original string a count_trailing_zeros() operation will
|
||||
identify exactly which byte is causing the termination. */
|
||||
|
||||
/* Locals and temporaries. */
|
||||
|
||||
ENTRY (__strchrnul_aarch64)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0x40100401 to allow us to identify which lane
|
||||
matches the termination condition. */
|
||||
mov wtmp2, #0x0401
|
||||
movk wtmp2, #0x4010, lsl #16
|
||||
dup vrepchr.16b, chrin
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask.4s, wtmp2
|
||||
ands tmp1, srcin, #31
|
||||
b.eq L(loop)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
for all the bytes, but then mask off those bits of the
|
||||
syndrome that are related to the padding. */
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
neg tmp1, tmp1
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
|
||||
lsl tmp1, tmp1, #1
|
||||
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
mov tmp3, #~0
|
||||
addp vend1.16b, vend1.16b, vend1.16b // 128->64
|
||||
lsr tmp1, tmp3, tmp1
|
||||
|
||||
mov tmp3, vend1.d[0]
|
||||
bic tmp1, tmp3, tmp1 // Mask padding bits.
|
||||
cbnz tmp1, L(tail)
|
||||
|
||||
.p2align 4
|
||||
L(loop):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
|
||||
cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
|
||||
orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
umaxp vend1.16b, vend1.16b, vend1.16b
|
||||
mov tmp1, vend1.d[0]
|
||||
cbz tmp1, L(loop)
|
||||
|
||||
/* Termination condition found. Now need to establish exactly why
|
||||
we terminated. */
|
||||
and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
|
||||
and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
|
||||
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vend1.16b // 128->64
|
||||
|
||||
mov tmp1, vend1.d[0]
|
||||
L(tail):
|
||||
/* Count the trailing zeros, by bit reversing... */
|
||||
rbit tmp1, tmp1
|
||||
/* Re-bias source. */
|
||||
sub src, src, #32
|
||||
clz tmp1, tmp1 /* ... and counting the leading zeros. */
|
||||
/* tmp1 is twice the offset into the fragment. */
|
||||
add result, src, tmp1, lsr #1
|
||||
ret
|
||||
|
||||
END (__strchrnul_aarch64)
|
214
libc/intrin/aarch64/strcmp.S
Normal file
214
libc/intrin/aarch64/strcmp.S
Normal file
|
@ -0,0 +1,214 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strcmp_aarch64 strcmp
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define result x0
|
||||
|
||||
#define data1 x2
|
||||
#define data1w w2
|
||||
#define data2 x3
|
||||
#define data2w w3
|
||||
#define has_nul x4
|
||||
#define diff x5
|
||||
#define off1 x5
|
||||
#define syndrome x6
|
||||
#define tmp x6
|
||||
#define data3 x7
|
||||
#define zeroones x8
|
||||
#define shift x9
|
||||
#define off2 x10
|
||||
|
||||
/* On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes. */
|
||||
#ifdef __AARCH64EB__
|
||||
# define LS_FW lsl
|
||||
#else
|
||||
# define LS_FW lsr
|
||||
#endif
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word.
|
||||
Since carry propagation makes 0x1 bytes before a NUL byte appear
|
||||
NUL too in big-endian, byte-reverse the data before the NUL check. */
|
||||
|
||||
|
||||
ENTRY (__strcmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
sub off2, src2, src1
|
||||
mov zeroones, REP8_01
|
||||
and tmp, src1, 7
|
||||
tst off2, 7
|
||||
b.ne L(misaligned8)
|
||||
cbnz tmp, L(mutual_align)
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_aligned):
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
L(start_realigned):
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, data1
|
||||
sub has_nul, tmp, zeroones
|
||||
orr tmp, tmp, REP8_7f
|
||||
#else
|
||||
sub has_nul, data1, zeroones
|
||||
orr tmp, data1, REP8_7f
|
||||
#endif
|
||||
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_aligned)
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
L(end):
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
clz shift, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
lsl data1, data1, shift
|
||||
lsl data2, data2, shift
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, 56
|
||||
sub result, data1, data2, lsr 56
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that precede the start point. */
|
||||
bic src1, src1, 7
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
neg shift, src2, lsl 3 /* Bits to alignment -64. */
|
||||
mov tmp, -1
|
||||
LS_FW tmp, tmp, shift
|
||||
orr data1, data1, tmp
|
||||
orr data2, data2, tmp
|
||||
b L(start_realigned)
|
||||
|
||||
L(misaligned8):
|
||||
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
|
||||
checking to make sure that we don't access beyond the end of SRC2. */
|
||||
cbz tmp, L(src1_aligned)
|
||||
L(do_misaligned):
|
||||
ldrb data1w, [src1], 1
|
||||
ldrb data2w, [src2], 1
|
||||
cmp data1w, 0
|
||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
tst src1, 7
|
||||
b.ne L(do_misaligned)
|
||||
|
||||
L(src1_aligned):
|
||||
neg shift, src2, lsl 3
|
||||
bic src2, src2, 7
|
||||
ldr data3, [src2], 8
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
lsr tmp, zeroones, shift
|
||||
orr data3, data3, tmp
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
bics has_nul, has_nul, tmp
|
||||
b.ne L(tail)
|
||||
|
||||
sub off1, src2, src1
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_unaligned):
|
||||
ldr data3, [src1, off1]
|
||||
ldr data2, [src1, off2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
ldr data1, [src1], 8
|
||||
bics has_nul, has_nul, tmp
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_unaligned)
|
||||
|
||||
lsl tmp, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, tmp
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, tmp
|
||||
cbnz syndrome, L(end)
|
||||
L(tail):
|
||||
ldr data1, [src1]
|
||||
neg shift, shift
|
||||
lsr data2, data3, shift
|
||||
lsr has_nul, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev data2, data2
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
b L(end)
|
||||
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
|
||||
END (__strcmp_aarch64)
|
170
libc/intrin/aarch64/strcpy.S
Normal file
170
libc/intrin/aarch64/strcpy.S
Normal file
|
@ -0,0 +1,170 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strcpy_aarch64 strcpy
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define dst x3
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
#define data2 x7
|
||||
#define dataw2 w7
|
||||
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
#define dataq2 q1
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
|
||||
per byte. We take 4 bits of every comparison byte with shift right and narrow
|
||||
by 4 instruction. Since the bits in the nibble mask reflect the order in
|
||||
which things occur in the original string, counting leading zeros identifies
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__strcpy_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
shrn vend.8b, vhas_nul.8h, 4
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
sub tmp, src, srcin
|
||||
clz len, synd
|
||||
add len, tmp, len, lsr 2
|
||||
tbz len, 4, L(less16)
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [srcin]
|
||||
ldr dataq2, [srcin, tmp]
|
||||
str dataq, [dstin]
|
||||
str dataq2, [dstin, tmp]
|
||||
ret
|
||||
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
ldr data1, [srcin]
|
||||
ldr data2, [srcin, tmp]
|
||||
str data1, [dstin]
|
||||
str data2, [dstin, tmp]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
subs tmp, len, 3
|
||||
b.lo L(less4)
|
||||
ldr dataw1, [srcin]
|
||||
ldr dataw2, [srcin, tmp]
|
||||
str dataw1, [dstin]
|
||||
str dataw2, [dstin, tmp]
|
||||
ret
|
||||
|
||||
L(less4):
|
||||
cbz len, L(zerobyte)
|
||||
ldrh dataw1, [srcin]
|
||||
strh dataw1, [dstin]
|
||||
L(zerobyte):
|
||||
strb wzr, [dstin, len]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub tmp, srcin, dstin
|
||||
ldr dataq2, [srcin]
|
||||
sub dst, src, tmp
|
||||
str dataq2, [dstin]
|
||||
L(loop):
|
||||
str dataq, [dst], 32
|
||||
ldr dataq, [src, 16]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbnz synd, L(loopend)
|
||||
str dataq, [dst, -16]
|
||||
ldr dataq, [src, 32]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
add dst, dst, 16
|
||||
L(loopend):
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
sub dst, dst, 31
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
add dst, dst, len
|
||||
ldr dataq, [dst, tmp]
|
||||
str dataq, [dst]
|
||||
ret
|
||||
|
||||
END (__strcpy_aarch64)
|
220
libc/intrin/aarch64/strlen.S
Normal file
220
libc/intrin/aarch64/strlen.S
Normal file
|
@ -0,0 +1,220 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strlen_aarch64 strlen
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
* Not MTE compatible.
|
||||
*/
|
||||
|
||||
#define srcin x0
|
||||
#define len x0
|
||||
|
||||
#define src x1
|
||||
#define data1 x2
|
||||
#define data2 x3
|
||||
#define has_nul1 x4
|
||||
#define has_nul2 x5
|
||||
#define tmp1 x4
|
||||
#define tmp2 x5
|
||||
#define tmp3 x6
|
||||
#define tmp4 x7
|
||||
#define zeroones x8
|
||||
|
||||
#define maskv v0
|
||||
#define maskd d0
|
||||
#define dataq1 q1
|
||||
#define dataq2 q2
|
||||
#define datav1 v1
|
||||
#define datav2 v2
|
||||
#define tmp x2
|
||||
#define tmpw w2
|
||||
#define synd x3
|
||||
#define syndw w3
|
||||
#define shift x4
|
||||
|
||||
/* For the first 32 bytes, NUL detection works on the principle that
|
||||
(X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
|
||||
byte is zero, and can be done in parallel across the entire word. */
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
/* To test the page crossing code path more thoroughly, compile with
|
||||
-DTEST_PAGE_CROSS - this will force all calls through the slower
|
||||
entry path. This option is not intended for production use. */
|
||||
|
||||
#ifdef TEST_PAGE_CROSS
|
||||
# define MIN_PAGE_SIZE 32
|
||||
#else
|
||||
# define MIN_PAGE_SIZE 4096
|
||||
#endif
|
||||
|
||||
/* Core algorithm:
|
||||
|
||||
Since strings are short on average, we check the first 32 bytes of the
|
||||
string for a NUL character without aligning the string. In order to use
|
||||
unaligned loads safely we must do a page cross check first.
|
||||
|
||||
If there is a NUL byte we calculate the length from the 2 8-byte words
|
||||
using conditional select to reduce branch mispredictions (it is unlikely
|
||||
strlen will be repeatedly called on strings with the same length).
|
||||
|
||||
If the string is longer than 32 bytes, align src so we don't need further
|
||||
page cross checks, and process 32 bytes per iteration using a fast SIMD
|
||||
loop.
|
||||
|
||||
If the page cross check fails, we read 32 bytes from an aligned address,
|
||||
and ignore any characters before the string. If it contains a NUL
|
||||
character, return the length, if not, continue in the main loop. */
|
||||
|
||||
ENTRY (__strlen_aarch64)
|
||||
PTR_ARG (0)
|
||||
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
||||
cmp tmp1, MIN_PAGE_SIZE - 32
|
||||
b.hi L(page_cross)
|
||||
|
||||
/* Look for a NUL byte in the first 16 bytes. */
|
||||
ldp data1, data2, [srcin]
|
||||
mov zeroones, REP8_01
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian, carry propagation (if the final byte in the
|
||||
string is 0x01) means we cannot use has_nul1/2 directly.
|
||||
Since we expect strings to be small and early-exit,
|
||||
byte-swap the data now so has_null1/2 will be correct. */
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
b.eq L(bytes16_31)
|
||||
|
||||
/* Find the exact offset of the first NUL byte in the first 16 bytes
|
||||
from the string start. Enter with C = has_nul1 == 0. */
|
||||
csel has_nul1, has_nul1, has_nul2, cc
|
||||
mov len, 8
|
||||
rev has_nul1, has_nul1
|
||||
csel len, xzr, len, cc
|
||||
clz tmp1, has_nul1
|
||||
add len, len, tmp1, lsr 3
|
||||
ret
|
||||
|
||||
/* Look for a NUL byte at offset 16..31 in the string. */
|
||||
L(bytes16_31):
|
||||
ldp data1, data2, [srcin, 16]
|
||||
#ifdef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
bic has_nul2, tmp3, tmp4
|
||||
ccmp has_nul2, 0, 0, eq
|
||||
b.eq L(loop_entry)
|
||||
|
||||
/* Find the exact offset of the first NUL byte at offset 16..31 from
|
||||
the string start. Enter with C = has_nul1 == 0. */
|
||||
csel has_nul1, has_nul1, has_nul2, cc
|
||||
mov len, 24
|
||||
rev has_nul1, has_nul1
|
||||
mov tmp3, 16
|
||||
clz tmp1, has_nul1
|
||||
csel len, tmp3, len, cc
|
||||
add len, len, tmp1, lsr 3
|
||||
ret
|
||||
|
||||
nop
|
||||
L(loop_entry):
|
||||
bic src, srcin, 31
|
||||
|
||||
.p2align 5
|
||||
L(loop):
|
||||
ldp dataq1, dataq2, [src, 32]!
|
||||
uminp maskv.16b, datav1.16b, datav2.16b
|
||||
uminp maskv.16b, maskv.16b, maskv.16b
|
||||
cmeq maskv.8b, maskv.8b, 0
|
||||
fmov synd, maskd
|
||||
cbz synd, L(loop)
|
||||
|
||||
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
|
||||
cmeq maskv.16b, datav1.16b, 0
|
||||
sub len, src, srcin
|
||||
cbnz syndw, 1f
|
||||
cmeq maskv.16b, datav2.16b, 0
|
||||
add len, len, 16
|
||||
1:
|
||||
/* Generate a bitmask and compute correct byte offset. */
|
||||
shrn maskv.8b, maskv.8h, 4
|
||||
fmov synd, maskd
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz tmp, synd
|
||||
add len, len, tmp, lsr 2
|
||||
ret
|
||||
|
||||
L(page_cross):
|
||||
bic src, srcin, 31
|
||||
mov tmpw, 0x0c03
|
||||
movk tmpw, 0xc030, lsl 16
|
||||
ld1 {datav1.16b, datav2.16b}, [src]
|
||||
dup maskv.4s, tmpw
|
||||
cmeq datav1.16b, datav1.16b, 0
|
||||
cmeq datav2.16b, datav2.16b, 0
|
||||
and datav1.16b, datav1.16b, maskv.16b
|
||||
and datav2.16b, datav2.16b, maskv.16b
|
||||
addp maskv.16b, datav1.16b, datav2.16b
|
||||
addp maskv.16b, maskv.16b, maskv.16b
|
||||
fmov synd, maskd
|
||||
lsl shift, srcin, 1
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(loop)
|
||||
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 1
|
||||
ret
|
||||
|
||||
END (__strlen_aarch64)
|
334
libc/intrin/aarch64/strncmp.S
Normal file
334
libc/intrin/aarch64/strncmp.S
Normal file
|
@ -0,0 +1,334 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strncmp_aarch64 strncmp
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result x0
|
||||
|
||||
/* Internal variables. */
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data2 x4
|
||||
#define data2w w4
|
||||
#define has_nul x5
|
||||
#define diff x6
|
||||
#define syndrome x7
|
||||
#define tmp1 x8
|
||||
#define tmp2 x9
|
||||
#define tmp3 x10
|
||||
#define zeroones x11
|
||||
#define pos x12
|
||||
#define mask x13
|
||||
#define endloop x14
|
||||
#define count mask
|
||||
#define offset pos
|
||||
#define neg_offset x15
|
||||
|
||||
/* Define endian dependent shift operations.
|
||||
On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes.
|
||||
LS_BK means shifting towards later bytes.
|
||||
*/
|
||||
#ifdef __AARCH64EB__
|
||||
#define LS_FW lsl
|
||||
#define LS_BK lsr
|
||||
#else
|
||||
#define LS_FW lsr
|
||||
#define LS_BK lsl
|
||||
#endif
|
||||
|
||||
ENTRY (__strncmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
cbz limit, L(ret0)
|
||||
eor tmp1, src1, src2
|
||||
mov zeroones, #REP8_01
|
||||
tst tmp1, #7
|
||||
and count, src1, #7
|
||||
b.ne L(misaligned8)
|
||||
cbnz count, L(mutual_align)
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
.p2align 4
|
||||
L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(start_realigned):
|
||||
subs limit, limit, #8
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp endloop, #0, #0, eq
|
||||
b.eq L(loop_aligned)
|
||||
/* End of main loop */
|
||||
|
||||
L(full_check):
|
||||
#ifndef __AARCH64EB__
|
||||
orr syndrome, diff, has_nul
|
||||
add limit, limit, 8 /* Rewind limit to before last subs. */
|
||||
L(syndrome_check):
|
||||
/* Limit was reached. Check if the NUL byte or the difference
|
||||
is before the limit. */
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
cmp limit, pos, lsr #3
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
csel result, result, xzr, hi
|
||||
ret
|
||||
#else
|
||||
/* Not reached the limit, must have found the end or a diff. */
|
||||
tbz limit, #63, L(not_limit)
|
||||
add tmp1, limit, 8
|
||||
cbz limit, L(not_limit)
|
||||
|
||||
lsl limit, tmp1, #3 /* Bits -> bytes. */
|
||||
mov mask, #~0
|
||||
lsr mask, mask, limit
|
||||
bic data1, data1, mask
|
||||
bic data2, data2, mask
|
||||
|
||||
/* Make sure that the NUL byte is marked in the syndrome. */
|
||||
orr has_nul, has_nul, mask
|
||||
|
||||
L(not_limit):
|
||||
/* For big-endian we cannot use the trick with the syndrome value
|
||||
as carry-propagation can corrupt the upper bits if the trailing
|
||||
bytes in the string contain 0x01. */
|
||||
/* However, if there is no NUL byte in the dword, we can generate
|
||||
the result directly. We can't just subtract the bytes as the
|
||||
MSB might be significant. */
|
||||
cbnz has_nul, 1f
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
1:
|
||||
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
|
||||
rev tmp3, data1
|
||||
sub tmp1, tmp3, zeroones
|
||||
orr tmp2, tmp3, #REP8_7f
|
||||
bic has_nul, tmp1, tmp2
|
||||
rev has_nul, has_nul
|
||||
orr syndrome, diff, has_nul
|
||||
clz pos, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
L(end_quick):
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
ret
|
||||
#endif
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that precede the start point.
|
||||
We also need to adjust the limit calculations, but without
|
||||
overflowing if the limit is near ULONG_MAX. */
|
||||
bic src1, src1, #7
|
||||
bic src2, src2, #7
|
||||
ldr data1, [src1], #8
|
||||
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
|
||||
ldr data2, [src2], #8
|
||||
mov tmp2, #~0
|
||||
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
/* Adjust the limit and ensure it doesn't overflow. */
|
||||
adds limit, limit, count
|
||||
csinv limit, limit, xzr, lo
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
b L(start_realigned)
|
||||
|
||||
.p2align 4
|
||||
/* Don't bother with dwords for up to 16 bytes. */
|
||||
L(misaligned8):
|
||||
cmp limit, #16
|
||||
b.hs L(try_misaligned_words)
|
||||
|
||||
L(byte_loop):
|
||||
/* Perhaps we can do better than this. */
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
subs limit, limit, #1
|
||||
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.eq L(byte_loop)
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
/* Align the SRC1 to a dword by doing a bytewise compare and then do
|
||||
the dword loop. */
|
||||
L(try_misaligned_words):
|
||||
cbz count, L(src1_aligned)
|
||||
|
||||
neg count, count
|
||||
and count, count, #7
|
||||
sub limit, limit, count
|
||||
|
||||
L(page_end_loop):
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
cmp data1w, #1
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
subs count, count, #1
|
||||
b.hi L(page_end_loop)
|
||||
|
||||
/* The following diagram explains the comparison of misaligned strings.
|
||||
The bytes are shown in natural order. For little-endian, it is
|
||||
reversed in the registers. The "x" bytes are before the string.
|
||||
The "|" separates data that is loaded at one time.
|
||||
src1 | a a a a a a a a | b b b c c c c c | . . .
|
||||
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
|
||||
|
||||
After shifting in each step, the data looks like this:
|
||||
STEP_A STEP_B STEP_C
|
||||
data1 a a a a a a a a b b b c c c c c b b b c c c c c
|
||||
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
|
||||
|
||||
The bytes with "0" are eliminated from the syndrome via mask.
|
||||
|
||||
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
|
||||
time from SRC2. The comparison happens in 3 steps. After each step
|
||||
the loop can exit, or read from SRC1 or SRC2. */
|
||||
L(src1_aligned):
|
||||
/* Calculate offset from 8 byte alignment to string start in bits. No
|
||||
need to mask offset since shifts are ignoring upper bits. */
|
||||
lsl offset, src2, #3
|
||||
bic src2, src2, #0xf
|
||||
mov mask, -1
|
||||
neg neg_offset, offset
|
||||
ldr data1, [src1], #8
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
LS_BK mask, mask, neg_offset
|
||||
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
|
||||
/* Skip the first compare if data in tmp1 is irrelevant. */
|
||||
tbnz offset, 6, L(misaligned_mid_loop)
|
||||
|
||||
L(loop_misaligned):
|
||||
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
|
||||
LS_FW data2, tmp1, offset
|
||||
LS_BK tmp1, tmp2, neg_offset
|
||||
subs limit, limit, #8
|
||||
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
|
||||
sub has_nul, data1, zeroones
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
orr tmp3, data1, #REP8_7f
|
||||
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
|
||||
orr tmp3, endloop, has_nul
|
||||
cbnz tmp3, L(full_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
L(misaligned_mid_loop):
|
||||
/* STEP_B: Compare first part of data1 to second part of tmp2. */
|
||||
LS_FW data2, tmp2, offset
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian we do a byte reverse to avoid carry-propagation
|
||||
problem described above. This way we can reuse the has_nul in the
|
||||
next step and also use syndrome value trick at the end. */
|
||||
rev tmp3, data1
|
||||
#define data1_fixed tmp3
|
||||
#else
|
||||
#define data1_fixed data1
|
||||
#endif
|
||||
sub has_nul, data1_fixed, zeroones
|
||||
orr tmp3, data1_fixed, #REP8_7f
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
cmp limit, neg_offset, lsr #3
|
||||
orr syndrome, diff, has_nul
|
||||
bic syndrome, syndrome, mask /* Ignore later bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
/* STEP_C: Compare second part of data1 to first part of tmp1. */
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
cmp limit, #8
|
||||
LS_BK data2, tmp1, neg_offset
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
orr syndrome, diff, has_nul
|
||||
and syndrome, syndrome, mask /* Ignore earlier bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
sub limit, limit, #8
|
||||
b L(loop_misaligned)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
L(syndrome_check):
|
||||
clz pos, syndrome
|
||||
cmp pos, limit, lsl #3
|
||||
b.lo L(end_quick)
|
||||
#endif
|
||||
|
||||
L(ret0):
|
||||
mov result, #0
|
||||
ret
|
||||
END(__strncmp_aarch64)
|
128
libc/intrin/aarch64/strnlen.S
Normal file
128
libc/intrin/aarch64/strnlen.S
Normal file
|
@ -0,0 +1,128 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strnlen_aarch64 strnlen
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#define srcin x0
|
||||
#define cntin x1
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define synd x3
|
||||
#define shift x4
|
||||
#define tmp x4
|
||||
#define cntrem x5
|
||||
|
||||
#define qdata q0
|
||||
#define vdata v0
|
||||
#define vhas_chr v1
|
||||
#define vend v2
|
||||
#define dend d2
|
||||
|
||||
/*
|
||||
Core algorithm:
|
||||
Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
|
||||
four bits per byte using the shrn instruction. A count trailing zeros then
|
||||
identifies the first zero byte. */
|
||||
|
||||
ENTRY (__strnlen_aarch64)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (1)
|
||||
bic src, srcin, 15
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbz synd, L(start_loop)
|
||||
L(finish):
|
||||
rbit synd, synd
|
||||
clz synd, synd
|
||||
lsr result, synd, 2
|
||||
cmp cntin, result
|
||||
csel result, cntin, result, ls
|
||||
ret
|
||||
|
||||
L(nomatch):
|
||||
mov result, cntin
|
||||
ret
|
||||
|
||||
L(start_loop):
|
||||
sub tmp, src, srcin
|
||||
add tmp, tmp, 17
|
||||
subs cntrem, cntin, tmp
|
||||
b.lo L(nomatch)
|
||||
|
||||
/* Make sure that it won't overread by a 16-byte chunk */
|
||||
tbz cntrem, 4, L(loop32_2)
|
||||
sub src, src, 16
|
||||
.p2align 5
|
||||
L(loop32):
|
||||
ldr qdata, [src, 32]!
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbnz synd, L(end)
|
||||
L(loop32_2):
|
||||
ldr qdata, [src, 16]
|
||||
subs cntrem, cntrem, 32
|
||||
cmeq vhas_chr.16b, vdata.16b, 0
|
||||
b.lo L(end_2)
|
||||
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop32)
|
||||
L(end_2):
|
||||
add src, src, 16
|
||||
L(end):
|
||||
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
|
||||
sub result, src, srcin
|
||||
fmov synd, dend
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz synd, synd
|
||||
add result, result, synd, lsr 2
|
||||
cmp cntin, result
|
||||
csel result, cntin, result, ls
|
||||
ret
|
||||
|
||||
END (__strnlen_aarch64)
|
175
libc/intrin/aarch64/strrchr.S
Normal file
175
libc/intrin/aarch64/strrchr.S
Normal file
|
@ -0,0 +1,175 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __strrchr_aarch64 strrchr
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* Neon Available.
|
||||
*/
|
||||
|
||||
/* Arguments and results. */
|
||||
#define srcin x0
|
||||
#define chrin w1
|
||||
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define tmp1 x3
|
||||
#define wtmp2 w4
|
||||
#define tmp3 x5
|
||||
#define src_match x6
|
||||
#define src_offset x7
|
||||
#define const_m1 x8
|
||||
#define tmp4 x9
|
||||
#define nul_match x10
|
||||
#define chr_match x11
|
||||
|
||||
#define vrepchr v0
|
||||
#define vdata1 v1
|
||||
#define vdata2 v2
|
||||
#define vhas_nul1 v3
|
||||
#define vhas_nul2 v4
|
||||
#define vhas_chr1 v5
|
||||
#define vhas_chr2 v6
|
||||
#define vrepmask_0 v7
|
||||
#define vrepmask_c v16
|
||||
#define vend1 v17
|
||||
#define vend2 v18
|
||||
|
||||
/* Core algorithm.
|
||||
|
||||
For each 32-byte hunk we calculate a 64-bit syndrome value, with
|
||||
two bits per byte (LSB is always in bits 0 and 1, for both big
|
||||
and little-endian systems). For each tuple, bit 0 is set iff
|
||||
the relevant byte matched the requested character; bit 1 is set
|
||||
iff the relevant byte matched the NUL end of string (we trigger
|
||||
off bit0 for the special case of looking for NUL). Since the bits
|
||||
in the syndrome reflect exactly the order in which things occur
|
||||
in the original string a count_trailing_zeros() operation will
|
||||
identify exactly which byte is causing the termination, and why. */
|
||||
|
||||
ENTRY (__strrchr_aarch64)
|
||||
PTR_ARG (0)
|
||||
/* Magic constant 0x40100401 to allow us to identify which lane
|
||||
matches the requested byte. Magic constant 0x80200802 used
|
||||
similarly for NUL termination. */
|
||||
mov wtmp2, #0x0401
|
||||
movk wtmp2, #0x4010, lsl #16
|
||||
dup vrepchr.16b, chrin
|
||||
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
|
||||
dup vrepmask_c.4s, wtmp2
|
||||
mov src_offset, #0
|
||||
ands tmp1, srcin, #31
|
||||
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
|
||||
b.eq L(aligned)
|
||||
|
||||
/* Input string is not 32-byte aligned. Rather than forcing
|
||||
the padding bytes to a safe value, we calculate the syndrome
|
||||
for all the bytes, but then mask off those bits of the
|
||||
syndrome that are related to the padding. */
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
neg tmp1, tmp1
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
|
||||
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vend1.d[0]
|
||||
lsl tmp1, tmp1, #1
|
||||
mov const_m1, #~0
|
||||
lsr tmp3, const_m1, tmp1
|
||||
mov chr_match, vend1.d[1]
|
||||
|
||||
bic nul_match, nul_match, tmp3 // Mask padding bits.
|
||||
bic chr_match, chr_match, tmp3 // Mask padding bits.
|
||||
cbnz nul_match, L(tail)
|
||||
|
||||
.p2align 4
|
||||
L(loop):
|
||||
cmp chr_match, #0
|
||||
csel src_match, src, src_match, ne
|
||||
csel src_offset, chr_match, src_offset, ne
|
||||
L(aligned):
|
||||
ld1 {vdata1.16b, vdata2.16b}, [src], #32
|
||||
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
|
||||
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
|
||||
uminp vend1.16b, vdata1.16b, vdata2.16b
|
||||
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
|
||||
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
|
||||
cmeq vend1.16b, vend1.16b, 0
|
||||
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
|
||||
addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
|
||||
mov nul_match, vend1.d[0]
|
||||
mov chr_match, vend1.d[1]
|
||||
cbz nul_match, L(loop)
|
||||
|
||||
cmeq vhas_nul1.16b, vdata1.16b, #0
|
||||
cmeq vhas_nul2.16b, vdata2.16b, #0
|
||||
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
|
||||
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
|
||||
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
|
||||
mov nul_match, vhas_nul1.d[0]
|
||||
|
||||
L(tail):
|
||||
/* Work out exactly where the string ends. */
|
||||
sub tmp4, nul_match, #1
|
||||
eor tmp4, tmp4, nul_match
|
||||
ands chr_match, chr_match, tmp4
|
||||
/* And pick the values corresponding to the last match. */
|
||||
csel src_match, src, src_match, ne
|
||||
csel src_offset, chr_match, src_offset, ne
|
||||
|
||||
/* Count down from the top of the syndrome to find the last match. */
|
||||
clz tmp3, src_offset
|
||||
/* Src_match points beyond the word containing the match, so we can
|
||||
simply subtract half the bit-offset into the syndrome. Because
|
||||
we are counting down, we need to go back one more character. */
|
||||
add tmp3, tmp3, #2
|
||||
sub result, src_match, tmp3, lsr #1
|
||||
/* But if the syndrome shows no match was found, then return NULL. */
|
||||
cmp src_offset, #0
|
||||
csel result, result, xzr, ne
|
||||
|
||||
ret
|
||||
|
||||
END (__strrchr_aarch64)
|
43
libc/intrin/getauxval.c
Normal file
43
libc/intrin/getauxval.c
Normal file
|
@ -0,0 +1,43 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/sysv/errfuns.h"
|
||||
|
||||
/**
|
||||
* Returns auxiliary value, or zero if kernel didn't provide it.
|
||||
*
|
||||
* This function is typically regarded as a libc implementation detail;
|
||||
* thus, the source code is the documentation.
|
||||
*
|
||||
* @return auxiliary value or 0 if `at` not found
|
||||
* @see libc/sysv/consts.sh
|
||||
* @see System Five Application Binary Interface § 3.4.3
|
||||
* @error ENOENT when value not found
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
unsigned long getauxval(unsigned long at) {
|
||||
unsigned long res, *ap;
|
||||
for (ap = __auxv; ap[0]; ap += 2) {
|
||||
if (at == ap[0]) {
|
||||
return ap[1];
|
||||
}
|
||||
}
|
||||
enoent();
|
||||
return 0;
|
||||
}
|
|
@ -6,6 +6,7 @@ PKGS += LIBC_INTRIN
|
|||
LIBC_INTRIN_ARTIFACTS += LIBC_INTRIN_A
|
||||
LIBC_INTRIN = $(LIBC_INTRIN_A_DEPS) $(LIBC_INTRIN_A)
|
||||
LIBC_INTRIN_A = o/$(MODE)/libc/intrin/intrin.a
|
||||
LIBC_INTRIN_A_FILES := $(wildcard libc/intrin/*)
|
||||
LIBC_INTRIN_A_HDRS = $(filter %.h,$(LIBC_INTRIN_A_FILES))
|
||||
LIBC_INTRIN_A_INCS = $(filter %.inc,$(LIBC_INTRIN_A_FILES))
|
||||
LIBC_INTRIN_A_SRCS_S = $(filter %.S,$(LIBC_INTRIN_A_FILES))
|
||||
|
@ -13,8 +14,9 @@ LIBC_INTRIN_A_SRCS_C = $(filter %.c,$(LIBC_INTRIN_A_FILES))
|
|||
LIBC_INTRIN_A_SRCS = $(LIBC_INTRIN_A_SRCS_S) $(LIBC_INTRIN_A_SRCS_C)
|
||||
LIBC_INTRIN_A_CHECKS = $(LIBC_INTRIN_A).pkg
|
||||
|
||||
LIBC_INTRIN_A_FILES := \
|
||||
$(wildcard libc/intrin/*)
|
||||
ifeq ($(ARCH), aarch64)
|
||||
LIBC_INTRIN_A_SRCS_S += $(wildcard libc/intrin/aarch64/*.S)
|
||||
endif
|
||||
|
||||
LIBC_INTRIN_A_OBJS = \
|
||||
$(LIBC_INTRIN_A_SRCS_S:%.S=o/$(MODE)/%.o) \
|
||||
|
@ -203,6 +205,8 @@ o/$(MODE)/libc/intrin/memmove.o: private \
|
|||
-fpie
|
||||
|
||||
# these assembly files are safe to build on aarch64
|
||||
o/$(MODE)/libc/intrin/aarch64/%.o: libc/intrin/aarch64/%.S
|
||||
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
|
||||
o/$(MODE)/libc/intrin/fenv.o: libc/intrin/fenv.S
|
||||
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
|
||||
o/$(MODE)/libc/intrin/futex.o: libc/intrin/futex.S
|
||||
|
|
88
libc/intrin/memchr.c
Normal file
88
libc/intrin/memchr.c
Normal file
|
@ -0,0 +1,88 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
static inline const unsigned char *memchr_pure(const unsigned char *s,
|
||||
unsigned char c, size_t n) {
|
||||
size_t i;
|
||||
for (i = 0; i < n; ++i) {
|
||||
if (s[i] == c) {
|
||||
return s + i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
noasan static inline const unsigned char *memchr_sse(const unsigned char *s,
|
||||
unsigned char c,
|
||||
size_t n) {
|
||||
size_t i;
|
||||
unsigned k;
|
||||
unsigned m;
|
||||
xmm_t v, *p;
|
||||
xmm_t t = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
|
||||
for (; n >= 16; n -= 16, s += 16) {
|
||||
v = *(const xmm_t *)s;
|
||||
m = __builtin_ia32_pmovmskb128(v == t);
|
||||
if (m) {
|
||||
m = __builtin_ctzll(m);
|
||||
return s + m;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < n; ++i) {
|
||||
if (s[i] == c) {
|
||||
return s + i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Returns pointer to first instance of character.
|
||||
*
|
||||
* @param s is memory to search
|
||||
* @param c is search byte which is masked with 255
|
||||
* @param n is byte length of p
|
||||
* @return is pointer to first instance of c or NULL if not found
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
void *memchr(const void *s, int c, size_t n) {
|
||||
#ifdef __x86_64__
|
||||
const void *r;
|
||||
if (!IsTiny() && X86_HAVE(SSE)) {
|
||||
if (IsAsan()) __asan_verify(s, n);
|
||||
r = memchr_sse(s, c, n);
|
||||
} else {
|
||||
r = memchr_pure(s, c, n);
|
||||
}
|
||||
return (void *)r;
|
||||
#else
|
||||
return memchr_pure(s, c, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -20,6 +20,7 @@
|
|||
#include "libc/intrin/likely.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
|
||||
|
||||
|
@ -129,7 +130,9 @@ microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
|
|||
* memcmp n=32768 29 ps/byte 32,851 mb/s
|
||||
* memcmp n=131072 33 ps/byte 28,983 mb/s
|
||||
*
|
||||
* @return unsigned char subtraction at stop index
|
||||
* @return an integer that's (1) equal to zero if `a` is equal to `b`,
|
||||
* (2) less than zero if `a` is less than `b`, or (3) greater than
|
||||
* zero if `a` is greater than `b`
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
int memcmp(const void *a, const void *b, size_t n) {
|
||||
|
@ -200,3 +203,5 @@ int memcmp(const void *a, const void *b, size_t n) {
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "libc/nexgen32e/nexgen32e.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -343,3 +344,5 @@ void *memmove(void *dst, const void *src, size_t n) {
|
|||
|
||||
asm("memcpy = memmove\n\t"
|
||||
".globl\tmemcpy");
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
86
libc/intrin/memrchr.c
Normal file
86
libc/intrin/memrchr.c
Normal file
|
@ -0,0 +1,86 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
static inline const unsigned char *memrchr_pure(const unsigned char *s,
|
||||
unsigned char c, size_t n) {
|
||||
size_t i;
|
||||
for (i = n; i--;) {
|
||||
if (s[i] == c) {
|
||||
return s + i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
noasan static inline const unsigned char *memrchr_sse(const unsigned char *s,
|
||||
unsigned char c,
|
||||
size_t n) {
|
||||
size_t i;
|
||||
unsigned k, m;
|
||||
xmm_t v, t = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
|
||||
for (i = n; i >= 16;) {
|
||||
v = *(const xmm_t *)(s + (i -= 16));
|
||||
m = __builtin_ia32_pmovmskb128(v == t);
|
||||
if (m) {
|
||||
m = __builtin_clzl(m) ^ (sizeof(long) * CHAR_BIT - 1);
|
||||
return s + i + m;
|
||||
}
|
||||
}
|
||||
while (i--) {
|
||||
if (s[i] == c) {
|
||||
return s + i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Returns pointer to first instance of character.
|
||||
*
|
||||
* @param s is memory to search
|
||||
* @param c is search byte which is masked with 255
|
||||
* @param n is byte length of p
|
||||
* @return is pointer to first instance of c or NULL if not found
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
void *memrchr(const void *s, int c, size_t n) {
|
||||
#ifdef __x86_64__
|
||||
const void *r;
|
||||
if (!IsTiny() && X86_HAVE(SSE)) {
|
||||
if (IsAsan()) __asan_verify(s, n);
|
||||
r = memrchr_sse(s, c, n);
|
||||
} else {
|
||||
r = memrchr_pure(s, c, n);
|
||||
}
|
||||
return (void *)r;
|
||||
#else
|
||||
return memrchr_pure(s, c, n);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -22,6 +22,7 @@
|
|||
#include "libc/nexgen32e/nexgen32e.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -168,3 +169,5 @@ void *memset(void *p, int c, size_t n) {
|
|||
return memset_sse(b, c, n);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -17,6 +17,9 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
// TODO(jart): ASAN support here is important.
|
||||
|
||||
typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -63,3 +66,5 @@ char *stpcpy(char *d, const char *s) {
|
|||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
120
libc/intrin/strchr.c
Normal file
120
libc/intrin/strchr.c
Normal file
|
@ -0,0 +1,120 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/assert.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
static inline const char *strchr_pure(const char *s, int c) {
|
||||
for (;; ++s) {
|
||||
if ((*s & 255) == (c & 255)) return s;
|
||||
if (!*s) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
noasan static inline const char *strchr_sse(const char *s, unsigned char c) {
|
||||
unsigned k;
|
||||
unsigned m;
|
||||
xmm_t v, *p;
|
||||
xmm_t z = {0};
|
||||
xmm_t n = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
|
||||
k = (uintptr_t)s & 15;
|
||||
p = (const xmm_t *)((uintptr_t)s & -16);
|
||||
v = *p;
|
||||
m = __builtin_ia32_pmovmskb128((v == z) | (v == n));
|
||||
m >>= k;
|
||||
m <<= k;
|
||||
while (!m) {
|
||||
v = *++p;
|
||||
m = __builtin_ia32_pmovmskb128((v == z) | (v == n));
|
||||
}
|
||||
m = __builtin_ctzl(m);
|
||||
s = (const char *)p + m;
|
||||
if (c && !*s) s = 0;
|
||||
return s;
|
||||
}
|
||||
#endif
|
||||
|
||||
static noasan inline const char *strchr_x64(const char *p, uint64_t c) {
|
||||
unsigned a, b;
|
||||
uint64_t w, x, y;
|
||||
for (c *= 0x0101010101010101;; p += 8) {
|
||||
w = (uint64_t)(255 & p[7]) << 070 | (uint64_t)(255 & p[6]) << 060 |
|
||||
(uint64_t)(255 & p[5]) << 050 | (uint64_t)(255 & p[4]) << 040 |
|
||||
(uint64_t)(255 & p[3]) << 030 | (uint64_t)(255 & p[2]) << 020 |
|
||||
(uint64_t)(255 & p[1]) << 010 | (uint64_t)(255 & p[0]) << 000;
|
||||
if ((x = ~(w ^ c) & ((w ^ c) - 0x0101010101010101) & 0x8080808080808080) |
|
||||
(y = ~w & (w - 0x0101010101010101) & 0x8080808080808080)) {
|
||||
if (x) {
|
||||
a = __builtin_ctzll(x);
|
||||
if (y) {
|
||||
b = __builtin_ctzll(y);
|
||||
if (a <= b) {
|
||||
return p + (a >> 3);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
return p + (a >> 3);
|
||||
}
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns pointer to first instance of character.
|
||||
*
|
||||
* @param s is a NUL-terminated string
|
||||
* @param c is masked with 255 as byte to search for
|
||||
* @return pointer to first instance of c or NULL if not found
|
||||
* noting that if c is NUL we return pointer to terminator
|
||||
* @asyncsignalsafe
|
||||
* @vforksafe
|
||||
*/
|
||||
char *strchr(const char *s, int c) {
|
||||
#ifdef __x86_64__
|
||||
const char *r;
|
||||
if (X86_HAVE(SSE)) {
|
||||
if (IsAsan()) __asan_verify(s, 1);
|
||||
r = strchr_sse(s, c);
|
||||
} else {
|
||||
r = strchr_pure(s, c);
|
||||
}
|
||||
_unassert(!r || *r || !(c & 255));
|
||||
return (char *)r;
|
||||
#else
|
||||
char *r;
|
||||
for (c &= 255; (uintptr_t)s & 7; ++s) {
|
||||
if ((*s & 255) == c) return s;
|
||||
if (!*s) return NULL;
|
||||
}
|
||||
r = strchr_x64(s, c);
|
||||
_unassert(!r || *r || !c);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
118
libc/intrin/strchrnul.c
Normal file
118
libc/intrin/strchrnul.c
Normal file
|
@ -0,0 +1,118 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/assert.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
static inline const char *strchrnul_pure(const char *s, int c) {
|
||||
for (;; ++s) {
|
||||
if ((*s & 255) == (c & 255)) return s;
|
||||
if (!*s) return s;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
noasan static inline const char *strchrnul_sse(const char *s, unsigned char c) {
|
||||
unsigned k;
|
||||
unsigned m;
|
||||
xmm_t v, *p;
|
||||
xmm_t z = {0};
|
||||
xmm_t n = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
|
||||
k = (uintptr_t)s & 15;
|
||||
p = (const xmm_t *)((uintptr_t)s & -16);
|
||||
v = *p;
|
||||
m = __builtin_ia32_pmovmskb128((v == z) | (v == n));
|
||||
m >>= k;
|
||||
m <<= k;
|
||||
while (!m) {
|
||||
v = *++p;
|
||||
m = __builtin_ia32_pmovmskb128((v == z) | (v == n));
|
||||
}
|
||||
return (const char *)p + __builtin_ctzl(m);
|
||||
}
|
||||
#endif
|
||||
|
||||
noasan static const char *strchrnul_x64(const char *p, uint64_t c) {
|
||||
unsigned a, b;
|
||||
uint64_t w, x, y;
|
||||
for (c *= 0x0101010101010101;; p += 8) {
|
||||
w = (uint64_t)(255 & p[7]) << 070 | (uint64_t)(255 & p[6]) << 060 |
|
||||
(uint64_t)(255 & p[5]) << 050 | (uint64_t)(255 & p[4]) << 040 |
|
||||
(uint64_t)(255 & p[3]) << 030 | (uint64_t)(255 & p[2]) << 020 |
|
||||
(uint64_t)(255 & p[1]) << 010 | (uint64_t)(255 & p[0]) << 000;
|
||||
if ((x = ~(w ^ c) & ((w ^ c) - 0x0101010101010101) & 0x8080808080808080) |
|
||||
(y = ~w & (w - 0x0101010101010101) & 0x8080808080808080)) {
|
||||
if (x) {
|
||||
a = __builtin_ctzll(x);
|
||||
if (y) {
|
||||
b = __builtin_ctzll(y);
|
||||
if (a <= b) {
|
||||
return p + (a >> 3);
|
||||
} else {
|
||||
return p + (b >> 3);
|
||||
}
|
||||
} else {
|
||||
return p + (a >> 3);
|
||||
}
|
||||
} else {
|
||||
b = __builtin_ctzll(y);
|
||||
return p + (b >> 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns pointer to first instance of character.
|
||||
*
|
||||
* If c is not found then a pointer to the nul byte is returned.
|
||||
*
|
||||
* @param s is a NUL-terminated string
|
||||
* @param c is masked with 255 as byte to search for
|
||||
* @return pointer to first instance of c, or pointer to
|
||||
* NUL terminator if c is not found
|
||||
*/
|
||||
char *strchrnul(const char *s, int c) {
|
||||
#ifdef __x86_64__
|
||||
const char *r;
|
||||
if (X86_HAVE(SSE)) {
|
||||
if (IsAsan()) __asan_verify(s, 1);
|
||||
r = strchrnul_sse(s, c);
|
||||
} else {
|
||||
r = strchrnul_pure(s, c);
|
||||
}
|
||||
_unassert((*r & 255) == (c & 255) || !*r);
|
||||
return (char *)r;
|
||||
#else
|
||||
char *r;
|
||||
for (c &= 255; (uintptr_t)s & 7; ++s) {
|
||||
if ((*s & 0xff) == c) return s;
|
||||
if (!*s) return s;
|
||||
}
|
||||
r = strchrnul_x64(s, c);
|
||||
assert((*r & 255) == c || !*r);
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
|
@ -17,6 +17,9 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
// TODO(jart): ASAN support here is important.
|
||||
|
||||
typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
@ -63,3 +66,5 @@ char *strcpy(char *d, const char *s) {
|
|||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
/**
|
||||
* Returns length of NUL-terminated string.
|
||||
|
@ -61,3 +62,5 @@ noasan size_t strlen(const char *s) {
|
|||
return n;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
|
37
libc/intrin/strncmp.c
Normal file
37
libc/intrin/strncmp.c
Normal file
|
@ -0,0 +1,37 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
/**
|
||||
* Compares NUL-terminated strings w/ limit.
|
||||
*
|
||||
* @param a is first non-null NUL-terminated string pointer
|
||||
* @param b is second non-null NUL-terminated string pointer
|
||||
* @return is <0, 0, or >0 based on uint8_t comparison
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
int strncmp(const char *a, const char *b, size_t n) {
|
||||
size_t i = 0;
|
||||
if (!n-- || a == b) return 0;
|
||||
while (i < n && a[i] == b[i] && b[i]) ++i;
|
||||
return (a[i] & 0xff) - (b[i] & 0xff);
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
61
libc/intrin/strnlen.c
Normal file
61
libc/intrin/strnlen.c
Normal file
|
@ -0,0 +1,61 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/assert.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
static noasan size_t strnlen_x64(const char *s, size_t n, size_t i) {
|
||||
uint64_t w;
|
||||
for (; i + 8 < n; i += 8) {
|
||||
w = *(uint64_t *)(s + i);
|
||||
if ((w = ~w & (w - 0x0101010101010101) & 0x8080808080808080)) {
|
||||
i += (unsigned)__builtin_ctzll(w) >> 3;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns length of NUL-terminated string w/ limit.
|
||||
*
|
||||
* @param s is string
|
||||
* @param n is max length
|
||||
* @return byte length
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
noasan size_t strnlen(const char *s, size_t n) {
|
||||
size_t i;
|
||||
if (IsAsan() && n) __asan_verify(s, 1);
|
||||
for (i = 0; (uintptr_t)(s + i) & 7; ++i) {
|
||||
if (i == n || !s[i]) return i;
|
||||
}
|
||||
i = strnlen_x64(s, n, i);
|
||||
for (;; ++i) {
|
||||
if (i == n || !s[i]) break;
|
||||
}
|
||||
_unassert(i == n || (i < n && !s[i]));
|
||||
if (IsAsan()) __asan_verify(s, i);
|
||||
return i;
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
34
libc/intrin/strrchr.c
Normal file
34
libc/intrin/strrchr.c
Normal file
|
@ -0,0 +1,34 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
/**
|
||||
* Searches for last instance of character in string.
|
||||
*
|
||||
* @param s is NUL-terminated string to search
|
||||
* @param c is treated as unsigned char
|
||||
* @return address of last c in s, or NULL if not found
|
||||
* @asyncsignalsafe
|
||||
*/
|
||||
char *strrchr(const char *s, int c) {
|
||||
return memrchr(s, c, strlen(s));
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
Loading…
Add table
Add a link
Reference in a new issue