mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-19 09:00:31 +00:00
Make AARCH64 harder, better, faster, stronger
- Perform some housekeeping on scalar math function code - Import ARM's Optimized Routines for SIMD string processing - Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
parent
550b52abf6
commit
cc1732bc42
143 changed files with 15661 additions and 1329 deletions
143
libc/intrin/aarch64/memset.S
Normal file
143
libc/intrin/aarch64/memset.S
Normal file
|
@ -0,0 +1,143 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/aarch64/asmdefs.h"
|
||||
|
||||
#define __memset_aarch64 memset
|
||||
|
||||
.ident "\n\
|
||||
Optimized Routines (MIT License)\n\
|
||||
Copyright 2022 ARM Limited\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define val x1
|
||||
#define valw w1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define dstend x4
|
||||
#define zva_val x5
|
||||
|
||||
ENTRY (__memset_aarch64)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
|
||||
dup v0.16B, valw
|
||||
add dstend, dstin, count
|
||||
|
||||
cmp count, 96
|
||||
b.hi L(set_long)
|
||||
cmp count, 16
|
||||
b.hs L(set_medium)
|
||||
mov val, v0.D[0]
|
||||
|
||||
/* Set 0..15 bytes. */
|
||||
tbz count, 3, 1f
|
||||
str val, [dstin]
|
||||
str val, [dstend, -8]
|
||||
ret
|
||||
.p2align 4
|
||||
1: tbz count, 2, 2f
|
||||
str valw, [dstin]
|
||||
str valw, [dstend, -4]
|
||||
ret
|
||||
2: cbz count, 3f
|
||||
strb valw, [dstin]
|
||||
tbz count, 1, 3f
|
||||
strh valw, [dstend, -2]
|
||||
3: ret
|
||||
|
||||
/* Set 17..96 bytes. */
|
||||
L(set_medium):
|
||||
str q0, [dstin]
|
||||
tbnz count, 6, L(set96)
|
||||
str q0, [dstend, -16]
|
||||
tbz count, 5, 1f
|
||||
str q0, [dstin, 16]
|
||||
str q0, [dstend, -32]
|
||||
1: ret
|
||||
|
||||
.p2align 4
|
||||
/* Set 64..96 bytes. Write 64 bytes from the start and
|
||||
32 bytes from the end. */
|
||||
L(set96):
|
||||
str q0, [dstin, 16]
|
||||
stp q0, q0, [dstin, 32]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(set_long):
|
||||
and valw, valw, 255
|
||||
bic dst, dstin, 15
|
||||
str q0, [dstin]
|
||||
cmp count, 160
|
||||
ccmp valw, 0, 0, hs
|
||||
b.ne L(no_zva)
|
||||
|
||||
#ifndef SKIP_ZVA_CHECK
|
||||
mrs zva_val, dczid_el0
|
||||
and zva_val, zva_val, 31
|
||||
cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||
b.ne L(no_zva)
|
||||
#endif
|
||||
str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
bic dst, dst, 63
|
||||
sub count, dstend, dst /* Count is now 64 too large. */
|
||||
sub count, count, 128 /* Adjust count and bias for loop. */
|
||||
|
||||
.p2align 4
|
||||
L(zva_loop):
|
||||
add dst, dst, 64
|
||||
dc zva, dst
|
||||
subs count, count, 64
|
||||
b.hi L(zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
L(no_zva):
|
||||
sub count, dstend, dst /* Count is 16 too large. */
|
||||
sub dst, dst, 16 /* Dst is biased by -32. */
|
||||
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||
L(no_zva_loop):
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]!
|
||||
subs count, count, 64
|
||||
b.hi L(no_zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
END (__memset_aarch64)
|
Loading…
Add table
Add a link
Reference in a new issue