mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-28 21:40:30 +00:00
Make quality improvements
- Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno
This commit is contained in:
parent
af8f2bd19f
commit
592f6ebc20
122 changed files with 6305 additions and 3859 deletions
|
@ -3,7 +3,7 @@
|
|||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ Optimized Routines │
|
||||
│ Copyright (c) 1999-2022, Arm Limited. │
|
||||
│ Copyright (c) 2018-2024, Arm Limited. │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
|
@ -80,11 +80,12 @@ ENTRY (__memcpy_aarch64_simd)
|
|||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 128
|
||||
b.hi L(copy_long)
|
||||
add dstend, dstin, count
|
||||
cmp count, 32
|
||||
b.hi L(copy32_128)
|
||||
nop
|
||||
|
||||
/* Small copies: 0..32 bytes. */
|
||||
cmp count, 16
|
||||
|
@ -95,6 +96,18 @@ ENTRY (__memcpy_aarch64_simd)
|
|||
str B_q, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
L(copy32_128):
|
||||
ldp A_q, B_q, [src]
|
||||
ldp C_q, D_q, [srcend, -32]
|
||||
cmp count, 64
|
||||
b.hi L(copy128)
|
||||
stp A_q, B_q, [dstin]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 8-15 bytes. */
|
||||
L(copy16):
|
||||
tbz count, 3, L(copy8)
|
||||
|
@ -104,7 +117,6 @@ L(copy16):
|
|||
str A_h, [dstend, -8]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
/* Copy 4-7 bytes. */
|
||||
L(copy8):
|
||||
tbz count, 2, L(copy4)
|
||||
|
@ -114,6 +126,19 @@ L(copy8):
|
|||
str B_lw, [dstend, -4]
|
||||
ret
|
||||
|
||||
/* Copy 65..128 bytes. */
|
||||
L(copy128):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
cmp count, 96
|
||||
b.ls L(copy96)
|
||||
ldp G_q, H_q, [srcend, -64]
|
||||
stp G_q, H_q, [dstend, -64]
|
||||
L(copy96):
|
||||
stp A_q, B_q, [dstin]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
/* Copy 0..3 bytes using a branchless sequence. */
|
||||
L(copy4):
|
||||
cbz count, L(copy0)
|
||||
|
@ -127,33 +152,11 @@ L(copy4):
|
|||
L(copy0):
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
L(copy32_128):
|
||||
ldp A_q, B_q, [src]
|
||||
ldp C_q, D_q, [srcend, -32]
|
||||
cmp count, 64
|
||||
b.hi L(copy128)
|
||||
stp A_q, B_q, [dstin]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 65..128 bytes. */
|
||||
L(copy128):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
cmp count, 96
|
||||
b.ls L(copy96)
|
||||
ldp G_q, H_q, [srcend, -64]
|
||||
stp G_q, H_q, [dstend, -64]
|
||||
L(copy96):
|
||||
stp A_q, B_q, [dstin]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
/* Copy more than 128 bytes. */
|
||||
L(copy_long):
|
||||
add dstend, dstin, count
|
||||
|
||||
/* Use backwards copy if there is an overlap. */
|
||||
sub tmp1, dstin, src
|
||||
cmp tmp1, count
|
||||
|
@ -190,6 +193,9 @@ L(copy64_from_end):
|
|||
stp A_q, B_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
nop
|
||||
|
||||
/* Large backwards copy for overlapping copies.
|
||||
Copy 16 bytes and then align srcend to 16-byte alignment. */
|
||||
L(copy_long_backwards):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue