Make more progress on aarch64

This commit is contained in:
Justine Tunney 2023-05-03 00:00:09 -07:00
parent 135080fd3e
commit aef9a69a60
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
42 changed files with 563 additions and 387 deletions

View file

@ -1,69 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Generates lookup table for computing CRC-32 byte-by-byte.
//
// void crc32init(uint32_t table[256], uint32_t polynomial) {
// uint32_t d, i, r;
// for (d = 0; d < 256; ++d) {
// r = d;
// for (i = 0; i < 8; ++i) {
// r = r >> 1 ^ (r & 1 ? polynomial : 0);
// }
// table[d] = r;
// }
// }
//
// @param rdi is pointer to uint32_t[256] array
// @param esi 32-bit binary polynomial config
// @note imposes ~300ns one-time cost
crc32init:
push %rbp
mov %rsp,%rbp
.profilable
lea 256*4(%rdi),%rdx
movd %esi,%xmm0
pshufd $0,%xmm0,%xmm0 # (uint32_t[]){esi,esi,esi,esi} %xmm0
pushpop 4,%rax
movd %eax,%xmm2 # (int[]){4,4,4,4} %xmm2
pshufd $0,%xmm2,%xmm2
0: sub $4,%rsp # (int[]){0,1,2,3} %xmm1
dec %eax
mov %eax,(%rsp)
jnz 0b
movdqu (%rsp),%xmm1
1: mov $8,%ecx
movdqa %xmm1,%xmm3
2: movdqa %xmm3,%xmm4
psrld $1,%xmm4
pslld $31,%xmm3
psrad $31,%xmm3
pand %xmm0,%xmm3
pxor %xmm4,%xmm3
movdqa %xmm3,%xmm4
.loop 2b
movdqu %xmm3,(%rdi)
add $16,%rdi
paddd %xmm2,%xmm1
cmp %rdx,%rdi
jb 1b
leave
ret
.endfn crc32init,globl

View file

@ -1,7 +1,7 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
@ -16,29 +16,15 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
#include "libc/nexgen32e/nexgen32e.h"
.initbss 300,_init_kToUpper
// ASCII lowercase → uppercase translation tables.
//
// char kToUpper[256];
//
// @see kToLower
kToUpper:
.rept 256
.byte 0
.endr
.endobj kToUpper,globl,hidden
.previous
.init.start 300,_init_kToUpper
push %rdi
call imapxlatab
xchg %rsi,(%rsp)
xor %ecx,%ecx
0: inc %ecx
subb $0x20,'a'-1(%rsi,%rcx)
cmp $'z'-'a'+1,%ecx
jne 0b
pop %rsi
.init.end 300,_init_kToUpper
void crc32init(uint32_t table[256], uint32_t polynomial) {
uint32_t d, i, r;
for (d = 0; d < 256; ++d) {
r = d;
for (i = 0; i < 8; ++i) {
r = r >> 1 ^ (r & 1 ? polynomial : 0);
}
table[d] = r;
}
}

View file

@ -1,48 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.initbss 300,_init_kToLower
// ASCII uppercase lowercase translation tables.
//
// char kToLower[256];
//
// @see kToUpper
kToLower:
.rept 256
.byte 0
.endr
.endobj kToLower,globl,hidden
.previous
.init.start 300,_init_kToLower
push %rdi
call imapxlatab
xchg %rsi,(%rsp)
xor %ecx,%ecx
0: inc %ecx
addb $0x20,'A'-1(%rsi,%rcx)
cmp $'Z'-'A'+1,%ecx
jne 0b
pop %rsi
.init.end 300,_init_kToLower
.type gperf_downcase,@object
.globl gperf_downcase
gperf_downcase = kToLower

41
libc/nexgen32e/ktolower.c Normal file
View file

@ -0,0 +1,41 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/tab.internal.h"
const uint8_t kToLower[256] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, ' ', '!', '\"', '#', '$', '%', '&', '\'', '(', ')',
'*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', ':', ';', '<', '=', '>', '?', '@', 'a', 'b', 'c', 'd', 'e',
'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
't', 'u', 'v', 'w', 'x', 'y', 'z', '[', '\\', ']', '^', '_', '`', 'a',
'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}',
'~', 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
252, 253, 254, 255,
};

41
libc/nexgen32e/ktoupper.c Normal file
View file

@ -0,0 +1,41 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/tab.internal.h"
const uint8_t kToUpper[256] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, ' ', '!', '\"', '#', '$', '%', '&', '\'', '(', ')',
'*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'A',
'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '{', '|', '}',
'~', 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
252, 253, 254, 255,
};

View file

@ -1,158 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/nexgen32e/x86feature.h"
#include "libc/macros.internal.h"
// Returns pointer to first instance of character.
//
// @param rdi is a non-null NUL-terminated char16_t string pointer
// @param esi is the search word
// @return rax points to character, or to NUL word if not found
// @note this won't return NULL if search character is NUL
strchrnul16:
.leafprologue
.profilable
or $-1,%r9
jmp 0f
.endfn strchrnul16,globl
// Returns pointer to first instance of character.
//
// @param rdi is a non-null NUL-terminated char16_t string pointer
// @param esi is the search word
// @return rax points to first result, or NULL if not found
// @note this won't return NULL if search character is NUL
// @asyncsignalsafe
strchr16:
.leafprologue
.profilable
xor %r9,%r9
0: mov %esi,%edx
xor %r11d,%r11d
or $-1,%rsi
xor %r8,%r8
jmp strsak16
.endfn strchr16,globl
// Returns pointer to first instance of character in range.
//
// @param rdi is a non-null pointer to memory
// @param esi is the search word
// @return rax points to word if found, or else undefined behavior
rawmemchr16:
or $-1,%rdx
// fallthrough
.endfn rawmemchr16,globl
// Returns pointer to first instance of character in range.
//
// @param rdi is a non-null pointer to memory
// @param esi is the search word
// @param rdx is length of memory in shorts
// @return rax points to word if found or NULL
// @asyncsignalsafe
memchr16:
.leafprologue
.profilable
xchg %rsi,%rdx
mov %edx,%r11d
xor %r8,%r8
xor %r10,%r10
jmp strsak16
.endfn memchr16,globl
// Returns length of char16_t string w/ security blankets.
//
// This is like strnlen() except it'll return 0 if (1) RDI is NULL
// or (2) a NUL-terminator wasn't found in the first RSI shorts.
//
// @param rdi is a nullable NUL-terminated char16_t string pointer
// @param rsi is the maximum number of shorts to consider
// @return rax is the number of shorts, excluding the NUL
strnlen16_s:
.leafprologue
.profilable
xor %eax,%eax
xor %r10d,%r10d
test %rdi,%rdi
jnz 0f
.leafepilogue
.endfn strnlen16_s,globl
// Swiss Army Knife of string char16_t scanning.
// Sixteen fast functions in one.
//
// @param rdi is non-null string memory
// @param rsi is max number of shorts to consider
// @param dx is search character #1
// @param r11w is search character #2
// @param r8 is subtracted from result (for length vs. pointer)
// @param r9 masks result if DH is found (for NUL vs. NULL)
// @param r10 masks result on shorts exhausted (for length v. NULL)
// @return rax end pointer after r8/r9/r10 modifications
strsak16:
lea -2(%rdi),%rax
1: add $2,%rax
sub $1,%rsi
jb .Lend
test $31,%al
jz .Lfast
.Lword: mov (%rax),%cx
cmp %cx,%dx
je .Ldone
cmp %cx,%r11w
je .Lnul
jmp 1b
.Ldone: sub %r8,%rax
jmp .Lret
.Lend: mov %r10,%r9
.Lnul: sub %r8,%rax
and %r9,%rax
.Lret: test %r8,%r8
jz 0f
shr %rax
0: .leafepilogue
.Lslow: add $32,%rsi
jmp .Lword
.Lfast:
#if !X86_NEED(AVX2)
testb X86_HAVE(AVX2)+kCpuids(%rip)
jz .Lword
#endif
movzwl %dx,%ecx
movd %ecx,%xmm0
movzwl %r11w,%ecx
movd %ecx,%xmm1
vpbroadcastw %xmm0,%ymm0
vpbroadcastw %xmm1,%ymm1
sub $32,%rax
1: add $32,%rax
sub $16,%rsi
jb .Lslow
vmovdqa (%rax),%ymm2
vpcmpeqw %ymm0,%ymm2,%ymm3
vpcmpeqw %ymm1,%ymm2,%ymm2
vpor %ymm3,%ymm2,%ymm2
vpmovmskb %ymm2,%ecx
bsf %ecx,%ecx
je 1b
vzeroupper
add %rcx,%rax
jmp .Lword
.endfn strsak16

View file

@ -19,34 +19,6 @@
#include "libc/nexgen32e/x86feature.h"
#include "libc/macros.internal.h"
// Returns pointer to first instance of character.
//
// @param rdi is a non-null NUL-terminated wchar_t string pointer
// @param esi is the search word
// @return rax points to character, or to NUL word if not found
// @note this won't return NULL if search character is NUL
wcschrnul:
.leafprologue
.profilable
or $-1,%r9
jmp 0f
// Returns pointer to first instance of character.
//
// @param rdi is a non-null NUL-terminated wchar_t string pointer
// @param esi is the search word
// @return rax points to first result, or NULL if not found
// @note this won't return NULL if search character is NUL
// @asyncsignalsafe
wcschr: .leafprologue
.profilable
xor %r9,%r9
0: mov %esi,%edx
xor %r11d,%r11d
pushpop -1,%rsi
xor %r8,%r8
jmp wcssak
// Returns length of wchar_t string w/ security blankets.
//
// This is like wcsnlen() except it'll return 0 if (1) RDI is NULL
@ -168,5 +140,3 @@ wcssak: lea -4(%rdi),%rax
.endfn wmemchr,globl
.endfn rawwmemchr,globl
.endfn wcsnlen,globl
.endfn wcschr,globl
.endfn wcschrnul,globl