mirror of
				https://github.com/jart/cosmopolitan.git
				synced 2025-10-26 11:10:58 +00:00 
			
		
		
		
	Make AARCH64 harder, better, faster, stronger
- Perform some housekeeping on scalar math function code - Import ARM's Optimized Routines for SIMD string processing - Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
		
							parent
							
								
									550b52abf6
								
							
						
					
					
						commit
						cc1732bc42
					
				
					 143 changed files with 15661 additions and 1329 deletions
				
			
		|  | @ -73,6 +73,13 @@ IMAGE_BASE_VIRTUAL ?= 0x400000 | |||
| IGNORE := $(shell $(ECHO) -2 ♥cosmo) | ||||
| IGNORE := $(shell $(MKDIR) o/tmp) | ||||
| 
 | ||||
| ifeq ($(MODE), dbg) | ||||
| # be generous about resources in debug mode
 | ||||
| # let commands use  64 seconds  cpu time max
 | ||||
| # let commands use 300 seconds wall time max
 | ||||
| QUOTA ?= -C64 -L300 | ||||
| endif | ||||
| 
 | ||||
| ifneq ($(findstring aarch64,$(MODE)),) | ||||
| ARCH = aarch64 | ||||
| VM = o/third_party/qemu/qemu-aarch64 | ||||
|  |  | |||
|  | @ -12,8 +12,8 @@ | |||
| #include "libc/errno.h" | ||||
| #include "libc/fmt/conv.h" | ||||
| #include "libc/log/check.h" | ||||
| #include "libc/mem/mem.h" | ||||
| #include "libc/mem/gc.internal.h" | ||||
| #include "libc/mem/mem.h" | ||||
| #include "libc/runtime/runtime.h" | ||||
| #include "libc/stdio/stdio.h" | ||||
| #include "libc/str/str.h" | ||||
|  | @ -48,26 +48,62 @@ FLAGS\n\ | |||
| // clang-format off
 | ||||
| // make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
 | ||||
| /*
 | ||||
| #!/bin/bash | ||||
| # data file is o/dbg/third_party/python/python.com | ||||
| # level 0 147517 compress 495 MB/s decompress 1.4 GB/s | ||||
| # level 1 80274 compress 29.2 MB/s decompress 303 MB/s | ||||
| # level 2 79384 compress 33.8 MB/s decompress 212 MB/s | ||||
| # level 3 78875 compress 28.9 MB/s decompress 224 MB/s | ||||
| # level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot? | ||||
| # level 5 77107 compress 19.5 MB/s decompress 273 MB/s | ||||
| # level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s | ||||
| # level 7 75022 compress 7.5 MB/s decompress 287 MB/s | ||||
| # level 8 75016 compress 5.4 MB/s decompress 109 MB/s | ||||
| # level 9 75016 compress 5.4 MB/s decompress 344 MB/s | ||||
| # level   1 348739 compress 22.8 MB/s decompress 444 MB/s | ||||
| # level   2 347549 compress 37.8 MB/s decompress 457 MB/s | ||||
| # level   3 346902 compress 33.3 MB/s decompress 463 MB/s | ||||
| # level   4 345671 compress 29.3 MB/s decompress 467 MB/s | ||||
| # level   5 344392 compress 22.4 MB/s decompress 506 MB/s | ||||
| # level   6 342105 compress 10.9 MB/s decompress 516 MB/s | ||||
| # level   7 342046 compress  7.9 MB/s decompress 515 MB/s | ||||
| # level   8 342009 compress  5.8 MB/s decompress 518 MB/s | ||||
| # level   9 342001 compress  5.7 MB/s decompress 524 MB/s | ||||
| # level F 1 362426 compress 48.2 MB/s decompress 488 MB/s | ||||
| # level F 2 360875 compress 42.7 MB/s decompress 484 MB/s | ||||
| # level F 3 359992 compress 37.1 MB/s decompress 499 MB/s | ||||
| # level F 4 358460 compress 32.9 MB/s decompress 503 MB/s | ||||
| # level F 5 356431 compress 24.0 MB/s decompress 547 MB/s | ||||
| # level F 6 352274 compress 11.6 MB/s decompress 558 MB/s | ||||
| # level F 7 352155 compress  8.7 MB/s decompress 554 MB/s | ||||
| # level F 8 352065 compress  6.3 MB/s decompress 554 MB/s | ||||
| # level F 9 352051 compress  6.2 MB/s decompress 556 MB/s | ||||
| # level L 1 348739 compress 41.1 MB/s decompress 446 MB/s | ||||
| # level L 2 347549 compress 37.4 MB/s decompress 443 MB/s | ||||
| # level L 3 346902 compress 32.3 MB/s decompress 462 MB/s | ||||
| # level L 4 351932 compress 28.8 MB/s decompress 511 MB/s | ||||
| # level L 5 351384 compress 23.6 MB/s decompress 520 MB/s | ||||
| # level L 6 351328 compress 12.1 MB/s decompress 522 MB/s | ||||
| # level L 7 351230 compress  7.3 MB/s decompress 518 MB/s | ||||
| # level L 8 351192 compress  5.7 MB/s decompress 522 MB/s | ||||
| # level L 9 351182 compress  6.5 MB/s decompress 519 MB/s | ||||
| # level R 1 388209 compress 83.1 MB/s decompress 371 MB/s | ||||
| # level R 2 388209 compress 82.3 MB/s decompress 362 MB/s | ||||
| # level R 3 388209 compress 81.8 MB/s decompress 361 MB/s | ||||
| # level R 4 388209 compress 81.7 MB/s decompress 364 MB/s | ||||
| # level R 5 388209 compress 81.7 MB/s decompress 363 MB/s | ||||
| # level R 6 388209 compress 80.1 MB/s decompress 359 MB/s | ||||
| # level R 7 388209 compress 80.3 MB/s decompress 354 MB/s | ||||
| # level R 8 388209 compress 80.3 MB/s decompress 363 MB/s | ||||
| # level R 9 388209 compress 81.3 MB/s decompress 364 MB/s | ||||
| # level H 1 390207 compress 87.6 MB/s decompress 371 MB/s | ||||
| # level H 2 390207 compress 87.5 MB/s decompress 372 MB/s | ||||
| # level H 3 390207 compress 85.5 MB/s decompress 364 MB/s | ||||
| # level H 4 390207 compress 87.3 MB/s decompress 375 MB/s | ||||
| # level H 5 390207 compress 89.0 MB/s decompress 373 MB/s | ||||
| # level H 6 390207 compress 87.3 MB/s decompress 372 MB/s | ||||
| # level H 7 390207 compress 87.0 MB/s decompress 368 MB/s | ||||
| # level H 8 390207 compress 86.2 MB/s decompress 367 MB/s | ||||
| # level H 9 390207 compress 86.9 MB/s decompress 369 MB/s | ||||
| m= | ||||
| make -j8 MODE=$m o/$m/examples || exit | ||||
| for strategy in ' ' F L R H; do | ||||
| for level in $(seq 1 9); do | ||||
| for strategy in F L R H; do | ||||
|   o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp | ||||
|   o/$m/examples/compress.com -$level$strategy <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp | ||||
|   compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info) | ||||
|   o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null | ||||
|   decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info) | ||||
|   size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c) | ||||
|   size=$(o/$m/examples/compress.com -$level$strategy <o/$m/examples/compress.com | wc -c) | ||||
|   echo "level $strategy $level $size compress $compspeed decompress $decompspeed" | ||||
| done | ||||
| done | ||||
|  |  | |||
|  | @ -10,43 +10,14 @@ | |||
| #include "libc/assert.h" | ||||
| #include "libc/calls/calls.h" | ||||
| #include "libc/errno.h" | ||||
| #include "libc/mem/mem.h" | ||||
| #include "libc/mem/gc.internal.h" | ||||
| #include "libc/mem/mem.h" | ||||
| #include "libc/stdio/stdio.h" | ||||
| #include "libc/str/str.h" | ||||
| #include "third_party/zlib/zlib.h" | ||||
| 
 | ||||
| #define CHUNK 32768 | ||||
| 
 | ||||
| // clang-format off
 | ||||
| // make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
 | ||||
| /*
 | ||||
| # data file is o/dbg/third_party/python/python.com | ||||
| # level 0 147517 compress 495 MB/s decompress 1.4 GB/s | ||||
| # level 1 80274 compress 29.2 MB/s decompress 303 MB/s | ||||
| # level 2 79384 compress 33.8 MB/s decompress 212 MB/s | ||||
| # level 3 78875 compress 28.9 MB/s decompress 224 MB/s | ||||
| # level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot? | ||||
| # level 5 77107 compress 19.5 MB/s decompress 273 MB/s | ||||
| # level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s | ||||
| # level 7 75022 compress 7.5 MB/s decompress 287 MB/s | ||||
| # level 8 75016 compress 5.4 MB/s decompress 109 MB/s | ||||
| # level 9 75016 compress 5.4 MB/s decompress 344 MB/s | ||||
| m= | ||||
| make -j8 MODE=$m o/$m/examples || exit | ||||
| for level in $(seq 0 9); do | ||||
| for strategy in F L R H; do | ||||
|   o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp | ||||
|   compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info) | ||||
|   o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null | ||||
|   decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info) | ||||
|   size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c) | ||||
|   echo "level $strategy $level $size compress $compspeed decompress $decompspeed" | ||||
| done | ||||
| done | ||||
| */ | ||||
| // clang-format on
 | ||||
| 
 | ||||
| int decompressor(int infd, int outfd) { | ||||
|   int rc; | ||||
|   unsigned have; | ||||
|  |  | |||
							
								
								
									
										88
									
								
								libc/intrin/aarch64/asmdefs.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								libc/intrin/aarch64/asmdefs.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,88 @@ | |||
| #ifndef COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_ | ||||
| #define COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_ | ||||
| #ifdef __ASSEMBLER__ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /* Branch Target Identitication support.  */ | ||||
| #define BTI_C		hint	34 | ||||
| #define BTI_J		hint	36 | ||||
| /* Return address signing support (pac-ret).  */ | ||||
| #define PACIASP		hint	25; .cfi_window_save | ||||
| #define AUTIASP		hint	29; .cfi_window_save | ||||
| 
 | ||||
| /* GNU_PROPERTY_AARCH64_* macros from elf.h.  */ | ||||
| #define FEATURE_1_AND 0xc0000000 | ||||
| #define FEATURE_1_BTI 1 | ||||
| #define FEATURE_1_PAC 2 | ||||
| 
 | ||||
| /* Add a NT_GNU_PROPERTY_TYPE_0 note.  */ | ||||
| #define GNU_PROPERTY(type, value)	\ | ||||
|   .section .note.gnu.property, "a";	\ | ||||
|   .p2align 3;				\ | ||||
|   .word 4;				\ | ||||
|   .word 16;				\ | ||||
|   .word 5;				\ | ||||
|   .asciz "GNU";				\ | ||||
|   .word type;				\ | ||||
|   .word 4;				\ | ||||
|   .word value;				\ | ||||
|   .word 0;				\ | ||||
|   .text | ||||
| 
 | ||||
| /* If set then the GNU Property Note section will be added to
 | ||||
|    mark objects to support BTI and PAC-RET.  */ | ||||
| #ifndef WANT_GNU_PROPERTY | ||||
| #define WANT_GNU_PROPERTY 1 | ||||
| #endif | ||||
| 
 | ||||
| #if WANT_GNU_PROPERTY | ||||
| /* Add property note with supported features to all asm files.  */ | ||||
| GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) | ||||
| #endif | ||||
| 
 | ||||
| #define ENTRY_ALIGN(name, alignment)	\ | ||||
|   .global name;		\ | ||||
|   .type name,%function;	\ | ||||
|   .align alignment;		\ | ||||
|   name:			\ | ||||
|   .cfi_startproc;	\ | ||||
|   BTI_C; | ||||
| 
 | ||||
| #define ENTRY(name)	ENTRY_ALIGN(name, 6) | ||||
| 
 | ||||
| #define ENTRY_ALIAS(name)	\ | ||||
|   .global name;		\ | ||||
|   .type name,%function;	\ | ||||
|   name: | ||||
| 
 | ||||
| #define END(name)	\ | ||||
|   .cfi_endproc;		\ | ||||
|   .size name, .-name; | ||||
| 
 | ||||
| #define L(l) .L ## l | ||||
| 
 | ||||
| #ifdef __ILP32__ | ||||
|   /* Sanitize padding bits of pointer arguments as per aapcs64 */ | ||||
| #define PTR_ARG(n)  mov w##n, w##n | ||||
| #else | ||||
| #define PTR_ARG(n) | ||||
| #endif | ||||
| 
 | ||||
| #ifdef __ILP32__ | ||||
|   /* Sanitize padding bits of size arguments as per aapcs64 */ | ||||
| #define SIZE_ARG(n)  mov w##n, w##n | ||||
| #else | ||||
| #define SIZE_ARG(n) | ||||
| #endif | ||||
| 
 | ||||
| /* Compiler supports SVE instructions  */ | ||||
| #ifndef HAVE_SVE | ||||
| # if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) | ||||
| #   define HAVE_SVE 1 | ||||
| # else | ||||
| #   define HAVE_SVE 0 | ||||
| # endif | ||||
| #endif | ||||
| 
 | ||||
| #endif /* __ASSEMBLER__ */ | ||||
| #endif /* COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_ */ | ||||
							
								
								
									
										172
									
								
								libc/intrin/aarch64/memchr.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								libc/intrin/aarch64/memchr.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,172 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __memchr_aarch64 memchr | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64 | ||||
|  * Neon Available. | ||||
|  */ | ||||
| 
 | ||||
| /* Arguments and results.  */ | ||||
| #define srcin		x0 | ||||
| #define chrin		w1 | ||||
| #define cntin		x2 | ||||
| 
 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x3 | ||||
| #define	tmp		x4 | ||||
| #define wtmp2		w5 | ||||
| #define synd		x6 | ||||
| #define soff		x9 | ||||
| #define cntrem		x10 | ||||
| 
 | ||||
| #define vrepchr		v0 | ||||
| #define vdata1		v1 | ||||
| #define vdata2		v2 | ||||
| #define vhas_chr1	v3 | ||||
| #define vhas_chr2	v4 | ||||
| #define vrepmask	v5 | ||||
| #define vend		v6 | ||||
| 
 | ||||
| /* | ||||
|  * Core algorithm: | ||||
|  * | ||||
|  * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits | ||||
|  * per byte. For each tuple, bit 0 is set if the relevant byte matched the | ||||
|  * requested character and bit 1 is not used (faster than using a 32bit | ||||
|  * syndrome). Since the bits in the syndrome reflect exactly the order in which | ||||
|  * things occur in the original string, counting trailing zeros allows to | ||||
|  * identify exactly which byte has matched. | ||||
|  */ | ||||
| 
 | ||||
| ENTRY (__memchr_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	SIZE_ARG (2) | ||||
| 	/* Do not dereference srcin if no bytes to compare.  */ | ||||
| 	cbz	cntin, L(zero_length) | ||||
| 	/* | ||||
| 	 * Magic constant 0x40100401 allows us to identify which lane matches | ||||
| 	 * the requested byte. | ||||
| 	 */ | ||||
| 	mov	wtmp2, #0x0401 | ||||
| 	movk	wtmp2, #0x4010, lsl #16 | ||||
| 	dup	vrepchr.16b, chrin | ||||
| 	/* Work with aligned 32-byte chunks */ | ||||
| 	bic	src, srcin, #31 | ||||
| 	dup	vrepmask.4s, wtmp2 | ||||
| 	ands	soff, srcin, #31 | ||||
| 	and	cntrem, cntin, #31 | ||||
| 	b.eq	L(loop) | ||||
| 
 | ||||
| 	/* | ||||
| 	 * Input string is not 32-byte aligned. We calculate the syndrome | ||||
| 	 * value for the aligned 32 bytes block containing the first bytes | ||||
| 	 * and mask the irrelevant part. | ||||
| 	 */ | ||||
| 
 | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	sub	tmp, soff, #32 | ||||
| 	adds	cntin, cntin, tmp | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b | ||||
| 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b | ||||
| 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */ | ||||
| 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */ | ||||
| 	mov	synd, vend.d[0] | ||||
| 	/* Clear the soff*2 lower bits */ | ||||
| 	lsl	tmp, soff, #1 | ||||
| 	lsr	synd, synd, tmp | ||||
| 	lsl	synd, synd, tmp | ||||
| 	/* The first block can also be the last */ | ||||
| 	b.ls	L(masklast) | ||||
| 	/* Have we found something already? */ | ||||
| 	cbnz	synd, L(tail) | ||||
| 
 | ||||
| L(loop): | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	subs	cntin, cntin, #32 | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	/* If we're out of data we finish regardless of the result */ | ||||
| 	b.ls	L(end) | ||||
| 	/* Use a fast check for the termination condition */ | ||||
| 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b | ||||
| 	addp	vend.2d, vend.2d, vend.2d | ||||
| 	mov	synd, vend.d[0] | ||||
| 	/* We're not out of data, loop if we haven't found the character */ | ||||
| 	cbz	synd, L(loop) | ||||
| 
 | ||||
| L(end): | ||||
| 	/* Termination condition found, let's calculate the syndrome value */ | ||||
| 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b | ||||
| 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b | ||||
| 	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */ | ||||
| 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */ | ||||
| 	mov	synd, vend.d[0] | ||||
| 	/* Only do the clear for the last possible block */ | ||||
| 	b.hs	L(tail) | ||||
| 
 | ||||
| L(masklast): | ||||
| 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ | ||||
| 	add	tmp, cntrem, soff | ||||
| 	and	tmp, tmp, #31 | ||||
| 	sub	tmp, tmp, #32 | ||||
| 	neg	tmp, tmp, lsl #1 | ||||
| 	lsl	synd, synd, tmp | ||||
| 	lsr	synd, synd, tmp | ||||
| 
 | ||||
| L(tail): | ||||
| 	/* Count the trailing zeros using bit reversing */ | ||||
| 	rbit	synd, synd | ||||
| 	/* Compensate the last post-increment */ | ||||
| 	sub	src, src, #32 | ||||
| 	/* Check that we have found a character */ | ||||
| 	cmp	synd, #0 | ||||
| 	/* And count the leading zeros */ | ||||
| 	clz	synd, synd | ||||
| 	/* Compute the potential result */ | ||||
| 	add	result, src, synd, lsr #1 | ||||
| 	/* Select result or NULL */ | ||||
| 	csel	result, xzr, result, eq | ||||
| 	ret | ||||
| 
 | ||||
| L(zero_length): | ||||
| 	mov	result, #0 | ||||
| 	ret | ||||
| 
 | ||||
| END (__memchr_aarch64) | ||||
							
								
								
									
										218
									
								
								libc/intrin/aarch64/memcmp.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										218
									
								
								libc/intrin/aarch64/memcmp.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,218 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __memcmp_aarch64 memcmp | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. | ||||
|  */ | ||||
| 
 | ||||
| #define src1	x0 | ||||
| #define src2	x1 | ||||
| #define limit	x2 | ||||
| #define result	w0 | ||||
| 
 | ||||
| #define data1	x3 | ||||
| #define data1w	w3 | ||||
| #define data2	x4 | ||||
| #define data2w	w4 | ||||
| #define data3	x5 | ||||
| #define data3w	w5 | ||||
| #define data4	x6 | ||||
| #define data4w	w6 | ||||
| #define tmp	x6 | ||||
| #define src1end	x7 | ||||
| #define src2end	x8 | ||||
| 
 | ||||
| 
 | ||||
| ENTRY (__memcmp_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	PTR_ARG (1) | ||||
| 	SIZE_ARG (2) | ||||
| 
 | ||||
| 	cmp	limit, 16 | ||||
| 	b.lo	L(less16) | ||||
| 	ldp	data1, data3, [src1] | ||||
| 	ldp	data2, data4, [src2] | ||||
| 	ccmp	data1, data2, 0, ne | ||||
| 	ccmp	data3, data4, 0, eq | ||||
| 	b.ne	L(return2) | ||||
| 
 | ||||
| 	add	src1end, src1, limit | ||||
| 	add	src2end, src2, limit | ||||
| 	cmp	limit, 32 | ||||
| 	b.ls	L(last_bytes) | ||||
| 	cmp	limit, 160 | ||||
| 	b.hs	L(loop_align) | ||||
| 	sub	limit, limit, 32 | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(loop32): | ||||
| 	ldp	data1, data3, [src1, 16] | ||||
| 	ldp	data2, data4, [src2, 16] | ||||
| 	cmp	data1, data2 | ||||
| 	ccmp	data3, data4, 0, eq | ||||
| 	b.ne	L(return2) | ||||
| 	cmp	limit, 16 | ||||
| 	b.ls	L(last_bytes) | ||||
| 
 | ||||
| 	ldp	data1, data3, [src1, 32] | ||||
| 	ldp	data2, data4, [src2, 32] | ||||
| 	cmp	data1, data2 | ||||
| 	ccmp	data3, data4, 0, eq | ||||
| 	b.ne	L(return2) | ||||
| 	add	src1, src1, 32 | ||||
| 	add	src2, src2, 32 | ||||
| L(last64): | ||||
| 	subs	limit, limit, 32 | ||||
| 	b.hi	L(loop32) | ||||
| 
 | ||||
| 	/* Compare last 1-16 bytes using unaligned access.  */ | ||||
| L(last_bytes): | ||||
| 	ldp	data1, data3, [src1end, -16] | ||||
| 	ldp	data2, data4, [src2end, -16] | ||||
| L(return2): | ||||
| 	cmp	data1, data2 | ||||
| 	csel	data1, data1, data3, ne | ||||
| 	csel	data2, data2, data4, ne | ||||
| 
 | ||||
| 	/* Compare data bytes and set return value to 0, -1 or 1.  */ | ||||
| L(return): | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rev	data1, data1 | ||||
| 	rev	data2, data2 | ||||
| #endif | ||||
| 	cmp	data1, data2 | ||||
| 	cset	result, ne | ||||
| 	cneg	result, result, lo | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(less16): | ||||
| 	add	src1end, src1, limit | ||||
| 	add	src2end, src2, limit | ||||
| 	tbz	limit, 3, L(less8) | ||||
| 	ldr	data1, [src1] | ||||
| 	ldr	data2, [src2] | ||||
| 	ldr	data3, [src1end, -8] | ||||
| 	ldr	data4, [src2end, -8] | ||||
| 	b	L(return2) | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(less8): | ||||
| 	tbz	limit, 2, L(less4) | ||||
| 	ldr	data1w, [src1] | ||||
| 	ldr	data2w, [src2] | ||||
| 	ldr	data3w, [src1end, -4] | ||||
| 	ldr	data4w, [src2end, -4] | ||||
| 	b	L(return2) | ||||
| 
 | ||||
| L(less4): | ||||
| 	tbz	limit, 1, L(less2) | ||||
| 	ldrh	data1w, [src1] | ||||
| 	ldrh	data2w, [src2] | ||||
| 	cmp	data1w, data2w | ||||
| 	b.ne	L(return) | ||||
| L(less2): | ||||
| 	mov	result, 0 | ||||
| 	tbz	limit, 0, L(return_zero) | ||||
| 	ldrb	data1w, [src1end, -1] | ||||
| 	ldrb	data2w, [src2end, -1] | ||||
| 	sub	result, data1w, data2w | ||||
| L(return_zero): | ||||
| 	ret | ||||
| 
 | ||||
| L(loop_align): | ||||
| 	ldp	data1, data3, [src1, 16] | ||||
| 	ldp	data2, data4, [src2, 16] | ||||
| 	cmp	data1, data2 | ||||
| 	ccmp	data3, data4, 0, eq | ||||
| 	b.ne	L(return2) | ||||
| 
 | ||||
| 	/* Align src2 and adjust src1, src2 and limit.  */ | ||||
| 	and	tmp, src2, 15 | ||||
| 	sub	tmp, tmp, 16 | ||||
| 	sub	src2, src2, tmp | ||||
| 	add	limit, limit, tmp | ||||
| 	sub	src1, src1, tmp | ||||
| 	sub	limit, limit, 64 + 16 | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(loop64): | ||||
| 	ldr	q0, [src1, 16] | ||||
| 	ldr	q1, [src2, 16] | ||||
| 	subs	limit, limit, 64 | ||||
| 	ldr	q2, [src1, 32] | ||||
| 	ldr	q3, [src2, 32] | ||||
| 	eor	v0.16b, v0.16b, v1.16b | ||||
| 	eor	v1.16b, v2.16b, v3.16b | ||||
| 	ldr	q2, [src1, 48] | ||||
| 	ldr	q3, [src2, 48] | ||||
| 	umaxp	v0.16b, v0.16b, v1.16b | ||||
| 	ldr	q4, [src1, 64]! | ||||
| 	ldr	q5, [src2, 64]! | ||||
| 	eor	v1.16b, v2.16b, v3.16b | ||||
| 	eor	v2.16b, v4.16b, v5.16b | ||||
| 	umaxp	v1.16b, v1.16b, v2.16b | ||||
| 	umaxp	v0.16b, v0.16b, v1.16b | ||||
| 	umaxp	v0.16b, v0.16b, v0.16b | ||||
| 	fmov	tmp, d0 | ||||
| 	ccmp	tmp, 0, 0, hi | ||||
| 	b.eq	L(loop64) | ||||
| 
 | ||||
| 	/* If equal, process last 1-64 bytes using scalar loop.  */ | ||||
| 	add	limit, limit, 64 + 16 | ||||
| 	cbz	tmp, L(last64) | ||||
| 
 | ||||
| 	/* Determine the 8-byte aligned offset of the first difference.  */ | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev16	tmp, tmp | ||||
| #endif | ||||
| 	rev	tmp, tmp | ||||
| 	clz	tmp, tmp | ||||
| 	bic	tmp, tmp, 7 | ||||
| 	sub	tmp, tmp, 48 | ||||
| 	ldr	data1, [src1, tmp] | ||||
| 	ldr	data2, [src2, tmp] | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rev	data1, data1 | ||||
| 	rev	data2, data2 | ||||
| #endif | ||||
| 	mov	result, 1 | ||||
| 	cmp	data1, data2 | ||||
| 	cneg	result, result, lo | ||||
| 	ret | ||||
| 
 | ||||
| END (__memcmp_aarch64) | ||||
							
								
								
									
										233
									
								
								libc/intrin/aarch64/memcpy.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										233
									
								
								libc/intrin/aarch64/memcpy.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,233 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __memcpy_aarch64_simd memcpy | ||||
| #define __memmove_aarch64_simd memmove | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. | ||||
|  * | ||||
|  */ | ||||
| 
 | ||||
| #define dstin	x0 | ||||
| #define src	x1 | ||||
| #define count	x2 | ||||
| #define dst	x3 | ||||
| #define srcend	x4 | ||||
| #define dstend	x5 | ||||
| #define A_l	x6 | ||||
| #define A_lw	w6 | ||||
| #define A_h	x7 | ||||
| #define B_l	x8 | ||||
| #define B_lw	w8 | ||||
| #define B_h	x9 | ||||
| #define C_lw	w10 | ||||
| #define tmp1	x14 | ||||
| 
 | ||||
| #define A_q	q0 | ||||
| #define B_q	q1 | ||||
| #define C_q	q2 | ||||
| #define D_q	q3 | ||||
| #define E_q	q4 | ||||
| #define F_q	q5 | ||||
| #define G_q	q6 | ||||
| #define H_q	q7 | ||||
| 
 | ||||
| /* This implementation handles overlaps and supports both memcpy and memmove | ||||
|    from a single entry point.  It uses unaligned accesses and branchless | ||||
|    sequences to keep the code small, simple and improve performance. | ||||
| 
 | ||||
|    Copies are split into 3 main cases: small copies of up to 32 bytes, medium | ||||
|    copies of up to 128 bytes, and large copies.  The overhead of the overlap | ||||
|    check is negligible since it is only required for large copies. | ||||
| 
 | ||||
|    Large copies use a software pipelined loop processing 64 bytes per iteration. | ||||
|    The source pointer is 16-byte aligned to minimize unaligned accesses. | ||||
|    The loop tail is handled by always copying 64 bytes from the end. | ||||
| */ | ||||
| 
 | ||||
| ENTRY_ALIAS (__memmove_aarch64_simd) | ||||
| ENTRY (__memcpy_aarch64_simd) | ||||
| 	PTR_ARG (0) | ||||
| 	PTR_ARG (1) | ||||
| 	SIZE_ARG (2) | ||||
| 	add	srcend, src, count | ||||
| 	add	dstend, dstin, count | ||||
| 	cmp	count, 128 | ||||
| 	b.hi	L(copy_long) | ||||
| 	cmp	count, 32 | ||||
| 	b.hi	L(copy32_128) | ||||
| 
 | ||||
| 	/* Small copies: 0..32 bytes.  */ | ||||
| 	cmp	count, 16 | ||||
| 	b.lo	L(copy16) | ||||
| 	ldr	A_q, [src] | ||||
| 	ldr	B_q, [srcend, -16] | ||||
| 	str	A_q, [dstin] | ||||
| 	str	B_q, [dstend, -16] | ||||
| 	ret | ||||
| 
 | ||||
| 	/* Copy 8-15 bytes.  */ | ||||
| L(copy16): | ||||
| 	tbz	count, 3, L(copy8) | ||||
| 	ldr	A_l, [src] | ||||
| 	ldr	A_h, [srcend, -8] | ||||
| 	str	A_l, [dstin] | ||||
| 	str	A_h, [dstend, -8] | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 3
 | ||||
| 	/* Copy 4-7 bytes.  */ | ||||
| L(copy8): | ||||
| 	tbz	count, 2, L(copy4) | ||||
| 	ldr	A_lw, [src] | ||||
| 	ldr	B_lw, [srcend, -4] | ||||
| 	str	A_lw, [dstin] | ||||
| 	str	B_lw, [dstend, -4] | ||||
| 	ret | ||||
| 
 | ||||
| 	/* Copy 0..3 bytes using a branchless sequence.  */ | ||||
| L(copy4): | ||||
| 	cbz	count, L(copy0) | ||||
| 	lsr	tmp1, count, 1 | ||||
| 	ldrb	A_lw, [src] | ||||
| 	ldrb	C_lw, [srcend, -1] | ||||
| 	ldrb	B_lw, [src, tmp1] | ||||
| 	strb	A_lw, [dstin] | ||||
| 	strb	B_lw, [dstin, tmp1] | ||||
| 	strb	C_lw, [dstend, -1] | ||||
| L(copy0): | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| 	/* Medium copies: 33..128 bytes.  */ | ||||
| L(copy32_128): | ||||
| 	ldp	A_q, B_q, [src] | ||||
| 	ldp	C_q, D_q, [srcend, -32] | ||||
| 	cmp	count, 64 | ||||
| 	b.hi	L(copy128) | ||||
| 	stp	A_q, B_q, [dstin] | ||||
| 	stp	C_q, D_q, [dstend, -32] | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| 	/* Copy 65..128 bytes.  */ | ||||
| L(copy128): | ||||
| 	ldp	E_q, F_q, [src, 32] | ||||
| 	cmp	count, 96 | ||||
| 	b.ls	L(copy96) | ||||
| 	ldp	G_q, H_q, [srcend, -64] | ||||
| 	stp	G_q, H_q, [dstend, -64] | ||||
| L(copy96): | ||||
| 	stp	A_q, B_q, [dstin] | ||||
| 	stp	E_q, F_q, [dstin, 32] | ||||
| 	stp	C_q, D_q, [dstend, -32] | ||||
| 	ret | ||||
| 
 | ||||
| 	/* Copy more than 128 bytes.  */ | ||||
| L(copy_long): | ||||
| 	/* Use backwards copy if there is an overlap.  */ | ||||
| 	sub	tmp1, dstin, src | ||||
| 	cmp	tmp1, count | ||||
| 	b.lo	L(copy_long_backwards) | ||||
| 
 | ||||
| 	/* Copy 16 bytes and then align src to 16-byte alignment.  */ | ||||
| 	ldr	D_q, [src] | ||||
| 	and	tmp1, src, 15 | ||||
| 	bic	src, src, 15 | ||||
| 	sub	dst, dstin, tmp1 | ||||
| 	add	count, count, tmp1	/* Count is now 16 too large.  */ | ||||
| 	ldp	A_q, B_q, [src, 16] | ||||
| 	str	D_q, [dstin] | ||||
| 	ldp	C_q, D_q, [src, 48] | ||||
| 	subs	count, count, 128 + 16	/* Test and readjust count.  */ | ||||
| 	b.ls	L(copy64_from_end) | ||||
| L(loop64): | ||||
| 	stp	A_q, B_q, [dst, 16] | ||||
| 	ldp	A_q, B_q, [src, 80] | ||||
| 	stp	C_q, D_q, [dst, 48] | ||||
| 	ldp	C_q, D_q, [src, 112] | ||||
| 	add	src, src, 64 | ||||
| 	add	dst, dst, 64 | ||||
| 	subs	count, count, 64 | ||||
| 	b.hi	L(loop64) | ||||
| 
 | ||||
| 	/* Write the last iteration and copy 64 bytes from the end.  */ | ||||
| L(copy64_from_end): | ||||
| 	ldp	E_q, F_q, [srcend, -64] | ||||
| 	stp	A_q, B_q, [dst, 16] | ||||
| 	ldp	A_q, B_q, [srcend, -32] | ||||
| 	stp	C_q, D_q, [dst, 48] | ||||
| 	stp	E_q, F_q, [dstend, -64] | ||||
| 	stp	A_q, B_q, [dstend, -32] | ||||
| 	ret | ||||
| 
 | ||||
| 	/* Large backwards copy for overlapping copies. | ||||
| 	   Copy 16 bytes and then align srcend to 16-byte alignment.  */ | ||||
| L(copy_long_backwards): | ||||
| 	cbz	tmp1, L(copy0) | ||||
| 	ldr	D_q, [srcend, -16] | ||||
| 	and	tmp1, srcend, 15 | ||||
| 	bic	srcend, srcend, 15 | ||||
| 	sub	count, count, tmp1 | ||||
| 	ldp	A_q, B_q, [srcend, -32] | ||||
| 	str	D_q, [dstend, -16] | ||||
| 	ldp	C_q, D_q, [srcend, -64] | ||||
| 	sub	dstend, dstend, tmp1 | ||||
| 	subs	count, count, 128 | ||||
| 	b.ls	L(copy64_from_start) | ||||
| 
 | ||||
| L(loop64_backwards): | ||||
| 	str	B_q, [dstend, -16] | ||||
| 	str	A_q, [dstend, -32] | ||||
| 	ldp	A_q, B_q, [srcend, -96] | ||||
| 	str	D_q, [dstend, -48] | ||||
| 	str	C_q, [dstend, -64]! | ||||
| 	ldp	C_q, D_q, [srcend, -128] | ||||
| 	sub	srcend, srcend, 64 | ||||
| 	subs	count, count, 64 | ||||
| 	b.hi	L(loop64_backwards) | ||||
| 
 | ||||
| 	/* Write the last iteration and copy 64 bytes from the start.  */ | ||||
| L(copy64_from_start): | ||||
| 	ldp	E_q, F_q, [src, 32] | ||||
| 	stp	A_q, B_q, [dstend, -32] | ||||
| 	ldp	A_q, B_q, [src] | ||||
| 	stp	C_q, D_q, [dstend, -64] | ||||
| 	stp	E_q, F_q, [dstin, 32] | ||||
| 	stp	A_q, B_q, [dstin] | ||||
| 	ret | ||||
| 
 | ||||
| END (__memcpy_aarch64_simd) | ||||
							
								
								
									
										138
									
								
								libc/intrin/aarch64/memrchr.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										138
									
								
								libc/intrin/aarch64/memrchr.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,138 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __memrchr_aarch64 memrchr | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD. | ||||
|  * MTE compatible. | ||||
|  */ | ||||
| 
 | ||||
| #define srcin		x0 | ||||
| #define chrin		w1 | ||||
| #define cntin		x2 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x3 | ||||
| #define cntrem		x4 | ||||
| #define synd		x5 | ||||
| #define shift		x6 | ||||
| #define	tmp		x7 | ||||
| #define end		x8 | ||||
| #define endm1		x9 | ||||
| 
 | ||||
| #define vrepchr		v0 | ||||
| #define qdata		q1 | ||||
| #define vdata		v1 | ||||
| #define vhas_chr	v2 | ||||
| #define vend		v3 | ||||
| #define dend		d3 | ||||
| 
 | ||||
| /* | ||||
|    Core algorithm: | ||||
|    For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits | ||||
|    per byte. We take 4 bits of every comparison byte with shift right and narrow | ||||
|    by 4 instruction. Since the bits in the nibble mask reflect the order in | ||||
|    which things occur in the original string, counting leading zeros identifies | ||||
|    exactly which byte matched.  */ | ||||
| 
 | ||||
| ENTRY (__memrchr_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	add	end, srcin, cntin | ||||
| 	sub	endm1, end, 1 | ||||
| 	bic	src, endm1, 15 | ||||
| 	cbz	cntin, L(nomatch) | ||||
| 	ld1	{vdata.16b}, [src] | ||||
| 	dup	vrepchr.16b, chrin | ||||
| 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b | ||||
| 	neg	shift, end, lsl 2 | ||||
| 	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	lsl	synd, synd, shift | ||||
| 	cbz	synd, L(start_loop) | ||||
| 
 | ||||
| 	clz	synd, synd | ||||
| 	sub	result, endm1, synd, lsr 2 | ||||
| 	cmp	cntin, synd, lsr 2 | ||||
| 	csel	result, result, xzr, hi | ||||
| 	ret | ||||
| 
 | ||||
| 	nop | ||||
| L(start_loop): | ||||
| 	subs	cntrem, src, srcin | ||||
| 	b.ls	L(nomatch) | ||||
| 
 | ||||
| 	/* Make sure that it won't overread by a 16-byte chunk */ | ||||
| 	sub	cntrem, cntrem, 1 | ||||
| 	tbz	cntrem, 4, L(loop32_2) | ||||
| 	add	src, src, 16 | ||||
| 
 | ||||
| 	.p2align 5
 | ||||
| L(loop32): | ||||
| 	ldr	qdata, [src, -32]! | ||||
| 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b | ||||
| 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	cbnz	synd, L(end) | ||||
| 
 | ||||
| L(loop32_2): | ||||
| 	ldr	qdata, [src, -16] | ||||
| 	subs	cntrem, cntrem, 32 | ||||
| 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b | ||||
| 	b.lo	L(end_2) | ||||
| 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	cbz	synd, L(loop32) | ||||
| L(end_2): | ||||
| 	sub	src, src, 16 | ||||
| L(end): | ||||
| 	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 
 | ||||
| 	add	tmp, src, 15 | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rbit	synd, synd | ||||
| #endif | ||||
| 	clz	synd, synd | ||||
| 	sub	tmp, tmp, synd, lsr 2 | ||||
| 	cmp	tmp, srcin | ||||
| 	csel	result, tmp, xzr, hs | ||||
| 	ret | ||||
| 
 | ||||
| L(nomatch): | ||||
| 	mov	result, 0 | ||||
| 	ret | ||||
| 
 | ||||
| END (__memrchr_aarch64) | ||||
							
								
								
									
										143
									
								
								libc/intrin/aarch64/memset.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								libc/intrin/aarch64/memset.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,143 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __memset_aarch64 memset | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. | ||||
|  * | ||||
|  */ | ||||
| 
 | ||||
| #define dstin	x0 | ||||
| #define val	x1 | ||||
| #define valw	w1 | ||||
| #define count	x2 | ||||
| #define dst	x3 | ||||
| #define dstend	x4 | ||||
| #define zva_val	x5 | ||||
| 
 | ||||
| ENTRY (__memset_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	SIZE_ARG (2) | ||||
| 
 | ||||
| 	dup	v0.16B, valw | ||||
| 	add	dstend, dstin, count | ||||
| 
 | ||||
| 	cmp	count, 96 | ||||
| 	b.hi	L(set_long) | ||||
| 	cmp	count, 16 | ||||
| 	b.hs	L(set_medium) | ||||
| 	mov	val, v0.D[0] | ||||
| 
 | ||||
| 	/* Set 0..15 bytes.  */ | ||||
| 	tbz	count, 3, 1f | ||||
| 	str	val, [dstin] | ||||
| 	str	val, [dstend, -8] | ||||
| 	ret | ||||
| 	.p2align 4
 | ||||
| 1:	tbz	count, 2, 2f | ||||
| 	str	valw, [dstin] | ||||
| 	str	valw, [dstend, -4] | ||||
| 	ret | ||||
| 2:	cbz	count, 3f | ||||
| 	strb	valw, [dstin] | ||||
| 	tbz	count, 1, 3f | ||||
| 	strh	valw, [dstend, -2] | ||||
| 3:	ret | ||||
| 
 | ||||
| 	/* Set 17..96 bytes.  */ | ||||
| L(set_medium): | ||||
| 	str	q0, [dstin] | ||||
| 	tbnz	count, 6, L(set96) | ||||
| 	str	q0, [dstend, -16] | ||||
| 	tbz	count, 5, 1f | ||||
| 	str	q0, [dstin, 16] | ||||
| 	str	q0, [dstend, -32] | ||||
| 1:	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| 	/* Set 64..96 bytes.  Write 64 bytes from the start and | ||||
| 	   32 bytes from the end.  */ | ||||
| L(set96): | ||||
| 	str	q0, [dstin, 16] | ||||
| 	stp	q0, q0, [dstin, 32] | ||||
| 	stp	q0, q0, [dstend, -32] | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(set_long): | ||||
| 	and	valw, valw, 255 | ||||
| 	bic	dst, dstin, 15 | ||||
| 	str	q0, [dstin] | ||||
| 	cmp	count, 160 | ||||
| 	ccmp	valw, 0, 0, hs | ||||
| 	b.ne	L(no_zva) | ||||
| 
 | ||||
| #ifndef SKIP_ZVA_CHECK | ||||
| 	mrs	zva_val, dczid_el0 | ||||
| 	and	zva_val, zva_val, 31 | ||||
| 	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */ | ||||
| 	b.ne	L(no_zva) | ||||
| #endif | ||||
| 	str	q0, [dst, 16] | ||||
| 	stp	q0, q0, [dst, 32] | ||||
| 	bic	dst, dst, 63 | ||||
| 	sub	count, dstend, dst	/* Count is now 64 too large.  */ | ||||
| 	sub	count, count, 128	/* Adjust count and bias for loop.  */ | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(zva_loop): | ||||
| 	add	dst, dst, 64 | ||||
| 	dc	zva, dst | ||||
| 	subs	count, count, 64 | ||||
| 	b.hi	L(zva_loop) | ||||
| 	stp	q0, q0, [dstend, -64] | ||||
| 	stp	q0, q0, [dstend, -32] | ||||
| 	ret | ||||
| 
 | ||||
| L(no_zva): | ||||
| 	sub	count, dstend, dst	/* Count is 16 too large.  */ | ||||
| 	sub	dst, dst, 16		/* Dst is biased by -32.  */ | ||||
| 	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */ | ||||
| L(no_zva_loop): | ||||
| 	stp	q0, q0, [dst, 32] | ||||
| 	stp	q0, q0, [dst, 64]! | ||||
| 	subs	count, count, 64 | ||||
| 	b.hi	L(no_zva_loop) | ||||
| 	stp	q0, q0, [dstend, -64] | ||||
| 	stp	q0, q0, [dstend, -32] | ||||
| 	ret | ||||
| 
 | ||||
| END (__memset_aarch64) | ||||
							
								
								
									
										175
									
								
								libc/intrin/aarch64/stpcpy.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								libc/intrin/aarch64/stpcpy.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,175 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __stpcpy_aarch64 stpcpy | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD. | ||||
|  * MTE compatible. | ||||
|  */ | ||||
| 
 | ||||
| #define dstin		x0 | ||||
| #define srcin		x1 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x2 | ||||
| #define dst		x3 | ||||
| #define len		x4 | ||||
| #define synd		x4 | ||||
| #define	tmp		x5 | ||||
| #define shift		x5 | ||||
| #define data1		x6 | ||||
| #define dataw1		w6 | ||||
| #define data2		x7 | ||||
| #define dataw2		w7 | ||||
| 
 | ||||
| #define dataq		q0 | ||||
| #define vdata		v0 | ||||
| #define vhas_nul	v1 | ||||
| #define vend		v2 | ||||
| #define dend		d2 | ||||
| #define dataq2		q1 | ||||
| 
 | ||||
| /* | ||||
|    Core algorithm: | ||||
|    For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits | ||||
|    per byte. We take 4 bits of every comparison byte with shift right and narrow | ||||
|    by 4 instruction. Since the bits in the nibble mask reflect the order in | ||||
|    which things occur in the original string, counting leading zeros identifies | ||||
|    exactly which byte matched.  */ | ||||
| 
 | ||||
| ENTRY (__stpcpy_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	PTR_ARG (1) | ||||
| 	bic	src, srcin, 15 | ||||
| 	ld1	{vdata.16b}, [src] | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	lsl	shift, srcin, 2 | ||||
| 	shrn	vend.8b, vhas_nul.8h, 4 | ||||
| 	fmov	synd, dend | ||||
| 	lsr	synd, synd, shift | ||||
| 	cbnz	synd, L(tail) | ||||
| 
 | ||||
| 	ldr	dataq, [src, 16]! | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	shrn	vend.8b, vhas_nul.8h, 4 | ||||
| 	fmov	synd, dend | ||||
| 	cbz	synd, L(start_loop) | ||||
| 
 | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rbit	synd, synd | ||||
| #endif | ||||
| 	sub	tmp, src, srcin | ||||
| 	clz	len, synd | ||||
| 	add	len, tmp, len, lsr 2 | ||||
| 	tbz	len, 4, L(less16) | ||||
| 	sub	tmp, len, 15 | ||||
| 	ldr	dataq, [srcin] | ||||
| 	ldr	dataq2, [srcin, tmp] | ||||
| 	str	dataq, [dstin] | ||||
| 	str	dataq2, [dstin, tmp] | ||||
| 	add	result, dstin, len | ||||
| 	ret | ||||
| 
 | ||||
| L(tail): | ||||
| 	rbit	synd, synd | ||||
| 	clz	len, synd | ||||
| 	lsr	len, len, 2 | ||||
| L(less16): | ||||
| 	tbz	len, 3, L(less8) | ||||
| 	sub	tmp, len, 7 | ||||
| 	ldr	data1, [srcin] | ||||
| 	ldr	data2, [srcin, tmp] | ||||
| 	str	data1, [dstin] | ||||
| 	str	data2, [dstin, tmp] | ||||
| 	add	result, dstin, len | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(less8): | ||||
| 	subs	tmp, len, 3 | ||||
| 	b.lo	L(less4) | ||||
| 	ldr	dataw1, [srcin] | ||||
| 	ldr	dataw2, [srcin, tmp] | ||||
| 	str	dataw1, [dstin] | ||||
| 	str	dataw2, [dstin, tmp] | ||||
| 	add	result, dstin, len | ||||
| 	ret | ||||
| 
 | ||||
| L(less4): | ||||
| 	cbz	len, L(zerobyte) | ||||
| 	ldrh	dataw1, [srcin] | ||||
| 	strh	dataw1, [dstin] | ||||
| L(zerobyte): | ||||
| 	strb	wzr, [dstin, len] | ||||
| 	add	result, dstin, len | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(start_loop): | ||||
| 	sub	tmp, srcin, dstin | ||||
| 	ldr	dataq2, [srcin] | ||||
| 	sub	dst, src, tmp | ||||
| 	str	dataq2, [dstin] | ||||
| L(loop): | ||||
| 	str	dataq, [dst], 32 | ||||
| 	ldr	dataq, [src, 16] | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b | ||||
| 	fmov	synd, dend | ||||
| 	cbnz	synd, L(loopend) | ||||
| 	str	dataq, [dst, -16] | ||||
| 	ldr	dataq, [src, 32]! | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b | ||||
| 	fmov	synd, dend | ||||
| 	cbz	synd, L(loop) | ||||
| 	add	dst, dst, 16 | ||||
| L(loopend): | ||||
| 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	sub	dst, dst, 31 | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rbit	synd, synd | ||||
| #endif | ||||
| 	clz	len, synd | ||||
| 	lsr	len, len, 2 | ||||
| 	add	dst, dst, len | ||||
| 	ldr	dataq, [dst, tmp] | ||||
| 	str	dataq, [dst] | ||||
| 	add	result, dst, 15 | ||||
| 	ret | ||||
| 
 | ||||
| END (__stpcpy_aarch64) | ||||
							
								
								
									
										152
									
								
								libc/intrin/aarch64/strchr.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										152
									
								
								libc/intrin/aarch64/strchr.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,152 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strchr_aarch64 strchr | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64 | ||||
|  * Neon Available. | ||||
|  */ | ||||
| 
 | ||||
| /* Arguments and results.  */ | ||||
| #define srcin		x0 | ||||
| #define chrin		w1 | ||||
| 
 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x2 | ||||
| #define	tmp1		x3 | ||||
| #define wtmp2		w4 | ||||
| #define tmp3		x5 | ||||
| 
 | ||||
| #define vrepchr		v0 | ||||
| #define vdata1		v1 | ||||
| #define vdata2		v2 | ||||
| #define vhas_nul1	v3 | ||||
| #define vhas_nul2	v4 | ||||
| #define vhas_chr1	v5 | ||||
| #define vhas_chr2	v6 | ||||
| #define vrepmask_0	v7 | ||||
| #define vrepmask_c	v16 | ||||
| #define vend1		v17 | ||||
| #define vend2		v18 | ||||
| 
 | ||||
| /* Core algorithm. | ||||
| 
 | ||||
|    For each 32-byte hunk we calculate a 64-bit syndrome value, with | ||||
|    two bits per byte (LSB is always in bits 0 and 1, for both big | ||||
|    and little-endian systems).  For each tuple, bit 0 is set iff | ||||
|    the relevant byte matched the requested character; bit 1 is set
 | ||||
|    iff the relevant byte matched the NUL end of string (we trigger | ||||
|    off bit0 for the special case of looking for NUL).  Since the bits | ||||
|    in the syndrome reflect exactly the order in which things occur | ||||
|    in the original string a count_trailing_zeros() operation will | ||||
|    identify exactly which byte is causing the termination, and why.  */ | ||||
| 
 | ||||
| /* Locals and temporaries.  */ | ||||
| 
 | ||||
| ENTRY (__strchr_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	/* Magic constant 0xc0300c03 to allow us to identify which lane | ||||
| 	   matches the requested byte.  Even bits are set if the character | ||||
| 	   matches, odd bits if either the char is NUL or matches.  */ | ||||
| 	mov	wtmp2, 0x0c03 | ||||
| 	movk	wtmp2, 0xc030, lsl 16 | ||||
| 	dup	vrepchr.16b, chrin | ||||
| 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */ | ||||
| 	dup	vrepmask_c.4s, wtmp2 | ||||
| 	ands	tmp1, srcin, #31 | ||||
| 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ | ||||
| 	b.eq	L(loop) | ||||
| 
 | ||||
| 	/* Input string is not 32-byte aligned.  Rather than forcing | ||||
| 	   the padding bytes to a safe value, we calculate the syndrome | ||||
| 	   for all the bytes, but then mask off those bits of the | ||||
| 	   syndrome that are related to the padding.  */ | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	neg	tmp1, tmp1 | ||||
| 	cmeq	vhas_nul1.16b, vdata1.16b, #0 | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_nul2.16b, vdata2.16b, #0 | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b | ||||
| 	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b | ||||
| 	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b | ||||
| 	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b | ||||
| 	lsl	tmp1, tmp1, #1 | ||||
| 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128 | ||||
| 	mov	tmp3, #~0 | ||||
| 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64 | ||||
| 	lsr	tmp1, tmp3, tmp1 | ||||
| 
 | ||||
| 	mov	tmp3, vend1.d[0] | ||||
| 	bic	tmp1, tmp3, tmp1	// Mask padding bits. | ||||
| 	cbnz	tmp1, L(tail) | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(loop): | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b | ||||
| 	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b | ||||
| 	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b | ||||
| 	umaxp	vend1.16b, vend1.16b, vend1.16b | ||||
| 	mov	tmp1, vend1.d[0] | ||||
| 	cbz	tmp1, L(loop) | ||||
| 
 | ||||
| 	/* Termination condition found.  Now need to establish exactly why | ||||
| 	   we terminated.  */ | ||||
| 	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b | ||||
| 	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b | ||||
| 	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b | ||||
| 	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b | ||||
| 	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128 | ||||
| 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64 | ||||
| 	mov	tmp1, vend1.d[0] | ||||
| L(tail): | ||||
| 	/* Count the trailing zeros, by bit reversing...  */ | ||||
| 	rbit	tmp1, tmp1 | ||||
| 	/* Re-bias source.  */ | ||||
| 	sub	src, src, #32 | ||||
| 	clz	tmp1, tmp1	/* And counting the leading zeros.  */ | ||||
| 	/* Tmp1 is even if the target charager was found first.  Otherwise | ||||
| 	   we've found the end of string and we weren't looking for NUL.  */ | ||||
| 	tst	tmp1, #1 | ||||
| 	add	result, src, tmp1, lsr #1 | ||||
| 	csel	result, result, xzr, eq | ||||
| 	ret | ||||
| 
 | ||||
| END (__strchr_aarch64) | ||||
							
								
								
									
										140
									
								
								libc/intrin/aarch64/strchrnul.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										140
									
								
								libc/intrin/aarch64/strchrnul.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,140 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strchrnul_aarch64 strchrnul | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64 | ||||
|  * Neon Available. | ||||
|  */ | ||||
| 
 | ||||
| /* Arguments and results.  */ | ||||
| #define srcin		x0 | ||||
| #define chrin		w1 | ||||
| 
 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x2 | ||||
| #define	tmp1		x3 | ||||
| #define wtmp2		w4 | ||||
| #define tmp3		x5 | ||||
| 
 | ||||
| #define vrepchr		v0 | ||||
| #define vdata1		v1 | ||||
| #define vdata2		v2 | ||||
| #define vhas_nul1	v3 | ||||
| #define vhas_nul2	v4 | ||||
| #define vhas_chr1	v5 | ||||
| #define vhas_chr2	v6 | ||||
| #define vrepmask	v7 | ||||
| #define vend1		v16 | ||||
| 
 | ||||
| /* Core algorithm. | ||||
| 
 | ||||
|    For each 32-byte hunk we calculate a 64-bit syndrome value, with | ||||
|    two bits per byte (LSB is always in bits 0 and 1, for both big | ||||
|    and little-endian systems).  For each tuple, bit 0 is set iff | ||||
|    the relevant byte matched the requested character or nul.  Since the | ||||
|    bits in the syndrome reflect exactly the order in which things occur | ||||
|    in the original string a count_trailing_zeros() operation will | ||||
|    identify exactly which byte is causing the termination.  */ | ||||
| 
 | ||||
| /* Locals and temporaries.  */ | ||||
| 
 | ||||
| ENTRY (__strchrnul_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	/* Magic constant 0x40100401 to allow us to identify which lane | ||||
| 	   matches the termination condition.  */ | ||||
| 	mov	wtmp2, #0x0401 | ||||
| 	movk	wtmp2, #0x4010, lsl #16 | ||||
| 	dup	vrepchr.16b, chrin | ||||
| 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */ | ||||
| 	dup	vrepmask.4s, wtmp2 | ||||
| 	ands	tmp1, srcin, #31 | ||||
| 	b.eq	L(loop) | ||||
| 
 | ||||
| 	/* Input string is not 32-byte aligned.  Rather than forcing | ||||
| 	   the padding bytes to a safe value, we calculate the syndrome | ||||
| 	   for all the bytes, but then mask off those bits of the | ||||
| 	   syndrome that are related to the padding.  */ | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	neg	tmp1, tmp1 | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b | ||||
| 	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b | ||||
| 	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b | ||||
| 	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b | ||||
| 	lsl	tmp1, tmp1, #1 | ||||
| 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128 | ||||
| 	mov	tmp3, #~0 | ||||
| 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64 | ||||
| 	lsr	tmp1, tmp3, tmp1 | ||||
| 
 | ||||
| 	mov	tmp3, vend1.d[0] | ||||
| 	bic	tmp1, tmp3, tmp1	// Mask padding bits. | ||||
| 	cbnz	tmp1, L(tail) | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(loop): | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b | ||||
| 	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b | ||||
| 	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b | ||||
| 	umaxp	vend1.16b, vend1.16b, vend1.16b | ||||
| 	mov	tmp1, vend1.d[0] | ||||
| 	cbz	tmp1, L(loop) | ||||
| 
 | ||||
| 	/* Termination condition found.  Now need to establish exactly why | ||||
| 	   we terminated.  */ | ||||
| 	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b | ||||
| 	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b | ||||
| 	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128 | ||||
| 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64 | ||||
| 
 | ||||
| 	mov	tmp1, vend1.d[0] | ||||
| L(tail): | ||||
| 	/* Count the trailing zeros, by bit reversing...  */ | ||||
| 	rbit	tmp1, tmp1 | ||||
| 	/* Re-bias source.  */ | ||||
| 	sub	src, src, #32 | ||||
| 	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */ | ||||
| 	/* tmp1 is twice the offset into the fragment.  */ | ||||
| 	add	result, src, tmp1, lsr #1 | ||||
| 	ret | ||||
| 
 | ||||
| END (__strchrnul_aarch64) | ||||
							
								
								
									
										214
									
								
								libc/intrin/aarch64/strcmp.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										214
									
								
								libc/intrin/aarch64/strcmp.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,214 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strcmp_aarch64 strcmp | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64. | ||||
|  * MTE compatible. | ||||
|  */ | ||||
| 
 | ||||
| #define REP8_01 0x0101010101010101 | ||||
| #define REP8_7f 0x7f7f7f7f7f7f7f7f | ||||
| 
 | ||||
| #define src1		x0 | ||||
| #define src2		x1 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define data1		x2 | ||||
| #define data1w		w2 | ||||
| #define data2		x3 | ||||
| #define data2w		w3 | ||||
| #define has_nul		x4 | ||||
| #define diff		x5 | ||||
| #define off1		x5 | ||||
| #define syndrome	x6 | ||||
| #define tmp		x6 | ||||
| #define data3		x7 | ||||
| #define zeroones	x8 | ||||
| #define shift		x9 | ||||
| #define off2		x10 | ||||
| 
 | ||||
| /* On big-endian early bytes are at MSB and on little-endian LSB. | ||||
|    LS_FW means shifting towards early bytes.  */ | ||||
| #ifdef __AARCH64EB__ | ||||
| # define LS_FW lsl | ||||
| #else | ||||
| # define LS_FW lsr | ||||
| #endif | ||||
| 
 | ||||
| /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 | ||||
|    (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | ||||
|    can be done in parallel across the entire word. | ||||
|    Since carry propagation makes 0x1 bytes before a NUL byte appear | ||||
|    NUL too in big-endian, byte-reverse the data before the NUL check.  */ | ||||
| 
 | ||||
| 
 | ||||
| ENTRY (__strcmp_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	PTR_ARG (1) | ||||
| 	sub	off2, src2, src1 | ||||
| 	mov	zeroones, REP8_01 | ||||
| 	and	tmp, src1, 7 | ||||
| 	tst	off2, 7 | ||||
| 	b.ne	L(misaligned8) | ||||
| 	cbnz	tmp, L(mutual_align) | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| 
 | ||||
| L(loop_aligned): | ||||
| 	ldr	data2, [src1, off2] | ||||
| 	ldr	data1, [src1], 8 | ||||
| L(start_realigned): | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev	tmp, data1 | ||||
| 	sub	has_nul, tmp, zeroones | ||||
| 	orr	tmp, tmp, REP8_7f | ||||
| #else | ||||
| 	sub	has_nul, data1, zeroones | ||||
| 	orr	tmp, data1, REP8_7f | ||||
| #endif | ||||
| 	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */ | ||||
| 	ccmp	data1, data2, 0, eq | ||||
| 	b.eq	L(loop_aligned) | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev	has_nul, has_nul | ||||
| #endif | ||||
| 	eor	diff, data1, data2 | ||||
| 	orr	syndrome, diff, has_nul | ||||
| L(end): | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rev	syndrome, syndrome | ||||
| 	rev	data1, data1 | ||||
| 	rev	data2, data2 | ||||
| #endif | ||||
| 	clz	shift, syndrome | ||||
| 	/* The most-significant-non-zero bit of the syndrome marks either the | ||||
| 	   first bit that is different, or the top bit of the first zero byte. | ||||
| 	   Shifting left now will bring the critical information into the | ||||
| 	   top bits.  */ | ||||
| 	lsl	data1, data1, shift | ||||
| 	lsl	data2, data2, shift | ||||
| 	/* But we need to zero-extend (char is unsigned) the value and then | ||||
| 	   perform a signed 32-bit subtraction.  */ | ||||
| 	lsr	data1, data1, 56 | ||||
| 	sub	result, data1, data2, lsr 56 | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| 
 | ||||
| L(mutual_align): | ||||
| 	/* Sources are mutually aligned, but are not currently at an | ||||
| 	   alignment boundary.  Round down the addresses and then mask off | ||||
| 	   the bytes that precede the start point.  */ | ||||
| 	bic	src1, src1, 7 | ||||
| 	ldr	data2, [src1, off2] | ||||
| 	ldr	data1, [src1], 8 | ||||
| 	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */ | ||||
| 	mov	tmp, -1 | ||||
| 	LS_FW	tmp, tmp, shift | ||||
| 	orr	data1, data1, tmp | ||||
| 	orr	data2, data2, tmp | ||||
| 	b	L(start_realigned) | ||||
| 
 | ||||
| L(misaligned8): | ||||
| 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always | ||||
| 	   checking to make sure that we don't access beyond the end of SRC2.  */ | ||||
| 	cbz	tmp, L(src1_aligned) | ||||
| L(do_misaligned): | ||||
| 	ldrb	data1w, [src1], 1 | ||||
| 	ldrb	data2w, [src2], 1 | ||||
| 	cmp	data1w, 0 | ||||
| 	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */ | ||||
| 	b.ne	L(done) | ||||
| 	tst	src1, 7 | ||||
| 	b.ne	L(do_misaligned) | ||||
| 
 | ||||
| L(src1_aligned): | ||||
| 	neg	shift, src2, lsl 3 | ||||
| 	bic	src2, src2, 7 | ||||
| 	ldr	data3, [src2], 8 | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev	data3, data3 | ||||
| #endif | ||||
| 	lsr	tmp, zeroones, shift | ||||
| 	orr	data3, data3, tmp | ||||
| 	sub	has_nul, data3, zeroones | ||||
| 	orr	tmp, data3, REP8_7f | ||||
| 	bics	has_nul, has_nul, tmp | ||||
| 	b.ne	L(tail) | ||||
| 
 | ||||
| 	sub	off1, src2, src1 | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| 
 | ||||
| L(loop_unaligned): | ||||
| 	ldr	data3, [src1, off1] | ||||
| 	ldr	data2, [src1, off2] | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev	data3, data3 | ||||
| #endif | ||||
| 	sub	has_nul, data3, zeroones | ||||
| 	orr	tmp, data3, REP8_7f | ||||
| 	ldr	data1, [src1], 8 | ||||
| 	bics	has_nul, has_nul, tmp | ||||
| 	ccmp	data1, data2, 0, eq | ||||
| 	b.eq	L(loop_unaligned) | ||||
| 
 | ||||
| 	lsl	tmp, has_nul, shift | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev	tmp, tmp | ||||
| #endif | ||||
| 	eor	diff, data1, data2 | ||||
| 	orr	syndrome, diff, tmp | ||||
| 	cbnz	syndrome, L(end) | ||||
| L(tail): | ||||
| 	ldr	data1, [src1] | ||||
| 	neg	shift, shift | ||||
| 	lsr	data2, data3, shift | ||||
| 	lsr	has_nul, has_nul, shift | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev     data2, data2 | ||||
| 	rev	has_nul, has_nul | ||||
| #endif | ||||
| 	eor	diff, data1, data2 | ||||
| 	orr	syndrome, diff, has_nul | ||||
| 	b	L(end) | ||||
| 
 | ||||
| L(done): | ||||
| 	sub	result, data1, data2 | ||||
| 	ret | ||||
| 
 | ||||
| END (__strcmp_aarch64) | ||||
							
								
								
									
										170
									
								
								libc/intrin/aarch64/strcpy.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										170
									
								
								libc/intrin/aarch64/strcpy.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,170 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strcpy_aarch64 strcpy | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD. | ||||
|  * MTE compatible. | ||||
|  */ | ||||
| 
 | ||||
| #define dstin		x0 | ||||
| #define srcin		x1 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x2 | ||||
| #define dst		x3 | ||||
| #define len		x4 | ||||
| #define synd		x4 | ||||
| #define	tmp		x5 | ||||
| #define shift		x5 | ||||
| #define data1		x6 | ||||
| #define dataw1		w6 | ||||
| #define data2		x7 | ||||
| #define dataw2		w7 | ||||
| 
 | ||||
| #define dataq		q0 | ||||
| #define vdata		v0 | ||||
| #define vhas_nul	v1 | ||||
| #define vend		v2 | ||||
| #define dend		d2 | ||||
| #define dataq2		q1 | ||||
| 
 | ||||
| /* | ||||
|    Core algorithm: | ||||
|    For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits | ||||
|    per byte. We take 4 bits of every comparison byte with shift right and narrow | ||||
|    by 4 instruction. Since the bits in the nibble mask reflect the order in | ||||
|    which things occur in the original string, counting leading zeros identifies | ||||
|    exactly which byte matched.  */ | ||||
| 
 | ||||
| ENTRY (__strcpy_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	PTR_ARG (1) | ||||
| 	bic	src, srcin, 15 | ||||
| 	ld1	{vdata.16b}, [src] | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	lsl	shift, srcin, 2 | ||||
| 	shrn	vend.8b, vhas_nul.8h, 4 | ||||
| 	fmov	synd, dend | ||||
| 	lsr	synd, synd, shift | ||||
| 	cbnz	synd, L(tail) | ||||
| 
 | ||||
| 	ldr	dataq, [src, 16]! | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	shrn	vend.8b, vhas_nul.8h, 4 | ||||
| 	fmov	synd, dend | ||||
| 	cbz	synd, L(start_loop) | ||||
| 
 | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rbit	synd, synd | ||||
| #endif | ||||
| 	sub	tmp, src, srcin | ||||
| 	clz	len, synd | ||||
| 	add	len, tmp, len, lsr 2 | ||||
| 	tbz	len, 4, L(less16) | ||||
| 	sub	tmp, len, 15 | ||||
| 	ldr	dataq, [srcin] | ||||
| 	ldr	dataq2, [srcin, tmp] | ||||
| 	str	dataq, [dstin] | ||||
| 	str	dataq2, [dstin, tmp] | ||||
| 	ret | ||||
| 
 | ||||
| L(tail): | ||||
| 	rbit	synd, synd | ||||
| 	clz	len, synd | ||||
| 	lsr	len, len, 2 | ||||
| L(less16): | ||||
| 	tbz	len, 3, L(less8) | ||||
| 	sub	tmp, len, 7 | ||||
| 	ldr	data1, [srcin] | ||||
| 	ldr	data2, [srcin, tmp] | ||||
| 	str	data1, [dstin] | ||||
| 	str	data2, [dstin, tmp] | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(less8): | ||||
| 	subs	tmp, len, 3 | ||||
| 	b.lo	L(less4) | ||||
| 	ldr	dataw1, [srcin] | ||||
| 	ldr	dataw2, [srcin, tmp] | ||||
| 	str	dataw1, [dstin] | ||||
| 	str	dataw2, [dstin, tmp] | ||||
| 	ret | ||||
| 
 | ||||
| L(less4): | ||||
| 	cbz	len, L(zerobyte) | ||||
| 	ldrh	dataw1, [srcin] | ||||
| 	strh	dataw1, [dstin] | ||||
| L(zerobyte): | ||||
| 	strb	wzr, [dstin, len] | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(start_loop): | ||||
| 	sub	tmp, srcin, dstin | ||||
| 	ldr	dataq2, [srcin] | ||||
| 	sub	dst, src, tmp | ||||
| 	str	dataq2, [dstin] | ||||
| L(loop): | ||||
| 	str	dataq, [dst], 32 | ||||
| 	ldr	dataq, [src, 16] | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b | ||||
| 	fmov	synd, dend | ||||
| 	cbnz	synd, L(loopend) | ||||
| 	str	dataq, [dst, -16] | ||||
| 	ldr	dataq, [src, 32]! | ||||
| 	cmeq	vhas_nul.16b, vdata.16b, 0 | ||||
| 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b | ||||
| 	fmov	synd, dend | ||||
| 	cbz	synd, L(loop) | ||||
| 	add	dst, dst, 16 | ||||
| L(loopend): | ||||
| 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	sub	dst, dst, 31 | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rbit	synd, synd | ||||
| #endif | ||||
| 	clz	len, synd | ||||
| 	lsr	len, len, 2 | ||||
| 	add	dst, dst, len | ||||
| 	ldr	dataq, [dst, tmp] | ||||
| 	str	dataq, [dst] | ||||
| 	ret | ||||
| 
 | ||||
| END (__strcpy_aarch64) | ||||
							
								
								
									
										220
									
								
								libc/intrin/aarch64/strlen.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										220
									
								
								libc/intrin/aarch64/strlen.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,220 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strlen_aarch64 strlen | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. | ||||
|  * Not MTE compatible. | ||||
|  */ | ||||
| 
 | ||||
| #define srcin	x0 | ||||
| #define len	x0 | ||||
| 
 | ||||
| #define src	x1 | ||||
| #define data1	x2 | ||||
| #define data2	x3 | ||||
| #define has_nul1 x4 | ||||
| #define has_nul2 x5 | ||||
| #define tmp1	x4 | ||||
| #define tmp2	x5 | ||||
| #define tmp3	x6 | ||||
| #define tmp4	x7 | ||||
| #define zeroones x8 | ||||
| 
 | ||||
| #define maskv	v0 | ||||
| #define maskd	d0 | ||||
| #define dataq1	q1 | ||||
| #define dataq2	q2 | ||||
| #define datav1	v1 | ||||
| #define datav2	v2 | ||||
| #define tmp	x2 | ||||
| #define tmpw	w2 | ||||
| #define synd	x3 | ||||
| #define syndw	w3 | ||||
| #define shift	x4 | ||||
| 
 | ||||
| /* For the first 32 bytes, NUL detection works on the principle that | ||||
|    (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a | ||||
|    byte is zero, and can be done in parallel across the entire word.  */ | ||||
| 
 | ||||
| #define REP8_01 0x0101010101010101 | ||||
| #define REP8_7f 0x7f7f7f7f7f7f7f7f | ||||
| 
 | ||||
| /* To test the page crossing code path more thoroughly, compile with | ||||
|    -DTEST_PAGE_CROSS - this will force all calls through the slower | ||||
|    entry path.  This option is not intended for production use.  */ | ||||
| 
 | ||||
| #ifdef TEST_PAGE_CROSS | ||||
| # define MIN_PAGE_SIZE 32 | ||||
| #else | ||||
| # define MIN_PAGE_SIZE 4096 | ||||
| #endif | ||||
| 
 | ||||
| /* Core algorithm: | ||||
| 
 | ||||
|    Since strings are short on average, we check the first 32 bytes of the | ||||
|    string for a NUL character without aligning the string.  In order to use | ||||
|    unaligned loads safely we must do a page cross check first. | ||||
| 
 | ||||
|    If there is a NUL byte we calculate the length from the 2 8-byte words | ||||
|    using conditional select to reduce branch mispredictions (it is unlikely | ||||
|    strlen will be repeatedly called on strings with the same length). | ||||
| 
 | ||||
|    If the string is longer than 32 bytes, align src so we don't need further | ||||
|    page cross checks, and process 32 bytes per iteration using a fast SIMD | ||||
|    loop. | ||||
| 
 | ||||
|    If the page cross check fails, we read 32 bytes from an aligned address, | ||||
|    and ignore any characters before the string.  If it contains a NUL | ||||
|    character, return the length, if not, continue in the main loop.  */ | ||||
| 
 | ||||
| ENTRY (__strlen_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	and	tmp1, srcin, MIN_PAGE_SIZE - 1 | ||||
| 	cmp	tmp1, MIN_PAGE_SIZE - 32 | ||||
| 	b.hi	L(page_cross) | ||||
| 
 | ||||
| 	/* Look for a NUL byte in the first 16 bytes.  */ | ||||
| 	ldp	data1, data2, [srcin] | ||||
| 	mov	zeroones, REP8_01 | ||||
| 
 | ||||
| #ifdef __AARCH64EB__ | ||||
| 	/* For big-endian, carry propagation (if the final byte in the | ||||
| 	   string is 0x01) means we cannot use has_nul1/2 directly. | ||||
| 	   Since we expect strings to be small and early-exit, | ||||
| 	   byte-swap the data now so has_null1/2 will be correct.  */ | ||||
| 	rev	data1, data1 | ||||
| 	rev	data2, data2 | ||||
| #endif | ||||
| 	sub	tmp1, data1, zeroones | ||||
| 	orr	tmp2, data1, REP8_7f | ||||
| 	sub	tmp3, data2, zeroones | ||||
| 	orr	tmp4, data2, REP8_7f | ||||
| 	bics	has_nul1, tmp1, tmp2 | ||||
| 	bic	has_nul2, tmp3, tmp4 | ||||
| 	ccmp	has_nul2, 0, 0, eq | ||||
| 	b.eq	L(bytes16_31) | ||||
| 
 | ||||
| 	/* Find the exact offset of the first NUL byte in the first 16 bytes | ||||
| 	   from the string start.  Enter with C = has_nul1 == 0.  */ | ||||
| 	csel	has_nul1, has_nul1, has_nul2, cc | ||||
| 	mov	len, 8 | ||||
| 	rev	has_nul1, has_nul1 | ||||
| 	csel	len, xzr, len, cc | ||||
| 	clz	tmp1, has_nul1 | ||||
| 	add	len, len, tmp1, lsr 3 | ||||
| 	ret | ||||
| 
 | ||||
| 	/* Look for a NUL byte at offset 16..31 in the string.  */ | ||||
| L(bytes16_31): | ||||
| 	ldp	data1, data2, [srcin, 16] | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev	data1, data1 | ||||
| 	rev	data2, data2 | ||||
| #endif | ||||
| 	sub	tmp1, data1, zeroones | ||||
| 	orr	tmp2, data1, REP8_7f | ||||
| 	sub	tmp3, data2, zeroones | ||||
| 	orr	tmp4, data2, REP8_7f | ||||
| 	bics	has_nul1, tmp1, tmp2 | ||||
| 	bic	has_nul2, tmp3, tmp4 | ||||
| 	ccmp	has_nul2, 0, 0, eq | ||||
| 	b.eq	L(loop_entry) | ||||
| 
 | ||||
| 	/* Find the exact offset of the first NUL byte at offset 16..31 from | ||||
| 	   the string start.  Enter with C = has_nul1 == 0.  */ | ||||
| 	csel	has_nul1, has_nul1, has_nul2, cc | ||||
| 	mov	len, 24 | ||||
| 	rev	has_nul1, has_nul1 | ||||
| 	mov	tmp3, 16 | ||||
| 	clz	tmp1, has_nul1 | ||||
| 	csel	len, tmp3, len, cc | ||||
| 	add	len, len, tmp1, lsr 3 | ||||
| 	ret | ||||
| 
 | ||||
| 	nop | ||||
| L(loop_entry): | ||||
| 	bic	src, srcin, 31 | ||||
| 
 | ||||
| 	.p2align 5
 | ||||
| L(loop): | ||||
| 	ldp	dataq1, dataq2, [src, 32]! | ||||
| 	uminp	maskv.16b, datav1.16b, datav2.16b | ||||
| 	uminp	maskv.16b, maskv.16b, maskv.16b | ||||
| 	cmeq	maskv.8b, maskv.8b, 0 | ||||
| 	fmov	synd, maskd | ||||
| 	cbz	synd, L(loop) | ||||
| 
 | ||||
| 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */ | ||||
| 	cmeq	maskv.16b, datav1.16b, 0 | ||||
| 	sub	len, src, srcin | ||||
| 	cbnz	syndw, 1f | ||||
| 	cmeq	maskv.16b, datav2.16b, 0 | ||||
| 	add	len, len, 16 | ||||
| 1: | ||||
| 	/* Generate a bitmask and compute correct byte offset.  */ | ||||
| 	shrn	maskv.8b, maskv.8h, 4 | ||||
| 	fmov	synd, maskd | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rbit	synd, synd | ||||
| #endif | ||||
| 	clz	tmp, synd | ||||
| 	add	len, len, tmp, lsr 2 | ||||
| 	ret | ||||
| 
 | ||||
| L(page_cross): | ||||
| 	bic	src, srcin, 31 | ||||
| 	mov	tmpw, 0x0c03 | ||||
| 	movk	tmpw, 0xc030, lsl 16 | ||||
| 	ld1	{datav1.16b, datav2.16b}, [src] | ||||
| 	dup	maskv.4s, tmpw | ||||
| 	cmeq	datav1.16b, datav1.16b, 0 | ||||
| 	cmeq	datav2.16b, datav2.16b, 0 | ||||
| 	and	datav1.16b, datav1.16b, maskv.16b | ||||
| 	and	datav2.16b, datav2.16b, maskv.16b | ||||
| 	addp	maskv.16b, datav1.16b, datav2.16b | ||||
| 	addp	maskv.16b, maskv.16b, maskv.16b | ||||
| 	fmov	synd, maskd | ||||
| 	lsl	shift, srcin, 1 | ||||
| 	lsr	synd, synd, shift | ||||
| 	cbz	synd, L(loop) | ||||
| 
 | ||||
| 	rbit	synd, synd | ||||
| 	clz	len, synd | ||||
| 	lsr	len, len, 1 | ||||
| 	ret | ||||
| 
 | ||||
| END (__strlen_aarch64) | ||||
							
								
								
									
										334
									
								
								libc/intrin/aarch64/strncmp.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										334
									
								
								libc/intrin/aarch64/strncmp.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,334 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strncmp_aarch64 strncmp | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64. | ||||
|  * MTE compatible. | ||||
|  */ | ||||
| 
 | ||||
| #define REP8_01 0x0101010101010101 | ||||
| #define REP8_7f 0x7f7f7f7f7f7f7f7f | ||||
| 
 | ||||
| /* Parameters and result.  */ | ||||
| #define src1		x0 | ||||
| #define src2		x1 | ||||
| #define limit		x2 | ||||
| #define result		x0 | ||||
| 
 | ||||
| /* Internal variables.  */ | ||||
| #define data1		x3 | ||||
| #define data1w		w3 | ||||
| #define data2		x4 | ||||
| #define data2w		w4 | ||||
| #define has_nul		x5 | ||||
| #define diff		x6 | ||||
| #define syndrome	x7 | ||||
| #define tmp1		x8 | ||||
| #define tmp2		x9 | ||||
| #define tmp3		x10 | ||||
| #define zeroones	x11 | ||||
| #define pos		x12 | ||||
| #define mask		x13 | ||||
| #define endloop		x14 | ||||
| #define count		mask | ||||
| #define offset		pos | ||||
| #define neg_offset	x15 | ||||
| 
 | ||||
| /* Define endian dependent shift operations. | ||||
|    On big-endian early bytes are at MSB and on little-endian LSB. | ||||
|    LS_FW means shifting towards early bytes. | ||||
|    LS_BK means shifting towards later bytes. | ||||
|    */ | ||||
| #ifdef __AARCH64EB__ | ||||
| #define LS_FW lsl | ||||
| #define LS_BK lsr | ||||
| #else | ||||
| #define LS_FW lsr | ||||
| #define LS_BK lsl | ||||
| #endif | ||||
| 
 | ||||
| ENTRY (__strncmp_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	PTR_ARG (1) | ||||
| 	SIZE_ARG (2) | ||||
| 	cbz	limit, L(ret0) | ||||
| 	eor	tmp1, src1, src2 | ||||
| 	mov	zeroones, #REP8_01 | ||||
| 	tst	tmp1, #7 | ||||
| 	and	count, src1, #7 | ||||
| 	b.ne	L(misaligned8) | ||||
| 	cbnz	count, L(mutual_align) | ||||
| 
 | ||||
| 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 | ||||
| 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | ||||
| 	   can be done in parallel across the entire word.  */ | ||||
| 	.p2align 4
 | ||||
| L(loop_aligned): | ||||
| 	ldr	data1, [src1], #8 | ||||
| 	ldr	data2, [src2], #8 | ||||
| L(start_realigned): | ||||
| 	subs	limit, limit, #8 | ||||
| 	sub	tmp1, data1, zeroones | ||||
| 	orr	tmp2, data1, #REP8_7f | ||||
| 	eor	diff, data1, data2	/* Non-zero if differences found.  */ | ||||
| 	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */ | ||||
| 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */ | ||||
| 	ccmp	endloop, #0, #0, eq | ||||
| 	b.eq	L(loop_aligned) | ||||
| 	/* End of main loop */ | ||||
| 
 | ||||
| L(full_check): | ||||
| #ifndef __AARCH64EB__ | ||||
| 	orr	syndrome, diff, has_nul | ||||
| 	add	limit, limit, 8	/* Rewind limit to before last subs. */ | ||||
| L(syndrome_check): | ||||
| 	/* Limit was reached. Check if the NUL byte or the difference | ||||
| 	   is before the limit. */ | ||||
| 	rev	syndrome, syndrome | ||||
| 	rev	data1, data1 | ||||
| 	clz	pos, syndrome | ||||
| 	rev	data2, data2 | ||||
| 	lsl	data1, data1, pos | ||||
| 	cmp	limit, pos, lsr #3 | ||||
| 	lsl	data2, data2, pos | ||||
| 	/* But we need to zero-extend (char is unsigned) the value and then | ||||
| 	   perform a signed 32-bit subtraction.  */ | ||||
| 	lsr	data1, data1, #56 | ||||
| 	sub	result, data1, data2, lsr #56 | ||||
| 	csel result, result, xzr, hi | ||||
| 	ret | ||||
| #else | ||||
| 	/* Not reached the limit, must have found the end or a diff.  */ | ||||
| 	tbz	limit, #63, L(not_limit) | ||||
| 	add	tmp1, limit, 8 | ||||
| 	cbz	limit, L(not_limit) | ||||
| 
 | ||||
| 	lsl	limit, tmp1, #3	/* Bits -> bytes.  */ | ||||
| 	mov	mask, #~0 | ||||
| 	lsr	mask, mask, limit | ||||
| 	bic	data1, data1, mask | ||||
| 	bic	data2, data2, mask | ||||
| 
 | ||||
| 	/* Make sure that the NUL byte is marked in the syndrome.  */ | ||||
| 	orr	has_nul, has_nul, mask | ||||
| 
 | ||||
| L(not_limit): | ||||
| 	/* For big-endian we cannot use the trick with the syndrome value | ||||
| 	   as carry-propagation can corrupt the upper bits if the trailing | ||||
| 	   bytes in the string contain 0x01.  */ | ||||
| 	/* However, if there is no NUL byte in the dword, we can generate | ||||
| 	   the result directly.  We can't just subtract the bytes as the | ||||
| 	   MSB might be significant.  */ | ||||
| 	cbnz	has_nul, 1f | ||||
| 	cmp	data1, data2 | ||||
| 	cset	result, ne | ||||
| 	cneg	result, result, lo | ||||
| 	ret | ||||
| 1: | ||||
| 	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */ | ||||
| 	rev	tmp3, data1 | ||||
| 	sub	tmp1, tmp3, zeroones | ||||
| 	orr	tmp2, tmp3, #REP8_7f | ||||
| 	bic	has_nul, tmp1, tmp2 | ||||
| 	rev	has_nul, has_nul | ||||
| 	orr	syndrome, diff, has_nul | ||||
| 	clz	pos, syndrome | ||||
| 	/* The most-significant-non-zero bit of the syndrome marks either the | ||||
| 	   first bit that is different, or the top bit of the first zero byte. | ||||
| 	   Shifting left now will bring the critical information into the | ||||
| 	   top bits.  */ | ||||
| L(end_quick): | ||||
| 	lsl	data1, data1, pos | ||||
| 	lsl	data2, data2, pos | ||||
| 	/* But we need to zero-extend (char is unsigned) the value and then | ||||
| 	   perform a signed 32-bit subtraction.  */ | ||||
| 	lsr	data1, data1, #56 | ||||
| 	sub	result, data1, data2, lsr #56 | ||||
| 	ret | ||||
| #endif | ||||
| 
 | ||||
| L(mutual_align): | ||||
| 	/* Sources are mutually aligned, but are not currently at an | ||||
| 	   alignment boundary.  Round down the addresses and then mask off | ||||
| 	   the bytes that precede the start point. | ||||
| 	   We also need to adjust the limit calculations, but without | ||||
| 	   overflowing if the limit is near ULONG_MAX.  */ | ||||
| 	bic	src1, src1, #7 | ||||
| 	bic	src2, src2, #7 | ||||
| 	ldr	data1, [src1], #8 | ||||
| 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */ | ||||
| 	ldr	data2, [src2], #8 | ||||
| 	mov	tmp2, #~0 | ||||
| 	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */ | ||||
| 	/* Adjust the limit and ensure it doesn't overflow.  */ | ||||
| 	adds	limit, limit, count | ||||
| 	csinv	limit, limit, xzr, lo | ||||
| 	orr	data1, data1, tmp2 | ||||
| 	orr	data2, data2, tmp2 | ||||
| 	b	L(start_realigned) | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| 	/* Don't bother with dwords for up to 16 bytes.  */ | ||||
| L(misaligned8): | ||||
| 	cmp	limit, #16 | ||||
| 	b.hs	L(try_misaligned_words) | ||||
| 
 | ||||
| L(byte_loop): | ||||
| 	/* Perhaps we can do better than this.  */ | ||||
| 	ldrb	data1w, [src1], #1 | ||||
| 	ldrb	data2w, [src2], #1 | ||||
| 	subs	limit, limit, #1 | ||||
| 	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */ | ||||
| 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */ | ||||
| 	b.eq	L(byte_loop) | ||||
| L(done): | ||||
| 	sub	result, data1, data2 | ||||
| 	ret | ||||
| 	/* Align the SRC1 to a dword by doing a bytewise compare and then do | ||||
| 	   the dword loop.  */ | ||||
| L(try_misaligned_words): | ||||
| 	cbz	count, L(src1_aligned) | ||||
| 
 | ||||
| 	neg	count, count | ||||
| 	and	count, count, #7 | ||||
| 	sub	limit, limit, count | ||||
| 
 | ||||
| L(page_end_loop): | ||||
| 	ldrb	data1w, [src1], #1 | ||||
| 	ldrb	data2w, [src2], #1 | ||||
| 	cmp	data1w, #1 | ||||
| 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */ | ||||
| 	b.ne	L(done) | ||||
| 	subs	count, count, #1 | ||||
| 	b.hi	L(page_end_loop) | ||||
| 
 | ||||
| 	/* The following diagram explains the comparison of misaligned strings. | ||||
| 	   The bytes are shown in natural order. For little-endian, it is | ||||
| 	   reversed in the registers. The "x" bytes are before the string. | ||||
| 	   The "|" separates data that is loaded at one time. | ||||
| 	   src1     | a a a a a a a a | b b b c c c c c | . . . | ||||
| 	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . . | ||||
| 
 | ||||
| 	   After shifting in each step, the data looks like this: | ||||
| 	                STEP_A              STEP_B              STEP_C | ||||
| 	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c | ||||
| 	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c | ||||
| 
 | ||||
| 	   The bytes with "0" are eliminated from the syndrome via mask. | ||||
| 
 | ||||
| 	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a | ||||
| 	   time from SRC2. The comparison happens in 3 steps. After each step | ||||
| 	   the loop can exit, or read from SRC1 or SRC2. */ | ||||
| L(src1_aligned): | ||||
| 	/* Calculate offset from 8 byte alignment to string start in bits. No | ||||
| 	   need to mask offset since shifts are ignoring upper bits. */ | ||||
| 	lsl	offset, src2, #3 | ||||
| 	bic	src2, src2, #0xf | ||||
| 	mov	mask, -1 | ||||
| 	neg	neg_offset, offset | ||||
| 	ldr	data1, [src1], #8 | ||||
| 	ldp	tmp1, tmp2, [src2], #16 | ||||
| 	LS_BK	mask, mask, neg_offset | ||||
| 	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */ | ||||
| 	/* Skip the first compare if data in tmp1 is irrelevant. */ | ||||
| 	tbnz	offset, 6, L(misaligned_mid_loop) | ||||
| 
 | ||||
| L(loop_misaligned): | ||||
| 	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ | ||||
| 	LS_FW	data2, tmp1, offset | ||||
| 	LS_BK	tmp1, tmp2, neg_offset | ||||
| 	subs	limit, limit, #8 | ||||
| 	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/ | ||||
| 	sub	has_nul, data1, zeroones | ||||
| 	eor	diff, data1, data2	/* Non-zero if differences found.  */ | ||||
| 	orr	tmp3, data1, #REP8_7f | ||||
| 	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */ | ||||
| 	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */ | ||||
| 	orr	tmp3, endloop, has_nul | ||||
| 	cbnz	tmp3, L(full_check) | ||||
| 
 | ||||
| 	ldr	data1, [src1], #8 | ||||
| L(misaligned_mid_loop): | ||||
| 	/* STEP_B: Compare first part of data1 to second part of tmp2. */ | ||||
| 	LS_FW	data2, tmp2, offset | ||||
| #ifdef __AARCH64EB__ | ||||
| 	/* For big-endian we do a byte reverse to avoid carry-propagation | ||||
| 	problem described above. This way we can reuse the has_nul in the | ||||
| 	next step and also use syndrome value trick at the end. */ | ||||
| 	rev	tmp3, data1 | ||||
| 	#define data1_fixed tmp3 | ||||
| #else | ||||
| 	#define data1_fixed data1 | ||||
| #endif | ||||
| 	sub	has_nul, data1_fixed, zeroones | ||||
| 	orr	tmp3, data1_fixed, #REP8_7f | ||||
| 	eor	diff, data2, data1	/* Non-zero if differences found.  */ | ||||
| 	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */ | ||||
| #ifdef __AARCH64EB__ | ||||
| 	rev	has_nul, has_nul | ||||
| #endif | ||||
| 	cmp	limit, neg_offset, lsr #3 | ||||
| 	orr	syndrome, diff, has_nul | ||||
| 	bic	syndrome, syndrome, mask	/* Ignore later bytes. */ | ||||
| 	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */ | ||||
| 	cbnz	tmp3, L(syndrome_check) | ||||
| 
 | ||||
| 	/* STEP_C: Compare second part of data1 to first part of tmp1. */ | ||||
| 	ldp	tmp1, tmp2, [src2], #16 | ||||
| 	cmp	limit, #8 | ||||
| 	LS_BK	data2, tmp1, neg_offset | ||||
| 	eor	diff, data2, data1	/* Non-zero if differences found.  */ | ||||
| 	orr	syndrome, diff, has_nul | ||||
| 	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */ | ||||
| 	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */ | ||||
| 	cbnz	tmp3, L(syndrome_check) | ||||
| 
 | ||||
| 	ldr	data1, [src1], #8 | ||||
| 	sub	limit, limit, #8 | ||||
| 	b	L(loop_misaligned) | ||||
| 
 | ||||
| #ifdef	__AARCH64EB__ | ||||
| L(syndrome_check): | ||||
| 	clz	pos, syndrome | ||||
| 	cmp	pos, limit, lsl #3 | ||||
| 	b.lo	L(end_quick) | ||||
| #endif | ||||
| 
 | ||||
| L(ret0): | ||||
| 	mov	result, #0 | ||||
| 	ret | ||||
| END(__strncmp_aarch64) | ||||
							
								
								
									
										128
									
								
								libc/intrin/aarch64/strnlen.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										128
									
								
								libc/intrin/aarch64/strnlen.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,128 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strnlen_aarch64 strnlen | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64, Advanced SIMD. | ||||
|  * MTE compatible. | ||||
|  */ | ||||
| 
 | ||||
| #define srcin		x0 | ||||
| #define cntin		x1 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x2 | ||||
| #define synd		x3 | ||||
| #define	shift		x4 | ||||
| #define tmp		x4 | ||||
| #define cntrem		x5 | ||||
| 
 | ||||
| #define qdata		q0 | ||||
| #define vdata		v0 | ||||
| #define vhas_chr	v1 | ||||
| #define vend		v2 | ||||
| #define dend		d2 | ||||
| 
 | ||||
| /* | ||||
|    Core algorithm: | ||||
|    Process the string in 16-byte aligned chunks. Compute a 64-bit mask with | ||||
|    four bits per byte using the shrn instruction. A count trailing zeros then | ||||
|    identifies the first zero byte.  */ | ||||
| 
 | ||||
| ENTRY (__strnlen_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	SIZE_ARG (1) | ||||
| 	bic	src, srcin, 15 | ||||
| 	cbz	cntin, L(nomatch) | ||||
| 	ld1	{vdata.16b}, [src] | ||||
| 	cmeq	vhas_chr.16b, vdata.16b, 0 | ||||
| 	lsl	shift, srcin, 2 | ||||
| 	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	lsr	synd, synd, shift | ||||
| 	cbz	synd, L(start_loop) | ||||
| L(finish): | ||||
| 	rbit	synd, synd | ||||
| 	clz	synd, synd | ||||
| 	lsr	result, synd, 2 | ||||
| 	cmp	cntin, result | ||||
| 	csel	result, cntin, result, ls | ||||
| 	ret | ||||
| 
 | ||||
| L(nomatch): | ||||
| 	mov	result, cntin | ||||
| 	ret | ||||
| 
 | ||||
| L(start_loop): | ||||
| 	sub	tmp, src, srcin | ||||
| 	add	tmp, tmp, 17 | ||||
| 	subs	cntrem, cntin, tmp | ||||
| 	b.lo	L(nomatch) | ||||
| 
 | ||||
| 	/* Make sure that it won't overread by a 16-byte chunk */ | ||||
| 	tbz	cntrem, 4, L(loop32_2) | ||||
| 	sub	src, src, 16 | ||||
| 	.p2align 5
 | ||||
| L(loop32): | ||||
| 	ldr	qdata, [src, 32]! | ||||
| 	cmeq	vhas_chr.16b, vdata.16b, 0 | ||||
| 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	cbnz	synd, L(end) | ||||
| L(loop32_2): | ||||
| 	ldr	qdata, [src, 16] | ||||
| 	subs	cntrem, cntrem, 32 | ||||
| 	cmeq	vhas_chr.16b, vdata.16b, 0 | ||||
| 	b.lo	L(end_2) | ||||
| 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */ | ||||
| 	fmov	synd, dend | ||||
| 	cbz	synd, L(loop32) | ||||
| L(end_2): | ||||
| 	add	src, src, 16 | ||||
| L(end): | ||||
| 	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */ | ||||
| 	sub	result, src, srcin | ||||
| 	fmov	synd, dend | ||||
| #ifndef __AARCH64EB__ | ||||
| 	rbit	synd, synd | ||||
| #endif | ||||
| 	clz	synd, synd | ||||
| 	add	result, result, synd, lsr 2 | ||||
| 	cmp	cntin, result | ||||
| 	csel	result, cntin, result, ls | ||||
| 	ret | ||||
| 
 | ||||
| END (__strnlen_aarch64) | ||||
							
								
								
									
										175
									
								
								libc/intrin/aarch64/strrchr.S
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								libc/intrin/aarch64/strrchr.S
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,175 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  Optimized Routines                                                          │ | ||||
| │  Copyright (c) 1999-2022, Arm Limited.                                       │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/aarch64/asmdefs.h" | ||||
| 
 | ||||
| #define __strrchr_aarch64 strrchr | ||||
| 
 | ||||
| .ident "\n\ | ||||
| Optimized Routines (MIT License)\n\ | ||||
| Copyright 2022 ARM Limited\n" | ||||
| .include "libc/disclaimer.inc" | ||||
| 
 | ||||
| /* Assumptions: | ||||
|  * | ||||
|  * ARMv8-a, AArch64 | ||||
|  * Neon Available. | ||||
|  */ | ||||
| 
 | ||||
| /* Arguments and results.  */ | ||||
| #define srcin		x0 | ||||
| #define chrin		w1 | ||||
| 
 | ||||
| #define result		x0 | ||||
| 
 | ||||
| #define src		x2 | ||||
| #define	tmp1		x3 | ||||
| #define wtmp2		w4 | ||||
| #define tmp3		x5 | ||||
| #define src_match	x6 | ||||
| #define src_offset	x7 | ||||
| #define const_m1	x8 | ||||
| #define tmp4		x9 | ||||
| #define nul_match	x10 | ||||
| #define chr_match	x11 | ||||
| 
 | ||||
| #define vrepchr		v0 | ||||
| #define vdata1		v1 | ||||
| #define vdata2		v2 | ||||
| #define vhas_nul1	v3 | ||||
| #define vhas_nul2	v4 | ||||
| #define vhas_chr1	v5 | ||||
| #define vhas_chr2	v6 | ||||
| #define vrepmask_0	v7 | ||||
| #define vrepmask_c	v16 | ||||
| #define vend1		v17 | ||||
| #define vend2		v18 | ||||
| 
 | ||||
| /* Core algorithm. | ||||
| 
 | ||||
|    For each 32-byte hunk we calculate a 64-bit syndrome value, with | ||||
|    two bits per byte (LSB is always in bits 0 and 1, for both big | ||||
|    and little-endian systems).  For each tuple, bit 0 is set iff | ||||
|    the relevant byte matched the requested character; bit 1 is set
 | ||||
|    iff the relevant byte matched the NUL end of string (we trigger | ||||
|    off bit0 for the special case of looking for NUL).  Since the bits | ||||
|    in the syndrome reflect exactly the order in which things occur | ||||
|    in the original string a count_trailing_zeros() operation will | ||||
|    identify exactly which byte is causing the termination, and why.  */ | ||||
| 
 | ||||
| ENTRY (__strrchr_aarch64) | ||||
| 	PTR_ARG (0) | ||||
| 	/* Magic constant 0x40100401 to allow us to identify which lane | ||||
| 	   matches the requested byte.  Magic constant 0x80200802 used | ||||
| 	   similarly for NUL termination.  */ | ||||
| 	mov	wtmp2, #0x0401 | ||||
| 	movk	wtmp2, #0x4010, lsl #16 | ||||
| 	dup	vrepchr.16b, chrin | ||||
| 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */ | ||||
| 	dup	vrepmask_c.4s, wtmp2 | ||||
| 	mov	src_offset, #0 | ||||
| 	ands	tmp1, srcin, #31 | ||||
| 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ | ||||
| 	b.eq	L(aligned) | ||||
| 
 | ||||
| 	/* Input string is not 32-byte aligned.  Rather than forcing | ||||
| 	   the padding bytes to a safe value, we calculate the syndrome | ||||
| 	   for all the bytes, but then mask off those bits of the | ||||
| 	   syndrome that are related to the padding.  */ | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	neg	tmp1, tmp1 | ||||
| 	cmeq	vhas_nul1.16b, vdata1.16b, #0 | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_nul2.16b, vdata2.16b, #0 | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b | ||||
| 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b | ||||
| 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b | ||||
| 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b | ||||
| 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128 | ||||
| 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128 | ||||
| 	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64 | ||||
| 	mov	nul_match, vend1.d[0] | ||||
| 	lsl	tmp1, tmp1, #1 | ||||
| 	mov	const_m1, #~0 | ||||
| 	lsr	tmp3, const_m1, tmp1 | ||||
| 	mov	chr_match, vend1.d[1] | ||||
| 
 | ||||
| 	bic	nul_match, nul_match, tmp3	// Mask padding bits. | ||||
| 	bic	chr_match, chr_match, tmp3	// Mask padding bits. | ||||
| 	cbnz	nul_match, L(tail) | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(loop): | ||||
| 	cmp	chr_match, #0 | ||||
| 	csel	src_match, src, src_match, ne | ||||
| 	csel	src_offset, chr_match, src_offset, ne | ||||
| L(aligned): | ||||
| 	ld1	{vdata1.16b, vdata2.16b}, [src], #32 | ||||
| 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b | ||||
| 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b | ||||
| 	uminp	vend1.16b, vdata1.16b, vdata2.16b | ||||
| 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b | ||||
| 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b | ||||
| 	cmeq	vend1.16b, vend1.16b, 0 | ||||
| 	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128 | ||||
| 	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64 | ||||
| 	mov	nul_match, vend1.d[0] | ||||
| 	mov	chr_match, vend1.d[1] | ||||
| 	cbz	nul_match, L(loop) | ||||
| 
 | ||||
| 	cmeq	vhas_nul1.16b, vdata1.16b, #0 | ||||
| 	cmeq	vhas_nul2.16b, vdata2.16b, #0 | ||||
| 	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b | ||||
| 	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b | ||||
| 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b | ||||
| 	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b | ||||
| 	mov	nul_match, vhas_nul1.d[0] | ||||
| 
 | ||||
| L(tail): | ||||
| 	/* Work out exactly where the string ends.  */ | ||||
| 	sub	tmp4, nul_match, #1 | ||||
| 	eor	tmp4, tmp4, nul_match | ||||
| 	ands	chr_match, chr_match, tmp4 | ||||
| 	/* And pick the values corresponding to the last match.  */ | ||||
| 	csel	src_match, src, src_match, ne | ||||
| 	csel	src_offset, chr_match, src_offset, ne | ||||
| 
 | ||||
| 	/* Count down from the top of the syndrome to find the last match.  */ | ||||
| 	clz	tmp3, src_offset | ||||
| 	/* Src_match points beyond the word containing the match, so we can | ||||
| 	   simply subtract half the bit-offset into the syndrome.  Because | ||||
| 	   we are counting down, we need to go back one more character.  */ | ||||
| 	add	tmp3, tmp3, #2 | ||||
| 	sub	result, src_match, tmp3, lsr #1 | ||||
| 	/* But if the syndrome shows no match was found, then return NULL.  */ | ||||
| 	cmp	src_offset, #0 | ||||
| 	csel	result, result, xzr, ne | ||||
| 
 | ||||
| 	ret | ||||
| 
 | ||||
| END (__strrchr_aarch64) | ||||
|  | @ -6,6 +6,7 @@ PKGS += LIBC_INTRIN | |||
| LIBC_INTRIN_ARTIFACTS += LIBC_INTRIN_A | ||||
| LIBC_INTRIN = $(LIBC_INTRIN_A_DEPS) $(LIBC_INTRIN_A) | ||||
| LIBC_INTRIN_A = o/$(MODE)/libc/intrin/intrin.a | ||||
| LIBC_INTRIN_A_FILES := $(wildcard libc/intrin/*) | ||||
| LIBC_INTRIN_A_HDRS = $(filter %.h,$(LIBC_INTRIN_A_FILES)) | ||||
| LIBC_INTRIN_A_INCS = $(filter %.inc,$(LIBC_INTRIN_A_FILES)) | ||||
| LIBC_INTRIN_A_SRCS_S = $(filter %.S,$(LIBC_INTRIN_A_FILES)) | ||||
|  | @ -13,8 +14,9 @@ LIBC_INTRIN_A_SRCS_C = $(filter %.c,$(LIBC_INTRIN_A_FILES)) | |||
| LIBC_INTRIN_A_SRCS = $(LIBC_INTRIN_A_SRCS_S) $(LIBC_INTRIN_A_SRCS_C) | ||||
| LIBC_INTRIN_A_CHECKS = $(LIBC_INTRIN_A).pkg | ||||
| 
 | ||||
| LIBC_INTRIN_A_FILES :=					\
 | ||||
| 	$(wildcard libc/intrin/*) | ||||
| ifeq ($(ARCH), aarch64) | ||||
| LIBC_INTRIN_A_SRCS_S += $(wildcard libc/intrin/aarch64/*.S) | ||||
| endif | ||||
| 
 | ||||
| LIBC_INTRIN_A_OBJS =					\
 | ||||
| 	$(LIBC_INTRIN_A_SRCS_S:%.S=o/$(MODE)/%.o)	\
 | ||||
|  | @ -203,6 +205,8 @@ o/$(MODE)/libc/intrin/memmove.o: private		\ | |||
| 			-fpie | ||||
| 
 | ||||
| # these assembly files are safe to build on aarch64
 | ||||
| o/$(MODE)/libc/intrin/aarch64/%.o: libc/intrin/aarch64/%.S | ||||
| 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $< | ||||
| o/$(MODE)/libc/intrin/fenv.o: libc/intrin/fenv.S | ||||
| 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $< | ||||
| o/$(MODE)/libc/intrin/futex.o: libc/intrin/futex.S | ||||
|  |  | |||
|  | @ -20,6 +20,7 @@ | |||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); | ||||
| 
 | ||||
|  | @ -83,3 +84,5 @@ void *memchr(const void *s, int c, size_t n) { | |||
|   return memchr_pure(s, c, n); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  | @ -20,6 +20,7 @@ | |||
| #include "libc/intrin/likely.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| #define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x) | ||||
| 
 | ||||
|  | @ -129,7 +130,9 @@ microarchitecture("avx") static int memcmp_avx(const unsigned char *p, | |||
|  *     memcmp n=32768                      29 ps/byte         32,851 mb/s | ||||
|  *     memcmp n=131072                     33 ps/byte         28,983 mb/s | ||||
|  * | ||||
|  * @return unsigned char subtraction at stop index | ||||
|  * @return an integer that's (1) equal to zero if `a` is equal to `b`, | ||||
|  *     (2) less than zero if `a` is less than `b`, or (3) greater than | ||||
|  *     zero if `a` is greater than `b` | ||||
|  * @asyncsignalsafe | ||||
|  */ | ||||
| int memcmp(const void *a, const void *b, size_t n) { | ||||
|  | @ -200,3 +203,5 @@ int memcmp(const void *a, const void *b, size_t n) { | |||
|   } | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ | |||
| #include "libc/nexgen32e/nexgen32e.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1))); | ||||
| typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16))); | ||||
|  | @ -343,3 +344,5 @@ void *memmove(void *dst, const void *src, size_t n) { | |||
| 
 | ||||
| asm("memcpy = memmove\n\t" | ||||
|     ".globl\tmemcpy"); | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  |  | |||
|  | @ -20,6 +20,7 @@ | |||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); | ||||
| 
 | ||||
|  | @ -81,3 +82,5 @@ void *memrchr(const void *s, int c, size_t n) { | |||
|   return memrchr_pure(s, c, n); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  | @ -22,6 +22,7 @@ | |||
| #include "libc/nexgen32e/nexgen32e.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); | ||||
| typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16))); | ||||
|  | @ -168,3 +169,5 @@ void *memset(void *p, int c, size_t n) { | |||
|     return memset_sse(b, c, n); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  |  | |||
|  | @ -17,6 +17,9 @@ | |||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| // TODO(jart): ASAN support here is important.
 | ||||
| 
 | ||||
| typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1))); | ||||
| typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16))); | ||||
|  | @ -63,3 +66,5 @@ char *stpcpy(char *d, const char *s) { | |||
|     ++i; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  |  | |||
|  | @ -21,6 +21,7 @@ | |||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| static inline const char *strchr_pure(const char *s, int c) { | ||||
|   for (;; ++s) { | ||||
|  | @ -115,3 +116,5 @@ char *strchr(const char *s, int c) { | |||
|   return r; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  | @ -21,6 +21,7 @@ | |||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| static inline const char *strchrnul_pure(const char *s, int c) { | ||||
|   for (;; ++s) { | ||||
|  | @ -113,3 +114,5 @@ char *strchrnul(const char *s, int c) { | |||
|   return r; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  | @ -17,6 +17,9 @@ | |||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| // TODO(jart): ASAN support here is important.
 | ||||
| 
 | ||||
| typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1))); | ||||
| typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16))); | ||||
|  | @ -63,3 +66,5 @@ char *strcpy(char *d, const char *s) { | |||
|     ++i; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  |  | |||
|  | @ -19,6 +19,7 @@ | |||
| #include "libc/dce.h" | ||||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| /**
 | ||||
|  * Returns length of NUL-terminated string. | ||||
|  | @ -61,3 +62,5 @@ noasan size_t strlen(const char *s) { | |||
|   return n; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  |  | |||
|  | @ -17,6 +17,7 @@ | |||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| /**
 | ||||
|  * Compares NUL-terminated strings w/ limit. | ||||
|  | @ -32,3 +33,5 @@ int strncmp(const char *a, const char *b, size_t n) { | |||
|   while (i < n && a[i] == b[i] && b[i]) ++i; | ||||
|   return (a[i] & 0xff) - (b[i] & 0xff); | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  | @ -21,6 +21,7 @@ | |||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/intrin/bits.h" | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| static noasan size_t strnlen_x64(const char *s, size_t n, size_t i) { | ||||
|   uint64_t w; | ||||
|  | @ -56,3 +57,5 @@ noasan size_t strnlen(const char *s, size_t n) { | |||
|   if (IsAsan()) __asan_verify(s, i); | ||||
|   return i; | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  | @ -17,6 +17,7 @@ | |||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/str/str.h" | ||||
| #ifndef __aarch64__ | ||||
| 
 | ||||
| /**
 | ||||
|  * Searches for last instance of character in string. | ||||
|  | @ -29,3 +30,5 @@ | |||
| char *strrchr(const char *s, int c) { | ||||
|   return memrchr(s, c, strlen(s)); | ||||
| } | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
|  | @ -36,10 +36,16 @@ STATIC_YOINK("strerror_wr"); | |||
| /**
 | ||||
|  * Handles failure of CHECK_xx() macros. | ||||
|  */ | ||||
| relegated void __check_fail(const char *suffix, const char *opstr, | ||||
|                             uint64_t want, const char *wantstr, uint64_t got, | ||||
|                             const char *gotstr, const char *file, int line, | ||||
|                             const char *fmt, ...) { | ||||
| relegated void __check_fail(const char *suffix,   //
 | ||||
|                             const char *opstr,    //
 | ||||
|                             uint64_t want,        //
 | ||||
|                             const char *wantstr,  //
 | ||||
|                             uint64_t got,         //
 | ||||
|                             const char *gotstr,   //
 | ||||
|                             const char *file,     //
 | ||||
|                             int line,             //
 | ||||
|                             const char *fmt,      //
 | ||||
|                             ...) { | ||||
|   int e; | ||||
|   char *p; | ||||
|   size_t i; | ||||
|  |  | |||
|  | @ -33,21 +33,69 @@ | |||
|  * | ||||
|  * @see libc/log/thunks/__check_fail_ndebug.S | ||||
|  */ | ||||
| relegated wontreturn void __check_fail_ndebug(uint64_t want, uint64_t got, | ||||
|                                               const char *file, int line, | ||||
|                                               const char *opchar, | ||||
|                                               const char *fmt, ...) { | ||||
|   va_list va; | ||||
| static relegated wontreturn void __check_fail_ndebug(uint64_t want,       //
 | ||||
|                                                      uint64_t got,        //
 | ||||
|                                                      const char *file,    //
 | ||||
|                                                      int line,            //
 | ||||
|                                                      const char *opchar,  //
 | ||||
|                                                      const char *fmt,     //
 | ||||
|                                                      va_list va) { | ||||
|   __restore_tty(); | ||||
|   kprintf("%rerror:%s:%d: check failed: %'ld %s %'ld% m", file, line, want, | ||||
|           opchar, got); | ||||
|   if (*fmt) { | ||||
|   if (fmt && *fmt) { | ||||
|     kprintf(": "); | ||||
|     va_start(va, fmt); | ||||
|     kvprintf(fmt, va); | ||||
|     va_end(va); | ||||
|   } | ||||
|   kprintf("\n"); | ||||
|   if (_weaken(__die)) _weaken(__die)(); | ||||
|   _Exitr(68); | ||||
| } | ||||
| 
 | ||||
| void __check_fail_eq(uint64_t want, uint64_t got, const char *file, int line, | ||||
|                      const char *opchar, const char *fmt, ...) { | ||||
|   va_list va; | ||||
|   va_start(va, fmt); | ||||
|   __check_fail_ndebug(want, got, file, line, opchar, fmt, va); | ||||
|   va_end(va); | ||||
| } | ||||
| 
 | ||||
| void __check_fail_ne(uint64_t want, uint64_t got, const char *file, int line, | ||||
|                      const char *opchar, const char *fmt, ...) { | ||||
|   va_list va; | ||||
|   va_start(va, fmt); | ||||
|   __check_fail_ndebug(want, got, file, line, opchar, fmt, va); | ||||
|   va_end(va); | ||||
| } | ||||
| 
 | ||||
| void __check_fail_le(uint64_t want, uint64_t got, const char *file, int line, | ||||
|                      const char *opchar, const char *fmt, ...) { | ||||
|   va_list va; | ||||
|   va_start(va, fmt); | ||||
|   __check_fail_ndebug(want, got, file, line, opchar, fmt, va); | ||||
|   va_end(va); | ||||
| } | ||||
| 
 | ||||
| void __check_fail_lt(uint64_t want, uint64_t got, const char *file, int line, | ||||
|                      const char *opchar, const char *fmt, ...) { | ||||
|   va_list va; | ||||
|   va_start(va, fmt); | ||||
|   __check_fail_ndebug(want, got, file, line, opchar, fmt, va); | ||||
|   va_end(va); | ||||
| } | ||||
| 
 | ||||
| void __check_fail_ge(uint64_t want, uint64_t got, const char *file, int line, | ||||
|                      const char *opchar, const char *fmt, ...) { | ||||
|   va_list va; | ||||
|   va_start(va, fmt); | ||||
|   __check_fail_ndebug(want, got, file, line, opchar, fmt, va); | ||||
|   va_end(va); | ||||
| } | ||||
| 
 | ||||
| void __check_fail_gt(uint64_t want, uint64_t got, const char *file, int line, | ||||
|                      const char *opchar, const char *fmt, ...) { | ||||
|   va_list va; | ||||
|   va_start(va, fmt); | ||||
|   __check_fail_ndebug(want, got, file, line, opchar, fmt, va); | ||||
|   va_end(va); | ||||
| } | ||||
|  |  | |||
|  | @ -6,9 +6,7 @@ PKGS += LIBC_LOG | |||
| LIBC_LOG_ARTIFACTS += LIBC_LOG_A | ||||
| LIBC_LOG = $(LIBC_LOG_A_DEPS) $(LIBC_LOG_A) | ||||
| LIBC_LOG_A = o/$(MODE)/libc/log/log.a | ||||
| LIBC_LOG_A_FILES :=					\
 | ||||
| 	$(wildcard libc/log/thunks/*)			\
 | ||||
| 	$(wildcard libc/log/*) | ||||
| LIBC_LOG_A_FILES := $(wildcard libc/log/*) | ||||
| LIBC_LOG_A_HDRS = $(filter %.h,$(LIBC_LOG_A_FILES)) | ||||
| LIBC_LOG_A_SRCS_C = $(filter %.c,$(LIBC_LOG_A_FILES)) | ||||
| LIBC_LOG_A_SRCS_S = $(filter %.S,$(LIBC_LOG_A_FILES)) | ||||
|  |  | |||
|  | @ -1,30 +0,0 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| .text.unlikely | ||||
| 
 | ||||
| //	Code-size saving thunk for CHECK_EQ() in NDEBUG mode. | ||||
| __check_fail_eq: | ||||
| 	lea	.Lop(%rip),%r8 | ||||
| 	jmp	__check_fail_ndebug | ||||
| 	.endfn	__check_fail_eq,globl | ||||
| 
 | ||||
| 	.rodata.str1.1 | ||||
| .Lop:	.asciz	"==" | ||||
| 	.previous | ||||
|  | @ -1,30 +0,0 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| .text.unlikely | ||||
| 
 | ||||
| //	Code-size saving thunk for CHECK_GE() in NDEBUG mode. | ||||
| __check_fail_ge: | ||||
| 	lea	.Lop(%rip),%r8 | ||||
| 	jmp	__check_fail_ndebug | ||||
| 	.endfn	__check_fail_ge,globl | ||||
| 
 | ||||
| 	.rodata.str1.1 | ||||
| .Lop:	.asciz	">=" | ||||
| 	.previous | ||||
|  | @ -1,30 +0,0 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| .text.unlikely | ||||
| 
 | ||||
| //	Code-size saving thunk for CHECK_GT() in NDEBUG mode. | ||||
| __check_fail_gt: | ||||
| 	lea	.Lop(%rip),%r8 | ||||
| 	jmp	__check_fail_ndebug | ||||
| 	.endfn	__check_fail_gt,globl | ||||
| 
 | ||||
| 	.rodata.str1.1 | ||||
| .Lop:	.asciz	">" | ||||
| 	.previous | ||||
|  | @ -1,30 +0,0 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| .text.unlikely | ||||
| 
 | ||||
| //	Code-size saving thunk for CHECK_LE() in NDEBUG mode. | ||||
| __check_fail_le: | ||||
| 	lea	.Lop(%rip),%r8 | ||||
| 	jmp	__check_fail_ndebug | ||||
| 	.endfn	__check_fail_le,globl | ||||
| 
 | ||||
| 	.rodata.str1.1 | ||||
| .Lop:	.asciz	"<=" | ||||
| 	.previous | ||||
|  | @ -1,30 +0,0 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| .text.unlikely | ||||
| 
 | ||||
| //	Code-size saving thunk for CHECK_LT() in NDEBUG mode. | ||||
| __check_fail_lt: | ||||
| 	lea	.Lop(%rip),%r8 | ||||
| 	jmp	__check_fail_ndebug | ||||
| 	.endfn	__check_fail_lt,globl | ||||
| 
 | ||||
| 	.rodata.str1.1 | ||||
| .Lop:	.asciz	"<" | ||||
| 	.previous | ||||
|  | @ -1,30 +0,0 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| .text.unlikely | ||||
| 
 | ||||
| //	Code-size saving thunk for CHECK_NE() in NDEBUG mode. | ||||
| __check_fail_ne: | ||||
| 	lea	.Lop(%rip),%r8 | ||||
| 	jmp	__check_fail_ndebug | ||||
| 	.endfn	__check_fail_ne,globl | ||||
| 
 | ||||
| 	.rodata.str1.1 | ||||
| .Lop:	.asciz	"!=" | ||||
| 	.previous | ||||
|  | @ -43,8 +43,22 @@ | |||
|     Ticks;                                               \ | ||||
|   }) | ||||
| #else | ||||
| #define __startbench() rdtsc() | ||||
| #define __endbench()   rdtsc() | ||||
| #define __startbench()                \ | ||||
|   ({                                  \ | ||||
|     uint64_t _ts;                     \ | ||||
|     asm volatile("isb" ::: "memory"); \ | ||||
|     _ts = rdtsc();                    \ | ||||
|     asm volatile("isb" ::: "memory"); \ | ||||
|     _ts;                              \ | ||||
|   }) | ||||
| #define __endbench()                  \ | ||||
|   ({                                  \ | ||||
|     uint64_t _ts;                     \ | ||||
|     asm volatile("isb" ::: "memory"); \ | ||||
|     _ts = rdtsc();                    \ | ||||
|     asm volatile("isb" ::: "memory"); \ | ||||
|     _ts;                              \ | ||||
|   }) | ||||
| #endif | ||||
| 
 | ||||
| #define __startbench_m() mfence_lfence_rdtsc_lfence() | ||||
|  |  | |||
|  | @ -1,262 +0,0 @@ | |||
| /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 | ||||
| │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| 
 | ||||
| //	Computes Phil Katz CRC-32 w/ carryless multiply isa. | ||||
| // | ||||
| //	This is support code that's abstracted by crc32_z(). | ||||
| // | ||||
| //	@param	edi is initial value
 | ||||
| //	@param	rsi points to buffer
 | ||||
| //	@param	rdx is bytes in buffer that's >=64 and %16==0
 | ||||
| //	@return	eax is crc32
 | ||||
| //	@note	needs Westmere (c.2010) or Bulldozer (c.2011)
 | ||||
| //	@see	“Fast CRC Computation for Generic Polynomials Using
 | ||||
| //		 PCLMULQDQ Instruction” V. Gopal, E. Ozturk, et al., | ||||
| //		 2009, intel.ly/2ySEwL0 | ||||
| crc32_pclmul: | ||||
| 	.leafprologue | ||||
| 	.profilable | ||||
| 	movdqu	(%rsi),%xmm7 | ||||
| 	movd	%edi,%xmm1 | ||||
| 	movdqu	16(%rsi),%xmm9 | ||||
| 	movdqu	32(%rsi),%xmm4 | ||||
| 	movdqu	48(%rsi),%xmm0 | ||||
| 	lea	-64(%rdx),%rdi | ||||
| 	lea	64(%rsi),%rcx | ||||
| 	pxor	%xmm7,%xmm1 | ||||
| 	movdqa	.Lk1k2(%rip),%xmm8 | ||||
| 	cmp	$63,%rdi | ||||
| 	jbe	2f | ||||
| 	lea	-128(%rdx),%rdi | ||||
| 	mov	%rdi,%rdx | ||||
| 	shr	$6,%rdx | ||||
| 	lea	2(%rdx),%rax | ||||
| 	sal	$6,%rax | ||||
| 	add	%rax,%rsi | ||||
| 	mov	%rcx,%rax | ||||
| 3:	add	$64,%rax | ||||
| 	movdqa	%xmm1,%xmm7 | ||||
| 	movdqa	%xmm4,%xmm5 | ||||
| 	movdqa	%xmm0,%xmm3 | ||||
| 	movdqa	%xmm9,%xmm6 | ||||
| 	movdqa	%xmm9,%xmm2 | ||||
| 	movdqu	-48(%rax),%xmm9 | ||||
| 	pclmullqlqdq %xmm8,%xmm7 | ||||
| 	pclmullqlqdq %xmm8,%xmm6 | ||||
| 	pclmullqlqdq %xmm8,%xmm5 | ||||
| 	pclmulhqhqdq %xmm8,%xmm1 | ||||
| 	pclmulhqhqdq %xmm8,%xmm2 | ||||
| 	pclmulhqhqdq %xmm8,%xmm4 | ||||
| 	pxor	%xmm7,%xmm1 | ||||
| 	movdqu	-64(%rax),%xmm7 | ||||
| 	pxor	%xmm6,%xmm2 | ||||
| 	pxor	%xmm5,%xmm4 | ||||
| 	movdqu	-32(%rax),%xmm6 | ||||
| 	movdqu	-16(%rax),%xmm5 | ||||
| 	pclmullqlqdq %xmm8,%xmm3 | ||||
| 	pclmulhqhqdq %xmm8,%xmm0 | ||||
| 	pxor	%xmm7,%xmm1 | ||||
| 	pxor	%xmm3,%xmm0 | ||||
| 	pxor	%xmm2,%xmm9 | ||||
| 	pxor	%xmm6,%xmm4 | ||||
| 	pxor	%xmm5,%xmm0 | ||||
| 	cmp	%rsi,%rax | ||||
| 	jne	3b | ||||
| 	lea	1(%rdx),%rax | ||||
| 	sal	$6,%rdx | ||||
| 	sal	$6,%rax | ||||
| 	sub	%rdx,%rdi | ||||
| 	add	%rax,%rcx | ||||
| 2:	movdqa	.Lk3k4(%rip),%xmm3 | ||||
| 	movdqa	%xmm1,%xmm2 | ||||
| 	movdqa	%xmm1,%xmm5 | ||||
| 	pclmulhqhqdq %xmm3,%xmm2 | ||||
| 	pclmullqlqdq %xmm3,%xmm5 | ||||
| 	pxor	%xmm9,%xmm2 | ||||
| 	pxor	%xmm5,%xmm2 | ||||
| 	movdqa	%xmm2,%xmm5 | ||||
| 	pclmulhqhqdq %xmm3,%xmm2 | ||||
| 	movdqa	%xmm2,%xmm1 | ||||
| 	pclmullqlqdq %xmm3,%xmm5 | ||||
| 	pxor	%xmm4,%xmm1 | ||||
| 	pxor	%xmm5,%xmm1 | ||||
| 	movdqa	%xmm1,%xmm2 | ||||
| 	pclmulhqhqdq %xmm3,%xmm1 | ||||
| 	pclmullqlqdq %xmm3,%xmm2 | ||||
| 	pxor	%xmm1,%xmm0 | ||||
| 	pxor	%xmm2,%xmm0 | ||||
| 	cmp	$15,%rdi | ||||
| 	jbe	4f | ||||
| 	sub	$16,%rdi | ||||
| 	mov	%rcx,%rax | ||||
| 	and	$-16,%rdi | ||||
| 	lea	16(%rcx,%rdi),%rdx | ||||
| 5:	movdqa	%xmm0,%xmm1 | ||||
| 	movdqu	(%rax),%xmm6 | ||||
| 	pclmulhqhqdq %xmm3,%xmm0 | ||||
| 	add	$16,%rax | ||||
| 	pclmullqlqdq %xmm3,%xmm1 | ||||
| 	pxor	%xmm1,%xmm0 | ||||
| 	pxor	%xmm6,%xmm0 | ||||
| 	cmp	%rax,%rdx | ||||
| 	jne	5b | ||||
| 4:	movdqa	%xmm0,%xmm1 | ||||
| 	movdqa	.Lboop(%rip),%xmm2 | ||||
| 	psrldq	$8,%xmm0 | ||||
| 	pclmullqhqdq %xmm3,%xmm1 | ||||
| 	movdqa	.Lpoly(%rip),%xmm3 | ||||
| 	pxor	%xmm1,%xmm0 | ||||
| 	movdqa	%xmm0,%xmm1 | ||||
| 	pand	%xmm2,%xmm0 | ||||
| 	pclmullqlqdq .Lk5k0(%rip),%xmm0 | ||||
| 	psrldq	$4,%xmm1 | ||||
| 	pxor	%xmm0,%xmm1 | ||||
| 	movdqa	%xmm1,%xmm0 | ||||
| 	pand	%xmm2,%xmm0 | ||||
| 	pclmullqhqdq %xmm3,%xmm0 | ||||
| 	pand	%xmm2,%xmm0 | ||||
| 	pclmullqlqdq %xmm3,%xmm0 | ||||
| 	pxor	%xmm1,%xmm0 | ||||
| 	movq	%xmm0,%rax | ||||
| 	shr	$32,%rax | ||||
| 	.leafepilogue | ||||
| 	.endfn	crc32_pclmul,globl,hidden | ||||
| 
 | ||||
| //	Definitions of the bit-reflected domain constants k1,k2,k3, etc. | ||||
| //	and the CRC32+Barrett polynomials given at the end of the paper. | ||||
| 	.rodata.cst16 | ||||
| .Lk1k2:	.quad	0x0000000154442bd4
 | ||||
| 	.quad	0x00000001c6e41596
 | ||||
| 	.endobj	.Lk1k2 | ||||
| .Lk3k4:	.quad	0x00000001751997d0
 | ||||
| 	.quad	0x00000000ccaa009e
 | ||||
| 	.endobj	.Lk3k4 | ||||
| .Lk5k0:	.quad	0x0000000163cd6124
 | ||||
| 	.quad	0x0000000000000000
 | ||||
| 	.endobj	.Lk5k0 | ||||
| .Lboop:	.quad	0x00000000ffffffff
 | ||||
| 	.quad	0x00000000ffffffff
 | ||||
| 	.endobj	.Lboop | ||||
| .Lpoly:	.quad	0x00000001db710641
 | ||||
| 	.quad	0x00000001f7011641
 | ||||
| 	.endobj	.Lpoly | ||||
| 	.previous | ||||
| 
 | ||||
| /*	crc32() w/ pclmul for #c per n where c ≈ 0.293ns | ||||
| 	N                     x1            x8           x64	mBps | ||||
| 	------------------------------------------------------------ | ||||
| 	1               4437.000        42.375        38.141      85
 | ||||
| 	1                 45.000        39.375        38.234      85
 | ||||
| 	2                 31.500        25.312        23.102     141
 | ||||
| 	3                 25.667        19.792        17.911     181
 | ||||
| 	4                 21.250        16.219        15.035     216
 | ||||
| 	7                 18.429        12.946        11.712     277
 | ||||
| 	8                 16.125        12.578        10.998     296
 | ||||
| 	15                12.867         9.925         9.161     355
 | ||||
| 	16                12.438         9.836         9.114     357
 | ||||
| 	31                11.194         8.528         8.149     399
 | ||||
| 	32                10.781         8.418         8.098     401
 | ||||
| 	63                 9.063         7.780         7.647     425
 | ||||
| 	64                 3.109         1.604         1.414    2299
 | ||||
| 	127                2.260         1.824         1.729    1880
 | ||||
| 	128                1.305         0.860         0.806    4033
 | ||||
| 	255                1.290         1.001         0.948    3428
 | ||||
| 	256                0.574         0.491         0.476    6822
 | ||||
| 	511                0.773         0.571         0.546    5956
 | ||||
| 	512                0.354         0.320         0.306   10613
 | ||||
| 	1023               0.425         0.365         0.347    9375
 | ||||
| 	1024               0.237         0.229         0.231   14097
 | ||||
| 	2047               0.278         0.251         0.246   13236
 | ||||
| 	2048               0.187         0.187         0.188   17306
 | ||||
| 	4095               0.229         0.200         0.194   16761
 | ||||
| 	4096               0.162         0.170         0.167   19438
 | ||||
| 	8191               0.182         0.173         0.178   18266
 | ||||
| 	8192               0.162         0.155         0.158   20560
 | ||||
| 	16383              0.156         0.162         0.154   21136
 | ||||
| 	16384              0.156         0.156         0.148   22005
 | ||||
| 	32767              0.163         0.149         0.149   21768
 | ||||
| 	32768              0.150         0.146         0.145   22491
 | ||||
| 	65535              0.158         0.141         0.141   23102
 | ||||
| 	65536              0.149         0.140         0.138   23478
 | ||||
| 	131071             0.150         0.145         0.141   23011
 | ||||
| 	131072             0.148         0.141         0.148   21892
 | ||||
| 	262143             0.151         0.148         0.147   22136
 | ||||
| 	262144             0.149         0.146         0.146   22298
 | ||||
| 	524287             0.150         0.149         0.149   21832
 | ||||
| 	524288             0.148         0.148         0.147   22043
 | ||||
| 	1048575            0.148         0.158         0.163   19913
 | ||||
| 	1048576            0.156         0.179         0.153   21186
 | ||||
| 	2097151            0.153         0.149         0.148   21979
 | ||||
| 	2097152            0.147         0.148         0.147   22040
 | ||||
| 	4194303            0.148         0.148         0.151   21482
 | ||||
| 	4194304            0.148         0.148         0.147   22061
 | ||||
| 	8388607            0.185         0.183         0.185   17536
 | ||||
| 	8388608            0.193         0.180         0.183   17769
 | ||||
| 
 | ||||
| 	crc32() w/ 10+ year old cpus for #c per n where c ≈ 0.293ns | ||||
| 	N                     x1            x8           x64	mBps | ||||
| 	------------------------------------------------------------ | ||||
| 	1               4447.000        43.625        37.641      86
 | ||||
| 	1                 41.000        37.125        37.609      86
 | ||||
| 	2                 31.500        26.562        22.477     145
 | ||||
| 	3                 25.000        20.125        17.422     187
 | ||||
| 	4                 21.250        16.594        15.230     213
 | ||||
| 	7                 16.714        13.089        11.717     277
 | ||||
| 	8                 16.875        12.609        11.174     291
 | ||||
| 	15                12.733         9.958         9.339     348
 | ||||
| 	16                12.438         9.852         9.208     353
 | ||||
| 	31                10.935         8.617         8.164     398
 | ||||
| 	32                10.906         8.496         8.155     399
 | ||||
| 	63                 9.095         7.819         7.692     423
 | ||||
| 	64                 9.172         7.807         7.692     423
 | ||||
| 	127                8.165         7.531         7.438     437
 | ||||
| 	128                8.133         7.503         7.437     437
 | ||||
| 	255                7.714         7.329         7.293     446
 | ||||
| 	256                7.723         7.348         7.293     446
 | ||||
| 	511                7.434         7.253         7.223     450
 | ||||
| 	512                7.412         7.237         7.218     450
 | ||||
| 	1023               7.274         7.214         7.201     451
 | ||||
| 	1024               7.292         7.203         7.189     452
 | ||||
| 	2047               7.232         7.185         7.178     453
 | ||||
| 	2048               7.239         7.189         7.186     452
 | ||||
| 	4095               7.189         7.175         7.172     453
 | ||||
| 	4096               7.192         7.173         7.172     453
 | ||||
| 	8191               7.187         7.173         7.172     453
 | ||||
| 	8192               7.183         7.174         7.181     453
 | ||||
| 	16383              7.175         7.170         7.169     453
 | ||||
| 	16384              7.176         7.169         7.169     453
 | ||||
| 	32767              7.169         7.182         7.170     453
 | ||||
| 	32768              7.173         7.172         7.172     453
 | ||||
| 	65535              7.170         7.170         7.171     453
 | ||||
| 	65536              7.172         7.171         7.204     451
 | ||||
| 	131071             7.170         7.354         7.260     448
 | ||||
| 	131072             7.172         7.172         7.182     453
 | ||||
| 	262143             7.037         7.178         7.182     453
 | ||||
| 	262144             7.169         7.343         7.205     451
 | ||||
| 	524287             7.438         7.170         7.206     451
 | ||||
| 	524288             7.169         7.164         7.209     451
 | ||||
| 	1048575            6.995         7.119         7.158     454
 | ||||
| 	1048576            7.168         7.110         7.157     454
 | ||||
| 	2097151            7.057         7.058         7.065     460
 | ||||
| 	2097152            6.977         7.047         7.089     458
 | ||||
| 	4194303            7.017         7.504         7.030     462
 | ||||
| 	4194304            7.025         7.059         7.030     462
 | ||||
| 	8388607            7.082         6.980         6.997     464
 | ||||
| 	8388608            7.051         6.985         6.999     464 */ | ||||
|  | @ -8,12 +8,6 @@ extern const uint32_t kCrc32cTab[256]; | |||
| void crc32init(uint32_t[hasatleast 256], uint32_t); | ||||
| uint32_t crc32a(uint32_t, const void *, size_t); | ||||
| uint32_t crc32c(uint32_t, const void *, size_t); | ||||
| uint32_t crc32_z(uint32_t, const void *, size_t); | ||||
| uint32_t crc32c_pure(uint32_t, const void *, size_t) | ||||
| strlenesque _Hide; | ||||
| uint32_t crc32c_sse42(uint32_t, const void *, size_t) | ||||
| strlenesque _Hide; | ||||
| uint32_t crc32_pclmul(uint32_t, const void *, size_t) _Hide; | ||||
| 
 | ||||
| COSMOPOLITAN_C_END_ | ||||
| #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ | ||||
|  |  | |||
|  | @ -1,63 +0,0 @@ | |||
| /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 | ||||
| │vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/dce.h" | ||||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/macros.internal.h" | ||||
| #include "libc/nexgen32e/crc32.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| 
 | ||||
| /**
 | ||||
|  * Computes Phil Katz CRC-32 used by zip/zlib/gzip/etc. | ||||
|  * | ||||
|  *     x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1 | ||||
|  *     0b100000100110000010001110110110111 | ||||
|  *     _bitreverse32(0x104c11db7) | ||||
|  * | ||||
|  * This implementation takes 32 picoseconds per byte or 30 gibibyte/s. | ||||
|  * | ||||
|  * @param h is initial value | ||||
|  */ | ||||
| uint32_t crc32_z(uint32_t h, const void *data, size_t size) { | ||||
|   size_t n; | ||||
|   static bool once; | ||||
|   const unsigned char *p, *e; | ||||
|   static uint32_t kCrc32Tab[256]; | ||||
|   if (!once) { | ||||
|     crc32init(kCrc32Tab, 0xedb88320); | ||||
|     once = 0; | ||||
|   } | ||||
|   if (size == -1) { | ||||
|     size = data ? strlen(data) : 0; | ||||
|   } | ||||
|   p = data; | ||||
|   e = p + size; | ||||
|   h ^= 0xffffffff; | ||||
|   if (X86_HAVE(PCLMUL)) { | ||||
|     while (((intptr_t)p & 15) && p < e) | ||||
|       h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++]; | ||||
|     if ((n = ROUNDDOWN(e - p, 16)) >= 64) { | ||||
|       if (IsAsan()) __asan_verify(p, n); | ||||
|       h = crc32_pclmul(h, p, n); /* 51x faster */ | ||||
|       p += n; | ||||
|     } | ||||
|   } | ||||
|   while (p < e) h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++]; | ||||
|   return h ^ 0xffffffff; | ||||
| } | ||||
							
								
								
									
										61
									
								
								libc/sysv/consts/hwap.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								libc/sysv/consts/hwap.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,61 @@ | |||
| #ifndef COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_ | ||||
| #define COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_ | ||||
| #ifdef __aarch64__ | ||||
| 
 | ||||
| // Feature bits for getauxval(AT_HWCAP) on AARCH64 GNU/SystemD.
 | ||||
| 
 | ||||
| #define HWCAP_FP       (1 << 0) | ||||
| #define HWCAP_ASIMD    (1 << 1) | ||||
| #define HWCAP_EVTSTRM  (1 << 2) | ||||
| #define HWCAP_AES      (1 << 3) | ||||
| #define HWCAP_PMULL    (1 << 4) | ||||
| #define HWCAP_SHA1     (1 << 5) | ||||
| #define HWCAP_SHA2     (1 << 6) | ||||
| #define HWCAP_CRC32    (1 << 7) | ||||
| #define HWCAP_ATOMICS  (1 << 8) | ||||
| #define HWCAP_FPHP     (1 << 9) | ||||
| #define HWCAP_ASIMDHP  (1 << 10) | ||||
| #define HWCAP_CPUID    (1 << 11) | ||||
| #define HWCAP_ASIMDRDM (1 << 12) | ||||
| #define HWCAP_JSCVT    (1 << 13) | ||||
| #define HWCAP_FCMA     (1 << 14) | ||||
| #define HWCAP_LRCPC    (1 << 15) | ||||
| #define HWCAP_DCPOP    (1 << 16) | ||||
| #define HWCAP_SHA3     (1 << 17) | ||||
| #define HWCAP_SM3      (1 << 18) | ||||
| #define HWCAP_SM4      (1 << 19) | ||||
| #define HWCAP_ASIMDDP  (1 << 20) | ||||
| #define HWCAP_SHA512   (1 << 21) | ||||
| #define HWCAP_SVE      (1 << 22) | ||||
| #define HWCAP_ASIMDFHM (1 << 23) | ||||
| #define HWCAP_DIT      (1 << 24) | ||||
| #define HWCAP_USCAT    (1 << 25) | ||||
| #define HWCAP_ILRCPC   (1 << 26) | ||||
| #define HWCAP_FLAGM    (1 << 27) | ||||
| #define HWCAP_SSBS     (1 << 28) | ||||
| #define HWCAP_SB       (1 << 29) | ||||
| #define HWCAP_PACA     (1 << 30) | ||||
| #define HWCAP_PACG     (1UL << 31) | ||||
| 
 | ||||
| #define HWCAP2_DCPODP     (1 << 0) | ||||
| #define HWCAP2_SVE2       (1 << 1) | ||||
| #define HWCAP2_SVEAES     (1 << 2) | ||||
| #define HWCAP2_SVEPMULL   (1 << 3) | ||||
| #define HWCAP2_SVEBITPERM (1 << 4) | ||||
| #define HWCAP2_SVESHA3    (1 << 5) | ||||
| #define HWCAP2_SVESM4     (1 << 6) | ||||
| #define HWCAP2_FLAGM2     (1 << 7) | ||||
| #define HWCAP2_FRINT      (1 << 8) | ||||
| #define HWCAP2_SVEI8MM    (1 << 9) | ||||
| #define HWCAP2_SVEF32MM   (1 << 10) | ||||
| #define HWCAP2_SVEF64MM   (1 << 11) | ||||
| #define HWCAP2_SVEBF16    (1 << 12) | ||||
| #define HWCAP2_I8MM       (1 << 13) | ||||
| #define HWCAP2_BF16       (1 << 14) | ||||
| #define HWCAP2_DGH        (1 << 15) | ||||
| #define HWCAP2_RNG        (1 << 16) | ||||
| #define HWCAP2_BTI        (1 << 17) | ||||
| #define HWCAP2_MTE        (1 << 18) | ||||
| 
 | ||||
| #endif /* __aarch64__ */ | ||||
| #endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_ */ | ||||
|  | @ -5,6 +5,13 @@ | |||
| │ FreeBSD lib/msun/src/e_acoshl.c                                              │ | ||||
| │ Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans.     │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -28,12 +35,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
|  |  | |||
|  | @ -27,6 +27,7 @@ | |||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/feval.internal.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
| 
 | ||||
| asm(".ident\t\"\\n\\n\
 | ||||
| Musl libc (MIT License)\\n\ | ||||
|  |  | |||
|  | @ -5,6 +5,13 @@ | |||
| │ FreeBSD lib/msun/src/s_asinhl.c                                              │ | ||||
| │ Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans.     │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -28,12 +35,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
|  |  | |||
|  | @ -4,6 +4,13 @@ | |||
| │                                                                              │ | ||||
| │ FreeBSD lib/msun/src/e_atan2.c                                               │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -27,12 +34,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
|  |  | |||
|  | @ -79,7 +79,7 @@ long double atan2l(long double y, long double x) | |||
| 	long double z; | ||||
| 	int m, ex, ey; | ||||
| 
 | ||||
| 	if (isnan(x) || isnan(y)) | ||||
| 	if (isunordered(x, y)) | ||||
| 		return x+y; | ||||
| 	if (x == 1) | ||||
| 		return atanl(y); | ||||
|  |  | |||
|  | @ -5,6 +5,13 @@ | |||
| │ FreeBSD lib/msun/src/s_tanhf.c                                               │ | ||||
| │ Converted to long double by Bruce D. Evans.                                  │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -28,12 +35,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
|  |  | |||
|  | @ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\""); | |||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| // clang-format off
 | ||||
| 
 | ||||
| long double cosl(long double x) { | ||||
| /**
 | ||||
|  * Returns cosine of 𝑥. | ||||
|  */ | ||||
| long double cosl(long double x) | ||||
| { | ||||
| #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 | ||||
| 	return cos(x); | ||||
| #elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384 | ||||
|  |  | |||
|  | @ -4,6 +4,13 @@ | |||
| │                                                                              │ | ||||
| │ FreeBSD lib/msun/src/s_expm1f.c                                              │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -27,12 +34,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| #define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f | ||||
| #define INSERT_WORDS(d,hi,lo)                     \ | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| #define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f | ||||
| #define SET_FLOAT_WORD(d,w)                       \ | ||||
|  |  | |||
|  | @ -22,6 +22,6 @@ | |||
|  * Returns positive difference. | ||||
|  */ | ||||
| double fdim(double x, double y) { | ||||
|   if (isnan(x) || isnan(y)) return NAN; | ||||
|   if (isunordered(x, y)) return NAN; | ||||
|   return x > y ? x - y : 0; | ||||
| } | ||||
|  |  | |||
|  | @ -22,6 +22,6 @@ | |||
|  * Returns positive difference. | ||||
|  */ | ||||
| float fdimf(float x, float y) { | ||||
|   if (isnan(x) || isnan(y)) return NAN; | ||||
|   if (isunordered(x, y)) return NAN; | ||||
|   return x > y ? x - y : 0; | ||||
| } | ||||
|  |  | |||
|  | @ -25,7 +25,7 @@ long double fdiml(long double x, long double y) { | |||
| #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 | ||||
|   return fdim(x, y); | ||||
| #else | ||||
|   if (isnan(x) || isnan(y)) return NAN; | ||||
|   if (isunordered(x, y)) return NAN; | ||||
|   return x > y ? x - y : 0; | ||||
| #endif | ||||
| } | ||||
|  |  | |||
|  | @ -906,67 +906,6 @@ irintl(long double x) | |||
| 	__x + __y;			\ | ||||
| }) | ||||
| 
 | ||||
| /*
 | ||||
|  * ieee style elementary functions | ||||
|  * | ||||
|  * We rename functions here to improve other sources' diffability | ||||
|  * against fdlibm. | ||||
|  */ | ||||
| #define	__ieee754_sqrt	sqrt | ||||
| #define	__ieee754_acos	acos | ||||
| #define	__ieee754_acosh	acosh | ||||
| #define	__ieee754_log	log | ||||
| #define	__ieee754_log2	log2 | ||||
| #define	__ieee754_atanh	atanh | ||||
| #define	__ieee754_asin	asin | ||||
| #define	__ieee754_atan2	atan2 | ||||
| #define	__ieee754_exp	exp | ||||
| #define	__ieee754_cosh	cosh | ||||
| #define	__ieee754_fmod	fmod | ||||
| #define	__ieee754_pow	pow | ||||
| #define	__ieee754_lgamma lgamma | ||||
| #define	__ieee754_gamma	gamma | ||||
| #define	__ieee754_lgamma_r lgamma_r | ||||
| #define	__ieee754_gamma_r gamma_r | ||||
| #define	__ieee754_log10	log10 | ||||
| #define	__ieee754_sinh	sinh | ||||
| #define	__ieee754_hypot	hypot | ||||
| #define	__ieee754_j0	j0 | ||||
| #define	__ieee754_j1	j1 | ||||
| #define	__ieee754_y0	y0 | ||||
| #define	__ieee754_y1	y1 | ||||
| #define	__ieee754_jn	jn | ||||
| #define	__ieee754_yn	yn | ||||
| #define	__ieee754_remainder remainder | ||||
| #define	__ieee754_scalb	scalb | ||||
| #define	__ieee754_sqrtf	sqrtf | ||||
| #define	__ieee754_acosf	acosf | ||||
| #define	__ieee754_acoshf acoshf | ||||
| #define	__ieee754_logf	logf | ||||
| #define	__ieee754_atanhf atanhf | ||||
| #define	__ieee754_asinf	asinf | ||||
| #define	__ieee754_atan2f atan2f | ||||
| #define	__ieee754_expf	expf | ||||
| #define	__ieee754_coshf	coshf | ||||
| #define	__ieee754_fmodf	fmodf | ||||
| #define	__ieee754_powf	powf | ||||
| #define	__ieee754_lgammaf lgammaf | ||||
| #define	__ieee754_gammaf gammaf | ||||
| #define	__ieee754_lgammaf_r lgammaf_r | ||||
| #define	__ieee754_gammaf_r gammaf_r | ||||
| #define	__ieee754_log10f log10f | ||||
| #define	__ieee754_log2f log2f | ||||
| #define	__ieee754_sinhf	sinhf | ||||
| #define	__ieee754_hypotf hypotf | ||||
| #define	__ieee754_j0f	j0f | ||||
| #define	__ieee754_j1f	j1f | ||||
| #define	__ieee754_y0f	y0f | ||||
| #define	__ieee754_y1f	y1f | ||||
| #define	__ieee754_jnf	jnf | ||||
| #define	__ieee754_ynf	ynf | ||||
| #define	__ieee754_remainderf remainderf | ||||
| #define	__ieee754_scalbf scalbf | ||||
| 
 | ||||
| /* fdlibm kernel function */ | ||||
| int	__kernel_rem_pio2(double*,double*,int,int,int); | ||||
| 
 | ||||
|  |  | |||
|  | @ -16,7 +16,7 @@ | |||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/tinymath/tinymath.h" | ||||
| #include "libc/math.h" | ||||
| 
 | ||||
| /**
 | ||||
|  * Rounds to nearest integer. | ||||
|  |  | |||
|  | @ -83,7 +83,8 @@ static dontinline long lrint_slow(double x) { | |||
| /**
 | ||||
|  * Rounds to nearest integer. | ||||
|  */ | ||||
| long lrint(double x) { | ||||
| long lrint(double x) | ||||
| { | ||||
| #ifdef __x86_64__ | ||||
| 	long res; | ||||
| 	asm("cvtsd2si\t%1,%0" : "=r"(res) : "x"(x)); | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| double modf(double x, double *iptr) | ||||
| { | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| float modff(float x, float *iptr) | ||||
| { | ||||
|  |  | |||
|  | @ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| double nextafter(double x, double y) | ||||
| { | ||||
|  | @ -40,7 +40,7 @@ double nextafter(double x, double y) | |||
| 	uint64_t ax, ay; | ||||
| 	int e; | ||||
| 
 | ||||
| 	if (isnan(x) || isnan(y)) | ||||
| 	if (isunordered(x, y)) | ||||
| 		return x + y; | ||||
| 	if (ux.i == uy.i) | ||||
| 		return y; | ||||
|  |  | |||
|  | @ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| float nextafterf(float x, float y) | ||||
| { | ||||
| 	union {float f; uint32_t i;} ux={x}, uy={y}; | ||||
| 	uint32_t ax, ay, e; | ||||
| 
 | ||||
| 	if (isnan(x) || isnan(y)) | ||||
| 	if (isunordered(x, y)) | ||||
| 		return x + y; | ||||
| 	if (ux.i == uy.i) | ||||
| 		return y; | ||||
|  |  | |||
|  | @ -36,13 +36,14 @@ Copyright 2005-2014 Rich Felker, et. al.\""); | |||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| // clang-format off
 | ||||
| 
 | ||||
| long double nextafterl(long double x, long double y) { | ||||
| long double nextafterl(long double x, long double y) | ||||
| { | ||||
| #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 | ||||
| 	return nextafter(x, y); | ||||
| #elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384 | ||||
| 	union ldshape ux, uy; | ||||
| 
 | ||||
| 	if (isnan(x) || isnan(y)) | ||||
| 	if (isunordered(x, y)) | ||||
| 		return x + y; | ||||
| 	if (x == y) | ||||
| 		return y; | ||||
|  | @ -75,7 +76,7 @@ long double nextafterl(long double x, long double y) { | |||
| #elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384 | ||||
| 	union ldshape ux, uy; | ||||
| 
 | ||||
| 	if (isnan(x) || isnan(y)) | ||||
| 	if (isunordered(x, y)) | ||||
| 		return x + y; | ||||
| 	if (x == y) | ||||
| 		return y; | ||||
|  |  | |||
|  | @ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| double nexttoward(double x, long double y) | ||||
| { | ||||
| 	union {double f; uint64_t i;} ux = {x}; | ||||
| 	int e; | ||||
| 
 | ||||
| 	if (isnan(x) || isnan(y)) | ||||
| 	if (isunordered(x, y)) | ||||
| 		return x + y; | ||||
| 	if (x == y) | ||||
| 		return y; | ||||
|  |  | |||
|  | @ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| float nexttowardf(float x, long double y) | ||||
| { | ||||
| 	union {float f; uint32_t i;} ux = {x}; | ||||
| 	uint32_t e; | ||||
| 
 | ||||
| 	if (isnan(x) || isnan(y)) | ||||
| 	if (isunordered(x, y)) | ||||
| 		return x + y; | ||||
| 	if (x == y) | ||||
| 		return y; | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| long double nexttowardl(long double x, long double y) | ||||
| { | ||||
|  |  | |||
|  | @ -34,7 +34,7 @@ asm(".ident\t\"\\n\\n\ | |||
| OpenBSD libm (ISC License)\\n\ | ||||
| Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /* origin: OpenBSD /usr/src/lib/libm/src/polevll.c */ | ||||
| /*
 | ||||
|  |  | |||
|  | @ -32,10 +32,10 @@ | |||
| #include "libc/tinymath/pow_data.internal.h" | ||||
| 
 | ||||
| asm(".ident\t\"\\n\\n\
 | ||||
| Double-precision math functions (MIT License)\\n\ | ||||
| Copyright 2018 ARM Limited\""); | ||||
| Optimized Routines (MIT License)\\n\ | ||||
| Copyright 2022 ARM Limited\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /*
 | ||||
|  * Double-precision x^y function. | ||||
|  |  | |||
|  | @ -121,9 +121,9 @@ double pochisq( | |||
|     	    e = (even ? 0.0 : LOG_SQRT_PI); | ||||
|     	    c = log(a); | ||||
|     	    while (z <= x) { | ||||
| 		e = log(z) + e; | ||||
| 		s += ex(c * z - a - e); | ||||
| 		z += 1.0; | ||||
| 				e = log(z) + e; | ||||
| 				s += ex(c * z - a - e); | ||||
| 				z += 1.0; | ||||
|     	    } | ||||
|     	    return (s); | ||||
|     	} else { | ||||
|  |  | |||
|  | @ -35,8 +35,8 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /* clang-format off */ | ||||
| /* origin: FreeBSD /usr/src/lib/msun/src/k_rem_pio2.c */ | ||||
| /*
 | ||||
|  * ==================================================== | ||||
|  |  | |||
|  | @ -58,7 +58,7 @@ asm(".include \"libc/disclaimer.inc\""); | |||
|  */ | ||||
| double scalb(double x, double fn) | ||||
| { | ||||
| 	if (isnan(x) || isnan(fn)) | ||||
| 	if (isunordered(x, fn)) | ||||
| 		return x*fn; | ||||
| 	if (!isfinite(fn)) { | ||||
| 		if (fn > 0.0) | ||||
|  |  | |||
|  | @ -38,7 +38,8 @@ asm(".include \"libc/disclaimer.inc\""); | |||
| 
 | ||||
| float scalbf(float x, float fn) | ||||
| { | ||||
| 	if (isnan(x) || isnan(fn)) return x*fn; | ||||
| 	if (isunordered(x, fn)) | ||||
| 		return x*fn; | ||||
| 	if (!isfinite(fn)) { | ||||
| 		if (fn > 0.0f) | ||||
| 			return x*fn; | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ | |||
| #include "libc/tinymath/internal.h" | ||||
| #if !(__ASSEMBLER__ + __LINKER__ + 0) | ||||
| COSMOPOLITAN_C_START_ | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /*
 | ||||
|  * Header for sinf, cosf and sincosf. | ||||
|  |  | |||
|  | @ -35,7 +35,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /* origin: FreeBSD /usr/src/lib/msun/src/k_sinf.c */ | ||||
| /*
 | ||||
|  |  | |||
|  | @ -37,7 +37,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /* origin: FreeBSD /usr/src/lib/msun/src/s_sinf.c */ | ||||
| /*
 | ||||
|  |  | |||
|  | @ -27,12 +27,13 @@ | |||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/expo.internal.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
| 
 | ||||
| asm(".ident\t\"\\n\\n\
 | ||||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /**
 | ||||
|  * Returns hyperbolic sine of 𝑥. | ||||
|  |  | |||
|  | @ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /**
 | ||||
|  * Returns hyperbolic sine of 𝑥. | ||||
|  |  | |||
|  | @ -5,6 +5,13 @@ | |||
| │ FreeBSD lib/msun/src/e_sinhl.c                                               │ | ||||
| │ Converted to long double by Bruce D. Evans                                   │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -28,12 +35,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/likely.h" | ||||
| #include "libc/math.h" | ||||
|  |  | |||
|  | @ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\""); | |||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| // clang-format off
 | ||||
| 
 | ||||
| long double sinl(long double x) { | ||||
| /**
 | ||||
|  * Returns sine of 𝑥. | ||||
|  */ | ||||
| long double sinl(long double x) | ||||
| { | ||||
| #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 | ||||
| 	return sin(x); | ||||
| #elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384 | ||||
|  |  | |||
|  | @ -36,7 +36,7 @@ asm(".ident\t\"\\n\\n\ | |||
| Musl libc (MIT License)\\n\ | ||||
| Copyright 2005-2014 Rich Felker, et. al.\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| /* clang-format off */ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| /* origin: FreeBSD /usr/src/lib/msun/src/s_tan.c */ | ||||
| /*
 | ||||
|  |  | |||
|  | @ -5,6 +5,13 @@ | |||
| │ FreeBSD lib/msun/src/s_tanhf.c                                               │ | ||||
| │ Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.     │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -28,12 +35,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/tinymath/freebsd.internal.h" | ||||
|  |  | |||
|  | @ -5,6 +5,13 @@ | |||
| │ FreeBSD lib/msun/src/s_tanhl.c                                               │ | ||||
| │ Converted to long double by Bruce D. Evans                                   │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │                                                                              │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │ | ||||
| │                                                                              │ | ||||
| │ Redistribution and use in source and binary forms, with or without           │ | ||||
|  | @ -28,12 +35,6 @@ | |||
| │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │ | ||||
| │ SUCH DAMAGE.                                                                 │ | ||||
| │                                                                              │ | ||||
| │ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │ | ||||
| │ Developed at SunPro, a Sun Microsystems, Inc. business.                      │ | ||||
| │ Permission to use, copy, modify, and distribute this                         │ | ||||
| │ software is freely granted, provided that this notice                        │ | ||||
| │ is preserved.                                                                │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/likely.h" | ||||
| #include "libc/math.h" | ||||
|  |  | |||
|  | @ -31,7 +31,6 @@ | |||
| #include "libc/intrin/directmap.internal.h" | ||||
| #include "libc/intrin/extend.internal.h" | ||||
| #include "libc/intrin/weaken.h" | ||||
| #include "libc/nexgen32e/crc32.h" | ||||
| #include "libc/runtime/internal.h" | ||||
| #include "libc/runtime/memtrack.internal.h" | ||||
| #include "libc/sysv/consts/f.h" | ||||
|  | @ -159,11 +158,6 @@ static int __zipos_load(struct Zipos *zipos, size_t cf, unsigned flags, | |||
|   h->pos = 0; | ||||
|   h->cfile = cf; | ||||
|   h->size = size; | ||||
|   if (!IsTiny() && h->mem && | ||||
|       crc32_z(0, h->mem, h->size) != ZIP_LFILE_CRC32(zipos->map + lf)) { | ||||
|     h->mem = 0; | ||||
|     eio(); | ||||
|   } | ||||
|   if (h->mem) { | ||||
|     minfd = 3; | ||||
|     __fds_lock(); | ||||
|  |  | |||
|  | @ -61,6 +61,17 @@ TEST(memcmp, hug) { | |||
|   } | ||||
| } | ||||
| 
 | ||||
| static int coerce(int result) { | ||||
| #ifdef __aarch64__ | ||||
|   // arm's strcmp assembly is nuts and unpredictable, but it's legal
 | ||||
|   if (result < 0) return -1; | ||||
|   if (result > 0) return +1; | ||||
|   return 0; | ||||
| #else | ||||
|   return result; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| TEST(memcmp, fuzz) { | ||||
|   int i, o, n, g; | ||||
|   char a[256], b[256]; | ||||
|  | @ -79,8 +90,18 @@ TEST(memcmp, fuzz) { | |||
|     } | ||||
|     o = rand() & 31; | ||||
|     n = rand() % (sizeof(a) - o); | ||||
|     g = golden(a + o, b + o, n); | ||||
|     ASSERT_EQ(g, memcmp(a + o, b + o, n), "n=%d o=%d", n, o); | ||||
|     g = coerce(golden(a + o, b + o, n)); | ||||
| #if 0 | ||||
|     if (memcmp(a + o, b + o, n) != g) { | ||||
|       kprintf("const size_t g = %d;\n", g); | ||||
|       kprintf("const size_t n = %d;\n", n); | ||||
|       kprintf("const char a[] = unbingstr(%#.*hhhs); /* %p */\n", n, a + o, | ||||
|               a + o); | ||||
|       kprintf("const char b[] = unbingstr(%#.*hhhs); /* %p */\n", n, b + o, | ||||
|               b + o); | ||||
|     } | ||||
| #endif | ||||
|     ASSERT_EQ(g, coerce(memcmp(a + o, b + o, n)), "n=%d o=%d", n, o); | ||||
|     ASSERT_EQ(!!g, !!bcmp(a + o, b + o, n), "n=%d o=%d", n, o); | ||||
|     ASSERT_EQ(!!g, !!timingsafe_bcmp(a + o, b + o, n), "n=%d o=%d", n, o); | ||||
|     ASSERT_EQ(MAX(-1, MIN(1, g)), timingsafe_memcmp(a + o, b + o, n), | ||||
|  |  | |||
|  | @ -190,9 +190,11 @@ BENCH(strchr, bench2) { | |||
|   char *strlen_(const char *) asm("strlen"); | ||||
|   char *rawmemchr_(const char *, int) asm("rawmemchr"); | ||||
|   EZBENCH2("strchr z", donothing, strchr_(kHyperion, 'z')); | ||||
|   EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z')); | ||||
|   EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize)); | ||||
|   EZBENCH2("strchr Z", donothing, strchr_(kHyperion, 'Z')); | ||||
|   EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize)); | ||||
|   EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize)); | ||||
|   EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z')); | ||||
|   EZBENCH2("rawmemchr Z", donothing, rawmemchr_(kHyperion, 'z')); | ||||
|   EZBENCH2("rawmemchr \\0", donothing, rawmemchr_(kHyperion, 0)); | ||||
|   EZBENCH2("strlen", donothing, strlen_(kHyperion)); | ||||
|   EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize)); | ||||
|  | @ -1,49 +0,0 @@ | |||
| /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 | ||||
| │vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│ | ||||
| ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||
| │ Copyright 2020 Justine Alexandra Roberts Tunney                              │ | ||||
| │                                                                              │ | ||||
| │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||
| │ any purpose with or without fee is hereby granted, provided that the         │ | ||||
| │ above copyright notice and this permission notice appear in all copies.      │ | ||||
| │                                                                              │ | ||||
| │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||
| │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||
| │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||
| │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||
| │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||
| │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/macros.internal.h" | ||||
| #include "libc/nexgen32e/crc32.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/str/str.h" | ||||
| #include "libc/testlib/ezbench.h" | ||||
| #include "libc/testlib/hyperion.h" | ||||
| #include "libc/testlib/testlib.h" | ||||
| #include "third_party/zlib/zlib.h" | ||||
| 
 | ||||
| TEST(crc32, testBigText) { | ||||
|   size_t size; | ||||
|   void *hyperion; | ||||
|   size = kHyperionSize; | ||||
|   hyperion = kHyperion; | ||||
|   EXPECT_EQ(0xe9ded8e6, crc32(0, hyperion, size)); | ||||
|   EXPECT_EQ(0xe9ded8e6, crc32_z(0, hyperion, size)); | ||||
|   if (X86_HAVE(PCLMUL)) { | ||||
|     size = ROUNDDOWN(size, 64); | ||||
|     EXPECT_EQ(0xc7adc04f, crc32(0, hyperion, size)); | ||||
|     EXPECT_EQ(0xc7adc04f, crc32_z(0, hyperion, size)); | ||||
|     EXPECT_EQ(0xc7adc04f, | ||||
|               0xffffffffu ^ crc32_pclmul(0 ^ 0xffffffffu, hyperion, size)); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #define TESTSTR "libc/calls/typedef/sighandler_t.h" | ||||
| 
 | ||||
| BENCH(crc32c, bench) { | ||||
|   EZBENCH2("crc32c", donothing, | ||||
|            EXPROPRIATE(crc32c(0, VEIL("r", TESTSTR), sizeof(TESTSTR) - 1))); | ||||
| } | ||||
|  | @ -16,17 +16,18 @@ | |||
| │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/intrin/bits.h" | ||||
| #include "libc/dce.h" | ||||
| #include "libc/intrin/bits.h" | ||||
| #include "libc/mem/gc.internal.h" | ||||
| #include "libc/mem/mem.h" | ||||
| #include "libc/nexgen32e/crc32.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/mem/gc.internal.h" | ||||
| #include "libc/stdio/stdio.h" | ||||
| #include "libc/str/str.h" | ||||
| #include "libc/testlib/ezbench.h" | ||||
| #include "libc/testlib/hyperion.h" | ||||
| #include "libc/testlib/testlib.h" | ||||
| #include "third_party/zlib/zlib.h" | ||||
| 
 | ||||
| #define FANATICS "Fanatics" | ||||
| 
 | ||||
|  |  | |||
|  | @ -17,18 +17,19 @@ | |||
| │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/assert.h" | ||||
| #include "libc/intrin/bits.h" | ||||
| #include "libc/dce.h" | ||||
| #include "libc/intrin/asan.internal.h" | ||||
| #include "libc/intrin/bits.h" | ||||
| #include "libc/mem/gc.internal.h" | ||||
| #include "libc/mem/mem.h" | ||||
| #include "libc/nexgen32e/crc32.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "libc/mem/gc.internal.h" | ||||
| #include "libc/stdio/stdio.h" | ||||
| #include "libc/str/str.h" | ||||
| #include "libc/testlib/ezbench.h" | ||||
| #include "libc/testlib/hyperion.h" | ||||
| #include "libc/testlib/testlib.h" | ||||
| #include "third_party/zlib/zlib.h" | ||||
| 
 | ||||
| #define FANATICS "Fanatics" | ||||
| 
 | ||||
|  |  | |||
|  | @ -15,15 +15,16 @@ | |||
| │ See the License for the specific language governing permissions and          │ | ||||
| │ limitations under the License.                                               │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/str/highwayhash64.h" | ||||
| #include "libc/inttypes.h" | ||||
| #include "libc/nexgen32e/crc32.h" | ||||
| #include "libc/stdio/rand.h" | ||||
| #include "libc/stdio/stdio.h" | ||||
| #include "libc/str/highwayhash64.h" | ||||
| #include "libc/str/str.h" | ||||
| #include "libc/testlib/ezbench.h" | ||||
| #include "libc/testlib/hyperion.h" | ||||
| #include "libc/testlib/testlib.h" | ||||
| #include "third_party/zlib/zlib.h" | ||||
| 
 | ||||
| #define kMaxSize 64 | ||||
| 
 | ||||
|  |  | |||
|  | @ -472,8 +472,6 @@ TEST(wcscmp, testTwosComplementBane) { | |||
| TEST(wcsncmp, testTwosComplementBane) { | ||||
|   wchar_t *B1 = malloc(4); | ||||
|   wchar_t *B2 = malloc(4); | ||||
|   B1[1] = L'\0'; | ||||
|   B2[1] = L'\0'; | ||||
|   EXPECT_EQ(wcsncmp(memcpy(B1, "\x00\x00\x00\x80", 4), | ||||
|                     memcpy(B2, "\x00\x00\x00\x80", 4), 1), | ||||
|             0); | ||||
|  |  | |||
|  | @ -18,6 +18,7 @@ | |||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/mem/gc.h" | ||||
| #include "libc/testlib/ezbench.h" | ||||
| #include "libc/testlib/testlib.h" | ||||
| #include "libc/x/x.h" | ||||
| #include "libc/x/xasprintf.h" | ||||
|  | @ -51,3 +52,9 @@ TEST(asinhl, test) { | |||
|   EXPECT_STREQ("NAN", _gc(xdtoal(_asinhl(NAN)))); | ||||
|   EXPECT_STREQ("INFINITY", _gc(xdtoal(_asinhl(INFINITY)))); | ||||
| } | ||||
| 
 | ||||
| BENCH(asinh, bench) { | ||||
|   EZBENCH2("asinh", donothing, _asinh(.7));    // ~26ns
 | ||||
|   EZBENCH2("asinhf", donothing, _asinhf(.7));  // ~17ns
 | ||||
|   EZBENCH2("asinhl", donothing, _asinhl(.7));  // ~48ns
 | ||||
| } | ||||
|  |  | |||
|  | @ -18,6 +18,7 @@ | |||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| #include "libc/math.h" | ||||
| #include "libc/mem/gc.internal.h" | ||||
| #include "libc/testlib/ezbench.h" | ||||
| #include "libc/testlib/testlib.h" | ||||
| #include "libc/x/x.h" | ||||
| 
 | ||||
|  | @ -60,3 +61,9 @@ TEST(sinhf, test) { | |||
|   EXPECT_STREQ("INFINITY", gc(xdtoaf(_sinhf(INFINITY)))); | ||||
|   EXPECT_STREQ("-INFINITY", gc(xdtoaf(_sinhf(-INFINITY)))); | ||||
| } | ||||
| 
 | ||||
| BENCH(sinh, bench) { | ||||
|   EZBENCH2("sinh", donothing, _sinh(.7));    // ~24ns
 | ||||
|   EZBENCH2("sinhf", donothing, _sinhf(.7));  // ~19ns
 | ||||
|   EZBENCH2("sinhl", donothing, _sinhl(.7));  // ~15ns
 | ||||
| } | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show more
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue