/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │ vi: set noet ft=asm ts=8 sw=8 fenc=utf-8 :vi │ ╚──────────────────────────────────────────────────────────────────────────────╝ │ │ │ Optimized Routines │ │ Copyright (c) 2018-2024, Arm Limited. │ │ │ │ Permission is hereby granted, free of charge, to any person obtaining │ │ a copy of this software and associated documentation files (the │ │ "Software"), to deal in the Software without restriction, including │ │ without limitation the rights to use, copy, modify, merge, publish, │ │ distribute, sublicense, and/or sell copies of the Software, and to │ │ permit persons to whom the Software is furnished to do so, subject to │ │ the following conditions: │ │ │ │ The above copyright notice and this permission notice shall be │ │ included in all copies or substantial portions of the Software. │ │ │ │ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ │ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ │ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ │ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ │ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ │ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ │ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/intrin/aarch64/asmdefs.h" .yoink arm_optimized_routines_notice #define __strnlen_aarch64 strnlen /* Assumptions: * * ARMv8-a, AArch64, Advanced SIMD. * MTE compatible. */ #define srcin x0 #define cntin x1 #define result x0 #define src x2 #define synd x3 #define shift x4 #define tmp x4 #define cntrem x5 #define qdata q0 #define vdata v0 #define vhas_chr v1 #define vend v2 #define dend d2 /* Core algorithm: Process the string in 16-byte aligned chunks. Compute a 64-bit mask with four bits per byte using the shrn instruction. A count trailing zeros then identifies the first zero byte. */ ENTRY (__strnlen_aarch64) PTR_ARG (0) SIZE_ARG (1) bic src, srcin, 15 cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] cmeq vhas_chr.16b, vdata.16b, 0 lsl shift, srcin, 2 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) L(finish): rbit synd, synd clz synd, synd lsr result, synd, 2 cmp cntin, result csel result, cntin, result, ls ret L(nomatch): mov result, cntin ret L(start_loop): sub tmp, src, srcin add tmp, tmp, 17 subs cntrem, cntin, tmp b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ tbz cntrem, 4, L(loop32_2) sub src, src, 16 .p2align 5 L(loop32): ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, 0 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): ldr qdata, [src, 16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, 0 b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) L(end_2): add src, src, 16 L(end): shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ sub result, src, srcin fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif clz synd, synd add result, result, synd, lsr 2 cmp cntin, result csel result, cntin, result, ls ret END (__strnlen_aarch64)