/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
│vi: set et ft=asm ts=8 sw=8 fenc=utf-8                                     :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
│                                                                              │
│ Permission to use, copy, modify, and/or distribute this software for         │
│ any purpose with or without fee is hereby granted, provided that the         │
│ above copyright notice and this permission notice appear in all copies.      │
│                                                                              │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
│ PERFORMANCE OF THIS SOFTWARE.                                                │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/nexgen32e/x86feature.h"
#include "libc/nexgen32e/macros.h"
#include "libc/macros.h"
.source	__FILE__

/	Returns pointer to first instance of character.
/
/	@param	rdi is a non-null NUL-terminated string pointer
/	@param	esi is the search byte
/	@return	rax points to character, or to NUL byte if not found
/	@note	this won't return NULL if search character is NUL
strchrnul:
	.leafprologue
	.profilable
	or	$-1,%r9
	jmp	0f
	.endfn	strchrnul,globl

/	Returns pointer to first instance of character, the BSD way.
/
/	@param	rdi is a non-null NUL-terminated string pointer
/	@param	esi is the search byte
/	@return	rax points to first result, or NULL if not found
/	@note	this won't return NULL if search character is NUL
index:	nop
/	𝑠𝑙𝑖𝑑𝑒
	.endfn	index,globl

/	Returns pointer to first instance of character.
/
/	@param	rdi is a non-null NUL-terminated string pointer
/	@param	esi is the search byte
/	@return	rax points to first result, or NULL if not found
/	@note	this won't return NULL if search character is NUL
/	@asyncsignalsafe
strchr:	.leafprologue
	.profilable
	xor	%r9d,%r9d
0:	movzbl	%sil,%edx
	or	$-1,%rsi
	xor	%r8,%r8
	jmp	strsak
	.endfn	strchr,globl

/	Returns pointer to first instance of character in range.
/
/	@param	rdi is a non-null pointer to memory
/	@param	esi is the search byte
/	@return	rax points to byte if found, or else undefined behavior
rawmemchr:
	or	$-1,%rdx
/	𝑠𝑙𝑖𝑑𝑒
	.endfn	rawmemchr,globl

/	Returns pointer to first instance of character in range.
/
/	@param	rdi is a non-null pointer to memory
/	@param	esi is the search byte
/	@param	rdx is length of memory in bytes
/	@return	rax points to byte if found or NULL
/	@asyncsignalsafe
memchr:	.leafprologue
	.profilable
	xchg	%rsi,%rdx
	mov	%dl,%dh
	xor	%r8,%r8
	xor	%r10,%r10
	jmp	strsak
	.endfn	memchr,globl

/	Returns length of NUL-terminated string w/ security blankets.
/
/	This is like strnlen() except it'll return 0 if (1) RDI is NULL
/	or (2) a NUL-terminator wasn't found in the first RSI bytes.
/
/	@param	rdi is a nullable NUL-terminated string pointer
/	@param	rsi is the maximum number of bytes to consider
/	@return	rax is the number of bytes, excluding the NUL
strnlen_s:
	.leafprologue
	.profilable
	xor	%eax,%eax
	xor	%r10d,%r10d
	test	%rdi,%rdi
	jnz	0f
	.leafepilogue
	.endfn	strnlen_s,globl

/	Returns length of NUL-terminated memory, with limit.
/
/	@param	rdi is non-null memory
/	@param	rsi is the maximum number of bytes to consider
/	@return	rax is the number of bytes, excluding the NUL
/	@asyncsignalsafe
strnlen:.leafprologue
	.profilable
	or	$-1,%r10
0:	xor	%edx,%edx
	mov	%rdi,%r8
/	𝑠𝑙𝑖𝑑𝑒
	.endfn	strnlen,globl

/	Swiss army knife of string character scanning.
/	Fourteen fast functions in one.
/
/	@param	rdi is non-null string memory
/	@param	rsi is max number of bytes to consider
/	@param	dl is search character #1
/	@param	dh is search character #2
/	@param	r8 is subtracted from result (for length vs. pointer)
/	@param	r9 masks result if DH is found (for NUL vs. NULL)
/	@param	r10 masks result on bytes exhausted (for length v. NULL)
/	@return	rax end pointer after r8/r9/r10 modifications
strsak:	lea	-1(%rdi),%rax
1:	add	$1,%rax
	sub	$1,%rsi
	jb	.Lend
	test	$31,%al
	jz	.Lfast
.Lbyte:	mov	(%rax),%cl
	cmp	%cl,%dl
	je	.Ldone
	cmp	%cl,%dh
	je	.Lnul
	jmp	1b
.Ldone:	sub	%r8,%rax
	jmp	.Lret
.Lend:	mov	%r10,%r9
.Lnul:	sub	%r8,%rax
	and	%r9,%rax
.Lret:	.leafepilogue
.Lslow:	add	$32,%rsi
	jmp	.Lbyte
.Lfast:	movzbl	%dl,%ecx
	movd	%ecx,%xmm0
	movzbl	%dh,%ecx
	movd	%ecx,%xmm1
	sub	$32,%rax
#if !X86_NEED(AVX2)
	testb	X86_HAVE(AVX2)+kCpuids(%rip)
	jz	.Lsse2
#endif
	vpbroadcastb %xmm0,%ymm0
	vpbroadcastb %xmm1,%ymm1
1:	add	$32,%rax
	sub	$32,%rsi
9:	jb	.Lslow
	vmovdqa	(%rax),%ymm2
	vpcmpeqb %ymm0,%ymm2,%ymm3
	vpcmpeqb %ymm1,%ymm2,%ymm2
	vpor	%ymm3,%ymm2,%ymm2
	vpmovmskb %ymm2,%ecx
	bsf	%ecx,%ecx
	je	1b
	vzeroupper
2:	add	%rcx,%rax
	jmp	.Lbyte
#if !X86_NEED(AVX2)
.Lsse2:	pbroadcastb %xmm0
	pbroadcastb %xmm1
1:	add	$32,%rax
	sub	$32,%rsi
	jb	9b
	movdqa	(%rax),%xmm2
	movdqa	16(%rax),%xmm3
	movdqa	%xmm3,%xmm4
	pcmpeqb	%xmm0,%xmm3
	pcmpeqb %xmm1,%xmm4
	por	%xmm4,%xmm3
	pmovmskb %xmm3,%ecx
	shl	$16,%ecx
	movdqa	%xmm2,%xmm4
	pcmpeqb	%xmm0,%xmm2
	pcmpeqb %xmm1,%xmm4
	por	%xmm4,%xmm2
	pmovmskb %xmm2,%r11d
	or	%r11d,%ecx
	bsf	%ecx,%ecx
	je	1b
	jmp	2b
#endif
	.endfn	strsak,globl,hidden

/*	benchmarked on intel core i7-6700 @ 3.40GHz (skylake)
	includes function call overhead (unless marked otherwise)

	your strlen, &c (strsak+avx2) for #c per n where c ≈ 0.293ns
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1                 47.000        36.375        35.141      99
	1                 35.000        34.625        36.234      96
	2                 31.500        18.812        18.992     184
	3                 19.667        13.042        13.182     265
	4                 30.750        10.281        10.285     339
	7                 15.857         8.946         7.551     462
	8                 12.125         9.203         7.119     490
	15                10.467         5.475         4.601     758
	16                 6.812         5.523         4.798     727
	31                 5.387         4.327         3.517     992
	32                 4.719         1.645         1.532    2278
	63                 5.000         2.403         2.034    1715
	64                 2.047         0.779         0.788    4427
	127                2.134         1.194         1.027    3399
	128                1.742         0.444         0.419    8327
	255                0.945         0.594         0.554    6295
	256                0.574         0.271         0.264   13226
	511                0.785         0.362         0.307   11384
	512                0.326         0.178         0.151   23134
	1023               0.288         0.242         0.185   18862
	1024               0.208         0.114         0.107   32565
	2047               0.235         0.127         0.123   28430
	2048               0.127         0.090         0.084   41413
	4095               0.119         0.106         0.099   35116
	4096               0.100         0.081         0.079   44372
	8191               0.092         0.082         0.081   43176
	8192               0.081         0.072         0.071   49419
	16383              0.076         0.072         0.071   48847
	16384              0.071         0.068         0.067   52381
	32767              0.072         0.069         0.068   51154
	32768              0.068         0.066         0.065   53409

	your tinystrlen()
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1                 53.000        33.625        33.672      97
	1                 33.000        32.125        32.234     101
	2                 24.500        19.438        17.711     184
	3                 23.667        12.875        11.911     273
	4                 13.750         9.281         9.238     352
	7                 11.000         6.125         5.801     560
	8                  7.625         5.609         5.232     621
	15                11.800         3.825         3.364     966
	16                 4.562         3.648         3.173    1024 « optimal
	31                 3.710         2.851         2.298    1414
	32                 3.031         2.254         2.159    1506 « dropoff
	63                 2.683         1.827         1.691    1922
	64                 2.078         1.932         1.689    1924
	127                1.630         1.647         1.622    2004
	128                1.727         1.671         1.652    1968
	255                1.392         1.450         1.435    2265
	256                1.473         1.427         1.437    2262
	511                1.325         1.353         1.337    2431
	512                1.408         1.343         1.337    2431
	1023               1.289         1.281         1.287    2525
	1024               1.269         1.295         1.297    2506
	2047               1.269         1.274         1.269    2561
	2048               1.280         1.263         1.281    2538
	4095               1.262         1.270         1.266    2568
	4096               1.270         1.264         1.265    2570
	8191               1.253         1.254         1.254    2592
	8192               1.219         1.224         1.225    2653
	16383              1.225         1.222         1.220    2663
	16384              1.226         1.221         1.222    2659
	32767              1.227         1.224         1.223    2658
	32768              1.220         1.221         1.222    2659

	glibc strlen for #c per n where c ≈ 0.273ns
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1               3497.000        53.125        42.641      82
	1                 69.000        44.875        42.547      82
	2                 45.500        24.188        21.852     160
	3                 23.000        15.625        14.557     240
	4                 22.250        11.406        10.637     328
	7                 10.143         6.768         6.230     560
	8                 11.125         5.797         5.486     636
	15                 5.800         3.142         2.859    1220
	16                 7.062         3.070         2.737    1275
	31                 2.806         1.585         1.407    2481
	32                 3.156         1.574         1.349    2587
	63                 2.016         0.895         0.691    5049
	64                 1.328         0.744         0.670    5207
	127                1.441         0.521         0.407    8577
	128                0.648         0.454         0.405    8619
	255                0.553         0.286         0.214   16277
	256                0.387         0.235         0.218   15984
	511                0.456         0.151         0.129   27077
	512                0.182         0.134         0.129   27117
	1023               0.171         0.106         0.082   42795
	1024               0.112         0.088         0.082   42741
	2047               0.099         0.069         0.059   59537
	2048               0.072         0.060         0.058   59925
	4095               0.065         0.053         0.047   74122
	4096               0.061         0.048         0.047   74478
	8191               0.048         0.045         0.044   79117
	8192               0.051         0.045         0.044   79181
	16383              0.042         0.040         0.061   57018
	16384              0.069         0.063         0.061   57245
	32767              0.081         0.073         0.068   51426
	32768              0.084         0.072         0.068   51285

	GCC strlen (-Os REPNZ SCASB) for #c per n where c ≈ 0.293ns
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1                103.000        84.125        88.766      37
	1                 81.000        85.125        87.328      37
	2                 43.500        44.562        45.508      71
	3                 33.000        30.208        30.995     105
	4                 24.750        23.156        23.113     141
	7                 17.000        13.054        15.355     212
	8                 13.375        14.047        13.982     232
	15                 9.533         9.258        55.111      59
	16                 6.312         6.352         6.364     511
	31                 4.032         4.141         4.141     785
	32                 3.969         4.059         4.048     803
	63                 2.937         2.970         2.995    1086
	64                 2.922         2.939         2.956    1100
	127                2.386         2.408         2.403    1353
	128                2.383         2.403         2.401    1354
	255                2.129         2.118         2.124    1530
	256                2.137         2.133         2.130    1526
	511                1.982         1.986         3.351     970
	512                1.982         1.990         1.986    1637
	1023               1.915         1.916         2.587    1257
	1024               1.868         1.867         1.866    1742
	2047               1.835         1.833         1.832    1775
	2048               1.830         1.831         1.832    1775
	4095               1.814         1.814         1.815    1791
	4096               1.810         1.815         1.815    1791
	8191               1.805         1.807         1.806    1800
	8192               1.805         1.806         1.806    1800
	16383              1.803         1.756         1.756    1851
	16384              1.758         1.756         1.756    1851
	32767              1.756         1.754         1.754    1853
	32768              1.756         1.754         1.754    1853

	Intel Optimz. Manual (SSE4.2) for #c per n where c ≈ 0.273ns
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1                 37.000        43.125        34.078     102
	1                 33.000        33.875        34.016     103
	2                 39.500        17.188        17.555     199
	3                 18.333        12.208        12.036     290
	4                 30.250         9.344         9.137     382
	7                 14.429         5.732         5.766     605
	8                  7.875         6.797         5.354     652
	15                10.733         5.825         3.516     993
	16                 3.812         2.383         2.325    1501
	31                 4.097         2.609         2.079    1678
	32                 3.031         1.395         1.349    2587
	63                 2.937         1.558         1.079    3235
	64                 2.016         0.893         0.690    5056
	127                1.929         0.721         0.607    5745
	128                0.617         0.483         0.428    8147
	255                1.275         0.404         0.411    8486
	256                0.480         0.319         0.299   11681
	511                0.479         0.307         0.288   12127
	512                0.322         0.244         0.232   15013
	1023               0.324         0.224         0.225   15512
	1024               0.245         0.240         0.223   15651
	2047               0.222         0.213         0.206   16938
	2048               0.204         0.194         0.192   18140
	4095               0.204         0.188         0.185   18888
	4096               0.183         0.179         0.179   19446
	8191               0.179         0.176         0.174   20000
	8192               0.174         0.172         0.171   20383
	16383              0.171         0.170         0.169   20604
	16384              0.169         0.169         0.168   20808
	32767              0.213         0.225         0.267   13064
	32768              0.231         0.215         0.220   15852

	musl libc strlen for #c per n where c ≈ 0.273ns
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1                 65.000        36.125        37.984      92
	1                 39.000        37.625        37.422      93
	2                 41.500        21.938        20.695     169
	3                 22.333        17.625        15.859     220
	4                 21.250        13.656        12.105     288
	7                 22.143         9.018         7.609     459
	8                 31.125         7.234         7.346     475
	15                11.267         5.025         4.709     741
	16                 9.438         4.039         3.849     907
	31                 4.871         3.133         2.488    1402
	32                 5.219         2.246         2.039    1712
	63                 4.302         1.462         1.407    2479
	64                 2.109         1.428         1.155    3023
	127                1.551         1.078         0.879    3971
	128                1.742         0.903         0.760    4591
	255                0.922         0.558         0.605    5764
	256                0.934         0.575         0.537    6495
	511                0.550         0.493         0.455    7674
	512                0.646         0.490         0.426    8183
	1023               0.550         0.439         0.425    8203
	1024               0.472         0.421         0.408    8549
	2047               0.507         0.334         0.373    9360
	2048               0.403         0.426         0.409    8540
	4095               0.391         0.240         0.236   14799
	4096               0.238         0.222         0.221   15766
	8191               0.225         0.223         0.221   15779
	8192               0.225         0.214         0.215   16250
	16383              0.212         0.212         0.210   16595
	16384              0.209         0.210         0.211   16535
	32767              0.214         0.208         0.205   17001
	32768              0.207         0.207         0.291   12002

	newlib strlen for #c per n where c ≈ 0.273ns
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1                 33.000        34.625        34.141     102
	1                 33.000        34.125        33.984     103
	2                 58.500        18.562        17.508     199
	3                 16.333        12.792        12.016     290
	4                 19.250         9.219         9.215     379
	7                 17.571         6.089         5.685     614
	8                 16.625         5.078         5.432     642
	15                 8.467         4.042         3.207    1088
	16                 3.938         2.773         2.733    1277
	31                 3.645         1.673         1.598    2183
	32                 3.281         1.527         1.493    2338
	63                 2.619         1.042         0.895    3901
	64                 1.422         0.928         0.813    4294
	127                0.984         0.718         0.561    6222
	128                1.195         0.591         0.532    6558
	255                0.600         0.404         0.397    8785
	256                0.621         0.429         0.376    9280
	511                0.346         0.311         0.306   11421
	512                0.420         0.308         0.296   11776
	1023               0.284         0.285         0.285   12237
	1024               0.321         0.282         0.280   12456
	2047               0.253         0.252         0.252   13864
	2048               0.260         0.249         0.249   14012
	4095               0.236         0.236         0.236   14811
	4096               0.239         0.235         0.234   14906
	8191               0.233         0.228         0.227   15371
	8192               0.230         0.227         0.227   15397
	16383              0.223         0.224         0.223   15638
	16384              0.223         0.224         0.223   15663
	32767              0.224         0.387         0.225   15527
	32768              0.223         0.222         0.222   15724

	Agner Fog's strlen (SSE2) for #c per n where c ≈ 0.273ns
	N                     x1            x8           x64	mBps
	------------------------------------------------------------
	1                 59.000        38.375        38.453      91
	1                 37.000        38.625        38.234      91
	2                 18.500        19.062        19.273     181
	3                 13.000        12.792        12.859     271
	4                  9.250         9.594         9.660     361
	7                  5.286         5.554         5.502     634
	8                  4.625         4.703         4.791     728
	15                 2.600         2.858         2.622    1331
	16                 2.438         2.414         2.421    1442
	31                 2.161         1.399         1.290    2706
	32                 1.219         1.262         1.250    2793
	63                 1.508         0.875         0.693    5038
	64                 0.641         0.654         0.655    5328
	127                1.205         0.406         0.379    9200
	128                0.367         0.372         0.369    9463
	255                0.467         0.310         0.235   14835
	256                0.230         0.232         0.232   15034
	511                0.272         0.181         0.159   21918
	512                0.174         0.161         0.158   22148
	1023               0.175         0.134         0.120   29043
	1024               0.140         0.122         0.120   29005
	2047               0.128         0.114         0.112   31205
	2048               0.130         0.113         0.112   31242
	4095               0.105         0.098         0.097   35984
	4096               0.105         0.098         0.097   35973
	8191               0.093         0.090         0.090   38953
	8192               0.094         0.090         0.090   38986
	16383              0.088         0.086         0.086   40648
	16384              0.088         0.086         0.086   40652
	32767              0.088         0.086         0.085   40956
	32768              0.087         0.085         0.085   41114 */