Improve performance of bitscanning intrinsics

This change helps spectre more intelligently plan execution, by working
around false output dependencies, impacting ops like popcnt bsr and bsf
This commit is contained in:
Justine Tunney 2021-06-15 06:24:46 -07:00
parent 29cb53881e
commit 87d7010495
13 changed files with 100 additions and 224 deletions

View file

@ -7,15 +7,15 @@ COSMOPOLITAN_C_START_
unsigned long popcnt(unsigned long) pureconst;
#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
#define popcnt(X) \
(__builtin_constant_p(X) ? __builtin_popcountll(X) : ({ \
unsigned long Res, Pop = (X); \
if (X86_HAVE(POPCNT)) { \
asm("popcnt\t%1,%0" : "=r"(Res) : "r"(Pop) : "cc"); \
} else { \
Res = (popcnt)(Pop); \
} \
Res; \
#define popcnt(X) \
(__builtin_constant_p(X) ? __builtin_popcountll(X) : ({ \
unsigned long PoP = (X); \
if (X86_HAVE(POPCNT)) { \
asm("popcnt\t%0,%0" : "+r"(PoP) : /* no inputs */ : "cc"); \
} else { \
PoP = (popcnt)(PoP); \
} \
PoP; \
}))
#endif /* GNUC && !ANSI */

View file

@ -22,10 +22,20 @@ int bsfl(long);
int bsfll(long long);
int bsfmax(uintmax_t);
#ifdef __GNUC__
#define bsf(u) __builtin_ctz(u)
#define bsfl(u) __builtin_ctzl(u)
#define bsfll(u) __builtin_ctzll(u)
#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
#define bsf(u) \
({ \
unsigned BiTs; \
asm("bsf\t%0,%0" : "=r"(BiTs) : "0"((unsigned)(u)) : "cc"); \
BiTs; \
})
#define bsfl(u) \
({ \
unsigned long BiTs; \
asm("bsf\t%0,%0" : "=r"(BiTs) : "0"((unsigned long)(u)) : "cc"); \
(unsigned)BiTs; \
})
#define bsfll(u) bsfl(u)
#endif
COSMOPOLITAN_C_END_

View file

@ -22,10 +22,20 @@ int bsrl(long);
int bsrll(long long);
int bsrmax(uintmax_t);
#ifdef __GNUC__
#define bsr(u) ((sizeof(int) * 8 - 1) ^ __builtin_clz(u))
#define bsrl(u) ((sizeof(long) * 8 - 1) ^ __builtin_clzl(u))
#define bsrll(u) ((sizeof(long long) * 8 - 1) ^ __builtin_clzll(u))
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
#define bsr(u) \
({ \
unsigned BiTs; \
asm("bsr\t%0,%0" : "=r"(BiTs) : "0"((unsigned)(u)) : "cc"); \
BiTs; \
})
#define bsrl(u) \
({ \
unsigned long BiTs; \
asm("bsr\t%0,%0" : "=r"(BiTs) : "0"((unsigned long)(u)) : "cc"); \
(unsigned)BiTs; \
})
#define bsrll(u) bsrl(u)
#endif
COSMOPOLITAN_C_END_

View file

@ -1,45 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Finds leading bits in 𝑥.
//
// uint32 𝑥 bsf(𝑥) tzcnt(𝑥) ffs(𝑥) bsr(𝑥) lzcnt(𝑥)
// 0x00000000 wut 32 0 wut 32
// 0x00000001 0 0 1 0 31
// 0x80000001 0 0 1 31 0
// 0x80000000 31 31 32 31 0
// 0x00000010 4 4 5 4 27
// 0x08000010 4 4 5 27 4
// 0x08000000 27 27 28 27 4
// 0xffffffff 0 0 1 31 0
//
// @param edi is 32-bit unsigned 𝑥 value
// @return eax number in range [0,32) or 32 if 𝑥 is 0
// @see also treasure trove of nearly identical functions
lzcnt: .leafprologue
.profilable
mov $31,%eax
mov $-1,%edx
bsr %edi,%ecx
cmovz %edx,%ecx
sub %ecx,%eax
.leafepilogue
.endfn lzcnt,globl
.source __FILE__

View file

@ -1,25 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_NEXGEN32E_LZCNT_H_
#define COSMOPOLITAN_LIBC_NEXGEN32E_LZCNT_H_
#include "libc/nexgen32e/x86feature.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
unsigned lzcnt(unsigned) libcesque pureconst;
unsigned long lzcntl(unsigned long) libcesque pureconst;
#define lzcnt(X) \
({ \
typeof(X) Res; \
if (X86_HAVE(ABM)) { \
/* amd piledriver+ (a.k.a. bdver1) c. 2011 */ \
/* intel haswell+ c. 2013 */ \
asm("lzcnt\t%1,%0" : "=r,r"(Res) : "r,m"(X) : "cc"); \
} else { \
Res = (lzcnt)(X); \
} \
Res; \
})
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_NEXGEN32E_LZCNT_H_ */

View file

@ -1,45 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Finds leading bits in 𝑥.
//
// uint32 𝑥 bsf(𝑥) tzcnt(𝑥) ffs(𝑥) bsr(𝑥) lzcnt(𝑥)
// 0x00000000 wut 32 0 wut 32
// 0x00000001 0 0 1 0 31
// 0x80000001 0 0 1 31 0
// 0x80000000 31 31 32 31 0
// 0x00000010 4 4 5 4 27
// 0x08000010 4 4 5 27 4
// 0x08000000 27 27 28 27 4
// 0xffffffff 0 0 1 31 0
//
// @param rdi is 64-bit unsigned 𝑥 value
// @return rax number in range [0,64) or 64 if 𝑥 is 0
// @see also treasure trove of nearly identical functions
lzcntl: .leafprologue
.profilable
mov $31,%eax
mov $-1,%edx
bsr %rdi,%rcx
cmovz %edx,%ecx
sub %ecx,%eax
.endfn lzcntl,globl
.alias lzcntl,lzcntll
.source __FILE__

View file

@ -1,25 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_NEXGEN32E_TZCNT_H_
#define COSMOPOLITAN_LIBC_NEXGEN32E_TZCNT_H_
#include "libc/nexgen32e/x86feature.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
unsigned tzcnt(unsigned) libcesque pureconst;
unsigned long tzcntl(unsigned long) libcesque pureconst;
#define tzcnt(X) \
({ \
typeof(X) Res; \
if (X86_HAVE(BMI)) { \
/* amd piledriver+ (a.k.a. bdver1) c. 2011 */ \
/* intel haswell+ c. 2013 */ \
asm("tzcnt\t%1,%0" : "=r,r"(Res) : "r,m"(X) : "cc"); \
} else { \
Res = (tzcnt)(X); \
} \
Res; \
})
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_NEXGEN32E_TZCNT_H_ */

View file

@ -1,44 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Finds lowest set bit in 𝑥.
//
// uint32 𝑥 bsf(𝑥) tzcnt(𝑥) ffs(𝑥) bsr(𝑥) lzcnt(𝑥)
// 0x00000000 wut 32 0 wut 32
// 0x00000001 0 0 1 0 31
// 0x80000001 0 0 1 31 0
// 0x80000000 31 31 32 31 0
// 0x00000010 4 4 5 4 27
// 0x08000010 4 4 5 27 4
// 0x08000000 27 27 28 27 4
// 0xffffffff 0 0 1 31 0
//
// @param rdi is 64-bit unsigned 𝑥 value
// @return rax number in range [0,64) or 64 if 𝑥 is 0
// @see also treasure trove of nearly identical functions
tzcntl: .leafprologue
.profilable
mov $64,%esi
bsf %rdi,%rax
cmovz %esi,%eax
.leafepilogue
.endfn tzcntl,globl
.alias tzcntl,tzcntll
.source __FILE__

View file

@ -18,26 +18,16 @@
*/
#include "libc/macros.internal.h"
.source __FILE__
.rodata
// Finds lowest set bit in 𝑥.
//
// uint32 𝑥 bsf(𝑥) tzcnt(𝑥) ffs(𝑥) bsr(𝑥) lzcnt(𝑥)
// 0x00000000 wut 32 0 wut 32
// 0x00000001 0 0 1 0 31
// 0x80000001 0 0 1 31 0
// 0x80000000 31 31 32 31 0
// 0x00000010 4 4 5 4 27
// 0x08000010 4 4 5 27 4
// 0x08000000 27 27 28 27 4
// 0xffffffff 0 0 1 31 0
//
// @param edi is 32-bit unsigned 𝑥 value
// @return eax number in range [0,32) or 32 if 𝑥 is 0
// @see also treasure trove of nearly identical functions
tzcnt: .leafprologue
.profilable
mov $32,%esi
bsf %edi,%eax
cmovz %esi,%eax
.leafepilogue
.endfn tzcnt,globl
// Nontrivial NUL-terminated string test vector.
.align 1
kBlocktronics:
0: .incbin "libc/testlib/blocktronics.txt"
1: .byte 0
.endobj kBlocktronics,globl
.align 8
kBlocktronicsSize:
.quad 1b-0b
.endobj kBlocktronicsSize,globl

View file

@ -0,0 +1,12 @@
#ifndef COSMOPOLITAN_LIBC_TESTLIB_BLOCKTRONICS_H_
#define COSMOPOLITAN_LIBC_TESTLIB_BLOCKTRONICS_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
extern size_t kBlocktronicsSize;
extern char kBlocktronics[];
extern uint8_t kBlocktronicsZip[];
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_TESTLIB_BLOCKTRONICS_H_ */

View file

@ -0,0 +1,32 @@
░▄██▒▄█ ▐██ ░░░ ▀▀████▒▀█▄
▐███▓██░ ██▌ ▀████▄■█▄
▐█▓███▀█░██▀ ░ ░▀█████▓▄
▐█▓██▀▄█▒██▀ ▄▄░ ▄▄▄ ░░░ ░▀████▒▄
▐████▀▄█■█▀ ▀▀ ░█████░
▐█▓█▀████▀ ░ ▐▓███▒
█░███▀▀ ░░░ ▄█ ░░░ █████
▐█▓█░▀▀ ░░▄█▄▄▄▄▄ ▀▄ ▌▄▄▄░▄▄▄▄▄ ▐████░
▐███▌ ▄▀█████████▄ ▌▐▄████████▄ ▐▓███░
▐███░░░▀▄█▀▄▄████▄▀░ ▐████████▒ ▀ ░███░
░████░ ▓▀ ▄███████▀▌ ▀▄■████▀▀█▀ ██▀█
▓███░ ░▄▀▀░░░ ▀ ░░▌ ▄▀▀▄░░▀░▄▀▄ ▐██▀▄
░███░ ▄▓▓▄▄░▀▀█▀█ ▌░░ ▀█▀█▀▀ ▐██▀
█▀▄▐██ ▀░░ ▄▀ ▐ █ ▀ ▄▄▄░ ░▀▄█▄▀█
▌▄ █▓ ▒ ░ █▄█▄▀▄▄▄███▄▀▄ ░░ ░ ▀ █▌
█▌▄░▌ ░░░▄▀█▀███████▄▀▄▀▄▀▀▄▄▄ █▀█░▐
██▄ ░░░▄█▄▀██▄█■██████▄█▄█▄■▀█░ ▐░▐
▀██░ ░▄██████████████████▄█▄█ ░█ ░ ▄▀
▀▓█▄▓░░ ▒█▀█████████████████████▒ ██▀
▀███ ▓▒ ██████████████▀▀▀▀█▄▀ ░▄█▒
▀███ ▀█▄▀▄█████▀▀ ▓▓▓▄░ ▐ ░▄██
▀██ ▄███████▄████████▀░░ ░▄██
▄██▀▀▄ █▄▀▄██▒▒███████████▀▀▀▄░ ░███░
▄██▀▄▄░░▀▐▄████▄ █████▀▄░░█▀▄▀░░ ▄██░
█████▄▄▄███▀░█▌██▄▀▀█████▄▄░░░▄▄███▀██▄ ▄▀▀▀▄▄
▀██████▀■▄█▄▄ ░▀███████████████▓▓░░▄██▀▄████▄▄▀▄
█▀█ █ █▀█ █▀█ █▄▀ ▐▀█▀▌█▀█ █▀█ █▄ █ ▀█▀ █▀█ █▀▀
█▀▄ █ █ █ █ █ ▀▄ █ █▀▄ █ █ █ ▀█ █ █ ▀▀█
█▄█ █▄▌█▄█ █▄█ █ █ █ █ █ █▄█ █ █ ▄█▄ █▄█ █▄█
THERE WILL BE BLOCKS march 01 2017

View file

@ -13,11 +13,13 @@ LIBC_TESTLIB_A = o/$(MODE)/libc/testlib/testlib.a
LIBC_TESTLIB_A_CHECKS = $(LIBC_TESTLIB_A).pkg
LIBC_TESTLIB_A_ASSETS = \
libc/testlib/hyperion.txt
libc/testlib/hyperion.txt \
libc/testlib/blocktronics.txt
LIBC_TESTLIB_A_HDRS = \
libc/testlib/bench.h \
libc/testlib/ezbench.h \
libc/testlib/blocktronics.h \
libc/testlib/hyperion.h \
libc/testlib/testlib.h
@ -25,6 +27,7 @@ LIBC_TESTLIB_A_SRCS_S = \
libc/testlib/bench.S \
libc/testlib/combo.S \
libc/testlib/fixture.S \
libc/testlib/blocktronics.S \
libc/testlib/hyperion.S \
libc/testlib/testcase.S \
libc/testlib/thrashcodecache.S \

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/testlib/blocktronics.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/hyperion.h"
#include "libc/testlib/testlib.h"
@ -48,4 +49,6 @@ BENCH(HasControlCodes, bench) {
EZBENCH2("HasControlCodes small", donothing, HasControlCodes("hello", -1, 0));
EZBENCH2("HasControlCodes big", donothing,
HasControlCodes(kHyperion, kHyperionSize, kControlC1));
EZBENCH2("HasControlCodes unicode", donothing,
HasControlCodes(kBlocktronics, kBlocktronicsSize, kControlC1));
}