mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 19:43:32 +00:00
161 lines
6.8 KiB
C
161 lines
6.8 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
||
│ │
|
||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||
│ any purpose with or without fee is hereby granted, provided that the │
|
||
│ above copyright notice and this permission notice appear in all copies. │
|
||
│ │
|
||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||
#include "libc/intrin/likely.h"
|
||
#include "libc/dce.h"
|
||
#include "libc/intrin/asan.internal.h"
|
||
#include "libc/nexgen32e/x86feature.h"
|
||
#include "libc/str/str.h"
|
||
|
||
typedef uint64_t xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||
|
||
noasan static dontinline antiquity unsigned timingsafe_bcmp_sse(const char *p,
|
||
const char *q,
|
||
size_t n) {
|
||
uint64_t w;
|
||
xmm_t a = {0};
|
||
while (n > 16 + 16) {
|
||
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
|
||
p += 16;
|
||
q += 16;
|
||
n -= 16;
|
||
}
|
||
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
|
||
a |= *(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
|
||
w = a[0] | a[1];
|
||
return w | w >> 32;
|
||
}
|
||
|
||
noasan static microarchitecture("avx") int timingsafe_bcmp_avx(const char *p,
|
||
const char *q,
|
||
size_t n) {
|
||
uint64_t w;
|
||
xmm_t a = {0};
|
||
if (n > 32) {
|
||
if (n >= 16 + 64) {
|
||
xmm_t b = {0};
|
||
xmm_t c = {0};
|
||
xmm_t d = {0};
|
||
do {
|
||
a |= ((const xmm_t *)p)[0] ^ ((const xmm_t *)q)[0];
|
||
b |= ((const xmm_t *)p)[1] ^ ((const xmm_t *)q)[1];
|
||
c |= ((const xmm_t *)p)[2] ^ ((const xmm_t *)q)[2];
|
||
d |= ((const xmm_t *)p)[3] ^ ((const xmm_t *)q)[3];
|
||
p += 64;
|
||
q += 64;
|
||
n -= 64;
|
||
} while (n >= 16 + 64);
|
||
a = a | b | c | d;
|
||
}
|
||
while (n > 32) {
|
||
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
|
||
p += 16;
|
||
q += 16;
|
||
n -= 16;
|
||
}
|
||
}
|
||
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
|
||
a |= *(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
|
||
w = a[0] | a[1];
|
||
return w | w >> 32;
|
||
}
|
||
|
||
/**
|
||
* Tests inequality of first 𝑛 bytes of 𝑝 and 𝑞.
|
||
*
|
||
* The following expression:
|
||
*
|
||
* !!timingsafe_bcmp(p, q, n)
|
||
*
|
||
* Is functionally equivalent to:
|
||
*
|
||
* !!memcmp(p, q, n)
|
||
*
|
||
* This function is faster than memcmp() and bcmp() when byte sequences
|
||
* are assumed to always be the same; that makes it best for assertions
|
||
* or hash table lookups, assuming 𝑛 is variable (since no gcc builtin)
|
||
*
|
||
* timingsafe_bcmp n=0 992 picoseconds
|
||
* timingsafe_bcmp n=1 1 ns/byte 738 mb/s
|
||
* timingsafe_bcmp n=2 826 ps/byte 1,181 mb/s
|
||
* timingsafe_bcmp n=3 661 ps/byte 1,476 mb/s
|
||
* timingsafe_bcmp n=4 330 ps/byte 2,952 mb/s
|
||
* timingsafe_bcmp n=5 264 ps/byte 3,690 mb/s
|
||
* timingsafe_bcmp n=6 220 ps/byte 4,428 mb/s
|
||
* timingsafe_bcmp n=7 189 ps/byte 5,166 mb/s
|
||
* timingsafe_bcmp n=8 124 ps/byte 7,873 mb/s
|
||
* timingsafe_bcmp n=9 147 ps/byte 6,643 mb/s
|
||
* timingsafe_bcmp n=15 88 ps/byte 11,072 mb/s
|
||
* timingsafe_bcmp n=16 62 ps/byte 15,746 mb/s
|
||
* timingsafe_bcmp n=17 136 ps/byte 7,170 mb/s
|
||
* timingsafe_bcmp n=31 74 ps/byte 13,075 mb/s
|
||
* timingsafe_bcmp n=32 72 ps/byte 13,497 mb/s
|
||
* timingsafe_bcmp n=33 80 ps/byte 12,179 mb/s
|
||
* timingsafe_bcmp n=80 57 ps/byte 16,871 mb/s
|
||
* timingsafe_bcmp n=128 49 ps/byte 19,890 mb/s
|
||
* timingsafe_bcmp n=256 31 ps/byte 31,493 mb/s
|
||
* timingsafe_bcmp n=16384 14 ps/byte 67,941 mb/s
|
||
* timingsafe_bcmp n=32768 29 ps/byte 33,121 mb/s
|
||
* timingsafe_bcmp n=131072 29 ps/byte 32,949 mb/s
|
||
*
|
||
* Running time is independent of the byte sequences compared, making
|
||
* this safe to use for comparing secret values such as cryptographic
|
||
* MACs. In contrast, memcmp() may short-circuit after finding the first
|
||
* differing byte.
|
||
*
|
||
* @return nonzero if unequal, otherwise zero
|
||
* @see timingsafe_memcmp()
|
||
* @asyncsignalsafe
|
||
*/
|
||
int timingsafe_bcmp(const void *a, const void *b, size_t n) {
|
||
const char *p = a, *q = b;
|
||
uint32_t u, u0, u1, u2, u3;
|
||
uint64_t w, w0, w1, w2, w3;
|
||
if (!IsTiny()) {
|
||
if (n >= 8) {
|
||
if (n <= 16) {
|
||
__builtin_memcpy(&w0, p, 8);
|
||
__builtin_memcpy(&w1, q, 8);
|
||
__builtin_memcpy(&w2, p + n - 8, 8);
|
||
__builtin_memcpy(&w3, q + n - 8, 8);
|
||
w = (w0 ^ w1) | (w2 ^ w3);
|
||
return w | w >> 32;
|
||
} else {
|
||
if (IsAsan()) {
|
||
__asan_verify(a, n);
|
||
__asan_verify(b, n);
|
||
}
|
||
if (X86_HAVE(AVX)) {
|
||
return timingsafe_bcmp_avx(p, q, n);
|
||
} else {
|
||
return timingsafe_bcmp_sse(p, q, n);
|
||
}
|
||
}
|
||
} else if (n >= 4) {
|
||
__builtin_memcpy(&u0, p, 4);
|
||
__builtin_memcpy(&u1, q, 4);
|
||
__builtin_memcpy(&u2, p + n - 4, 4);
|
||
__builtin_memcpy(&u3, q + n - 4, 4);
|
||
return (u0 ^ u1) | (u2 ^ u3);
|
||
}
|
||
}
|
||
for (u = 0; n--;) {
|
||
u |= p[n] ^ q[n];
|
||
}
|
||
return u;
|
||
}
|