Random number generator updates for Linux 6.11-rc1.

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEq5lC5tSkz8NBJiCnSfxwEqXeA64FAmaarzgACgkQSfxwEqXe
 A66ZWBAAlhXx8bve0uKlDRK8fffWHgruho/fOY4lZJ137AKwA9JCtmOyqdfL4Dmk
 VxFe7pEQJlQhcA/6kH54uO7SBXwfKlKZJth6SYnaCRMUIbFifHjjIQ0QqldjEKi0
 rP90Hu4FVsbwQC7u9i9lQj9n2P36zb6pn83BzpZQ/2PtoVCSCrdSJUe0Rxa3H3GN
 0+nNkDSXQt5otCByLaeE3x7KJgXLWL9+G2eFSFLTZ8rSVfMx1CdOIAG37WlLGdWm
 BaFYPDKMyBTVvVJBNgAe9YSqtrsZ5nlmLz+Z9wAe/hTL7RlL03kWUu34/Udcpull
 zzMDH0WMntiGK3eFQ2gOYSWqypvAjwHgn3BzqNmjUb69+89mZsdU1slcvnxWsUwU
 D3vphrscaqarF629tfsXti3jc5PoXwUTjROZVcCyeFPBhyAZgzK8xUvPpJO+RT+K
 EuUABob9cpA6FCpW/QeolDmMDhXlNT8QgsZu1juokZac2xP3Ly3REyEvT7HLbU2W
 ZJjbEqm1ppp3RmGELUOJbyhwsLrnbt+OMDO7iEWoG8aSFK4diBK/ZM6WvLMkr8Oi
 7ioXGIsYkCy3c47wpZKTrAapOPJp5keqNAiHSEbXw8mozp6429QAEZxNOcczgHKC
 Ea2JzRkctqutcIT+Slw/uUe//i1iSsIHXbE81fp5udcQTJcUByo=
 =P8aI
 -----END PGP SIGNATURE-----

Merge tag 'random-6.11-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random

Pull random number generator updates from Jason Donenfeld:
 "This adds getrandom() support to the vDSO.

  First, it adds a new kind of mapping to mmap(2), MAP_DROPPABLE, which
  lets the kernel zero out pages anytime under memory pressure, which
  enables allocating memory that never gets swapped to disk but also
  doesn't count as being mlocked.

  Then, the vDSO implementation of getrandom() is introduced in a
  generic manner and hooked into random.c.

  Next, this is implemented on x86. (Also, though it's not ready for
  this pull, somebody has begun an arm64 implementation already)

  Finally, two vDSO selftests are added.

  There are also two housekeeping cleanup commits"

* tag 'random-6.11-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random:
  MAINTAINERS: add random.h headers to RNG subsection
  random: note that RNDGETPOOL was removed in 2.6.9-rc2
  selftests/vDSO: add tests for vgetrandom
  x86: vdso: Wire up getrandom() vDSO implementation
  random: introduce generic vDSO getrandom() implementation
  mm: add MAP_DROPPABLE for designating always lazily freeable mappings
This commit is contained in:
Linus Torvalds 2024-07-24 10:29:50 -07:00
commit 7a3fad30fd
37 changed files with 1121 additions and 18 deletions

View file

@ -19057,7 +19057,13 @@ S: Maintained
T: git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git
F: Documentation/devicetree/bindings/rng/microsoft,vmgenid.yaml
F: drivers/char/random.c
F: include/linux/random.h
F: include/uapi/linux/random.h
F: drivers/virt/vmgenid.c
F: include/vdso/getrandom.h
F: lib/vdso/getrandom.c
F: arch/x86/entry/vdso/vgetrandom*
F: arch/x86/include/asm/vdso/getrandom*
RAPIDIO SUBSYSTEM
M: Matt Porter <mporter@kernel.crashing.org>

View file

@ -287,6 +287,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
select VDSO_GETRANDOM if X86_64
select HOTPLUG_PARALLEL if SMP && X86_64
select HOTPLUG_SMT if SMP
select HOTPLUG_SPLIT_STARTUP if SMP && X86_32

View file

@ -7,7 +7,7 @@
include $(srctree)/lib/vdso/Makefile
# Files to link into the vDSO:
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
vobjs-$(CONFIG_X86_SGX) += vsgx.o
@ -73,6 +73,7 @@ CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg
CFLAGS_REMOVE_vsgx.o = -pg
CFLAGS_REMOVE_vgetrandom.o = -pg
#
# X32 processes use x32 vDSO to access 64bit kernel data.

View file

@ -30,6 +30,8 @@ VERSION {
#ifdef CONFIG_X86_SGX
__vdso_sgx_enter_enclave;
#endif
getrandom;
__vdso_getrandom;
local: *;
};
}

View file

@ -0,0 +1,178 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
.section .rodata, "a"
.align 16
CONSTANTS: .octa 0x6b20657479622d323320646e61707865
.text
/*
* Very basic SSE2 implementation of ChaCha20. Produces a given positive number
* of blocks of output with a nonce of 0, taking an input key and 8-byte
* counter. Importantly does not spill to the stack. Its arguments are:
*
* rdi: output bytes
* rsi: 32-byte key input
* rdx: 8-byte counter input/output
* rcx: number of 64-byte blocks to write to output
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
.set output, %rdi
.set key, %rsi
.set counter, %rdx
.set nblocks, %rcx
.set i, %al
/* xmm registers are *not* callee-save. */
.set temp, %xmm0
.set state0, %xmm1
.set state1, %xmm2
.set state2, %xmm3
.set state3, %xmm4
.set copy0, %xmm5
.set copy1, %xmm6
.set copy2, %xmm7
.set copy3, %xmm8
.set one, %xmm9
/* copy0 = "expand 32-byte k" */
movaps CONSTANTS(%rip),copy0
/* copy1,copy2 = key */
movups 0x00(key),copy1
movups 0x10(key),copy2
/* copy3 = counter || zero nonce */
movq 0x00(counter),copy3
/* one = 1 || 0 */
movq $1,%rax
movq %rax,one
.Lblock:
/* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
movdqa copy0,state0
movdqa copy1,state1
movdqa copy2,state2
movdqa copy3,state3
movb $10,i
.Lpermute:
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $16,temp
psrld $16,state3
por temp,state3
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $12,temp
psrld $20,state1
por temp,state1
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $8,temp
psrld $24,state3
por temp,state3
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $7,temp
psrld $25,state1
por temp,state1
/* state1[0,1,2,3] = state1[1,2,3,0] */
pshufd $0x39,state1,state1
/* state2[0,1,2,3] = state2[2,3,0,1] */
pshufd $0x4e,state2,state2
/* state3[0,1,2,3] = state3[3,0,1,2] */
pshufd $0x93,state3,state3
/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $16,temp
psrld $16,state3
por temp,state3
/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $12,temp
psrld $20,state1
por temp,state1
/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
paddd state1,state0
pxor state0,state3
movdqa state3,temp
pslld $8,temp
psrld $24,state3
por temp,state3
/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
paddd state3,state2
pxor state2,state1
movdqa state1,temp
pslld $7,temp
psrld $25,state1
por temp,state1
/* state1[0,1,2,3] = state1[3,0,1,2] */
pshufd $0x93,state1,state1
/* state2[0,1,2,3] = state2[2,3,0,1] */
pshufd $0x4e,state2,state2
/* state3[0,1,2,3] = state3[1,2,3,0] */
pshufd $0x39,state3,state3
decb i
jnz .Lpermute
/* output0 = state0 + copy0 */
paddd copy0,state0
movups state0,0x00(output)
/* output1 = state1 + copy1 */
paddd copy1,state1
movups state1,0x10(output)
/* output2 = state2 + copy2 */
paddd copy2,state2
movups state2,0x20(output)
/* output3 = state3 + copy3 */
paddd copy3,state3
movups state3,0x30(output)
/* ++copy3.counter */
paddq one,copy3
/* output += 64, --nblocks */
addq $64,output
decq nblocks
jnz .Lblock
/* counter = copy3.counter */
movq copy3,0x00(counter)
/* Zero out the potentially sensitive regs, in case nothing uses these again. */
pxor state0,state0
pxor state1,state1
pxor state2,state2
pxor state3,state3
pxor copy1,copy1
pxor copy2,copy2
pxor temp,temp
ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)

View file

@ -0,0 +1,17 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <linux/types.h>
#include "../../../../lib/vdso/getrandom.c"
ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len);
ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
{
return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
}
ssize_t getrandom(void *, size_t, unsigned int, void *, size_t)
__attribute__((weak, alias("__vdso_getrandom")));

View file

@ -0,0 +1,55 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#ifndef __ASM_VDSO_GETRANDOM_H
#define __ASM_VDSO_GETRANDOM_H
#ifndef __ASSEMBLY__
#include <asm/unistd.h>
#include <asm/vvar.h>
/**
* getrandom_syscall - Invoke the getrandom() syscall.
* @buffer: Destination buffer to fill with random bytes.
* @len: Size of @buffer in bytes.
* @flags: Zero or more GRND_* flags.
* Returns: The number of random bytes written to @buffer, or a negative value indicating an error.
*/
static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
{
long ret;
asm ("syscall" : "=a" (ret) :
"0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
"rcx", "r11", "memory");
return ret;
}
#define __vdso_rng_data (VVAR(_vdso_rng_data))
static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
{
if (IS_ENABLED(CONFIG_TIME_NS) && __vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data);
return &__vdso_rng_data;
}
/**
* __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack.
* @dst_bytes: Destination buffer to hold @nblocks * 64 bytes of output.
* @key: 32-byte input key.
* @counter: 8-byte counter, read on input and updated on return.
* @nblocks: Number of blocks to generate.
*
* Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write
* to any stack or memory outside of the parameters passed to it, in order to mitigate stack data
* leaking into forked child processes.
*/
extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks);
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETRANDOM_H */

View file

@ -10,6 +10,8 @@
#include <asm/vvar.h>
DEFINE_VVAR(struct vdso_data, _vdso_data);
DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data);
/*
* Update the vDSO data page to keep in sync with kernel timekeeping.
*/

View file

@ -26,6 +26,8 @@
*/
#define DECLARE_VVAR(offset, type, name) \
EMIT_VVAR(name, offset)
#define DECLARE_VVAR_SINGLE(offset, type, name) \
EMIT_VVAR(name, offset)
#else
@ -37,6 +39,10 @@ extern char __vvar_page;
extern type timens_ ## name[CS_BASES] \
__attribute__((visibility("hidden"))); \
#define DECLARE_VVAR_SINGLE(offset, type, name) \
extern type vvar_ ## name \
__attribute__((visibility("hidden"))); \
#define VVAR(name) (vvar_ ## name)
#define TIMENS(name) (timens_ ## name)
@ -44,12 +50,22 @@ extern char __vvar_page;
type name[CS_BASES] \
__attribute__((section(".vvar_" #name), aligned(16))) __visible
#define DEFINE_VVAR_SINGLE(type, name) \
type name \
__attribute__((section(".vvar_" #name), aligned(16))) __visible
#endif
/* DECLARE_VVAR(offset, type, name) */
DECLARE_VVAR(128, struct vdso_data, _vdso_data)
#if !defined(_SINGLE_DATA)
#define _SINGLE_DATA
DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data)
#endif
#undef DECLARE_VVAR
#undef DECLARE_VVAR_SINGLE
#endif

View file

@ -1,6 +1,6 @@
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* Copyright (C) 2017-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2017-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
* Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
*
@ -56,6 +56,10 @@
#include <linux/sched/isolation.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#ifdef CONFIG_VDSO_GETRANDOM
#include <vdso/getrandom.h>
#include <vdso/datapage.h>
#endif
#include <asm/archrandom.h>
#include <asm/processor.h>
#include <asm/irq.h>
@ -271,6 +275,15 @@ static void crng_reseed(struct work_struct *work)
if (next_gen == ULONG_MAX)
++next_gen;
WRITE_ONCE(base_crng.generation, next_gen);
#ifdef CONFIG_VDSO_GETRANDOM
/* base_crng.generation's invalid value is ULONG_MAX, while
* _vdso_rng_data.generation's invalid value is 0, so add one to the
* former to arrive at the latter. Use smp_store_release so that this
* is ordered with the write above to base_crng.generation. Pairs with
* the smp_rmb() before the syscall in the vDSO code.
*/
smp_store_release(&_vdso_rng_data.generation, next_gen + 1);
#endif
if (!static_branch_likely(&crng_is_ready))
crng_init = CRNG_READY;
spin_unlock_irqrestore(&base_crng.lock, flags);
@ -721,6 +734,9 @@ static void __cold _credit_init_bits(size_t bits)
if (static_key_initialized && system_unbound_wq)
queue_work(system_unbound_wq, &set_ready);
atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
#ifdef CONFIG_VDSO_GETRANDOM
WRITE_ONCE(_vdso_rng_data.is_ready, true);
#endif
wake_up_interruptible(&crng_init_wait);
kill_fasync(&fasync, SIGIO, POLL_IN);
pr_notice("crng init done\n");

View file

@ -988,6 +988,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
#ifdef CONFIG_64BIT
[ilog2(VM_DROPPABLE)] = "dp",
[ilog2(VM_SEALED)] = "sl",
#endif
};

View file

@ -406,6 +406,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ALLOW_ANY_UNCACHED VM_NONE
#endif
#ifdef CONFIG_64BIT
#define VM_DROPPABLE_BIT 40
#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
#else
#define VM_DROPPABLE VM_NONE
#endif
#ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */
#define VM_SEALED _BITUL(63)

View file

@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
{
vm_flags &= __VM_UFFD_FLAGS;
if (vm_flags & VM_DROPPABLE)
return false;
if ((vm_flags & VM_UFFD_MINOR) &&
(!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
return false;

View file

@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3)
# define IF_HAVE_UFFD_MINOR(flag, name)
#endif
#ifdef CONFIG_64BIT
# define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name},
#else
# define IF_HAVE_VM_DROPPABLE(flag, name)
#endif
#define __def_vmaflag_names \
{VM_READ, "read" }, \
{VM_WRITE, "write" }, \
@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \
{VM_HUGEPAGE, "hugepage" }, \
{VM_NOHUGEPAGE, "nohugepage" }, \
IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \
{VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \

View file

@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */
/*
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page

View file

@ -20,7 +20,7 @@
/* Add to (or subtract from) the entropy count. (Superuser only.) */
#define RNDADDTOENTCNT _IOW( 'R', 0x01, int )
/* Get the contents of the entropy pool. (Superuser only.) */
/* Get the contents of the entropy pool. (Superuser only.) (Removed in 2.6.9-rc2.) */
#define RNDGETPOOL _IOR( 'R', 0x02, int [2] )
/*
@ -55,4 +55,19 @@ struct rand_pool_info {
#define GRND_RANDOM 0x0002
#define GRND_INSECURE 0x0004
/**
* struct vgetrandom_opaque_params - arguments for allocating memory for vgetrandom
*
* @size_per_opaque_state: Size of each state that is to be passed to vgetrandom().
* @mmap_prot: Value of the prot argument in mmap(2).
* @mmap_flags: Value of the flags argument in mmap(2).
* @reserved: Reserved for future use.
*/
struct vgetrandom_opaque_params {
__u32 size_of_opaque_state;
__u32 mmap_prot;
__u32 mmap_flags;
__u32 reserved[13];
};
#endif /* _UAPI_LINUX_RANDOM_H */

View file

@ -117,6 +117,16 @@ struct vdso_data {
struct arch_vdso_data arch_data;
};
/**
* struct vdso_rng_data - vdso RNG state information
* @generation: counter representing the number of RNG reseeds
* @is_ready: boolean signaling whether the RNG is initialized
*/
struct vdso_rng_data {
u64 generation;
u8 is_ready;
};
/*
* We use the hidden visibility to prevent the compiler from generating a GOT
* relocation. Not only is going through a GOT useless (the entry couldn't and
@ -128,6 +138,7 @@ struct vdso_data {
*/
extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden")));
extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden")));
extern struct vdso_rng_data _vdso_rng_data __attribute__((visibility("hidden")));
/**
* union vdso_data_store - Generic vDSO data page

46
include/vdso/getrandom.h Normal file
View file

@ -0,0 +1,46 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#ifndef _VDSO_GETRANDOM_H
#define _VDSO_GETRANDOM_H
#include <linux/types.h>
#define CHACHA_KEY_SIZE 32
#define CHACHA_BLOCK_SIZE 64
/**
* struct vgetrandom_state - State used by vDSO getrandom().
*
* @batch: One and a half ChaCha20 blocks of buffered RNG output.
*
* @key: Key to be used for generating next batch.
*
* @batch_key: Union of the prior two members, which is exactly two full
* ChaCha20 blocks in size, so that @batch and @key can be filled
* together.
*
* @generation: Snapshot of @rng_info->generation in the vDSO data page at
* the time @key was generated.
*
* @pos: Offset into @batch of the next available random byte.
*
* @in_use: Reentrancy guard for reusing a state within the same thread
* due to signal handlers.
*/
struct vgetrandom_state {
union {
struct {
u8 batch[CHACHA_BLOCK_SIZE * 3 / 2];
u32 key[CHACHA_KEY_SIZE / sizeof(u32)];
};
u8 batch_key[CHACHA_BLOCK_SIZE * 2];
};
u64 generation;
u8 pos;
bool in_use;
};
#endif /* _VDSO_GETRANDOM_H */

View file

@ -38,3 +38,8 @@ config GENERIC_VDSO_OVERFLOW_PROTECT
in the hotpath.
endif
config VDSO_GETRANDOM
bool
help
Selected by architectures that support vDSO getrandom().

251
lib/vdso/getrandom.c Normal file
View file

@ -0,0 +1,251 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <linux/cache.h>
#include <linux/kernel.h>
#include <linux/time64.h>
#include <vdso/datapage.h>
#include <vdso/getrandom.h>
#include <asm/vdso/getrandom.h>
#include <asm/vdso/vsyscall.h>
#include <asm/unaligned.h>
#include <uapi/linux/mman.h>
#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \
while (len >= sizeof(type)) { \
__put_unaligned_t(type, __get_unaligned_t(type, src), dst); \
__put_unaligned_t(type, 0, src); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
} \
} while (0)
static void memcpy_and_zero_src(void *dst, void *src, size_t len)
{
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
if (IS_ENABLED(CONFIG_64BIT))
MEMCPY_AND_ZERO_SRC(u64, dst, src, len);
MEMCPY_AND_ZERO_SRC(u32, dst, src, len);
MEMCPY_AND_ZERO_SRC(u16, dst, src, len);
}
MEMCPY_AND_ZERO_SRC(u8, dst, src, len);
}
/**
* __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall.
* @rng_info: Describes state of kernel RNG, memory shared with kernel.
* @buffer: Destination buffer to fill with random bytes.
* @len: Size of @buffer in bytes.
* @flags: Zero or more GRND_* flags.
* @opaque_state: Pointer to an opaque state area.
* @opaque_len: Length of opaque state area.
*
* This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's
* getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same
* schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always
* calls into the syscall.
*
* If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated
* with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0,
* this function should not be used.
*
* @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields
* from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external
* locking is used, one state must be allocated per thread, as it is not safe to call this function
* concurrently with the same @opaque_state. However, it is safe to call this using the same
* @opaque_state that is shared between main code and signal handling code, within the same thread.
*
* Returns: The number of random bytes written to @buffer, or a negative value indicating an error.
*/
static __always_inline ssize_t
__cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len,
unsigned int flags, void *opaque_state, size_t opaque_len)
{
ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);
struct vgetrandom_state *state = opaque_state;
size_t batch_len, nblocks, orig_len = len;
bool in_use, have_retried = false;
unsigned long current_generation;
void *orig_buffer = buffer;
u32 counter[2] = { 0 };
if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) {
*(struct vgetrandom_opaque_params *)opaque_state = (struct vgetrandom_opaque_params) {
.size_of_opaque_state = sizeof(*state),
.mmap_prot = PROT_READ | PROT_WRITE,
.mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS
};
return 0;
}
/* The state must not straddle a page, since pages can be zeroed at any time. */
if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE))
return -EFAULT;
/* If the caller passes the wrong size, which might happen due to CRIU, fallback. */
if (unlikely(opaque_len != sizeof(*state)))
goto fallback_syscall;
/*
* If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from
* userspace, because A) the various @flags require this to block, or not, depending on
* various factors unavailable to userspace, and B) the kernel's behavior before the RNG is
* ready is to reseed from the entropy pool at every invocation.
*/
if (unlikely(!READ_ONCE(rng_info->is_ready)))
goto fallback_syscall;
/*
* This condition is checked after @rng_info->is_ready, because before the kernel's RNG is
* initialized, the @flags parameter may require this to block or return an error, even when
* len is zero.
*/
if (unlikely(!len))
return 0;
/*
* @state->in_use is basic reentrancy protection against this running in a signal handler
* with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one
* level of reentrancy. If a signal interrupts this after reading @state->in_use, but before
* writing @state->in_use, there is still no race, because the signal handler will run to
* its completion before returning execution.
*/
in_use = READ_ONCE(state->in_use);
if (unlikely(in_use))
/* The syscall simply fills the buffer and does not touch @state, so fallback. */
goto fallback_syscall;
WRITE_ONCE(state->in_use, true);
retry_generation:
/*
* @rng_info->generation must always be read here, as it serializes @state->key with the
* kernel's RNG reseeding schedule.
*/
current_generation = READ_ONCE(rng_info->generation);
/*
* If @state->generation doesn't match the kernel RNG's generation, then it means the
* kernel's RNG has reseeded, and so @state->key is reseeded as well.
*/
if (unlikely(state->generation != current_generation)) {
/*
* Write the generation before filling the key, in case of fork. If there is a fork
* just after this line, the parent and child will get different random bytes from
* the syscall, which is good. However, were this line to occur after the getrandom
* syscall, then both child and parent could have the same bytes and the same
* generation counter, so the fork would not be detected. Therefore, write
* @state->generation before the call to the getrandom syscall.
*/
WRITE_ONCE(state->generation, current_generation);
/*
* Prevent the syscall from being reordered wrt current_generation. Pairs with the
* smp_store_release(&_vdso_rng_data.generation) in random.c.
*/
smp_rmb();
/* Reseed @state->key using fresh bytes from the kernel. */
if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) {
/*
* If the syscall failed to refresh the key, then @state->key is now
* invalid, so invalidate the generation so that it is not used again, and
* fallback to using the syscall entirely.
*/
WRITE_ONCE(state->generation, 0);
/*
* Set @state->in_use to false only after the last write to @state in the
* line above.
*/
WRITE_ONCE(state->in_use, false);
goto fallback_syscall;
}
/*
* Set @state->pos to beyond the end of the batch, so that the batch is refilled
* using the new key.
*/
state->pos = sizeof(state->batch);
}
/* Set len to the total amount of bytes that this function is allowed to read, ret. */
len = ret;
more_batch:
/*
* First use bytes out of @state->batch, which may have been filled by the last call to this
* function.
*/
batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len);
if (batch_len) {
/* Zeroing at the same time as memcpying helps preserve forward secrecy. */
memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len);
state->pos += batch_len;
buffer += batch_len;
len -= batch_len;
}
if (!len) {
/* Prevent the loop from being reordered wrt ->generation. */
barrier();
/*
* Since @rng_info->generation will never be 0, re-read @state->generation, rather
* than using the local current_generation variable, to learn whether a fork
* occurred or if @state was zeroed due to memory pressure. Primarily, though, this
* indicates whether the kernel's RNG has reseeded, in which case generate a new key
* and start over.
*/
if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) {
/*
* Prevent this from looping forever in case of low memory or racing with a
* user force-reseeding the kernel's RNG using the ioctl.
*/
if (have_retried) {
WRITE_ONCE(state->in_use, false);
goto fallback_syscall;
}
have_retried = true;
buffer = orig_buffer;
goto retry_generation;
}
/*
* Set @state->in_use to false only when there will be no more reads or writes of
* @state.
*/
WRITE_ONCE(state->in_use, false);
return ret;
}
/* Generate blocks of RNG output directly into @buffer while there's enough room left. */
nblocks = len / CHACHA_BLOCK_SIZE;
if (nblocks) {
__arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks);
buffer += nblocks * CHACHA_BLOCK_SIZE;
len -= nblocks * CHACHA_BLOCK_SIZE;
}
BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0);
/* Refill the batch and overwrite the key, in order to preserve forward secrecy. */
__arch_chacha20_blocks_nostack(state->batch_key, state->key, counter,
sizeof(state->batch_key) / CHACHA_BLOCK_SIZE);
/* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
state->pos = 0;
goto more_batch;
fallback_syscall:
return getrandom_syscall(orig_buffer, orig_len, flags);
}
static __always_inline ssize_t
__cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
{
return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state, opaque_len);
}

View file

@ -713,7 +713,7 @@ static bool vma_ksm_compatible(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
VM_IO | VM_DONTEXPAND | VM_HUGETLB |
VM_MIXEDMAP))
VM_MIXEDMAP| VM_DROPPABLE))
return false; /* just ignore the advice */
if (vma_is_dax(vma))

View file

@ -1068,13 +1068,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
if (vma->vm_flags & VM_DROPPABLE)
return -EINVAL;
new_flags &= ~VM_WIPEONFORK;
break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
(vma->vm_flags & VM_DROPPABLE))
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;

View file

@ -5801,6 +5801,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* If the fault handler drops the mmap_lock, vma may be freed */
struct mm_struct *mm = vma->vm_mm;
vm_fault_t ret;
bool is_droppable;
__set_current_state(TASK_RUNNING);
@ -5815,6 +5816,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
goto out;
}
is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
@ -5829,8 +5832,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
else
ret = __handle_mm_fault(vma, address, flags);
/*
* Warning: It is no longer safe to dereference vma-> after this point,
* because mmap_lock might have been dropped by __handle_mm_fault(), so
* vma might be destroyed from underneath us.
*/
lru_gen_exit_fault();
/* If the mapping is droppable, then errors due to OOM aren't fatal. */
if (is_droppable)
ret &= ~VM_FAULT_OOM;
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*

View file

@ -2305,6 +2305,9 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
pgoff_t ilx;
struct folio *folio;
if (vma->vm_flags & VM_DROPPABLE)
gfp |= __GFP_NOWARN;
pol = get_vma_policy(vma, addr, order, &ilx);
folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
mpol_cond_put(pol);

View file

@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
vma_is_dax(vma) || vma_is_secretmem(vma))
vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;

View file

@ -1410,6 +1410,36 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_DROPPABLE:
if (VM_DROPPABLE == VM_NONE)
return -ENOTSUPP;
/*
* A locked or stack area makes no sense to be droppable.
*
* Also, since droppable pages can just go away at any time
* it makes no sense to copy them on fork or dump them.
*
* And don't attempt to combine with hugetlb for now.
*/
if (flags & (MAP_LOCKED | MAP_HUGETLB))
return -EINVAL;
if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
return -EINVAL;
vm_flags |= VM_DROPPABLE;
/*
* If the pages can be dropped, then it doesn't make
* sense to reserve them.
*/
vm_flags |= VM_NORESERVE;
/*
* Likewise, they're volatile enough that they
* shouldn't survive forks or coredumps.
*/
vm_flags |= VM_WIPEONFORK | VM_DONTDUMP;
fallthrough;
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.

View file

@ -1412,7 +1412,11 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
VM_BUG_ON_VMA(address < vma->vm_start ||
address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
if (!folio_test_swapbacked(folio))
/*
* VM_DROPPABLE mappings don't swap; instead they're just dropped when
* under memory pressure.
*/
if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
__folio_set_swapbacked(folio);
__folio_set_anon(folio, vma, address, exclusive);
@ -1848,7 +1852,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* plus the rmap(s) (dropped by discard:).
*/
if (ref_count == 1 + map_count &&
!folio_test_dirty(folio)) {
(!folio_test_dirty(folio) ||
/*
* Unlike MADV_FREE mappings, VM_DROPPABLE
* ones can be dropped even if they've
* been dirtied.
*/
(vma->vm_flags & VM_DROPPABLE))) {
dec_mm_counter(mm, MM_ANONPAGES);
goto discard;
}
@ -1858,7 +1868,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* discarded. Remap the page to page table.
*/
set_pte_at(mm, address, pvmw.pte, pteval);
folio_set_swapbacked(folio);
/*
* Unlike MADV_FREE mappings, VM_DROPPABLE ones
* never get swap backed on failure to drop.
*/
if (!(vma->vm_flags & VM_DROPPABLE))
folio_set_swapbacked(folio);
goto walk_abort;
}

View file

@ -4301,15 +4301,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
return true;
}
/* dirty lazyfree */
if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
folio_set_swapbacked(folio);
lruvec_add_folio_tail(lruvec, folio);
return true;
}
/* promoted */
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);

View file

View file

@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */
/*
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page

View file

@ -50,3 +50,4 @@ hugetlb_fault_after_madv
hugetlb_madv_vs_map
mseal_test
seal_elf
droppable

View file

@ -76,6 +76,7 @@ TEST_GEN_FILES += mdwe_test
TEST_GEN_FILES += hugetlb_fault_after_madv
TEST_GEN_FILES += hugetlb_madv_vs_map
TEST_GEN_FILES += hugetlb_dio
TEST_GEN_FILES += droppable
ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty

View file

@ -0,0 +1,53 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <sys/mman.h>
#include <linux/mman.h>
#include "../kselftest.h"
int main(int argc, char *argv[])
{
size_t alloc_size = 134217728;
size_t page_size = getpagesize();
void *alloc;
pid_t child;
ksft_print_header();
ksft_set_plan(1);
alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
assert(alloc != MAP_FAILED);
memset(alloc, 'A', alloc_size);
for (size_t i = 0; i < alloc_size; i += page_size)
assert(*(uint8_t *)(alloc + i));
child = fork();
assert(child >= 0);
if (!child) {
for (;;)
*(char *)malloc(page_size) = 'B';
}
for (bool done = false; !done;) {
for (size_t i = 0; i < alloc_size; i += page_size) {
if (!*(uint8_t *)(alloc + i)) {
done = true;
break;
}
}
}
kill(child, SIGTERM);
ksft_test_result_pass("MAP_DROPPABLE: PASS\n");
exit(KSFT_PASS);
}

View file

@ -6,3 +6,5 @@ vdso_test_correctness
vdso_test_gettimeofday
vdso_test_getcpu
vdso_standalone_test_x86
vdso_test_getrandom
vdso_test_chacha

View file

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
uname_M := $(shell uname -m 2>/dev/null || echo not)
ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null)
TEST_GEN_PROGS := vdso_test_gettimeofday
TEST_GEN_PROGS += vdso_test_getcpu
@ -10,6 +11,12 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
TEST_GEN_PROGS += vdso_standalone_test_x86
endif
TEST_GEN_PROGS += vdso_test_correctness
ifeq ($(uname_M),x86_64)
TEST_GEN_PROGS += vdso_test_getrandom
ifneq ($(SODIUM),)
TEST_GEN_PROGS += vdso_test_chacha
endif
endif
CFLAGS := -std=gnu99
@ -28,3 +35,14 @@ $(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind-
$(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c
$(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl
$(OUTPUT)/vdso_test_getrandom: parse_vdso.c
$(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
-isystem $(top_srcdir)/include/uapi
$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S
$(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
-isystem $(top_srcdir)/arch/$(ARCH)/include \
-isystem $(top_srcdir)/include \
-D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \
-Wa,--noexecstack $(SODIUM)

View file

@ -0,0 +1,43 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <sodium/crypto_stream_chacha20.h>
#include <sys/random.h>
#include <string.h>
#include <stdint.h>
#include "../kselftest.h"
extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks);
int main(int argc, char *argv[])
{
enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 };
static const uint8_t nonce[8] = { 0 };
uint32_t counter[2];
uint8_t key[32];
uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
ksft_print_header();
ksft_set_plan(1);
for (unsigned int trial = 0; trial < TRIALS; ++trial) {
if (getrandom(key, sizeof(key), 0) != sizeof(key)) {
printf("getrandom() failed!\n");
return KSFT_SKIP;
}
crypto_stream_chacha20(output1, sizeof(output1), nonce, key);
for (unsigned int split = 0; split < BLOCKS; ++split) {
memset(output2, 'X', sizeof(output2));
memset(counter, 0, sizeof(counter));
if (split)
__arch_chacha20_blocks_nostack(output2, key, counter, split);
__arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split);
if (memcmp(output1, output2, sizeof(output1)))
return KSFT_FAIL;
}
}
ksft_test_result_pass("chacha: PASS\n");
return KSFT_PASS;
}

View file

@ -0,0 +1,288 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <assert.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <signal.h>
#include <sys/auxv.h>
#include <sys/mman.h>
#include <sys/random.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <linux/random.h>
#include "../kselftest.h"
#include "parse_vdso.h"
#ifndef timespecsub
#define timespecsub(tsp, usp, vsp) \
do { \
(vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \
(vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \
if ((vsp)->tv_nsec < 0) { \
(vsp)->tv_sec--; \
(vsp)->tv_nsec += 1000000000L; \
} \
} while (0)
#endif
static struct {
pthread_mutex_t lock;
void **states;
size_t len, cap;
} grnd_allocator = {
.lock = PTHREAD_MUTEX_INITIALIZER
};
static struct {
ssize_t(*fn)(void *, size_t, unsigned long, void *, size_t);
pthread_key_t key;
pthread_once_t initialized;
struct vgetrandom_opaque_params params;
} grnd_ctx = {
.initialized = PTHREAD_ONCE_INIT
};
static void *vgetrandom_get_state(void)
{
void *state = NULL;
pthread_mutex_lock(&grnd_allocator.lock);
if (!grnd_allocator.len) {
size_t page_size = getpagesize();
size_t new_cap;
size_t alloc_size, num = sysconf(_SC_NPROCESSORS_ONLN); /* Just a decent heuristic. */
void *new_block, *new_states;
alloc_size = (num * grnd_ctx.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1));
num = (page_size / grnd_ctx.params.size_of_opaque_state) * (alloc_size / page_size);
new_block = mmap(0, alloc_size, grnd_ctx.params.mmap_prot, grnd_ctx.params.mmap_flags, -1, 0);
if (new_block == MAP_FAILED)
goto out;
new_cap = grnd_allocator.cap + num;
new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states));
if (!new_states)
goto unmap;
grnd_allocator.cap = new_cap;
grnd_allocator.states = new_states;
for (size_t i = 0; i < num; ++i) {
if (((uintptr_t)new_block & (page_size - 1)) + grnd_ctx.params.size_of_opaque_state > page_size)
new_block = (void *)(((uintptr_t)new_block + page_size - 1) & (~(page_size - 1)));
grnd_allocator.states[i] = new_block;
new_block += grnd_ctx.params.size_of_opaque_state;
}
grnd_allocator.len = num;
goto success;
unmap:
munmap(new_block, alloc_size);
goto out;
}
success:
state = grnd_allocator.states[--grnd_allocator.len];
out:
pthread_mutex_unlock(&grnd_allocator.lock);
return state;
}
static void vgetrandom_put_state(void *state)
{
if (!state)
return;
pthread_mutex_lock(&grnd_allocator.lock);
grnd_allocator.states[grnd_allocator.len++] = state;
pthread_mutex_unlock(&grnd_allocator.lock);
}
static void vgetrandom_init(void)
{
if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0)
return;
unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
if (!sysinfo_ehdr) {
printf("AT_SYSINFO_EHDR is not present!\n");
exit(KSFT_SKIP);
}
vdso_init_from_sysinfo_ehdr(sysinfo_ehdr);
grnd_ctx.fn = (__typeof__(grnd_ctx.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom");
if (!grnd_ctx.fn) {
printf("__vdso_getrandom is missing!\n");
exit(KSFT_FAIL);
}
if (grnd_ctx.fn(NULL, 0, 0, &grnd_ctx.params, ~0UL) != 0) {
printf("failed to fetch vgetrandom params!\n");
exit(KSFT_FAIL);
}
}
static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags)
{
void *state;
pthread_once(&grnd_ctx.initialized, vgetrandom_init);
state = pthread_getspecific(grnd_ctx.key);
if (!state) {
state = vgetrandom_get_state();
if (pthread_setspecific(grnd_ctx.key, state) != 0) {
vgetrandom_put_state(state);
state = NULL;
}
if (!state) {
printf("vgetrandom_get_state failed!\n");
exit(KSFT_FAIL);
}
}
return grnd_ctx.fn(buf, len, flags, state, grnd_ctx.params.size_of_opaque_state);
}
enum { TRIALS = 25000000, THREADS = 256 };
static void *test_vdso_getrandom(void *)
{
for (size_t i = 0; i < TRIALS; ++i) {
unsigned int val;
ssize_t ret = vgetrandom(&val, sizeof(val), 0);
assert(ret == sizeof(val));
}
return NULL;
}
static void *test_libc_getrandom(void *)
{
for (size_t i = 0; i < TRIALS; ++i) {
unsigned int val;
ssize_t ret = getrandom(&val, sizeof(val), 0);
assert(ret == sizeof(val));
}
return NULL;
}
static void *test_syscall_getrandom(void *)
{
for (size_t i = 0; i < TRIALS; ++i) {
unsigned int val;
ssize_t ret = syscall(__NR_getrandom, &val, sizeof(val), 0);
assert(ret == sizeof(val));
}
return NULL;
}
static void bench_single(void)
{
struct timespec start, end, diff;
clock_gettime(CLOCK_MONOTONIC, &start);
test_vdso_getrandom(NULL);
clock_gettime(CLOCK_MONOTONIC, &end);
timespecsub(&end, &start, &diff);
printf(" vdso: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec);
clock_gettime(CLOCK_MONOTONIC, &start);
test_libc_getrandom(NULL);
clock_gettime(CLOCK_MONOTONIC, &end);
timespecsub(&end, &start, &diff);
printf(" libc: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec);
clock_gettime(CLOCK_MONOTONIC, &start);
test_syscall_getrandom(NULL);
clock_gettime(CLOCK_MONOTONIC, &end);
timespecsub(&end, &start, &diff);
printf("syscall: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec);
}
static void bench_multi(void)
{
struct timespec start, end, diff;
pthread_t threads[THREADS];
clock_gettime(CLOCK_MONOTONIC, &start);
for (size_t i = 0; i < THREADS; ++i)
assert(pthread_create(&threads[i], NULL, test_vdso_getrandom, NULL) == 0);
for (size_t i = 0; i < THREADS; ++i)
pthread_join(threads[i], NULL);
clock_gettime(CLOCK_MONOTONIC, &end);
timespecsub(&end, &start, &diff);
printf(" vdso: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec);
clock_gettime(CLOCK_MONOTONIC, &start);
for (size_t i = 0; i < THREADS; ++i)
assert(pthread_create(&threads[i], NULL, test_libc_getrandom, NULL) == 0);
for (size_t i = 0; i < THREADS; ++i)
pthread_join(threads[i], NULL);
clock_gettime(CLOCK_MONOTONIC, &end);
timespecsub(&end, &start, &diff);
printf(" libc: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec);
clock_gettime(CLOCK_MONOTONIC, &start);
for (size_t i = 0; i < THREADS; ++i)
assert(pthread_create(&threads[i], NULL, test_syscall_getrandom, NULL) == 0);
for (size_t i = 0; i < THREADS; ++i)
pthread_join(threads[i], NULL);
clock_gettime(CLOCK_MONOTONIC, &end);
timespecsub(&end, &start, &diff);
printf(" syscall: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec);
}
static void fill(void)
{
uint8_t weird_size[323929];
for (;;)
vgetrandom(weird_size, sizeof(weird_size), 0);
}
static void kselftest(void)
{
uint8_t weird_size[1263];
ksft_print_header();
ksft_set_plan(1);
for (size_t i = 0; i < 1000; ++i) {
ssize_t ret = vgetrandom(weird_size, sizeof(weird_size), 0);
if (ret != sizeof(weird_size))
exit(KSFT_FAIL);
}
ksft_test_result_pass("getrandom: PASS\n");
exit(KSFT_PASS);
}
static void usage(const char *argv0)
{
fprintf(stderr, "Usage: %s [bench-single|bench-multi|fill]\n", argv0);
}
int main(int argc, char *argv[])
{
if (argc == 1) {
kselftest();
return 0;
}
if (argc != 2) {
usage(argv[0]);
return 1;
}
if (!strcmp(argv[1], "bench-single"))
bench_single();
else if (!strcmp(argv[1], "bench-multi"))
bench_multi();
else if (!strcmp(argv[1], "fill"))
fill();
else {
usage(argv[0]);
return 1;
}
return 0;
}