diff --git a/MAINTAINERS b/MAINTAINERS index 07f753f039d2..644fc7b545e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19057,7 +19057,13 @@ S: Maintained T: git https://git.kernel.org/pub/scm/linux/kernel/git/crng/random.git F: Documentation/devicetree/bindings/rng/microsoft,vmgenid.yaml F: drivers/char/random.c +F: include/linux/random.h +F: include/uapi/linux/random.h F: drivers/virt/vmgenid.c +F: include/vdso/getrandom.h +F: lib/vdso/getrandom.c +F: arch/x86/entry/vdso/vgetrandom* +F: arch/x86/include/asm/vdso/getrandom* RAPIDIO SUBSYSTEM M: Matt Porter diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbe5fac4b9dd..007bab9f2a0e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -287,6 +287,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO + select VDSO_GETRANDOM if X86_64 select HOTPLUG_PARALLEL if SMP && X86_64 select HOTPLUG_SMT if SMP select HOTPLUG_SPLIT_STARTUP if SMP && X86_32 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 215a1b202a91..c9216ac4fb1e 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -7,7 +7,7 @@ include $(srctree)/lib/vdso/Makefile # Files to link into the vDSO: -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o vobjs-$(CONFIG_X86_SGX) += vsgx.o @@ -73,6 +73,7 @@ CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg CFLAGS_REMOVE_vsgx.o = -pg +CFLAGS_REMOVE_vgetrandom.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index e8c60ae7a7c8..0bab5f4af6d1 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -30,6 +30,8 @@ VERSION { #ifdef CONFIG_X86_SGX __vdso_sgx_enter_enclave; #endif + getrandom; + __vdso_getrandom; local: *; }; } diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..bcba5639b8ee --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom-chacha.S @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include + +.section .rodata, "a" +.align 16 +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 +.text + +/* + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number + * of blocks of output with a nonce of 0, taking an input key and 8-byte + * counter. Importantly does not spill to the stack. Its arguments are: + * + * rdi: output bytes + * rsi: 32-byte key input + * rdx: 8-byte counter input/output + * rcx: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +.set output, %rdi +.set key, %rsi +.set counter, %rdx +.set nblocks, %rcx +.set i, %al +/* xmm registers are *not* callee-save. */ +.set temp, %xmm0 +.set state0, %xmm1 +.set state1, %xmm2 +.set state2, %xmm3 +.set state3, %xmm4 +.set copy0, %xmm5 +.set copy1, %xmm6 +.set copy2, %xmm7 +.set copy3, %xmm8 +.set one, %xmm9 + + /* copy0 = "expand 32-byte k" */ + movaps CONSTANTS(%rip),copy0 + /* copy1,copy2 = key */ + movups 0x00(key),copy1 + movups 0x10(key),copy2 + /* copy3 = counter || zero nonce */ + movq 0x00(counter),copy3 + /* one = 1 || 0 */ + movq $1,%rax + movq %rax,one + +.Lblock: + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ + movdqa copy0,state0 + movdqa copy1,state1 + movdqa copy2,state2 + movdqa copy3,state3 + + movb $10,i +.Lpermute: + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[1,2,3,0] */ + pshufd $0x39,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[3,0,1,2] */ + pshufd $0x93,state3,state3 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $16,temp + psrld $16,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $12,temp + psrld $20,state1 + por temp,state1 + + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ + paddd state1,state0 + pxor state0,state3 + movdqa state3,temp + pslld $8,temp + psrld $24,state3 + por temp,state3 + + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ + paddd state3,state2 + pxor state2,state1 + movdqa state1,temp + pslld $7,temp + psrld $25,state1 + por temp,state1 + + /* state1[0,1,2,3] = state1[3,0,1,2] */ + pshufd $0x93,state1,state1 + /* state2[0,1,2,3] = state2[2,3,0,1] */ + pshufd $0x4e,state2,state2 + /* state3[0,1,2,3] = state3[1,2,3,0] */ + pshufd $0x39,state3,state3 + + decb i + jnz .Lpermute + + /* output0 = state0 + copy0 */ + paddd copy0,state0 + movups state0,0x00(output) + /* output1 = state1 + copy1 */ + paddd copy1,state1 + movups state1,0x10(output) + /* output2 = state2 + copy2 */ + paddd copy2,state2 + movups state2,0x20(output) + /* output3 = state3 + copy3 */ + paddd copy3,state3 + movups state3,0x30(output) + + /* ++copy3.counter */ + paddq one,copy3 + + /* output += 64, --nblocks */ + addq $64,output + decq nblocks + jnz .Lblock + + /* counter = copy3.counter */ + movq copy3,0x00(counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ + pxor state0,state0 + pxor state1,state1 + pxor state2,state2 + pxor state3,state3 + pxor copy1,copy1 + pxor copy2,copy2 + pxor temp,temp + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c new file mode 100644 index 000000000000..52d3c7faae2e --- /dev/null +++ b/arch/x86/entry/vdso/vgetrandom.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. + */ +#include + +#include "../../../../lib/vdso/getrandom.c" + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} + +ssize_t getrandom(void *, size_t, unsigned int, void *, size_t) + __attribute__((weak, alias("__vdso_getrandom"))); diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..b96e674cafde --- /dev/null +++ b/arch/x86/include/asm/vdso/getrandom.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include +#include + +/** + * getrandom_syscall - Invoke the getrandom() syscall. + * @buffer: Destination buffer to fill with random bytes. + * @len: Size of @buffer in bytes. + * @flags: Zero or more GRND_* flags. + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. + */ +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) +{ + long ret; + + asm ("syscall" : "=a" (ret) : + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) : + "rcx", "r11", "memory"); + + return ret; +} + +#define __vdso_rng_data (VVAR(_vdso_rng_data)) + +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) +{ + if (IS_ENABLED(CONFIG_TIME_NS) && __vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) + return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data); + return &__vdso_rng_data; +} + +/** + * __arch_chacha20_blocks_nostack - Generate ChaCha20 stream without using the stack. + * @dst_bytes: Destination buffer to hold @nblocks * 64 bytes of output. + * @key: 32-byte input key. + * @counter: 8-byte counter, read on input and updated on return. + * @nblocks: Number of blocks to generate. + * + * Generates a given positive number of blocks of ChaCha20 output with nonce=0, and does not write + * to any stack or memory outside of the parameters passed to it, in order to mitigate stack data + * leaking into forked child processes. + */ +extern void __arch_chacha20_blocks_nostack(u8 *dst_bytes, const u32 *key, u32 *counter, size_t nblocks); + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index 93226281b450..972415a8be31 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -10,6 +10,8 @@ #include DEFINE_VVAR(struct vdso_data, _vdso_data); +DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data); + /* * Update the vDSO data page to keep in sync with kernel timekeeping. */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 183e98e49ab9..9d9af37f7cab 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -26,6 +26,8 @@ */ #define DECLARE_VVAR(offset, type, name) \ EMIT_VVAR(name, offset) +#define DECLARE_VVAR_SINGLE(offset, type, name) \ + EMIT_VVAR(name, offset) #else @@ -37,6 +39,10 @@ extern char __vvar_page; extern type timens_ ## name[CS_BASES] \ __attribute__((visibility("hidden"))); \ +#define DECLARE_VVAR_SINGLE(offset, type, name) \ + extern type vvar_ ## name \ + __attribute__((visibility("hidden"))); \ + #define VVAR(name) (vvar_ ## name) #define TIMENS(name) (timens_ ## name) @@ -44,12 +50,22 @@ extern char __vvar_page; type name[CS_BASES] \ __attribute__((section(".vvar_" #name), aligned(16))) __visible +#define DEFINE_VVAR_SINGLE(type, name) \ + type name \ + __attribute__((section(".vvar_" #name), aligned(16))) __visible + #endif /* DECLARE_VVAR(offset, type, name) */ DECLARE_VVAR(128, struct vdso_data, _vdso_data) +#if !defined(_SINGLE_DATA) +#define _SINGLE_DATA +DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data) +#endif + #undef DECLARE_VVAR +#undef DECLARE_VVAR_SINGLE #endif diff --git a/drivers/char/random.c b/drivers/char/random.c index 2597cb43f438..b02a12436750 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* - * Copyright (C) 2017-2022 Jason A. Donenfeld . All Rights Reserved. + * Copyright (C) 2017-2024 Jason A. Donenfeld . All Rights Reserved. * Copyright Matt Mackall , 2003, 2004, 2005 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved. * @@ -56,6 +56,10 @@ #include #include #include +#ifdef CONFIG_VDSO_GETRANDOM +#include +#include +#endif #include #include #include @@ -271,6 +275,15 @@ static void crng_reseed(struct work_struct *work) if (next_gen == ULONG_MAX) ++next_gen; WRITE_ONCE(base_crng.generation, next_gen); +#ifdef CONFIG_VDSO_GETRANDOM + /* base_crng.generation's invalid value is ULONG_MAX, while + * _vdso_rng_data.generation's invalid value is 0, so add one to the + * former to arrive at the latter. Use smp_store_release so that this + * is ordered with the write above to base_crng.generation. Pairs with + * the smp_rmb() before the syscall in the vDSO code. + */ + smp_store_release(&_vdso_rng_data.generation, next_gen + 1); +#endif if (!static_branch_likely(&crng_is_ready)) crng_init = CRNG_READY; spin_unlock_irqrestore(&base_crng.lock, flags); @@ -721,6 +734,9 @@ static void __cold _credit_init_bits(size_t bits) if (static_key_initialized && system_unbound_wq) queue_work(system_unbound_wq, &set_ready); atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); +#ifdef CONFIG_VDSO_GETRANDOM + WRITE_ONCE(_vdso_rng_data.is_ready, true); +#endif wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); pr_notice("crng init done\n"); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 775a2e8d600c..5f171ad7b436 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -988,6 +988,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_SHADOW_STACK)] = "ss", #endif #ifdef CONFIG_64BIT + [ilog2(VM_DROPPABLE)] = "dp", [ilog2(VM_SEALED)] = "sl", #endif }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d044e737dba..aa4fccb2a693 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -406,6 +406,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif +#ifdef CONFIG_64BIT +#define VM_DROPPABLE_BIT 40 +#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) +#else +#define VM_DROPPABLE VM_NONE +#endif + #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 05d59f74fc88..a12bcf042551 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, { vm_flags &= __VM_UFFD_FLAGS; + if (vm_flags & VM_DROPPABLE) + return false; + if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e46d6e82765e..b63d211bd141 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3) # define IF_HAVE_UFFD_MINOR(flag, name) #endif +#ifdef CONFIG_64BIT +# define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name}, +#else +# define IF_HAVE_VM_DROPPABLE(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ {VM_HUGEPAGE, "hugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \ +IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \ {VM_MERGEABLE, "mergeable" } \ #define show_vma_flags(flags) \ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index a246e11988d5..e89d00528f2f 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -17,6 +17,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ /* * Huge page size encoding when MAP_HUGETLB is specified, and a huge page diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h index e744c23582eb..1dd047ec98a1 100644 --- a/include/uapi/linux/random.h +++ b/include/uapi/linux/random.h @@ -20,7 +20,7 @@ /* Add to (or subtract from) the entropy count. (Superuser only.) */ #define RNDADDTOENTCNT _IOW( 'R', 0x01, int ) -/* Get the contents of the entropy pool. (Superuser only.) */ +/* Get the contents of the entropy pool. (Superuser only.) (Removed in 2.6.9-rc2.) */ #define RNDGETPOOL _IOR( 'R', 0x02, int [2] ) /* @@ -55,4 +55,19 @@ struct rand_pool_info { #define GRND_RANDOM 0x0002 #define GRND_INSECURE 0x0004 +/** + * struct vgetrandom_opaque_params - arguments for allocating memory for vgetrandom + * + * @size_per_opaque_state: Size of each state that is to be passed to vgetrandom(). + * @mmap_prot: Value of the prot argument in mmap(2). + * @mmap_flags: Value of the flags argument in mmap(2). + * @reserved: Reserved for future use. + */ +struct vgetrandom_opaque_params { + __u32 size_of_opaque_state; + __u32 mmap_prot; + __u32 mmap_flags; + __u32 reserved[13]; +}; + #endif /* _UAPI_LINUX_RANDOM_H */ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 7647e0946f50..b85f24cac3f5 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -117,6 +117,16 @@ struct vdso_data { struct arch_vdso_data arch_data; }; +/** + * struct vdso_rng_data - vdso RNG state information + * @generation: counter representing the number of RNG reseeds + * @is_ready: boolean signaling whether the RNG is initialized + */ +struct vdso_rng_data { + u64 generation; + u8 is_ready; +}; + /* * We use the hidden visibility to prevent the compiler from generating a GOT * relocation. Not only is going through a GOT useless (the entry couldn't and @@ -128,6 +138,7 @@ struct vdso_data { */ extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); +extern struct vdso_rng_data _vdso_rng_data __attribute__((visibility("hidden"))); /** * union vdso_data_store - Generic vDSO data page diff --git a/include/vdso/getrandom.h b/include/vdso/getrandom.h new file mode 100644 index 000000000000..a8b7c14b0ae0 --- /dev/null +++ b/include/vdso/getrandom.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. + */ + +#ifndef _VDSO_GETRANDOM_H +#define _VDSO_GETRANDOM_H + +#include + +#define CHACHA_KEY_SIZE 32 +#define CHACHA_BLOCK_SIZE 64 + +/** + * struct vgetrandom_state - State used by vDSO getrandom(). + * + * @batch: One and a half ChaCha20 blocks of buffered RNG output. + * + * @key: Key to be used for generating next batch. + * + * @batch_key: Union of the prior two members, which is exactly two full + * ChaCha20 blocks in size, so that @batch and @key can be filled + * together. + * + * @generation: Snapshot of @rng_info->generation in the vDSO data page at + * the time @key was generated. + * + * @pos: Offset into @batch of the next available random byte. + * + * @in_use: Reentrancy guard for reusing a state within the same thread + * due to signal handlers. + */ +struct vgetrandom_state { + union { + struct { + u8 batch[CHACHA_BLOCK_SIZE * 3 / 2]; + u32 key[CHACHA_KEY_SIZE / sizeof(u32)]; + }; + u8 batch_key[CHACHA_BLOCK_SIZE * 2]; + }; + u64 generation; + u8 pos; + bool in_use; +}; + +#endif /* _VDSO_GETRANDOM_H */ diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index c46c2300517c..82fe827af542 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -38,3 +38,8 @@ config GENERIC_VDSO_OVERFLOW_PROTECT in the hotpath. endif + +config VDSO_GETRANDOM + bool + help + Selected by architectures that support vDSO getrandom(). diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c new file mode 100644 index 000000000000..b230f0b10832 --- /dev/null +++ b/lib/vdso/getrandom.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \ + while (len >= sizeof(type)) { \ + __put_unaligned_t(type, __get_unaligned_t(type, src), dst); \ + __put_unaligned_t(type, 0, src); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } \ +} while (0) + +static void memcpy_and_zero_src(void *dst, void *src, size_t len) +{ + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { + if (IS_ENABLED(CONFIG_64BIT)) + MEMCPY_AND_ZERO_SRC(u64, dst, src, len); + MEMCPY_AND_ZERO_SRC(u32, dst, src, len); + MEMCPY_AND_ZERO_SRC(u16, dst, src, len); + } + MEMCPY_AND_ZERO_SRC(u8, dst, src, len); +} + +/** + * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall. + * @rng_info: Describes state of kernel RNG, memory shared with kernel. + * @buffer: Destination buffer to fill with random bytes. + * @len: Size of @buffer in bytes. + * @flags: Zero or more GRND_* flags. + * @opaque_state: Pointer to an opaque state area. + * @opaque_len: Length of opaque state area. + * + * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's + * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same + * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always + * calls into the syscall. + * + * If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated + * with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0, + * this function should not be used. + * + * @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields + * from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external + * locking is used, one state must be allocated per thread, as it is not safe to call this function + * concurrently with the same @opaque_state. However, it is safe to call this using the same + * @opaque_state that is shared between main code and signal handling code, within the same thread. + * + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. + */ +static __always_inline ssize_t +__cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len, + unsigned int flags, void *opaque_state, size_t opaque_len) +{ + ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len); + struct vgetrandom_state *state = opaque_state; + size_t batch_len, nblocks, orig_len = len; + bool in_use, have_retried = false; + unsigned long current_generation; + void *orig_buffer = buffer; + u32 counter[2] = { 0 }; + + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) { + *(struct vgetrandom_opaque_params *)opaque_state = (struct vgetrandom_opaque_params) { + .size_of_opaque_state = sizeof(*state), + .mmap_prot = PROT_READ | PROT_WRITE, + .mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS + }; + return 0; + } + + /* The state must not straddle a page, since pages can be zeroed at any time. */ + if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE)) + return -EFAULT; + + /* If the caller passes the wrong size, which might happen due to CRIU, fallback. */ + if (unlikely(opaque_len != sizeof(*state))) + goto fallback_syscall; + + /* + * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from + * userspace, because A) the various @flags require this to block, or not, depending on + * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is + * ready is to reseed from the entropy pool at every invocation. + */ + if (unlikely(!READ_ONCE(rng_info->is_ready))) + goto fallback_syscall; + + /* + * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is + * initialized, the @flags parameter may require this to block or return an error, even when + * len is zero. + */ + if (unlikely(!len)) + return 0; + + /* + * @state->in_use is basic reentrancy protection against this running in a signal handler + * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one + * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before + * writing @state->in_use, there is still no race, because the signal handler will run to + * its completion before returning execution. + */ + in_use = READ_ONCE(state->in_use); + if (unlikely(in_use)) + /* The syscall simply fills the buffer and does not touch @state, so fallback. */ + goto fallback_syscall; + WRITE_ONCE(state->in_use, true); + +retry_generation: + /* + * @rng_info->generation must always be read here, as it serializes @state->key with the + * kernel's RNG reseeding schedule. + */ + current_generation = READ_ONCE(rng_info->generation); + + /* + * If @state->generation doesn't match the kernel RNG's generation, then it means the + * kernel's RNG has reseeded, and so @state->key is reseeded as well. + */ + if (unlikely(state->generation != current_generation)) { + /* + * Write the generation before filling the key, in case of fork. If there is a fork + * just after this line, the parent and child will get different random bytes from + * the syscall, which is good. However, were this line to occur after the getrandom + * syscall, then both child and parent could have the same bytes and the same + * generation counter, so the fork would not be detected. Therefore, write + * @state->generation before the call to the getrandom syscall. + */ + WRITE_ONCE(state->generation, current_generation); + + /* + * Prevent the syscall from being reordered wrt current_generation. Pairs with the + * smp_store_release(&_vdso_rng_data.generation) in random.c. + */ + smp_rmb(); + + /* Reseed @state->key using fresh bytes from the kernel. */ + if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) { + /* + * If the syscall failed to refresh the key, then @state->key is now + * invalid, so invalidate the generation so that it is not used again, and + * fallback to using the syscall entirely. + */ + WRITE_ONCE(state->generation, 0); + + /* + * Set @state->in_use to false only after the last write to @state in the + * line above. + */ + WRITE_ONCE(state->in_use, false); + + goto fallback_syscall; + } + + /* + * Set @state->pos to beyond the end of the batch, so that the batch is refilled + * using the new key. + */ + state->pos = sizeof(state->batch); + } + + /* Set len to the total amount of bytes that this function is allowed to read, ret. */ + len = ret; +more_batch: + /* + * First use bytes out of @state->batch, which may have been filled by the last call to this + * function. + */ + batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len); + if (batch_len) { + /* Zeroing at the same time as memcpying helps preserve forward secrecy. */ + memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len); + state->pos += batch_len; + buffer += batch_len; + len -= batch_len; + } + + if (!len) { + /* Prevent the loop from being reordered wrt ->generation. */ + barrier(); + + /* + * Since @rng_info->generation will never be 0, re-read @state->generation, rather + * than using the local current_generation variable, to learn whether a fork + * occurred or if @state was zeroed due to memory pressure. Primarily, though, this + * indicates whether the kernel's RNG has reseeded, in which case generate a new key + * and start over. + */ + if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) { + /* + * Prevent this from looping forever in case of low memory or racing with a + * user force-reseeding the kernel's RNG using the ioctl. + */ + if (have_retried) { + WRITE_ONCE(state->in_use, false); + goto fallback_syscall; + } + + have_retried = true; + buffer = orig_buffer; + goto retry_generation; + } + + /* + * Set @state->in_use to false only when there will be no more reads or writes of + * @state. + */ + WRITE_ONCE(state->in_use, false); + return ret; + } + + /* Generate blocks of RNG output directly into @buffer while there's enough room left. */ + nblocks = len / CHACHA_BLOCK_SIZE; + if (nblocks) { + __arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks); + buffer += nblocks * CHACHA_BLOCK_SIZE; + len -= nblocks * CHACHA_BLOCK_SIZE; + } + + BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0); + + /* Refill the batch and overwrite the key, in order to preserve forward secrecy. */ + __arch_chacha20_blocks_nostack(state->batch_key, state->key, counter, + sizeof(state->batch_key) / CHACHA_BLOCK_SIZE); + + /* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */ + state->pos = 0; + goto more_batch; + +fallback_syscall: + return getrandom_syscall(orig_buffer, orig_len, flags); +} + +static __always_inline ssize_t +__cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state, opaque_len); +} diff --git a/mm/ksm.c b/mm/ksm.c index df6bae3a5a2c..14d9e53b1ec2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -713,7 +713,7 @@ static bool vma_ksm_compatible(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_HUGETLB | - VM_MIXEDMAP)) + VM_MIXEDMAP| VM_DROPPABLE)) return false; /* just ignore the advice */ if (vma_is_dax(vma)) diff --git a/mm/madvise.c b/mm/madvise.c index 96c026fe0c99..89089d84f8df 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1068,13 +1068,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: + if (vma->vm_flags & VM_DROPPABLE) + return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || + (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; diff --git a/mm/memory.c b/mm/memory.c index 4bcd79619574..1ff7b6f51ec1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5801,6 +5801,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; + bool is_droppable; __set_current_state(TASK_RUNNING); @@ -5815,6 +5816,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, goto out; } + is_droppable = !!(vma->vm_flags & VM_DROPPABLE); + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -5829,8 +5832,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, else ret = __handle_mm_fault(vma, address, flags); + /* + * Warning: It is no longer safe to dereference vma-> after this point, + * because mmap_lock might have been dropped by __handle_mm_fault(), so + * vma might be destroyed from underneath us. + */ + lru_gen_exit_fault(); + /* If the mapping is droppable, then errors due to OOM aren't fatal. */ + if (is_droppable) + ret &= ~VM_FAULT_OOM; + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 327a19b0883d..b858e22b259d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2305,6 +2305,9 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct pgoff_t ilx; struct folio *folio; + if (vma->vm_flags & VM_DROPPABLE) + gfp |= __GFP_NOWARN; + pol = get_vma_policy(vma, addr, order, &ilx); folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); diff --git a/mm/mlock.c b/mm/mlock.c index 52d6e401ad67..e3e3dc2b2956 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma) || vma_is_secretmem(vma)) + vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/mmap.c b/mm/mmap.c index e42d89f98071..d0dfc85b209b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1410,6 +1410,36 @@ unsigned long do_mmap(struct file *file, unsigned long addr, pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; + case MAP_DROPPABLE: + if (VM_DROPPABLE == VM_NONE) + return -ENOTSUPP; + /* + * A locked or stack area makes no sense to be droppable. + * + * Also, since droppable pages can just go away at any time + * it makes no sense to copy them on fork or dump them. + * + * And don't attempt to combine with hugetlb for now. + */ + if (flags & (MAP_LOCKED | MAP_HUGETLB)) + return -EINVAL; + if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) + return -EINVAL; + + vm_flags |= VM_DROPPABLE; + + /* + * If the pages can be dropped, then it doesn't make + * sense to reserve them. + */ + vm_flags |= VM_NORESERVE; + + /* + * Likewise, they're volatile enough that they + * shouldn't survive forks or coredumps. + */ + vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; + fallthrough; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. diff --git a/mm/rmap.c b/mm/rmap.c index 8616308610b9..2490e727e2dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1412,7 +1412,11 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); - if (!folio_test_swapbacked(folio)) + /* + * VM_DROPPABLE mappings don't swap; instead they're just dropped when + * under memory pressure. + */ + if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE)) __folio_set_swapbacked(folio); __folio_set_anon(folio, vma, address, exclusive); @@ -1848,7 +1852,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * plus the rmap(s) (dropped by discard:). */ if (ref_count == 1 + map_count && - !folio_test_dirty(folio)) { + (!folio_test_dirty(folio) || + /* + * Unlike MADV_FREE mappings, VM_DROPPABLE + * ones can be dropped even if they've + * been dirtied. + */ + (vma->vm_flags & VM_DROPPABLE))) { dec_mm_counter(mm, MM_ANONPAGES); goto discard; } @@ -1858,7 +1868,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * discarded. Remap the page to page table. */ set_pte_at(mm, address, pvmw.pte, pteval); - folio_set_swapbacked(folio); + /* + * Unlike MADV_FREE mappings, VM_DROPPABLE ones + * never get swap backed on failure to drop. + */ + if (!(vma->vm_flags & VM_DROPPABLE)) + folio_set_swapbacked(folio); goto walk_abort; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 525d3ffa8451..cfa839284b92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4301,15 +4301,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c return true; } - /* dirty lazyfree */ - if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { - success = lru_gen_del_folio(lruvec, folio, true); - VM_WARN_ON_ONCE_FOLIO(!success, folio); - folio_set_swapbacked(folio); - lruvec_add_folio_tail(lruvec, folio); - return true; - } - /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { list_move(&folio->lru, &lrugen->folios[gen][type][zone]); diff --git a/tools/include/asm/rwonce.h b/tools/include/asm/rwonce.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index a246e11988d5..e89d00528f2f 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -17,6 +17,7 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ /* * Huge page size encoding when MAP_HUGETLB is specified, and a huge page diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 064e7b125643..da030b43e43b 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -50,3 +50,4 @@ hugetlb_fault_after_madv hugetlb_madv_vs_map mseal_test seal_elf +droppable diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index e1aa09ddaa3d..901e0d07765b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -76,6 +76,7 @@ TEST_GEN_FILES += mdwe_test TEST_GEN_FILES += hugetlb_fault_after_madv TEST_GEN_FILES += hugetlb_madv_vs_map TEST_GEN_FILES += hugetlb_dio +TEST_GEN_FILES += droppable ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c new file mode 100644 index 000000000000..f3d9ecf96890 --- /dev/null +++ b/tools/testing/selftests/mm/droppable.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +int main(int argc, char *argv[]) +{ + size_t alloc_size = 134217728; + size_t page_size = getpagesize(); + void *alloc; + pid_t child; + + ksft_print_header(); + ksft_set_plan(1); + + alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); + assert(alloc != MAP_FAILED); + memset(alloc, 'A', alloc_size); + for (size_t i = 0; i < alloc_size; i += page_size) + assert(*(uint8_t *)(alloc + i)); + + child = fork(); + assert(child >= 0); + if (!child) { + for (;;) + *(char *)malloc(page_size) = 'B'; + } + + for (bool done = false; !done;) { + for (size_t i = 0; i < alloc_size; i += page_size) { + if (!*(uint8_t *)(alloc + i)) { + done = true; + break; + } + } + } + kill(child, SIGTERM); + + ksft_test_result_pass("MAP_DROPPABLE: PASS\n"); + exit(KSFT_PASS); +} diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index a8dc51af5a9c..30d5c8f0e5c7 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -6,3 +6,5 @@ vdso_test_correctness vdso_test_gettimeofday vdso_test_getcpu vdso_standalone_test_x86 +vdso_test_getrandom +vdso_test_chacha diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 98d8ba2afa00..3de8e7e052ae 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null) TEST_GEN_PROGS := vdso_test_gettimeofday TEST_GEN_PROGS += vdso_test_getcpu @@ -10,6 +11,12 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) TEST_GEN_PROGS += vdso_standalone_test_x86 endif TEST_GEN_PROGS += vdso_test_correctness +ifeq ($(uname_M),x86_64) +TEST_GEN_PROGS += vdso_test_getrandom +ifneq ($(SODIUM),) +TEST_GEN_PROGS += vdso_test_chacha +endif +endif CFLAGS := -std=gnu99 @@ -28,3 +35,14 @@ $(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind- $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl + +$(OUTPUT)/vdso_test_getrandom: parse_vdso.c +$(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \ + -isystem $(top_srcdir)/include/uapi + +$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S +$(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ + -isystem $(top_srcdir)/arch/$(ARCH)/include \ + -isystem $(top_srcdir)/include \ + -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \ + -Wa,--noexecstack $(SODIUM) diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c new file mode 100644 index 000000000000..e38f44e5f803 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include "../kselftest.h" + +extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks); + +int main(int argc, char *argv[]) +{ + enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 }; + static const uint8_t nonce[8] = { 0 }; + uint32_t counter[2]; + uint8_t key[32]; + uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS]; + + ksft_print_header(); + ksft_set_plan(1); + + for (unsigned int trial = 0; trial < TRIALS; ++trial) { + if (getrandom(key, sizeof(key), 0) != sizeof(key)) { + printf("getrandom() failed!\n"); + return KSFT_SKIP; + } + crypto_stream_chacha20(output1, sizeof(output1), nonce, key); + for (unsigned int split = 0; split < BLOCKS; ++split) { + memset(output2, 'X', sizeof(output2)); + memset(counter, 0, sizeof(counter)); + if (split) + __arch_chacha20_blocks_nostack(output2, key, counter, split); + __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split); + if (memcmp(output1, output2, sizeof(output1))) + return KSFT_FAIL; + } + } + ksft_test_result_pass("chacha: PASS\n"); + return KSFT_PASS; +} diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c new file mode 100644 index 000000000000..05122425a873 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "parse_vdso.h" + +#ifndef timespecsub +#define timespecsub(tsp, usp, vsp) \ + do { \ + (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ + (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ + if ((vsp)->tv_nsec < 0) { \ + (vsp)->tv_sec--; \ + (vsp)->tv_nsec += 1000000000L; \ + } \ + } while (0) +#endif + +static struct { + pthread_mutex_t lock; + void **states; + size_t len, cap; +} grnd_allocator = { + .lock = PTHREAD_MUTEX_INITIALIZER +}; + +static struct { + ssize_t(*fn)(void *, size_t, unsigned long, void *, size_t); + pthread_key_t key; + pthread_once_t initialized; + struct vgetrandom_opaque_params params; +} grnd_ctx = { + .initialized = PTHREAD_ONCE_INIT +}; + +static void *vgetrandom_get_state(void) +{ + void *state = NULL; + + pthread_mutex_lock(&grnd_allocator.lock); + if (!grnd_allocator.len) { + size_t page_size = getpagesize(); + size_t new_cap; + size_t alloc_size, num = sysconf(_SC_NPROCESSORS_ONLN); /* Just a decent heuristic. */ + void *new_block, *new_states; + + alloc_size = (num * grnd_ctx.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1)); + num = (page_size / grnd_ctx.params.size_of_opaque_state) * (alloc_size / page_size); + new_block = mmap(0, alloc_size, grnd_ctx.params.mmap_prot, grnd_ctx.params.mmap_flags, -1, 0); + if (new_block == MAP_FAILED) + goto out; + + new_cap = grnd_allocator.cap + num; + new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states)); + if (!new_states) + goto unmap; + grnd_allocator.cap = new_cap; + grnd_allocator.states = new_states; + + for (size_t i = 0; i < num; ++i) { + if (((uintptr_t)new_block & (page_size - 1)) + grnd_ctx.params.size_of_opaque_state > page_size) + new_block = (void *)(((uintptr_t)new_block + page_size - 1) & (~(page_size - 1))); + grnd_allocator.states[i] = new_block; + new_block += grnd_ctx.params.size_of_opaque_state; + } + grnd_allocator.len = num; + goto success; + + unmap: + munmap(new_block, alloc_size); + goto out; + } +success: + state = grnd_allocator.states[--grnd_allocator.len]; + +out: + pthread_mutex_unlock(&grnd_allocator.lock); + return state; +} + +static void vgetrandom_put_state(void *state) +{ + if (!state) + return; + pthread_mutex_lock(&grnd_allocator.lock); + grnd_allocator.states[grnd_allocator.len++] = state; + pthread_mutex_unlock(&grnd_allocator.lock); +} + +static void vgetrandom_init(void) +{ + if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0) + return; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + exit(KSFT_SKIP); + } + vdso_init_from_sysinfo_ehdr(sysinfo_ehdr); + grnd_ctx.fn = (__typeof__(grnd_ctx.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom"); + if (!grnd_ctx.fn) { + printf("__vdso_getrandom is missing!\n"); + exit(KSFT_FAIL); + } + if (grnd_ctx.fn(NULL, 0, 0, &grnd_ctx.params, ~0UL) != 0) { + printf("failed to fetch vgetrandom params!\n"); + exit(KSFT_FAIL); + } +} + +static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags) +{ + void *state; + + pthread_once(&grnd_ctx.initialized, vgetrandom_init); + state = pthread_getspecific(grnd_ctx.key); + if (!state) { + state = vgetrandom_get_state(); + if (pthread_setspecific(grnd_ctx.key, state) != 0) { + vgetrandom_put_state(state); + state = NULL; + } + if (!state) { + printf("vgetrandom_get_state failed!\n"); + exit(KSFT_FAIL); + } + } + return grnd_ctx.fn(buf, len, flags, state, grnd_ctx.params.size_of_opaque_state); +} + +enum { TRIALS = 25000000, THREADS = 256 }; + +static void *test_vdso_getrandom(void *) +{ + for (size_t i = 0; i < TRIALS; ++i) { + unsigned int val; + ssize_t ret = vgetrandom(&val, sizeof(val), 0); + assert(ret == sizeof(val)); + } + return NULL; +} + +static void *test_libc_getrandom(void *) +{ + for (size_t i = 0; i < TRIALS; ++i) { + unsigned int val; + ssize_t ret = getrandom(&val, sizeof(val), 0); + assert(ret == sizeof(val)); + } + return NULL; +} + +static void *test_syscall_getrandom(void *) +{ + for (size_t i = 0; i < TRIALS; ++i) { + unsigned int val; + ssize_t ret = syscall(__NR_getrandom, &val, sizeof(val), 0); + assert(ret == sizeof(val)); + } + return NULL; +} + +static void bench_single(void) +{ + struct timespec start, end, diff; + + clock_gettime(CLOCK_MONOTONIC, &start); + test_vdso_getrandom(NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" vdso: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + test_libc_getrandom(NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" libc: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + test_syscall_getrandom(NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf("syscall: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec); +} + +static void bench_multi(void) +{ + struct timespec start, end, diff; + pthread_t threads[THREADS]; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (size_t i = 0; i < THREADS; ++i) + assert(pthread_create(&threads[i], NULL, test_vdso_getrandom, NULL) == 0); + for (size_t i = 0; i < THREADS; ++i) + pthread_join(threads[i], NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" vdso: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (size_t i = 0; i < THREADS; ++i) + assert(pthread_create(&threads[i], NULL, test_libc_getrandom, NULL) == 0); + for (size_t i = 0; i < THREADS; ++i) + pthread_join(threads[i], NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" libc: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + for (size_t i = 0; i < THREADS; ++i) + assert(pthread_create(&threads[i], NULL, test_syscall_getrandom, NULL) == 0); + for (size_t i = 0; i < THREADS; ++i) + pthread_join(threads[i], NULL); + clock_gettime(CLOCK_MONOTONIC, &end); + timespecsub(&end, &start, &diff); + printf(" syscall: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec); +} + +static void fill(void) +{ + uint8_t weird_size[323929]; + for (;;) + vgetrandom(weird_size, sizeof(weird_size), 0); +} + +static void kselftest(void) +{ + uint8_t weird_size[1263]; + + ksft_print_header(); + ksft_set_plan(1); + + for (size_t i = 0; i < 1000; ++i) { + ssize_t ret = vgetrandom(weird_size, sizeof(weird_size), 0); + if (ret != sizeof(weird_size)) + exit(KSFT_FAIL); + } + + ksft_test_result_pass("getrandom: PASS\n"); + exit(KSFT_PASS); +} + +static void usage(const char *argv0) +{ + fprintf(stderr, "Usage: %s [bench-single|bench-multi|fill]\n", argv0); +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) { + kselftest(); + return 0; + } + + if (argc != 2) { + usage(argv[0]); + return 1; + } + if (!strcmp(argv[1], "bench-single")) + bench_single(); + else if (!strcmp(argv[1], "bench-multi")) + bench_multi(); + else if (!strcmp(argv[1], "fill")) + fill(); + else { + usage(argv[0]); + return 1; + } + return 0; +}