linux-stable/include/linux/percpu.h
GONG, Ruiqi 3c61529405 Randomized slab caches for kmalloc()
When exploiting memory vulnerabilities, "heap spraying" is a common
technique targeting those related to dynamic memory allocation (i.e. the
"heap"), and it plays an important role in a successful exploitation.
Basically, it is to overwrite the memory area of vulnerable object by
triggering allocation in other subsystems or modules and therefore
getting a reference to the targeted memory location. It's usable on
various types of vulnerablity including use after free (UAF), heap out-
of-bound write and etc.

There are (at least) two reasons why the heap can be sprayed: 1) generic
slab caches are shared among different subsystems and modules, and
2) dedicated slab caches could be merged with the generic ones.
Currently these two factors cannot be prevented at a low cost: the first
one is a widely used memory allocation mechanism, and shutting down slab
merging completely via `slub_nomerge` would be overkill.

To efficiently prevent heap spraying, we propose the following approach:
to create multiple copies of generic slab caches that will never be
merged, and random one of them will be used at allocation. The random
selection is based on the address of code that calls `kmalloc()`, which
means it is static at runtime (rather than dynamically determined at
each time of allocation, which could be bypassed by repeatedly spraying
in brute force). In other words, the randomness of cache selection will
be with respect to the code address rather than time, i.e. allocations
in different code paths would most likely pick different caches,
although kmalloc() at each place would use the same cache copy whenever
it is executed. In this way, the vulnerable object and memory allocated
in other subsystems and modules will (most probably) be on different
slab caches, which prevents the object from being sprayed.

Meanwhile, the static random selection is further enhanced with a
per-boot random seed, which prevents the attacker from finding a usable
kmalloc that happens to pick the same cache with the vulnerable
subsystem/module by analyzing the open source code. In other words, with
the per-boot seed, the random selection is static during each time the
system starts and runs, but not across different system startups.

The overhead of performance has been tested on a 40-core x86 server by
comparing the results of `perf bench all` between the kernels with and
without this patch based on the latest linux-next kernel, which shows
minor difference. A subset of benchmarks are listed below:

                sched/  sched/  syscall/       mem/       mem/
             messaging    pipe     basic     memcpy     memset
                 (sec)   (sec)     (sec)   (GB/sec)   (GB/sec)

control1         0.019   5.459     0.733  15.258789  51.398026
control2         0.019   5.439     0.730  16.009221  48.828125
control3         0.019   5.282     0.735  16.009221  48.828125
control_avg      0.019   5.393     0.733  15.759077  49.684759

experiment1      0.019   5.374     0.741  15.500992  46.502976
experiment2      0.019   5.440     0.746  16.276042  51.398026
experiment3      0.019   5.242     0.752  15.258789  51.398026
experiment_avg   0.019   5.352     0.746  15.678608  49.766343

The overhead of memory usage was measured by executing `free` after boot
on a QEMU VM with 1GB total memory, and as expected, it's positively
correlated with # of cache copies:

           control  4 copies  8 copies  16 copies

total       969.8M    968.2M    968.2M     968.2M
used         20.0M     21.9M     24.1M      26.7M
free        936.9M    933.6M    931.4M     928.6M
available   932.2M    928.8M    926.6M     923.9M

Co-developed-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: GONG, Ruiqi <gongruiqi@huaweicloud.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Acked-by: Dennis Zhou <dennis@kernel.org> # percpu
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
2023-07-18 10:07:47 +02:00

149 lines
4.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PERCPU_H
#define __LINUX_PERCPU_H
#include <linux/mmdebug.h>
#include <linux/preempt.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/pfn.h>
#include <linux/init.h>
#include <linux/cleanup.h>
#include <asm/percpu.h>
/* enough to cover all DEFINE_PER_CPUs in modules */
#ifdef CONFIG_MODULES
#define PERCPU_MODULE_RESERVE (8 << 10)
#else
#define PERCPU_MODULE_RESERVE 0
#endif
/* minimum unit size, also is the maximum supported allocation size */
#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
/* minimum allocation size and shift in bytes */
#define PCPU_MIN_ALLOC_SHIFT 2
#define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT)
/*
* The PCPU_BITMAP_BLOCK_SIZE must be the same size as PAGE_SIZE as the
* updating of hints is used to manage the nr_empty_pop_pages in both
* the chunk and globally.
*/
#define PCPU_BITMAP_BLOCK_SIZE PAGE_SIZE
#define PCPU_BITMAP_BLOCK_BITS (PCPU_BITMAP_BLOCK_SIZE >> \
PCPU_MIN_ALLOC_SHIFT)
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
#define PERCPU_DYNAMIC_SIZE_SHIFT 12
#else
#define PERCPU_DYNAMIC_SIZE_SHIFT 10
#endif
/*
* Percpu allocator can serve percpu allocations before slab is
* initialized which allows slab to depend on the percpu allocator.
* The following parameter decide how much resource to preallocate
* for this. Keep PERCPU_DYNAMIC_RESERVE equal to or larger than
* PERCPU_DYNAMIC_EARLY_SIZE.
*/
#define PERCPU_DYNAMIC_EARLY_SIZE (20 << PERCPU_DYNAMIC_SIZE_SHIFT)
/*
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
* back on the first chunk for dynamic percpu allocation if arch is
* manually allocating and mapping it for faster access (as a part of
* large page mapping for example).
*
* The following values give between one and two pages of free space
* after typical minimal boot (2-way SMP, single disk and NIC) with
* both defconfig and a distro config on x86_64 and 32. More
* intelligent way to determine this would be nice.
*/
#if BITS_PER_LONG > 32
#define PERCPU_DYNAMIC_RESERVE (28 << PERCPU_DYNAMIC_SIZE_SHIFT)
#else
#define PERCPU_DYNAMIC_RESERVE (20 << PERCPU_DYNAMIC_SIZE_SHIFT)
#endif
extern void *pcpu_base_addr;
extern const unsigned long *pcpu_unit_offsets;
struct pcpu_group_info {
int nr_units; /* aligned # of units */
unsigned long base_offset; /* base address offset */
unsigned int *cpu_map; /* unit->cpu map, empty
* entries contain NR_CPUS */
};
struct pcpu_alloc_info {
size_t static_size;
size_t reserved_size;
size_t dyn_size;
size_t unit_size;
size_t atom_size;
size_t alloc_size;
size_t __ai_size; /* internal, don't use */
int nr_groups; /* 0 if grouping unnecessary */
struct pcpu_group_info groups[];
};
enum pcpu_fc {
PCPU_FC_AUTO,
PCPU_FC_EMBED,
PCPU_FC_PAGE,
PCPU_FC_NR,
};
extern const char * const pcpu_fc_names[PCPU_FC_NR];
extern enum pcpu_fc pcpu_chosen_fc;
typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu);
typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
int nr_units);
extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);
extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr);
extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
void __init pcpu_populate_pte(unsigned long addr);
extern int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
#endif
extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1);
extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
extern bool is_kernel_percpu_address(unsigned long addr);
#if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
extern void __init setup_per_cpu_areas(void);
#endif
extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
extern void free_percpu(void __percpu *__pdata);
DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
#define alloc_percpu_gfp(type, gfp) \
(typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \
__alignof__(type), gfp)
#define alloc_percpu(type) \
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))
extern unsigned long pcpu_nr_pages(void);
#endif /* __LINUX_PERCPU_H */