Merge branches 'slab/for-5.19/stackdepot' and 'slab/for-5.19/refactor' into slab/for-linus

This commit is contained in:
Vlastimil Babka 2022-05-23 11:14:32 +02:00
commit e001897da6
9 changed files with 238 additions and 91 deletions

View file

@ -384,5 +384,69 @@ c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
40,60`` range will plot only samples collected between 40th and
60th seconds).
DebugFS files for SLUB
======================
For more information about current state of SLUB caches with the user tracking
debug option enabled, debugfs files are available, typically under
/sys/kernel/debug/slab/<cache>/ (created only for caches with enabled user
tracking). There are 2 types of these files with the following debug
information:
1. alloc_traces::
Prints information about unique allocation traces of the currently
allocated objects. The output is sorted by frequency of each trace.
Information in the output:
Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
Example:::
1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
__slab_alloc+0x6d/0x90
kmem_cache_alloc_trace+0x2eb/0x300
populate_error_injection_list+0x97/0x110
init_error_injection+0x1b/0x71
do_one_initcall+0x5f/0x2d0
kernel_init_freeable+0x26f/0x2d7
kernel_init+0xe/0x118
ret_from_fork+0x22/0x30
2. free_traces::
Prints information about unique freeing traces of the currently allocated
objects. The freeing traces thus come from the previous life-cycle of the
objects and are reported as not available for objects allocated for the first
time. The output is sorted by frequency of each trace.
Information in the output:
Number of objects, freeing function, minimal/average/maximal jiffies since free,
pid range of the freeing processes, cpu mask of freeing cpus, and stack trace.
Example:::
1980 <not-available> age=4294912290 pid=0 cpus=0
51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1
kfree+0x2db/0x420
acpi_ut_update_ref_count+0x6a6/0x782
acpi_ut_update_object_reference+0x1ad/0x234
acpi_ut_remove_reference+0x7d/0x84
acpi_rs_get_prt_method_data+0x97/0xd6
acpi_get_irq_routing_table+0x82/0xc4
acpi_pci_irq_find_prt_entry+0x8e/0x2e0
acpi_pci_irq_lookup+0x3a/0x1e0
acpi_pci_irq_enable+0x77/0x240
pcibios_enable_device+0x39/0x40
do_pci_enable_device.part.0+0x5d/0xe0
pci_enable_device_flags+0xfc/0x120
pci_enable_device+0x13/0x20
virtio_pci_probe+0x9e/0x170
local_pci_probe+0x48/0x80
pci_device_probe+0x105/0x1c0
Christoph Lameter, May 30, 2007
Sergey Senozhatsky, October 23, 2015

View file

@ -105,7 +105,6 @@ struct kmem_cache {
struct kmem_cache_order_objects oo;
/* Allocation and freeing of slabs */
struct kmem_cache_order_objects max;
struct kmem_cache_order_objects min;
gfp_t allocflags; /* gfp flags to use on each alloc */
int refcount; /* Refcount for slab cache destroy */

View file

@ -20,18 +20,36 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries,
gfp_t gfp_flags, bool can_alloc);
/*
* Every user of stack depot has to call this during its own init when it's
* decided that it will be calling stack_depot_save() later.
* Every user of stack depot has to call stack_depot_init() during its own init
* when it's decided that it will be calling stack_depot_save() later. This is
* recommended for e.g. modules initialized later in the boot process, when
* slab_is_available() is true.
*
* The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot
* enabled as part of mm_init(), for subsystems where it's known at compile time
* that stack depot will be used.
*
* Another alternative is to call stack_depot_want_early_init(), when the
* decision to use stack depot is taken e.g. when evaluating kernel boot
* parameters, which precedes the enablement point in mm_init().
*
* stack_depot_init() and stack_depot_want_early_init() can be called regardless
* of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print
* functions should only be called from code that makes sure CONFIG_STACKDEPOT
* is enabled.
*/
#ifdef CONFIG_STACKDEPOT
int stack_depot_init(void);
#ifdef CONFIG_STACKDEPOT_ALWAYS_INIT
static inline int stack_depot_early_init(void) { return stack_depot_init(); }
void __init stack_depot_want_early_init(void);
/* This is supposed to be called only from mm_init() */
int __init stack_depot_early_init(void);
#else
static inline int stack_depot_init(void) { return 0; }
static inline void stack_depot_want_early_init(void) { }
static inline int stack_depot_early_init(void) { return 0; }
#endif

View file

@ -1875,6 +1875,7 @@ config SLUB_DEBUG
default y
bool "Enable SLUB debugging support" if EXPERT
depends on SLUB && SYSFS
select STACKDEPOT if STACKTRACE_SUPPORT
help
SLUB has extensive debug support features. Disabling these can
result in significant savings in code size. This also disables

View file

@ -709,6 +709,7 @@ config DEBUG_SLAB
config SLUB_DEBUG_ON
bool "SLUB debugging on by default"
depends on SLUB && SLUB_DEBUG
select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
default n
help
Boot with debugging on by default. SLUB boots by default with

View file

@ -66,6 +66,9 @@ struct stack_record {
unsigned long entries[]; /* Variable-sized array of entries. */
};
static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
static bool __stack_depot_early_init_passed __initdata;
static void *stack_slabs[STACK_ALLOC_MAX_SLABS];
static int depot_index;
@ -162,38 +165,58 @@ static int __init is_stack_depot_disabled(char *str)
}
early_param("stack_depot_disable", is_stack_depot_disabled);
/*
* __ref because of memblock_alloc(), which will not be actually called after
* the __init code is gone, because at that point slab_is_available() is true
*/
__ref int stack_depot_init(void)
void __init stack_depot_want_early_init(void)
{
/* Too late to request early init now */
WARN_ON(__stack_depot_early_init_passed);
__stack_depot_want_early_init = true;
}
int __init stack_depot_early_init(void)
{
size_t size;
/* This is supposed to be called only once, from mm_init() */
if (WARN_ON(__stack_depot_early_init_passed))
return 0;
__stack_depot_early_init_passed = true;
if (!__stack_depot_want_early_init || stack_depot_disable)
return 0;
size = (STACK_HASH_SIZE * sizeof(struct stack_record *));
pr_info("Stack Depot early init allocating hash table with memblock_alloc, %zu bytes\n",
size);
stack_table = memblock_alloc(size, SMP_CACHE_BYTES);
if (!stack_table) {
pr_err("Stack Depot hash table allocation failed, disabling\n");
stack_depot_disable = true;
return -ENOMEM;
}
return 0;
}
int stack_depot_init(void)
{
static DEFINE_MUTEX(stack_depot_init_mutex);
int ret = 0;
mutex_lock(&stack_depot_init_mutex);
if (!stack_depot_disable && !stack_table) {
size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *));
int i;
if (slab_is_available()) {
pr_info("Stack Depot allocating hash table with kvmalloc\n");
stack_table = kvmalloc(size, GFP_KERNEL);
} else {
pr_info("Stack Depot allocating hash table with memblock_alloc\n");
stack_table = memblock_alloc(size, SMP_CACHE_BYTES);
}
if (stack_table) {
for (i = 0; i < STACK_HASH_SIZE; i++)
stack_table[i] = NULL;
} else {
pr_info("Stack Depot allocating hash table with kvcalloc\n");
stack_table = kvcalloc(STACK_HASH_SIZE, sizeof(struct stack_record *), GFP_KERNEL);
if (!stack_table) {
pr_err("Stack Depot hash table allocation failed, disabling\n");
stack_depot_disable = true;
mutex_unlock(&stack_depot_init_mutex);
return -ENOMEM;
ret = -ENOMEM;
}
}
mutex_unlock(&stack_depot_init_mutex);
return 0;
return ret;
}
EXPORT_SYMBOL_GPL(stack_depot_init);

View file

@ -45,7 +45,12 @@ static void init_early_allocated_pages(void);
static int __init early_page_owner_param(char *buf)
{
return kstrtobool(buf, &page_owner_enabled);
int ret = kstrtobool(buf, &page_owner_enabled);
if (page_owner_enabled)
stack_depot_want_early_init();
return ret;
}
early_param("page_owner", early_page_owner_param);
@ -83,8 +88,6 @@ static __init void init_page_owner(void)
if (!page_owner_enabled)
return;
stack_depot_init();
register_dummy_stack();
register_failure_stack();
register_early_stack();

View file

@ -24,6 +24,7 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
#include <linux/stackdepot.h>
#define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
@ -314,9 +315,13 @@ kmem_cache_create_usercopy(const char *name,
* If no slub_debug was enabled globally, the static key is not yet
* enabled by setup_slub_debug(). Enable it if the cache is being
* created with any of the debugging flags passed explicitly.
* It's also possible that this is the first cache created with
* SLAB_STORE_USER and we should init stack_depot for it.
*/
if (flags & SLAB_DEBUG_FLAGS)
static_branch_enable(&slub_debug_enabled);
if (flags & SLAB_STORE_USER)
stack_depot_init();
#endif
mutex_lock(&slab_mutex);
@ -849,6 +854,8 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
return;
}
flags |= SLAB_ACCOUNT;
} else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
flags |= SLAB_CACHE_DMA;
}
kmalloc_caches[type][idx] = create_kmalloc_cache(
@ -877,7 +884,7 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/*
* Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
*/
for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
new_kmalloc_cache(i, type, flags);
@ -898,20 +905,6 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/* Kmalloc array is now usable */
slab_state = UP;
#ifdef CONFIG_ZONE_DMA
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
if (s) {
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
kmalloc_info[i].name[KMALLOC_DMA],
kmalloc_info[i].size,
SLAB_CACHE_DMA | flags, 0,
kmalloc_info[i].size);
}
}
#endif
}
#endif /* !CONFIG_SLOB */

137
mm/slub.c
View file

@ -26,6 +26,7 @@
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
#include <linux/stackdepot.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/kfence.h>
@ -37,6 +38,7 @@
#include <linux/memcontrol.h>
#include <linux/random.h>
#include <kunit/test.h>
#include <linux/sort.h>
#include <linux/debugfs.h>
#include <trace/events/kmem.h>
@ -264,8 +266,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
#ifdef CONFIG_STACKTRACE
unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
#ifdef CONFIG_STACKDEPOT
depot_stack_handle_t handle;
#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
@ -724,57 +726,51 @@ static struct track *get_track(struct kmem_cache *s, void *object,
return kasan_reset_tag(p + alloc);
}
static void set_track(struct kmem_cache *s, void *object,
static void noinline set_track(struct kmem_cache *s, void *object,
enum track_item alloc, unsigned long addr)
{
struct track *p = get_track(s, object, alloc);
if (addr) {
#ifdef CONFIG_STACKTRACE
unsigned int nr_entries;
#ifdef CONFIG_STACKDEPOT
unsigned long entries[TRACK_ADDRS_COUNT];
unsigned int nr_entries;
metadata_access_enable();
nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
TRACK_ADDRS_COUNT, 3);
metadata_access_disable();
if (nr_entries < TRACK_ADDRS_COUNT)
p->addrs[nr_entries] = 0;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
p->handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
} else {
memset(p, 0, sizeof(struct track));
}
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
}
static void init_tracking(struct kmem_cache *s, void *object)
{
struct track *p;
if (!(s->flags & SLAB_STORE_USER))
return;
set_track(s, object, TRACK_FREE, 0UL);
set_track(s, object, TRACK_ALLOC, 0UL);
p = get_track(s, object, TRACK_ALLOC);
memset(p, 0, 2*sizeof(struct track));
}
static void print_track(const char *s, struct track *t, unsigned long pr_time)
{
depot_stack_handle_t handle __maybe_unused;
if (!t->addr)
return;
pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
#ifdef CONFIG_STACKTRACE
{
int i;
for (i = 0; i < TRACK_ADDRS_COUNT; i++)
if (t->addrs[i])
pr_err("\t%pS\n", (void *)t->addrs[i]);
else
break;
}
#ifdef CONFIG_STACKDEPOT
handle = READ_ONCE(t->handle);
if (handle)
stack_depot_print(handle);
else
pr_err("object allocation/free stack trace missing\n");
#endif
}
@ -1532,6 +1528,8 @@ static int __init setup_slub_debug(char *str)
global_slub_debug_changed = true;
} else {
slab_list_specified = true;
if (flags & SLAB_STORE_USER)
stack_depot_want_early_init();
}
}
@ -1549,6 +1547,8 @@ static int __init setup_slub_debug(char *str)
}
out:
slub_debug = global_flags;
if (slub_debug & SLAB_STORE_USER)
stack_depot_want_early_init();
if (slub_debug != 0 || slub_debug_string)
static_branch_enable(&slub_debug_enabled);
else
@ -4162,8 +4162,6 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->oo = oo_make(order, size);
s->min = oo_make(get_order(size), size);
if (oo_objects(s->oo) > oo_objects(s->max))
s->max = s->oo;
return !!oo_objects(s->oo);
}
@ -4341,18 +4339,26 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
objp = fixup_red_left(s, objp);
trackp = get_track(s, objp, TRACK_ALLOC);
kpp->kp_ret = (void *)trackp->addr;
#ifdef CONFIG_STACKTRACE
for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
kpp->kp_stack[i] = (void *)trackp->addrs[i];
if (!kpp->kp_stack[i])
break;
}
#ifdef CONFIG_STACKDEPOT
{
depot_stack_handle_t handle;
unsigned long *entries;
unsigned int nr_entries;
trackp = get_track(s, objp, TRACK_FREE);
for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
if (!kpp->kp_free_stack[i])
break;
handle = READ_ONCE(trackp->handle);
if (handle) {
nr_entries = stack_depot_fetch(handle, &entries);
for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
kpp->kp_stack[i] = (void *)entries[i];
}
trackp = get_track(s, objp, TRACK_FREE);
handle = READ_ONCE(trackp->handle);
if (handle) {
nr_entries = stack_depot_fetch(handle, &entries);
for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
kpp->kp_free_stack[i] = (void *)entries[i];
}
}
#endif
#endif
@ -5054,6 +5060,7 @@ EXPORT_SYMBOL(validate_slab_cache);
*/
struct location {
depot_stack_handle_t handle;
unsigned long count;
unsigned long addr;
long long sum_time;
@ -5106,9 +5113,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
{
long start, end, pos;
struct location *l;
unsigned long caddr;
unsigned long caddr, chandle;
unsigned long age = jiffies - track->when;
depot_stack_handle_t handle = 0;
#ifdef CONFIG_STACKDEPOT
handle = READ_ONCE(track->handle);
#endif
start = -1;
end = t->count;
@ -5123,7 +5134,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
break;
caddr = t->loc[pos].addr;
if (track->addr == caddr) {
chandle = t->loc[pos].handle;
if ((track->addr == caddr) && (handle == chandle)) {
l = &t->loc[pos];
l->count++;
@ -5148,6 +5160,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (track->addr < caddr)
end = pos;
else if (track->addr == caddr && handle < chandle)
end = pos;
else
start = pos;
}
@ -5170,6 +5184,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
l->max_time = age;
l->min_pid = track->pid;
l->max_pid = track->pid;
l->handle = handle;
cpumask_clear(to_cpumask(l->cpus));
cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
nodes_clear(l->nodes);
@ -6079,6 +6094,21 @@ static int slab_debugfs_show(struct seq_file *seq, void *v)
seq_printf(seq, " nodes=%*pbl",
nodemask_pr_args(&l->nodes));
#ifdef CONFIG_STACKDEPOT
{
depot_stack_handle_t handle;
unsigned long *entries;
unsigned int nr_entries, j;
handle = READ_ONCE(l->handle);
if (handle) {
nr_entries = stack_depot_fetch(handle, &entries);
seq_puts(seq, "\n");
for (j = 0; j < nr_entries; j++)
seq_printf(seq, " %pS\n", (void *)entries[j]);
}
}
#endif
seq_puts(seq, "\n");
}
@ -6103,6 +6133,17 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
return NULL;
}
static int cmp_loc_by_count(const void *a, const void *b, const void *data)
{
struct location *loc1 = (struct location *)a;
struct location *loc2 = (struct location *)b;
if (loc1->count > loc2->count)
return -1;
else
return 1;
}
static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
{
struct loc_track *t = seq->private;
@ -6164,6 +6205,10 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
spin_unlock_irqrestore(&n->list_lock, flags);
}
/* Sort locations by count */
sort_r(t->loc, t->count, sizeof(struct location),
cmp_loc_by_count, NULL, NULL);
bitmap_free(obj_map);
return 0;
}