mm,page_owner: fix refcount imbalance

Current code does not contemplate scenarios were an allocation and free
operation on the same pages do not handle it in the same amount at once. 
To give an example, page_alloc_exact(), where we will allocate a page of
enough order to stafisfy the size request, but we will free the remainings
right away.

In the above example, we will increment the stack_record refcount only
once, but we will decrease it the same number of times as number of unused
pages we have to free.  This will lead to a warning because of refcount
imbalance.

Fix this by recording the number of base pages in the refcount field.

Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de
Reported-by: syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com
Fixes: 217b2119b9 ("mm,page_owner: implement the tracking of the stacks count")
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Oscar Salvador 2024-04-04 09:07:00 +02:00 committed by Andrew Morton
parent ea4b5b33bf
commit f5c12105c1
2 changed files with 58 additions and 49 deletions

View File

@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of
each page. It is already implemented and activated if page owner is each page. It is already implemented and activated if page owner is
enabled. Other usages are more than welcome. enabled. Other usages are more than welcome.
It can also be used to show all the stacks and their outstanding It can also be used to show all the stacks and their current number of
allocations, which gives us a quick overview of where the memory is going allocated base pages, which gives us a quick overview of where the memory
without the need to screen through all the pages and match the allocation is going without the need to screen through all the pages and match the
and free operation. allocation and free operation.
page owner is disabled by default. So, if you'd like to use it, you need page owner is disabled by default. So, if you'd like to use it, you need
to add "page_owner=on" to your boot cmdline. If the kernel is built to add "page_owner=on" to your boot cmdline. If the kernel is built
@ -75,42 +75,45 @@ Usage
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
cat stacks.txt cat stacks.txt
prep_new_page+0xa9/0x120 post_alloc_hook+0x177/0x1a0
get_page_from_freelist+0x7e6/0x2140 get_page_from_freelist+0xd01/0xd80
__alloc_pages+0x18a/0x370 __alloc_pages+0x39e/0x7e0
new_slab+0xc8/0x580 allocate_slab+0xbc/0x3f0
___slab_alloc+0x1f2/0xaf0 ___slab_alloc+0x528/0x8a0
__slab_alloc.isra.86+0x22/0x40 kmem_cache_alloc+0x224/0x3b0
kmem_cache_alloc+0x31b/0x350 sk_prot_alloc+0x58/0x1a0
__khugepaged_enter+0x39/0x100 sk_alloc+0x32/0x4f0
dup_mmap+0x1c7/0x5ce inet_create+0x427/0xb50
copy_process+0x1afe/0x1c90 __sock_create+0x2e4/0x650
kernel_clone+0x9a/0x3c0 inet_ctl_sock_create+0x30/0x180
__do_sys_clone+0x66/0x90 igmp_net_init+0xc1/0x130
do_syscall_64+0x7f/0x160 ops_init+0x167/0x410
entry_SYSCALL_64_after_hwframe+0x6c/0x74 setup_net+0x304/0xa60
stack_count: 234 copy_net_ns+0x29b/0x4a0
create_new_namespaces+0x4a1/0x820
nr_base_pages: 16
... ...
... ...
echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
cat stacks_7000.txt cat stacks_7000.txt
prep_new_page+0xa9/0x120 post_alloc_hook+0x177/0x1a0
get_page_from_freelist+0x7e6/0x2140 get_page_from_freelist+0xd01/0xd80
__alloc_pages+0x18a/0x370 __alloc_pages+0x39e/0x7e0
alloc_pages_mpol+0xdf/0x1e0 alloc_pages_mpol+0x22e/0x490
folio_alloc+0x14/0x50 folio_alloc+0xd5/0x110
filemap_alloc_folio+0xb0/0x100 filemap_alloc_folio+0x78/0x230
page_cache_ra_unbounded+0x97/0x180 page_cache_ra_order+0x287/0x6f0
filemap_fault+0x4b4/0x1200 filemap_get_pages+0x517/0x1160
__do_fault+0x2d/0x110 filemap_read+0x304/0x9f0
do_pte_missing+0x4b0/0xa30 xfs_file_buffered_read+0xe6/0x1d0 [xfs]
__handle_mm_fault+0x7fa/0xb70 xfs_file_read_iter+0x1f0/0x380 [xfs]
handle_mm_fault+0x125/0x300 __kernel_read+0x3b9/0x730
do_user_addr_fault+0x3c9/0x840 kernel_read_file+0x309/0x4d0
exc_page_fault+0x68/0x150 __do_sys_finit_module+0x381/0x730
asm_exc_page_fault+0x22/0x30 do_syscall_64+0x8d/0x150
stack_count: 8248 entry_SYSCALL_64_after_hwframe+0x62/0x6a
nr_base_pages: 20824
... ...
cat /sys/kernel/debug/page_owner > page_owner_full.txt cat /sys/kernel/debug/page_owner > page_owner_full.txt

View File

@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record,
spin_unlock_irqrestore(&stack_list_lock, flags); spin_unlock_irqrestore(&stack_list_lock, flags);
} }
static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
int nr_base_pages)
{ {
struct stack_record *stack_record = __stack_depot_get_stack_record(handle); struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
@ -217,15 +218,20 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
/* Add the new stack_record to our list */ /* Add the new stack_record to our list */
add_stack_record_to_list(stack_record, gfp_mask); add_stack_record_to_list(stack_record, gfp_mask);
} }
refcount_inc(&stack_record->count); refcount_add(nr_base_pages, &stack_record->count);
} }
static void dec_stack_record_count(depot_stack_handle_t handle) static void dec_stack_record_count(depot_stack_handle_t handle,
int nr_base_pages)
{ {
struct stack_record *stack_record = __stack_depot_get_stack_record(handle); struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
if (stack_record) if (!stack_record)
refcount_dec(&stack_record->count); return;
if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
handle);
} }
static inline void __update_page_owner_handle(struct page_ext *page_ext, static inline void __update_page_owner_handle(struct page_ext *page_ext,
@ -306,7 +312,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
* the machinery is not ready yet, we cannot decrement * the machinery is not ready yet, we cannot decrement
* their refcount either. * their refcount either.
*/ */
dec_stack_record_count(alloc_handle); dec_stack_record_count(alloc_handle, 1 << order);
} }
noinline void __set_page_owner(struct page *page, unsigned short order, noinline void __set_page_owner(struct page *page, unsigned short order,
@ -325,7 +331,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
current->pid, current->tgid, ts_nsec, current->pid, current->tgid, ts_nsec,
current->comm); current->comm);
page_ext_put(page_ext); page_ext_put(page_ext);
inc_stack_record_count(handle, gfp_mask); inc_stack_record_count(handle, gfp_mask, 1 << order);
} }
void __set_page_owner_migrate_reason(struct page *page, int reason) void __set_page_owner_migrate_reason(struct page *page, int reason)
@ -872,11 +878,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
return stack; return stack;
} }
static unsigned long page_owner_stack_threshold; static unsigned long page_owner_pages_threshold;
static int stack_print(struct seq_file *m, void *v) static int stack_print(struct seq_file *m, void *v)
{ {
int i, stack_count; int i, nr_base_pages;
struct stack *stack = v; struct stack *stack = v;
unsigned long *entries; unsigned long *entries;
unsigned long nr_entries; unsigned long nr_entries;
@ -887,14 +893,14 @@ static int stack_print(struct seq_file *m, void *v)
nr_entries = stack_record->size; nr_entries = stack_record->size;
entries = stack_record->entries; entries = stack_record->entries;
stack_count = refcount_read(&stack_record->count) - 1; nr_base_pages = refcount_read(&stack_record->count) - 1;
if (stack_count < 1 || stack_count < page_owner_stack_threshold) if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
return 0; return 0;
for (i = 0; i < nr_entries; i++) for (i = 0; i < nr_entries; i++)
seq_printf(m, " %pS\n", (void *)entries[i]); seq_printf(m, " %pS\n", (void *)entries[i]);
seq_printf(m, "stack_count: %d\n\n", stack_count); seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
return 0; return 0;
} }
@ -924,13 +930,13 @@ static const struct file_operations page_owner_stack_operations = {
static int page_owner_threshold_get(void *data, u64 *val) static int page_owner_threshold_get(void *data, u64 *val)
{ {
*val = READ_ONCE(page_owner_stack_threshold); *val = READ_ONCE(page_owner_pages_threshold);
return 0; return 0;
} }
static int page_owner_threshold_set(void *data, u64 val) static int page_owner_threshold_set(void *data, u64 val)
{ {
WRITE_ONCE(page_owner_stack_threshold, val); WRITE_ONCE(page_owner_pages_threshold, val);
return 0; return 0;
} }