linux-stable/mm/debug.c
Hugh Dickins cb67f4282b mm,thp,rmap: simplify compound page mapcount handling
Compound page (folio) mapcount calculations have been different for anon
and file (or shmem) THPs, and involved the obscure PageDoubleMap flag. 
And each huge mapping and unmapping of a file (or shmem) THP involved
atomically incrementing and decrementing the mapcount of every subpage of
that huge page, dirtying many struct page cachelines.

Add subpages_mapcount field to the struct folio and first tail page, so
that the total of subpage mapcounts is available in one place near the
head: then page_mapcount() and total_mapcount() and page_mapped(), and
their folio equivalents, are so quick that anon and file and hugetlb don't
need to be optimized differently.  Delete the unloved PageDoubleMap.

page_add and page_remove rmap functions must now maintain the
subpages_mapcount as well as the subpage _mapcount, when dealing with pte
mappings of huge pages; and correct maintenance of NR_ANON_MAPPED and
NR_FILE_MAPPED statistics still needs reading through the subpages, using
nr_subpages_unmapped() - but only when first or last pmd mapping finds
subpages_mapcount raised (double-map case, not the common case).

But are those counts (used to decide when to split an anon THP, and in
vmscan's pagecache_reclaimable heuristic) correctly maintained?  Not
quite: since page_remove_rmap() (and also split_huge_pmd()) is often
called without page lock, there can be races when a subpage pte mapcount
0<->1 while compound pmd mapcount 0<->1 is scanning - races which the
previous implementation had prevented.  The statistics might become
inaccurate, and even drift down until they underflow through 0.  That is
not good enough, but is better dealt with in a followup patch.

Update a few comments on first and second tail page overlaid fields. 
hugepage_add_new_anon_rmap() has to "increment" compound_mapcount, but
subpages_mapcount and compound_pincount are already correctly at 0, so
delete its reinitialization of compound_pincount.

A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took
18 seconds on small pages, and used to take 1 second on huge pages, but
now takes 119 milliseconds on huge pages.  Mapping by pmds a second time
used to take 860ms and now takes 92ms; mapping by pmds after mapping by
ptes (when the scan is needed) used to take 870ms and now takes 495ms. 
But there might be some benchmarks which would show a slowdown, because
tail struct pages now fall out of cache until final freeing checks them.

Link: https://lkml.kernel.org/r/47ad693-717-79c8-e1ba-46c3a6602e48@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-11-30 15:58:46 -08:00

263 lines
6.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* mm/debug.c
*
* mm/ specific debug routines.
*
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
#include <trace/events/mmflags.h>
#include <linux/migrate.h>
#include <linux/page_owner.h>
#include <linux/ctype.h>
#include "internal.h"
#include <trace/events/migrate.h>
/*
* Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
* be used to populate migrate_reason_names[].
*/
#undef EM
#undef EMe
#define EM(a, b) b,
#define EMe(a, b) b
const char *migrate_reason_names[MR_TYPES] = {
MIGRATE_REASON
};
const struct trace_print_flags pageflag_names[] = {
__def_pageflag_names,
{0, NULL}
};
const struct trace_print_flags gfpflag_names[] = {
__def_gfpflag_names,
{0, NULL}
};
const struct trace_print_flags vmaflag_names[] = {
__def_vmaflag_names,
{0, NULL}
};
static void __dump_page(struct page *page)
{
struct folio *folio = page_folio(page);
struct page *head = &folio->page;
struct address_space *mapping;
bool compound = PageCompound(page);
/*
* Accessing the pageblock without the zone lock. It could change to
* "isolate" again in the meantime, but since we are just dumping the
* state for debugging, it should be fine to accept a bit of
* inaccuracy here due to racing.
*/
bool page_cma = is_migrate_cma_page(page);
int mapcount;
char *type = "";
if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
/*
* Corrupt page, so we cannot call page_mapping. Instead, do a
* safe subset of the steps that page_mapping() does. Caution:
* this will be misleading for tail pages, PageSwapCache pages,
* and potentially other situations. (See the page_mapping()
* implementation for what's missing here.)
*/
unsigned long tmp = (unsigned long)page->mapping;
if (tmp & PAGE_MAPPING_ANON)
mapping = NULL;
else
mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
head = page;
folio = (struct folio *)page;
compound = false;
} else {
mapping = page_mapping(page);
}
/*
* Avoid VM_BUG_ON() in page_mapcount().
* page->_mapcount space in struct page is used by sl[aou]b pages to
* encode own info.
*/
mapcount = PageSlab(head) ? 0 : page_mapcount(page);
pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
page, page_ref_count(head), mapcount, mapping,
page_to_pgoff(page), page_to_pfn(page));
if (compound) {
pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n",
head, compound_order(head),
head_compound_mapcount(head),
head_subpages_mapcount(head),
head_compound_pincount(head));
}
#ifdef CONFIG_MEMCG
if (head->memcg_data)
pr_warn("memcg:%lx\n", head->memcg_data);
#endif
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
type = "anon ";
else if (mapping)
dump_mapping(mapping);
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
pr_warn("%sflags: %pGp%s\n", type, &head->flags,
page_cma ? " CMA" : "");
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
if (head != page)
print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), head,
sizeof(struct page), false);
}
void dump_page(struct page *page, const char *reason)
{
if (PagePoisoned(page))
pr_warn("page:%p is uninitialized and poisoned", page);
else
__dump_page(page);
if (reason)
pr_warn("page dumped because: %s\n", reason);
dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
"flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
pr_emerg("mm %px task_size %lu\n"
#ifdef CONFIG_MMU
"get_unmapped_area %px\n"
#endif
"mmap_base %lu mmap_legacy_base %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
"binfmt %px flags %lx\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
#ifdef CONFIG_MEMCG
"owner %px "
#endif
"exe_file %px\n"
#ifdef CONFIG_MMU_NOTIFIER
"notifier_subscriptions %px\n"
#endif
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
#endif
"tlb_flush_pending %d\n"
"def_flags: %#lx(%pGv)\n",
mm, mm->task_size,
#ifdef CONFIG_MMU
mm->get_unmapped_area,
#endif
mm->mmap_base, mm->mmap_legacy_base,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
(u64)atomic64_read(&mm->pinned_vm),
mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
mm->binfmt, mm->flags,
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
#ifdef CONFIG_MEMCG
mm->owner,
#endif
mm->exe_file,
#ifdef CONFIG_MMU_NOTIFIER
mm->notifier_subscriptions,
#endif
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
atomic_read(&mm->tlb_flush_pending),
mm->def_flags, &mm->def_flags
);
}
static bool page_init_poisoning __read_mostly = true;
static int __init setup_vm_debug(char *str)
{
bool __page_init_poisoning = true;
/*
* Calling vm_debug with no arguments is equivalent to requesting
* to enable all debugging options we can control.
*/
if (*str++ != '=' || !*str)
goto out;
__page_init_poisoning = false;
if (*str == '-')
goto out;
while (*str) {
switch (tolower(*str)) {
case'p':
__page_init_poisoning = true;
break;
default:
pr_err("vm_debug option '%c' unknown. skipped\n",
*str);
}
str++;
}
out:
if (page_init_poisoning && !__page_init_poisoning)
pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n");
page_init_poisoning = __page_init_poisoning;
return 1;
}
__setup("vm_debug", setup_vm_debug);
void page_init_poison(struct page *page, size_t size)
{
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
#endif /* CONFIG_DEBUG_VM */