linux-stable/mm/page_idle.c
Minchan Kim 6d4675e601 mm: don't be stuck to rmap lock on reclaim path
The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended
under memory pressure if processes keep working on their vmas(e.g., fork,
mmap, munmap).  It makes reclaim path stuck.  In our real workload traces,
we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it
makes other processes entering direct reclaim, which were also stuck on
the lock.

This patch makes lru aging path try_lock mode like shink_page_list so the
reclaim context will keep working with next lru pages without being stuck.
if it found the rmap lock contended, it rotates the page back to head of
lru in both active/inactive lrus to make them consistent behavior, which
is basic starting point rather than adding more heristic.

Since this patch introduces a new "contended" field as out-param along
with try_lock in-param in rmap_walk_control, it's not immutable any longer
if the try_lock is set so remove const keywords on rmap related functions.
Since rmap walking is already expensive operation, I doubt the const
would help sizable benefit( And we didn't have it until 5.17).

In a heavy app workload in Android, trace shows following statistics.  It
almost removes rmap lock contention from reclaim path.

Martin Liu reported:

Before:

   max_dur(ms)  min_dur(ms)  max-min(dur)ms  avg_dur(ms)  sum_dur(ms)  count blocked_function
         1632            0            1631   151.542173        31672    209  page_lock_anon_vma_read
          601            0             601   145.544681        28817    198  rmap_walk_file

After:

   max_dur(ms)  min_dur(ms)  max-min(dur)ms  avg_dur(ms)  sum_dur(ms)  count blocked_function
          NaN          NaN              NaN          NaN          NaN    0.0             NaN
            0            0                0     0.127645            1     12  rmap_walk_file

[minchan@kernel.org: add comment, per Matthew]
  Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com
Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: John Dias <joaodias@google.com>
Cc: Tim Murray <timmurray@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Martin Liu <liumartin@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-05-19 14:08:54 -07:00

220 lines
5.2 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kobject.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/page_ext.h>
#include <linux/page_idle.h>
#include "internal.h"
#define BITMAP_CHUNK_SIZE sizeof(u64)
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
/*
* Idle page tracking only considers user memory pages, for other types of
* pages the idle flag is always unset and an attempt to set it is silently
* ignored.
*
* We treat a page as a user memory page if it is on an LRU list, because it is
* always safe to pass such a page to rmap_walk(), which is essential for idle
* page tracking. With such an indicator of user pages we can skip isolated
* pages, but since there are not usually many of them, it will hardly affect
* the overall result.
*
* This function tries to get a user memory page by pfn as described above.
*/
static struct page *page_idle_get_page(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
if (!page || !PageLRU(page) ||
!get_page_unless_zero(page))
return NULL;
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
return page;
}
static bool page_idle_clear_pte_refs_one(struct folio *folio,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
bool referenced = false;
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
/*
* For PTE-mapped THP, one sub page is referenced,
* the whole THP is referenced.
*/
if (ptep_clear_young_notify(vma, addr, pvmw.pte))
referenced = true;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
referenced = true;
} else {
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
}
}
if (referenced) {
folio_clear_idle(folio);
/*
* We cleared the referenced bit in a mapping to this page. To
* avoid interference with page reclaim, mark it young so that
* folio_referenced() will return > 0.
*/
folio_set_young(folio);
}
return true;
}
static void page_idle_clear_pte_refs(struct page *page)
{
struct folio *folio = page_folio(page);
/*
* Since rwc.try_lock is unused, rwc is effectively immutable, so we
* can make it static to save some cycles and stack.
*/
static struct rmap_walk_control rwc = {
.rmap_one = page_idle_clear_pte_refs_one,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
if (!folio_mapped(folio) || !folio_raw_mapping(folio))
return;
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
if (need_lock && !folio_trylock(folio))
return;
rmap_walk(folio, &rwc);
if (need_lock)
folio_unlock(folio);
}
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
int bit;
if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
return -EINVAL;
pfn = pos * BITS_PER_BYTE;
if (pfn >= max_pfn)
return 0;
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
page = page_idle_get_page(pfn);
if (page) {
if (page_is_idle(page)) {
/*
* The page might have been referenced via a
* pte, in which case it is not idle. Clear
* refs and recheck.
*/
page_idle_clear_pte_refs(page);
if (page_is_idle(page))
*out |= 1ULL << bit;
}
put_page(page);
}
if (bit == BITMAP_CHUNK_BITS - 1)
out++;
cond_resched();
}
return (char *)out - buf;
}
static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
struct page *page;
unsigned long pfn, end_pfn;
int bit;
if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
return -EINVAL;
pfn = pos * BITS_PER_BYTE;
if (pfn >= max_pfn)
return -ENXIO;
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
page = page_idle_get_page(pfn);
if (page) {
page_idle_clear_pte_refs(page);
set_page_idle(page);
put_page(page);
}
}
if (bit == BITMAP_CHUNK_BITS - 1)
in++;
cond_resched();
}
return (char *)in - buf;
}
static struct bin_attribute page_idle_bitmap_attr =
__BIN_ATTR(bitmap, 0600,
page_idle_bitmap_read, page_idle_bitmap_write, 0);
static struct bin_attribute *page_idle_bin_attrs[] = {
&page_idle_bitmap_attr,
NULL,
};
static const struct attribute_group page_idle_attr_group = {
.bin_attrs = page_idle_bin_attrs,
.name = "page_idle",
};
static int __init page_idle_init(void)
{
int err;
err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
if (err) {
pr_err("page_idle: register sysfs failed\n");
return err;
}
return 0;
}
subsys_initcall(page_idle_init);