mm: vmscan: make rotations a secondary factor in balancing anon vs file

We noticed a 2% webserver throughput regression after upgrading from 5.6. 
This could be tracked down to a shift in the anon/file reclaim balance
(confirmed with swappiness) that resulted in worse reclaim efficiency and
thus more kswapd activity for the same outcome.

The change that exposed the problem is aae466b005 ("mm/swap: implement
workingset detection for anonymous LRU").  By qualifying swapins based on
their refault distance, it lowered the cost of anon reclaim in this
workload, in turn causing (much) more anon scanning than before.  Scanning
the anon list is more expensive due to the higher ratio of mmapped pages
that may rotate during reclaim, and so the result was an increase in %sys
time.

Right now, rotations aren't considered a cost when balancing scan pressure
between LRUs.  We can end up with very few file refaults putting all the
scan pressure on hot anon pages that are rotated en masse, don't get
reclaimed, and never push back on the file LRU again.  We still only
reclaim file cache in that case, but we burn a lot CPU rotating anon
pages.  It's "fair" from an LRU age POV, but doesn't reflect the real cost
it imposes on the system.

Consider rotations as a secondary factor in balancing the LRUs.  This
doesn't attempt to make a precise comparison between IO cost and CPU cost,
it just says: if reloads are about comparable between the lists, or
rotations are overwhelmingly different, adjust for CPU work.

This fixed the regression on our webservers.  It has since been deployed
to the entire Meta fleet and hasn't caused any problems.

Link: https://lkml.kernel.org/r/20221013193113.726425-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Johannes Weiner 2022-10-13 15:31:13 -04:00 committed by Andrew Morton
parent 57a196a584
commit 0538a82c39
4 changed files with 24 additions and 9 deletions

View File

@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages);
void lru_note_cost_folio(struct folio *);
void lru_note_cost(struct lruvec *lruvec, bool file,
unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void lru_cache_add(struct page *);

View File

@ -295,8 +295,20 @@ void folio_rotate_reclaimable(struct folio *folio)
}
}
void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
void lru_note_cost(struct lruvec *lruvec, bool file,
unsigned int nr_io, unsigned int nr_rotated)
{
unsigned long cost;
/*
* Reflect the relative cost of incurring IO and spending CPU
* time on rotations. This doesn't attempt to make a precise
* comparison, it just says: if reloads are about comparable
* between the LRU lists, or rotations are overwhelmingly
* different between them, adjust scan balance for CPU work.
*/
cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
do {
unsigned long lrusize;
@ -310,9 +322,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += nr_pages;
lruvec->file_cost += cost;
else
lruvec->anon_cost += nr_pages;
lruvec->anon_cost += cost;
/*
* Decay previous events
@ -335,10 +347,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec)));
}
void lru_note_cost_folio(struct folio *folio)
void lru_note_cost_refault(struct folio *folio)
{
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
folio_nr_pages(folio));
folio_nr_pages(folio), 0);
}
static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)

View File

@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
lru_note_cost(lruvec, file, stat.nr_pageout);
lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
mem_cgroup_uncharge_list(&folio_list);
free_unref_page_list(&folio_list);
@ -2639,6 +2639,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
if (nr_rotated)
lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,

View File

@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow)
if (workingset) {
folio_set_workingset(folio);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
lru_note_cost_folio(folio);
lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out: