linux-stable/include/linux/list_lru.h
Nhat Pham 0a97c01cd2 list_lru: allow explicit memcg and NUMA node selection
Patch series "workload-specific and memory pressure-driven zswap
writeback", v8.

There are currently several issues with zswap writeback:

1. There is only a single global LRU for zswap, making it impossible to
   perform worload-specific shrinking - an memcg under memory pressure
   cannot determine which pages in the pool it owns, and often ends up
   writing pages from other memcgs. This issue has been previously
   observed in practice and mitigated by simply disabling
   memcg-initiated shrinking:

   https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

   But this solution leaves a lot to be desired, as we still do not
   have an avenue for an memcg to free up its own memory locked up in
   the zswap pool.

2. We only shrink the zswap pool when the user-defined limit is hit.
   This means that if we set the limit too high, cold data that are
   unlikely to be used again will reside in the pool, wasting precious
   memory. It is hard to predict how much zswap space will be needed
   ahead of time, as this depends on the workload (specifically, on
   factors such as memory access patterns and compressibility of the
   memory pages).

This patch series solves these issues by separating the global zswap LRU
into per-memcg and per-NUMA LRUs, and performs workload-specific (i.e
memcg- and NUMA-aware) zswap writeback under memory pressure.  The new
shrinker does not have any parameter that must be tuned by the user, and
can be opted in or out on a per-memcg basis.

As a proof of concept, we ran the following synthetic benchmark: build the
linux kernel in a memory-limited cgroup, and allocate some cold data in
tmpfs to see if the shrinker could write them out and improved the overall
performance.  Depending on the amount of cold data generated, we observe
from 14% to 35% reduction in kernel CPU time used in the kernel builds.


This patch (of 6):

The interface of list_lru is based on the assumption that the list node
and the data it represents belong to the same allocated on the correct
node/memcg.  While this assumption is valid for existing slab objects LRU
such as dentries and inodes, it is undocumented, and rather inflexible for
certain potential list_lru users (such as the upcoming zswap shrinker and
the THP shrinker).  It has caused us a lot of issues during our
development.

This patch changes list_lru interface so that the caller must explicitly
specify numa node and memcg when adding and removing objects.  The old
list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and
list_lru_del_obj(), respectively.

It also extends the list_lru API with a new function, list_lru_putback,
which undoes a previous list_lru_isolate call.  Unlike list_lru_add, it
does not increment the LRU node count (as list_lru_isolate does not
decrement the node count).  list_lru_putback also allows for explicit
memcg and NUMA node selection.

Link: https://lkml.kernel.org/r/20231130194023.4102148-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-12-12 10:57:01 -08:00

271 lines
9.1 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
* Authors: David Chinner and Glauber Costa
*
* Generic LRU infrastructure
*/
#ifndef _LRU_LIST_H
#define _LRU_LIST_H
#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/shrinker.h>
#include <linux/xarray.h>
struct mem_cgroup;
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
LRU_REMOVED, /* item removed from list */
LRU_REMOVED_RETRY, /* item removed, but lock has been
dropped and reacquired */
LRU_ROTATE, /* item referenced, give another pass */
LRU_SKIP, /* item cannot be locked, skip */
LRU_RETRY, /* item not freeable. May drop the lock
internally, but has to return locked. */
};
struct list_lru_one {
struct list_head list;
/* may become negative during memcg reparenting */
long nr_items;
};
struct list_lru_memcg {
struct rcu_head rcu;
/* array of per cgroup per node lists, indexed by node id */
struct list_lru_one node[];
};
struct list_lru_node {
/* protects all lists on the node, including per cgroup */
spinlock_t lock;
/* global list, used for the root cgroup in cgroup aware lrus */
struct list_lru_one lru;
long nr_items;
} ____cacheline_aligned_in_smp;
struct list_lru {
struct list_lru_node *node;
#ifdef CONFIG_MEMCG_KMEM
struct list_head list;
int shrinker_id;
bool memcg_aware;
struct xarray xa;
#endif
};
void list_lru_destroy(struct list_lru *lru);
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key, struct shrinker *shrinker);
#define list_lru_init(lru) \
__list_lru_init((lru), false, NULL, NULL)
#define list_lru_init_key(lru, key) \
__list_lru_init((lru), false, (key), NULL)
#define list_lru_init_memcg(lru, shrinker) \
__list_lru_init((lru), true, NULL, shrinker)
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp);
void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent);
/**
* list_lru_add: add an element to the lru list's tail
* @lru: the lru pointer
* @item: the item to be added.
* @nid: the node id of the sublist to add the item to.
* @memcg: the cgroup of the sublist to add the item to.
*
* If the element is already part of a list, this function returns doing
* nothing. Therefore the caller does not need to keep state about whether or
* not the element already belongs in the list and is allowed to lazy update
* it. Note however that this is valid for *a* list, not *this* list. If
* the caller organize itself in a way that elements can be in more than
* one type of list, it is up to the caller to fully remove the item from
* the previous list (with list_lru_del() for instance) before moving it
* to @lru.
*
* Return: true if the list was updated, false otherwise
*/
bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
struct mem_cgroup *memcg);
/**
* list_lru_add_obj: add an element to the lru list's tail
* @lru: the lru pointer
* @item: the item to be added.
*
* This function is similar to list_lru_add(), but the NUMA node and the
* memcg of the sublist is determined by @item list_head. This assumption is
* valid for slab objects LRU such as dentries, inodes, etc.
*
* Return value: true if the list was updated, false otherwise
*/
bool list_lru_add_obj(struct list_lru *lru, struct list_head *item);
/**
* list_lru_del: delete an element from the lru list
* @lru: the lru pointer
* @item: the item to be deleted.
* @nid: the node id of the sublist to delete the item from.
* @memcg: the cgroup of the sublist to delete the item from.
*
* This function works analogously as list_lru_add() in terms of list
* manipulation. The comments about an element already pertaining to
* a list are also valid for list_lru_del().
*
* Return: true if the list was updated, false otherwise
*/
bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
struct mem_cgroup *memcg);
/**
* list_lru_del_obj: delete an element from the lru list
* @lru: the lru pointer
* @item: the item to be deleted.
*
* This function is similar to list_lru_del(), but the NUMA node and the
* memcg of the sublist is determined by @item list_head. This assumption is
* valid for slab objects LRU such as dentries, inodes, etc.
*
* Return value: true if the list was updated, false otherwise.
*/
bool list_lru_del_obj(struct list_lru *lru, struct list_head *item);
/**
* list_lru_count_one: return the number of objects currently held by @lru
* @lru: the lru pointer.
* @nid: the node id to count from.
* @memcg: the cgroup to count from.
*
* There is no guarantee that the list is not updated while the count is being
* computed. Callers that want such a guarantee need to provide an outer lock.
*
* Return: 0 for empty lists, otherwise the number of objects
* currently held by @lru.
*/
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg);
unsigned long list_lru_count_node(struct list_lru *lru, int nid);
static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
struct shrink_control *sc)
{
return list_lru_count_one(lru, sc->nid, sc->memcg);
}
static inline unsigned long list_lru_count(struct list_lru *lru)
{
long count = 0;
int nid;
for_each_node_state(nid, N_NORMAL_MEMORY)
count += list_lru_count_node(lru, nid);
return count;
}
void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head);
/**
* list_lru_putback: undo list_lru_isolate
* @lru: the lru pointer.
* @item: the item to put back.
* @nid: the node id of the sublist to put the item back to.
* @memcg: the cgroup of the sublist to put the item back to.
*
* Put back an isolated item into its original LRU. Note that unlike
* list_lru_add, this does not increment the node LRU count (as
* list_lru_isolate does not originally decrement this count).
*
* Since we might have dropped the LRU lock in between, recompute list_lru_one
* from the node's id and memcg.
*/
void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
struct mem_cgroup *memcg);
typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
/**
* list_lru_walk_one: walk a @lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
* @memcg: the cgroup to scan from.
* @isolate: callback function that is responsible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
* @nr_to_walk: how many items to scan.
*
* This function will scan all elements in a particular @lru, calling the
* @isolate callback for each of those items, along with the current list
* spinlock and a caller-provided opaque. The @isolate callback can choose to
* drop the lock internally, but *must* return with the lock held. The callback
* will return an enum lru_status telling the @lru infrastructure what to
* do with the object being scanned.
*
* Please note that @nr_to_walk does not mean how many objects will be freed,
* just how many objects will be scanned.
*
* Return: the number of objects effectively removed from the LRU.
*/
unsigned long list_lru_walk_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
/**
* list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
* @memcg: the cgroup to scan from.
* @isolate: callback function that is responsible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
* @nr_to_walk: how many items to scan.
*
* Same as list_lru_walk_one() except that the spinlock is acquired with
* spin_lock_irq().
*/
unsigned long list_lru_walk_one_irq(struct list_lru *lru,
int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
static inline unsigned long
list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
list_lru_walk_cb isolate, void *cb_arg)
{
return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
&sc->nr_to_scan);
}
static inline unsigned long
list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc,
list_lru_walk_cb isolate, void *cb_arg)
{
return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg,
&sc->nr_to_scan);
}
static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
void *cb_arg, unsigned long nr_to_walk)
{
long isolated = 0;
int nid;
for_each_node_state(nid, N_NORMAL_MEMORY) {
isolated += list_lru_walk_node(lru, nid, isolate,
cb_arg, &nr_to_walk);
if (nr_to_walk <= 0)
break;
}
return isolated;
}
#endif /* _LRU_LIST_H */