mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-30 08:02:30 +00:00
c22d70a162
Asynchronously try to release dying cgwbs by switching attached inodes to the nearest living ancestor wb. It helps to get rid of per-cgroup writeback structures themselves and of pinned memory and block cgroups, which are significantly larger structures (mostly due to large per-cpu statistics data). This prevents memory waste and helps to avoid different scalability problems caused by large piles of dying cgroups. Reuse the existing mechanism of inode switching used for foreign inode detection. To speed things up batch up to 115 inode switching in a single operation (the maximum number is selected so that the resulting struct inode_switch_wbs_context can fit into 1024 bytes). Because every switching consists of two steps divided by an RCU grace period, it would be too slow without batching. Please note that the whole batch counts as a single operation (when increasing/decreasing isw_nr_in_flight). This allows to keep umounting working (flush the switching queue), however prevents cleanups from consuming the whole switching quota and effectively blocking the frn switching. A cgwb cleanup operation can fail due to different reasons (e.g. not enough memory, the cgwb has an in-flight/pending io, an attached inode in a wrong state, etc). In this case the next scheduled cleanup will make a new attempt. An attempt is made each time a new cgwb is offlined (in other words a memcg and/or a blkcg is deleted by a user). In the future an additional attempt scheduled by a timer can be implemented. [guro@fb.com: replace open-coded "115" with arithmetic] Link: https://lkml.kernel.org/r/YMEcSBcq/VXMiPPO@carbon.dhcp.thefacebook.com [guro@fb.com: add smp_mb() to inode_prepare_wbs_switch()] Link: https://lkml.kernel.org/r/YMFa+guFw7OFjf3X@carbon.dhcp.thefacebook.com [willy@infradead.org: fix documentation] Link: https://lkml.kernel.org/r/20210615200242.1716568-2-willy@infradead.org Link: https://lkml.kernel.org/r/20210608230225.2078447-9-guro@fb.com Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Acked-by: Tejun Heo <tj@kernel.org> Acked-by: Dennis Zhou <dennis@kernel.org> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Dave Chinner <dchinner@redhat.com> Cc: Jan Kara <jack@suse.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
306 lines
8.4 KiB
C
306 lines
8.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __LINUX_BACKING_DEV_DEFS_H
|
|
#define __LINUX_BACKING_DEV_DEFS_H
|
|
|
|
#include <linux/list.h>
|
|
#include <linux/radix-tree.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/percpu_counter.h>
|
|
#include <linux/percpu-refcount.h>
|
|
#include <linux/flex_proportions.h>
|
|
#include <linux/timer.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/kref.h>
|
|
#include <linux/refcount.h>
|
|
|
|
struct page;
|
|
struct device;
|
|
struct dentry;
|
|
|
|
/*
|
|
* Bits in bdi_writeback.state
|
|
*/
|
|
enum wb_state {
|
|
WB_registered, /* bdi_register() was done */
|
|
WB_writeback_running, /* Writeback is in progress */
|
|
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
|
|
WB_start_all, /* nr_pages == 0 (all) work pending */
|
|
};
|
|
|
|
enum wb_congested_state {
|
|
WB_async_congested, /* The async (write) queue is getting full */
|
|
WB_sync_congested, /* The sync queue is getting full */
|
|
};
|
|
|
|
enum wb_stat_item {
|
|
WB_RECLAIMABLE,
|
|
WB_WRITEBACK,
|
|
WB_DIRTIED,
|
|
WB_WRITTEN,
|
|
NR_WB_STAT_ITEMS
|
|
};
|
|
|
|
#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
|
|
|
|
/*
|
|
* why some writeback work was initiated
|
|
*/
|
|
enum wb_reason {
|
|
WB_REASON_BACKGROUND,
|
|
WB_REASON_VMSCAN,
|
|
WB_REASON_SYNC,
|
|
WB_REASON_PERIODIC,
|
|
WB_REASON_LAPTOP_TIMER,
|
|
WB_REASON_FS_FREE_SPACE,
|
|
/*
|
|
* There is no bdi forker thread any more and works are done
|
|
* by emergency worker, however, this is TPs userland visible
|
|
* and we'll be exposing exactly the same information,
|
|
* so it has a mismatch name.
|
|
*/
|
|
WB_REASON_FORKER_THREAD,
|
|
WB_REASON_FOREIGN_FLUSH,
|
|
|
|
WB_REASON_MAX,
|
|
};
|
|
|
|
struct wb_completion {
|
|
atomic_t cnt;
|
|
wait_queue_head_t *waitq;
|
|
};
|
|
|
|
#define __WB_COMPLETION_INIT(_waitq) \
|
|
(struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }
|
|
|
|
/*
|
|
* If one wants to wait for one or more wb_writeback_works, each work's
|
|
* ->done should be set to a wb_completion defined using the following
|
|
* macro. Once all work items are issued with wb_queue_work(), the caller
|
|
* can wait for the completion of all using wb_wait_for_completion(). Work
|
|
* items which are waited upon aren't freed automatically on completion.
|
|
*/
|
|
#define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq)
|
|
|
|
#define DEFINE_WB_COMPLETION(cmpl, bdi) \
|
|
struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)
|
|
|
|
/*
|
|
* Each wb (bdi_writeback) can perform writeback operations, is measured
|
|
* and throttled, independently. Without cgroup writeback, each bdi
|
|
* (bdi_writeback) is served by its embedded bdi->wb.
|
|
*
|
|
* On the default hierarchy, blkcg implicitly enables memcg. This allows
|
|
* using memcg's page ownership for attributing writeback IOs, and every
|
|
* memcg - blkcg combination can be served by its own wb by assigning a
|
|
* dedicated wb to each memcg, which enables isolation across different
|
|
* cgroups and propagation of IO back pressure down from the IO layer upto
|
|
* the tasks which are generating the dirty pages to be written back.
|
|
*
|
|
* A cgroup wb is indexed on its bdi by the ID of the associated memcg,
|
|
* refcounted with the number of inodes attached to it, and pins the memcg
|
|
* and the corresponding blkcg. As the corresponding blkcg for a memcg may
|
|
* change as blkcg is disabled and enabled higher up in the hierarchy, a wb
|
|
* is tested for blkcg after lookup and removed from index on mismatch so
|
|
* that a new wb for the combination can be created.
|
|
*/
|
|
struct bdi_writeback {
|
|
struct backing_dev_info *bdi; /* our parent bdi */
|
|
|
|
unsigned long state; /* Always use atomic bitops on this */
|
|
unsigned long last_old_flush; /* last old data flush */
|
|
|
|
struct list_head b_dirty; /* dirty inodes */
|
|
struct list_head b_io; /* parked for writeback */
|
|
struct list_head b_more_io; /* parked for more writeback */
|
|
struct list_head b_dirty_time; /* time stamps are dirty */
|
|
spinlock_t list_lock; /* protects the b_* lists */
|
|
|
|
struct percpu_counter stat[NR_WB_STAT_ITEMS];
|
|
|
|
unsigned long congested; /* WB_[a]sync_congested flags */
|
|
|
|
unsigned long bw_time_stamp; /* last time write bw is updated */
|
|
unsigned long dirtied_stamp;
|
|
unsigned long written_stamp; /* pages written at bw_time_stamp */
|
|
unsigned long write_bandwidth; /* the estimated write bandwidth */
|
|
unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
|
|
|
|
/*
|
|
* The base dirty throttle rate, re-calculated on every 200ms.
|
|
* All the bdi tasks' dirty rate will be curbed under it.
|
|
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
|
|
* in small steps and is much more smooth/stable than the latter.
|
|
*/
|
|
unsigned long dirty_ratelimit;
|
|
unsigned long balanced_dirty_ratelimit;
|
|
|
|
struct fprop_local_percpu completions;
|
|
int dirty_exceeded;
|
|
enum wb_reason start_all_reason;
|
|
|
|
spinlock_t work_lock; /* protects work_list & dwork scheduling */
|
|
struct list_head work_list;
|
|
struct delayed_work dwork; /* work item used for writeback */
|
|
|
|
unsigned long dirty_sleep; /* last wait */
|
|
|
|
struct list_head bdi_node; /* anchored at bdi->wb_list */
|
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
struct percpu_ref refcnt; /* used only for !root wb's */
|
|
struct fprop_local_percpu memcg_completions;
|
|
struct cgroup_subsys_state *memcg_css; /* the associated memcg */
|
|
struct cgroup_subsys_state *blkcg_css; /* and blkcg */
|
|
struct list_head memcg_node; /* anchored at memcg->cgwb_list */
|
|
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
|
|
struct list_head b_attached; /* attached inodes, protected by list_lock */
|
|
struct list_head offline_node; /* anchored at offline_cgwbs */
|
|
|
|
union {
|
|
struct work_struct release_work;
|
|
struct rcu_head rcu;
|
|
};
|
|
#endif
|
|
};
|
|
|
|
struct backing_dev_info {
|
|
u64 id;
|
|
struct rb_node rb_node; /* keyed by ->id */
|
|
struct list_head bdi_list;
|
|
unsigned long ra_pages; /* max readahead in PAGE_SIZE units */
|
|
unsigned long io_pages; /* max allowed IO size */
|
|
|
|
struct kref refcnt; /* Reference counter for the structure */
|
|
unsigned int capabilities; /* Device capabilities */
|
|
unsigned int min_ratio;
|
|
unsigned int max_ratio, max_prop_frac;
|
|
|
|
/*
|
|
* Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are
|
|
* any dirty wbs, which is depended upon by bdi_has_dirty().
|
|
*/
|
|
atomic_long_t tot_write_bandwidth;
|
|
|
|
struct bdi_writeback wb; /* the root writeback info for this bdi */
|
|
struct list_head wb_list; /* list of all wbs */
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
|
|
struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
|
|
struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
|
|
#endif
|
|
wait_queue_head_t wb_waitq;
|
|
|
|
struct device *dev;
|
|
char dev_name[64];
|
|
struct device *owner;
|
|
|
|
struct timer_list laptop_mode_wb_timer;
|
|
|
|
#ifdef CONFIG_DEBUG_FS
|
|
struct dentry *debug_dir;
|
|
#endif
|
|
};
|
|
|
|
enum {
|
|
BLK_RW_ASYNC = 0,
|
|
BLK_RW_SYNC = 1,
|
|
};
|
|
|
|
void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
|
|
void set_bdi_congested(struct backing_dev_info *bdi, int sync);
|
|
|
|
struct wb_lock_cookie {
|
|
bool locked;
|
|
unsigned long flags;
|
|
};
|
|
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
|
|
/**
|
|
* wb_tryget - try to increment a wb's refcount
|
|
* @wb: bdi_writeback to get
|
|
*/
|
|
static inline bool wb_tryget(struct bdi_writeback *wb)
|
|
{
|
|
if (wb != &wb->bdi->wb)
|
|
return percpu_ref_tryget(&wb->refcnt);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* wb_get - increment a wb's refcount
|
|
* @wb: bdi_writeback to get
|
|
*/
|
|
static inline void wb_get(struct bdi_writeback *wb)
|
|
{
|
|
if (wb != &wb->bdi->wb)
|
|
percpu_ref_get(&wb->refcnt);
|
|
}
|
|
|
|
/**
|
|
* wb_put - decrement a wb's refcount
|
|
* @wb: bdi_writeback to put
|
|
* @nr: number of references to put
|
|
*/
|
|
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
|
|
{
|
|
if (WARN_ON_ONCE(!wb->bdi)) {
|
|
/*
|
|
* A driver bug might cause a file to be removed before bdi was
|
|
* initialized.
|
|
*/
|
|
return;
|
|
}
|
|
|
|
if (wb != &wb->bdi->wb)
|
|
percpu_ref_put_many(&wb->refcnt, nr);
|
|
}
|
|
|
|
/**
|
|
* wb_put - decrement a wb's refcount
|
|
* @wb: bdi_writeback to put
|
|
*/
|
|
static inline void wb_put(struct bdi_writeback *wb)
|
|
{
|
|
wb_put_many(wb, 1);
|
|
}
|
|
|
|
/**
|
|
* wb_dying - is a wb dying?
|
|
* @wb: bdi_writeback of interest
|
|
*
|
|
* Returns whether @wb is unlinked and being drained.
|
|
*/
|
|
static inline bool wb_dying(struct bdi_writeback *wb)
|
|
{
|
|
return percpu_ref_is_dying(&wb->refcnt);
|
|
}
|
|
|
|
#else /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
static inline bool wb_tryget(struct bdi_writeback *wb)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline void wb_get(struct bdi_writeback *wb)
|
|
{
|
|
}
|
|
|
|
static inline void wb_put(struct bdi_writeback *wb)
|
|
{
|
|
}
|
|
|
|
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
|
|
{
|
|
}
|
|
|
|
static inline bool wb_dying(struct bdi_writeback *wb)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
#endif /* CONFIG_CGROUP_WRITEBACK */
|
|
|
|
#endif /* __LINUX_BACKING_DEV_DEFS_H */
|