From 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:06:48 -0400 Subject: [PATCH 1/2] cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset kicks off asynchronous NUMA node migration if necessary during task migration and flushes it from cpuset_post_attach_flush() which is called at the end of __cgroup_procs_write(). This is to avoid performing migration with cgroup_threadgroup_rwsem write-locked which can lead to deadlock through dependency on kworker creation. memcg has a similar issue with charge moving, so let's convert it to an official callback rather than the current one-off cpuset specific function. This patch adds cgroup_subsys->post_attach callback and makes cpuset register cpuset_post_attach_flush() as its ->post_attach. The conversion is mostly one-to-one except that the new callback is called under cgroup_mutex. This is to guarantee that no other migration operations are started before ->post_attach callbacks are finished. cgroup_mutex is one of the outermost mutex in the system and has never been and shouldn't be a problem. We can add specialized synchronization around __cgroup_procs_write() but I don't think there's any noticeable benefit. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Cc: # 4.4+ prerequisite for the next patch --- include/linux/cgroup-defs.h | 1 + include/linux/cpuset.h | 6 ------ kernel/cgroup.c | 7 +++++-- kernel/cpuset.c | 4 ++-- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3e39ae5bc799..5b17de62c962 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -444,6 +444,7 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); + void (*post_attach)(void); int (*can_fork)(struct task_struct *task); void (*cancel_fork)(struct task_struct *task); void (*fork)(struct task_struct *task); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index fea160ee5803..85a868ccb493 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -137,8 +137,6 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern void cpuset_post_attach_flush(void); - #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -245,10 +243,6 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline void cpuset_post_attach_flush(void) -{ -} - #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 671dc05c0b0f..909a7d31ffd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; + struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ret; + int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; @@ -2875,8 +2876,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); cgroup_kn_unlock(of->kn); - cpuset_post_attach_flush(); return ret ?: nbytes; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 00ab5c2b7c5b..1902956baba1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include @@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -void cpuset_post_attach_flush(void) +static void cpuset_post_attach(void) { flush_workqueue(cpuset_migrate_mm_wq); } @@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, .early_init = true, From 264a0ae164bc0e9144bebcd25ff030d067b1a878 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:09:02 -0400 Subject: [PATCH 2/2] memcg: relocate charge moving from ->attach to ->post_attach Hello, So, this ended up a lot simpler than I originally expected. I tested it lightly and it seems to work fine. Petr, can you please test these two patches w/o the lru drain drop patch and see whether the problem is gone? Thanks. ------ 8< ------ If charge moving is used, memcg performs relabeling of the affected pages from its ->attach callback which is called under both cgroup_threadgroup_rwsem and thus can't create new kthreads. This is fragile as various operations may depend on workqueues making forward progress which relies on the ability to create new kthreads. There's no reason to perform charge moving from ->attach which is deep in the task migration path. Move it to ->post_attach which is called after the actual migration is finished and cgroup_threadgroup_rwsem is dropped. * move_charge_struct->mm is added and ->can_attach is now responsible for pinning and recording the target mm. mem_cgroup_clear_mc() is updated accordingly. This also simplifies mem_cgroup_move_task(). * mem_cgroup_move_task() is now called from ->post_attach instead of ->attach. Signed-off-by: Tejun Heo Cc: Johannes Weiner Acked-by: Michal Hocko Debugged-and-tested-by: Petr Mladek Reported-by: Cyril Hrubis Reported-by: Johannes Weiner Fixes: 1ed1328792ff ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") Cc: # 4.4+ --- mm/memcontrol.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36db05fa8acb..fe787f5c41bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ + struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; unsigned long flags; @@ -4667,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { + struct mm_struct *mm = mc.mm; + /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -4676,7 +4679,10 @@ static void mem_cgroup_clear_mc(void) spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; + mc.mm = NULL; spin_unlock(&mc.lock); + + mmput(mm); } static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4733,6 +4739,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) VM_BUG_ON(mc.moved_swap); spin_lock(&mc.lock); + mc.mm = mm; mc.from = from; mc.to = memcg; mc.flags = move_flags; @@ -4742,8 +4749,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); + } else { + mmput(mm); } - mmput(mm); return ret; } @@ -4852,11 +4860,11 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return ret; } -static void mem_cgroup_move_charge(struct mm_struct *mm) +static void mem_cgroup_move_charge(void) { struct mm_walk mem_cgroup_move_charge_walk = { .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, + .mm = mc.mm, }; lru_add_drain_all(); @@ -4868,7 +4876,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) atomic_inc(&mc.from->moving_account); synchronize_rcu(); retry: - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { /* * Someone who are holding the mmap_sem might be waiting in * waitq. So we cancel all extra charges, wake up all waiters, @@ -4885,23 +4893,16 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) * additional charge, the page walk just aborts. */ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); - up_read(&mm->mmap_sem); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { - struct cgroup_subsys_state *css; - struct task_struct *p = cgroup_taskset_first(tset, &css); - struct mm_struct *mm = get_task_mm(p); - - if (mm) { - if (mc.to) - mem_cgroup_move_charge(mm); - mmput(mm); - } - if (mc.to) + if (mc.to) { + mem_cgroup_move_charge(); mem_cgroup_clear_mc(); + } } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4911,7 +4912,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { } #endif @@ -5195,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, + .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files,