linux-stable/include/linux/cpuset.h
Feng Tang 8ca1b5a498 mm/page_alloc: detect allocation forbidden by cpuset and bail out early
There was a report that starting an Ubuntu in docker while using cpuset
to bind it to movable nodes (a node only has movable zone, like a node
for hotplug or a Persistent Memory node in normal usage) will fail due
to memory allocation failure, and then OOM is involved and many other
innocent processes got killed.

It can be reproduced with command:

    $ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status"

(where node 4 is a movable node)

  runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0
  CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G        W I E     5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased)
  Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020
  Call Trace:
   dump_stack+0x6b/0x88
   dump_header+0x4a/0x1e2
   oom_kill_process.cold+0xb/0x10
   out_of_memory.part.0+0xaf/0x230
   out_of_memory+0x3d/0x80
   __alloc_pages_slowpath.constprop.0+0x954/0xa20
   __alloc_pages_nodemask+0x2d3/0x300
   pipe_write+0x322/0x590
   new_sync_write+0x196/0x1b0
   vfs_write+0x1c3/0x1f0
   ksys_write+0xa7/0xe0
   do_syscall_64+0x52/0xd0
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

  Mem-Info:
  active_anon:392832 inactive_anon:182 isolated_anon:0
   active_file:68130 inactive_file:151527 isolated_file:0
   unevictable:2701 dirty:0 writeback:7
   slab_reclaimable:51418 slab_unreclaimable:116300
   mapped:45825 shmem:735 pagetables:2540 bounce:0
   free:159849484 free_pcp:73 free_cma:0
  Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no
  Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB
  lowmem_reserve[]: 0 0 0 0 0
  Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB

  oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0
  Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0
  oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
  oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
  oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB
  oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
  oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

The reason is that in this case, the target cpuset nodes only have
movable zone, while the creation of an OS in docker sometimes needs to
allocate memory in non-movable zones (dma/dma32/normal) like
GFP_HIGHUSER, and the cpuset limit forbids the allocation, then
out-of-memory killing is involved even when normal nodes and movable
nodes both have many free memory.

The OOM killer cannot help to resolve the situation as there is no
usable memory for the request in the cpuset scope.  The only reasonable
measure to take is to fail the allocation right away and have the caller
to deal with it.

So add a check for cases like this in the slowpath of allocation, and
bail out early returning NULL for the allocation.

As page allocation is one of the hottest path in kernel, this check will
hurt all users with sane cpuset configuration, add a static branch check
and detect the abnormal config in cpuset memory binding setup so that
the extra check cost in page allocation is not paid by everyone.

[thanks to Micho Hocko and David Rientjes for suggesting not handling
 it inside OOM code, adding cpuset check, refining comments]

Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-06 13:30:38 -07:00

304 lines
7.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUSET_H
#define _LINUX_CPUSET_H
/*
* cpuset interface
*
* Copyright (C) 2003 BULL SA
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
*/
#include <linux/sched.h>
#include <linux/sched/topology.h>
#include <linux/sched/task.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/jump_label.h>
#ifdef CONFIG_CPUSETS
/*
* Static branch rewrites can happen in an arbitrary order for a given
* key. In code paths where we need to loop with read_mems_allowed_begin() and
* read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
* to ensure that begin() always gets rewritten before retry() in the
* disabled -> enabled transition. If not, then if local irqs are disabled
* around the loop, we can deadlock since retry() would always be
* comparing the latest value of the mems_allowed seqcount against 0 as
* begin() still would see cpusets_enabled() as false. The enabled -> disabled
* transition should happen in reverse order for the same reasons (want to stop
* looking at real value of mems_allowed.sequence in retry() first).
*/
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
extern struct static_key_false cpusets_insane_config_key;
static inline bool cpusets_enabled(void)
{
return static_branch_unlikely(&cpusets_enabled_key);
}
static inline void cpuset_inc(void)
{
static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
static_branch_inc_cpuslocked(&cpusets_enabled_key);
}
static inline void cpuset_dec(void)
{
static_branch_dec_cpuslocked(&cpusets_enabled_key);
static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}
/*
* This will get enabled whenever a cpuset configuration is considered
* unsupportable in general. E.g. movable only node which cannot satisfy
* any non movable allocations (see update_nodemask). Page allocator
* needs to make additional checks for those configurations and this
* check is meant to guard those checks without any overhead for sane
* configurations.
*/
static inline bool cpusets_insane_config(void)
{
return static_branch_unlikely(&cpusets_insane_config_key);
}
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void cpuset_wait_for_hotplug(void);
extern void cpuset_read_lock(void);
extern void cpuset_read_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
if (cpusets_enabled())
return __cpuset_node_allowed(node, gfp_mask);
return true;
}
static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
}
static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
if (cpusets_enabled())
return __cpuset_zone_allowed(z, gfp_mask);
return true;
}
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2);
#define cpuset_memory_pressure_bump() \
do { \
if (cpuset_memory_pressure_enabled) \
__cpuset_memory_pressure_bump(); \
} while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);
extern void cpuset_task_status_allowed(struct seq_file *m,
struct task_struct *task);
extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
extern int cpuset_mem_spread_node(void);
extern int cpuset_slab_spread_node(void);
static inline int cpuset_do_page_mem_spread(void)
{
return task_spread_page(current);
}
static inline int cpuset_do_slab_mem_spread(void)
{
return task_spread_slab(current);
}
extern bool current_cpuset_is_being_rebound(void);
extern void rebuild_sched_domains(void);
extern void cpuset_print_current_mems_allowed(void);
/*
* read_mems_allowed_begin is required when making decisions involving
* mems_allowed such as during page allocation. mems_allowed can be updated in
* parallel and depending on the new value an operation can fail potentially
* causing process failure. A retry loop with read_mems_allowed_begin and
* read_mems_allowed_retry prevents these artificial failures.
*/
static inline unsigned int read_mems_allowed_begin(void)
{
if (!static_branch_unlikely(&cpusets_pre_enable_key))
return 0;
return read_seqcount_begin(&current->mems_allowed_seq);
}
/*
* If this returns true, the operation that took place after
* read_mems_allowed_begin may have failed artificially due to a concurrent
* update of mems_allowed. It is up to the caller to retry the operation if
* appropriate.
*/
static inline bool read_mems_allowed_retry(unsigned int seq)
{
if (!static_branch_unlikely(&cpusets_enabled_key))
return false;
return read_seqcount_retry(&current->mems_allowed_seq, seq);
}
static inline void set_mems_allowed(nodemask_t nodemask)
{
unsigned long flags;
task_lock(current);
local_irq_save(flags);
write_seqcount_begin(&current->mems_allowed_seq);
current->mems_allowed = nodemask;
write_seqcount_end(&current->mems_allowed_seq);
local_irq_restore(flags);
task_unlock(current);
}
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
static inline bool cpusets_insane_config(void) { return false; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
static inline void cpuset_force_rebuild(void) { }
static inline void cpuset_update_active_cpus(void)
{
partition_sched_domains(1, NULL, NULL);
}
static inline void cpuset_wait_for_hotplug(void) { }
static inline void cpuset_read_lock(void) { }
static inline void cpuset_read_unlock(void) { }
static inline void cpuset_cpus_allowed(struct task_struct *p,
struct cpumask *mask)
{
cpumask_copy(mask, task_cpu_possible_mask(p));
}
static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
{
return false;
}
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
return node_possible_map;
}
#define cpuset_current_mems_allowed (node_states[N_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}
static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
return 1;
}
static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
return true;
}
static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
return true;
}
static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
return true;
}
static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2)
{
return 1;
}
static inline void cpuset_memory_pressure_bump(void) {}
static inline void cpuset_task_status_allowed(struct seq_file *m,
struct task_struct *task)
{
}
static inline int cpuset_mem_spread_node(void)
{
return 0;
}
static inline int cpuset_slab_spread_node(void)
{
return 0;
}
static inline int cpuset_do_page_mem_spread(void)
{
return 0;
}
static inline int cpuset_do_slab_mem_spread(void)
{
return 0;
}
static inline bool current_cpuset_is_being_rebound(void)
{
return false;
}
static inline void rebuild_sched_domains(void)
{
partition_sched_domains(1, NULL, NULL);
}
static inline void cpuset_print_current_mems_allowed(void)
{
}
static inline void set_mems_allowed(nodemask_t nodemask)
{
}
static inline unsigned int read_mems_allowed_begin(void)
{
return 0;
}
static inline bool read_mems_allowed_retry(unsigned int seq)
{
return false;
}
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */