linux-stable/include/linux/cpuset.h
KOSAKI Motohiro 31f1de46b9 mempolicy: silently restrict nodemask to allowed nodes
Kosaki Motohito noted that "numactl --interleave=all ..." failed in the
presence of memoryless nodes.  This patch attempts to fix that problem.

Some background:

numactl --interleave=all calls set_mempolicy(2) with a fully populated
[out to MAXNUMNODES] nodemask.  set_mempolicy() [in do_set_mempolicy()]
calls contextualize_policy() which requires that the nodemask be a
subset of the current task's mems_allowed; else EINVAL will be returned.

A task's mems_allowed will always be a subset of node_states[N_HIGH_MEMORY]
i.e., nodes with memory.  So, a fully populated nodemask will be
declared invalid if it includes memoryless nodes.

  NOTE:  the same thing will occur when running in a cpuset
         with restricted mem_allowed--for the same reason:
         node mask contains dis-allowed nodes.

mbind(2), on the other hand, just masks off any nodes in the nodemask
that are not included in the caller's mems_allowed.

In each case [mbind() and set_mempolicy()], mpol_check_policy() will
complain [again, resulting in EINVAL] if the nodemask contains any
memoryless nodes.  This is somewhat redundant as mpol_new() will remove
memoryless nodes for interleave policy, as will bind_zonelist()--called
by mpol_new() for BIND policy.

Proposed fix:

1) modify contextualize_policy logic to:
   a) remember whether the incoming node mask is empty.
   b) if not, restrict the nodemask to allowed nodes, as is
      currently done in-line for mbind().  This guarantees
      that the resulting mask includes only nodes with memory.

      NOTE:  this is a [benign, IMO] change in behavior for
             set_mempolicy().  Dis-allowed nodes will be
             silently ignored, rather than returning an error.

   c) fold this code into mpol_check_policy(), replace 2 calls to
      contextualize_policy() to call mpol_check_policy() directly
      and remove contextualize_policy().

2) In existing mpol_check_policy() logic, after "contextualization":
   a) MPOL_DEFAULT:  require that in coming mask "was_empty"
   b) MPOL_{BIND|INTERLEAVE}:  require that contextualized nodemask
      contains at least one node.
   c) add a case for MPOL_PREFERRED:  if in coming was not empty
      and resulting mask IS empty, user specified invalid nodes.
      Return EINVAL.
   c) remove the now redundant check for memoryless nodes

3) remove the now redundant masking of policy nodes for interleave
   policy from mpol_new().

4) Now that mpol_check_policy() contextualizes the nodemask, remove
   the in-line nodes_and() from sys_mbind().  I believe that this
   restores mbind() to the behavior before the memoryless-nodes
   patch series.  E.g., we'll no longer treat an invalid nodemask
   with MPOL_PREFERRED as local allocation.

[ Patch history:

  v1 -> v2:
   - Communicate whether or not incoming node mask was empty to
     mpol_check_policy() for better error checking.
   - As suggested by David Rientjes, remove the now unused
     cpuset_nodes_subset_current_mems_allowed() from cpuset.h

  v2 -> v3:
   - As suggested by Kosaki Motohito, fold the "contextualization"
     of policy nodemask into mpol_check_policy().  Looks a little
     cleaner. ]

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by:  KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by:      KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by:       David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-11 20:48:29 -08:00

160 lines
3.9 KiB
C

#ifndef _LINUX_CPUSET_H
#define _LINUX_CPUSET_H
/*
* cpuset interface
*
* Copyright (C) 2003 BULL SA
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
*/
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/cgroup.h>
#ifdef CONFIG_CPUSETS
extern int number_of_cpusets; /* How many cpusets are defined in system? */
extern int cpuset_init_early(void);
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
void cpuset_update_task_memory_state(void);
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
{
return number_of_cpusets <= 1 ||
__cpuset_zone_allowed_softwall(z, gfp_mask);
}
static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
{
return number_of_cpusets <= 1 ||
__cpuset_zone_allowed_hardwall(z, gfp_mask);
}
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2);
#define cpuset_memory_pressure_bump() \
do { \
if (cpuset_memory_pressure_enabled) \
__cpuset_memory_pressure_bump(); \
} while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);
extern const struct file_operations proc_cpuset_operations;
struct seq_file;
extern void cpuset_task_status_allowed(struct seq_file *m,
struct task_struct *task);
extern void cpuset_lock(void);
extern void cpuset_unlock(void);
extern int cpuset_mem_spread_node(void);
static inline int cpuset_do_page_mem_spread(void)
{
return current->flags & PF_SPREAD_PAGE;
}
static inline int cpuset_do_slab_mem_spread(void)
{
return current->flags & PF_SPREAD_SLAB;
}
extern void cpuset_track_online_nodes(void);
extern int current_cpuset_is_being_rebound(void);
#else /* !CONFIG_CPUSETS */
static inline int cpuset_init_early(void) { return 0; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
{
return cpu_possible_map;
}
static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
{
return cpu_possible_map;
}
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
return node_possible_map;
}
#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}
static inline void cpuset_update_task_memory_state(void) {}
static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
{
return 1;
}
static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
{
return 1;
}
static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
{
return 1;
}
static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2)
{
return 1;
}
static inline void cpuset_memory_pressure_bump(void) {}
static inline void cpuset_task_status_allowed(struct seq_file *m,
struct task_struct *task)
{
}
static inline void cpuset_lock(void) {}
static inline void cpuset_unlock(void) {}
static inline int cpuset_mem_spread_node(void)
{
return 0;
}
static inline int cpuset_do_page_mem_spread(void)
{
return 0;
}
static inline int cpuset_do_slab_mem_spread(void)
{
return 0;
}
static inline void cpuset_track_online_nodes(void) {}
static inline int current_cpuset_is_being_rebound(void)
{
return 0;
}
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */