diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 44518c023949..4bca2a3d9174 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -127,17 +127,39 @@ the high water marks for each per cpu page list. zone_reclaim_mode: -This is set during bootup to 1 if it is determined that pages from -remote zones will cause a significant performance reduction. The +Zone_reclaim_mode allows to set more or less agressive approaches to +reclaim memory when a zone runs out of memory. If it is set to zero then no +zone reclaim occurs. Allocations will be satisfied from other zones / nodes +in the system. + +This is value ORed together of + +1 = Zone reclaim on +2 = Zone reclaim writes dirty pages out +4 = Zone reclaim swaps pages + +zone_reclaim_mode is set during bootup to 1 if it is determined that pages +from remote zones will cause a measurable performance reduction. The page allocator will then reclaim easily reusable pages (those page -cache pages that are currently not used) before going off node. +cache pages that are currently not used) before allocating off node pages. -The user can override this setting. It may be beneficial to switch -off zone reclaim if the system is used for a file server and all -of memory should be used for caching files from disk. +It may be beneficial to switch off zone reclaim if the system is +used for a file server and all of memory should be used for caching files +from disk. In that case the caching effect is more important than +data locality. + +Allowing zone reclaim to write out pages stops processes that are +writing large amounts of data from dirtying pages on other nodes. Zone +reclaim will write out dirty pages if a zone fills up and so effectively +throttle the process. This may decrease the performance of a single process +since it cannot use all of system memory to buffer the outgoing writes +anymore but it preserve the memory on other nodes so that the performance +of other processes running on other nodes will not be affected. + +Allowing regular swap effectively restricts allocations to the local +node unless explicitly overridden by memory policies or cpuset +configurations. -It may be beneficial to switch this on if one wants to do zone -reclaim regardless of the numa distances in the system. ================================================================ zone_reclaim_interval: diff --git a/mm/vmscan.c b/mm/vmscan.c index 8760a4abfa1f..9e2ef3624d77 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1592,6 +1592,11 @@ module_init(kswapd_init) */ int zone_reclaim_mode __read_mostly; +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ + /* * Mininum time between zone reclaim scans */ @@ -1630,8 +1635,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) if (!cpus_empty(mask) && node_id != numa_node_id()) return 0; - sc.may_writepage = 0; - sc.may_swap = 0; + sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); + sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); sc.nr_scanned = 0; sc.nr_reclaimed = 0; sc.priority = ZONE_RECLAIM_PRIORITY + 1;