linux-stable/include/linux/mman.h
Feng Tang 56f3547bfa mm: adjust vm_committed_as_batch according to vm overcommit policy
When checking a performance change for will-it-scale scalability mmap test
[1], we found very high lock contention for spinlock of percpu counter
'vm_committed_as':

    94.14%     0.35%  [kernel.kallsyms]         [k] _raw_spin_lock_irqsave
    48.21% _raw_spin_lock_irqsave;percpu_counter_add_batch;__vm_enough_memory;mmap_region;do_mmap;
    45.91% _raw_spin_lock_irqsave;percpu_counter_add_batch;__do_munmap;

Actually this heavy lock contention is not always necessary.  The
'vm_committed_as' needs to be very precise when the strict
OVERCOMMIT_NEVER policy is set, which requires a rather small batch number
for the percpu counter.

So keep 'batch' number unchanged for strict OVERCOMMIT_NEVER policy, and
lift it to 64X for OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS policies.  Also
add a sysctl handler to adjust it when the policy is reconfigured.

Benchmark with the same testcase in [1] shows 53% improvement on a 8C/16T
desktop, and 2097%(20X) on a 4S/72C/144T server.  We tested with test
platforms in 0day (server, desktop and laptop), and 80%+ platforms shows
improvements with that test.  And whether it shows improvements depends on
if the test mmap size is bigger than the batch number computed.

And if the lift is 16X, 1/3 of the platforms will show improvements,
though it should help the mmap/unmap usage generally, as Michal Hocko
mentioned:

: I believe that there are non-synthetic worklaods which would benefit from
: a larger batch.  E.g.  large in memory databases which do large mmaps
: during startups from multiple threads.

[1] https://lore.kernel.org/lkml/20200305062138.GI5972@shao2-debian/

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Qian Cai <cai@lca.pw>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: kernel test robot <rong.a.chen@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1589611660-89854-4-git-send-email-feng.tang@intel.com
Link: http://lkml.kernel.org/r/1592725000-73486-4-git-send-email-feng.tang@intel.com
Link: http://lkml.kernel.org/r/1594389708-60781-5-git-send-email-feng.tang@intel.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-07 11:33:26 -07:00

142 lines
3.4 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H
#include <linux/mm.h>
#include <linux/percpu_counter.h>
#include <linux/atomic.h>
#include <uapi/linux/mman.h>
/*
* Arrange for legacy / undefined architecture specific flags to be
* ignored by mmap handling code.
*/
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif
/*
* The historical set of flags that all mmap implementations implicitly
* support when a ->mmap_validate() op is not provided in file_operations.
*/
#define LEGACY_MAP_MASK (MAP_SHARED \
| MAP_PRIVATE \
| MAP_FIXED \
| MAP_ANONYMOUS \
| MAP_DENYWRITE \
| MAP_EXECUTABLE \
| MAP_UNINITIALIZED \
| MAP_GROWSDOWN \
| MAP_LOCKED \
| MAP_NORESERVE \
| MAP_POPULATE \
| MAP_NONBLOCK \
| MAP_STACK \
| MAP_HUGETLB \
| MAP_32BIT \
| MAP_HUGE_2MB \
| MAP_HUGE_1GB)
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
extern struct percpu_counter vm_committed_as;
#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif
unsigned long vm_memory_committed(void);
static inline void vm_acct_memory(long pages)
{
percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}
static inline void vm_unacct_memory(long pages)
{
vm_acct_memory(-pages);
}
/*
* Allow architectures to handle additional protection bits
*/
#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif
#ifndef arch_vm_get_page_prot
#define arch_vm_get_page_prot(vm_flags) __pgprot(0)
#endif
#ifndef arch_validate_prot
/*
* This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have
* already been masked out.
*
* Returns true if the prot flags are valid
*/
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif
/*
* Optimisation macro. It is equivalent to:
* (x & bit1) ? bit2 : 0
* but this version is faster.
* ("bit1" and "bit2" must be single bits)
*/
#define _calc_vm_trans(x, bit1, bit2) \
((!(bit1) || !(bit2)) ? 0 : \
((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
: ((x) & (bit1)) / ((bit1) / (bit2))))
/*
* Combine the mmap "prot" argument into "vm_flags" used internally.
*/
static inline unsigned long
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
return _calc_vm_trans(prot, PROT_READ, VM_READ ) |
_calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
_calc_vm_trans(prot, PROT_EXEC, VM_EXEC) |
arch_calc_vm_prot_bits(prot, pkey);
}
/*
* Combine the mmap "flags" argument into "vm_flags" used internally.
*/
static inline unsigned long
calc_vm_flag_bits(unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC );
}
unsigned long vm_commit_limit(void);
#endif /* _LINUX_MMAN_H */