2006-09-27 08:50:01 +00:00
|
|
|
#ifndef _LINUX_MM_TYPES_H
|
|
|
|
#define _LINUX_MM_TYPES_H
|
|
|
|
|
2017-02-03 23:12:19 +00:00
|
|
|
#include <linux/mm_types_task.h>
|
|
|
|
|
2007-10-17 06:30:12 +00:00
|
|
|
#include <linux/auxvec.h>
|
2006-09-27 08:50:01 +00:00
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/spinlock.h>
|
2007-10-16 08:24:43 +00:00
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/rwsem.h>
|
|
|
|
#include <linux/completion.h>
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-28 22:46:29 +00:00
|
|
|
#include <linux/cpumask.h>
|
2012-03-30 18:26:31 +00:00
|
|
|
#include <linux/uprobes.h>
|
2013-02-23 00:34:30 +00:00
|
|
|
#include <linux/page-flags-layout.h>
|
2016-05-20 23:57:21 +00:00
|
|
|
#include <linux/workqueue.h>
|
2017-02-03 23:12:19 +00:00
|
|
|
|
2007-10-16 08:24:43 +00:00
|
|
|
#include <asm/mmu.h>
|
2006-09-27 08:50:01 +00:00
|
|
|
|
2007-10-17 06:30:12 +00:00
|
|
|
#ifndef AT_VECTOR_SIZE_ARCH
|
|
|
|
#define AT_VECTOR_SIZE_ARCH 0
|
|
|
|
#endif
|
|
|
|
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
|
|
|
|
|
2006-09-27 08:50:01 +00:00
|
|
|
struct address_space;
|
2014-12-10 23:44:52 +00:00
|
|
|
struct mem_cgroup;
|
2006-09-27 08:50:01 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Each physical page in the system has a struct page associated with
|
|
|
|
* it to keep track of whatever it is we are using the page for at the
|
|
|
|
* moment. Note that we have no way to track which tasks are using
|
|
|
|
* a page, though if it is a pagecache page, rmap structures can tell us
|
|
|
|
* who is mapping it.
|
2011-06-01 17:25:48 +00:00
|
|
|
*
|
|
|
|
* The objects in struct page are organized in double word blocks in
|
|
|
|
* order to allows us to use atomic double word operations on portions
|
|
|
|
* of struct page. That is currently only used by slub but the arrangement
|
|
|
|
* allows the use of atomic double word operations on the flags/mapping
|
|
|
|
* and lru list pointers also.
|
2006-09-27 08:50:01 +00:00
|
|
|
*/
|
|
|
|
struct page {
|
2011-06-01 17:25:48 +00:00
|
|
|
/* First double word block */
|
2006-09-27 08:50:01 +00:00
|
|
|
unsigned long flags; /* Atomic flags, some possibly
|
|
|
|
* updated asynchronously */
|
2013-10-24 01:07:49 +00:00
|
|
|
union {
|
|
|
|
struct address_space *mapping; /* If low bit clear, points to
|
|
|
|
* inode address_space, or NULL.
|
|
|
|
* If page mapped as anonymous
|
|
|
|
* memory, low bit is set, and
|
|
|
|
* it points to anon_vma object:
|
|
|
|
* see PAGE_MAPPING_ANON below.
|
|
|
|
*/
|
|
|
|
void *s_mem; /* slab first object */
|
2016-01-16 00:53:42 +00:00
|
|
|
atomic_t compound_mapcount; /* first tail page */
|
2016-01-16 00:54:17 +00:00
|
|
|
/* page_deferred_list().next -- second tail page */
|
2013-10-24 01:07:49 +00:00
|
|
|
};
|
|
|
|
|
2011-06-01 17:25:48 +00:00
|
|
|
/* Second double word */
|
2016-07-26 22:24:16 +00:00
|
|
|
union {
|
|
|
|
pgoff_t index; /* Our offset within mapping. */
|
|
|
|
void *freelist; /* sl[aou]b first free object */
|
|
|
|
/* page_deferred_list().prev -- second tail page */
|
|
|
|
};
|
2011-07-14 17:48:14 +00:00
|
|
|
|
2016-07-26 22:24:16 +00:00
|
|
|
union {
|
2012-06-20 19:52:56 +00:00
|
|
|
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
|
|
|
|
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
|
2016-07-26 22:24:16 +00:00
|
|
|
/* Used for cmpxchg_double in slub */
|
|
|
|
unsigned long counters;
|
2012-06-20 19:52:56 +00:00
|
|
|
#else
|
2016-07-26 22:24:16 +00:00
|
|
|
/*
|
|
|
|
* Keep _refcount separate from slub cmpxchg_double data.
|
|
|
|
* As the rest of the double word is protected by slab_lock
|
|
|
|
* but _refcount is not.
|
|
|
|
*/
|
|
|
|
unsigned counters;
|
2012-06-20 19:52:56 +00:00
|
|
|
#endif
|
2016-07-26 22:24:16 +00:00
|
|
|
struct {
|
2011-07-14 17:48:14 +00:00
|
|
|
|
2016-07-26 22:24:16 +00:00
|
|
|
union {
|
2016-05-20 00:10:49 +00:00
|
|
|
/*
|
2016-07-26 22:24:16 +00:00
|
|
|
* Count of ptes mapped in mms, to show when
|
|
|
|
* page is mapped & limit reverse map searches.
|
2016-07-26 22:24:18 +00:00
|
|
|
*
|
|
|
|
* Extra information about page type may be
|
|
|
|
* stored here for pages that are never mapped,
|
|
|
|
* in which case the value MUST BE <= -2.
|
|
|
|
* See page-flags.h for more details.
|
2016-05-20 00:10:49 +00:00
|
|
|
*/
|
2016-07-26 22:24:16 +00:00
|
|
|
atomic_t _mapcount;
|
|
|
|
|
|
|
|
unsigned int active; /* SLAB */
|
|
|
|
struct { /* SLUB */
|
|
|
|
unsigned inuse:16;
|
|
|
|
unsigned objects:15;
|
|
|
|
unsigned frozen:1;
|
|
|
|
};
|
|
|
|
int units; /* SLOB */
|
2011-06-01 17:25:48 +00:00
|
|
|
};
|
2016-07-26 22:24:16 +00:00
|
|
|
/*
|
|
|
|
* Usage count, *USE WRAPPER FUNCTION* when manual
|
|
|
|
* accounting. See page_ref.h
|
|
|
|
*/
|
|
|
|
atomic_t _refcount;
|
2008-04-14 16:11:30 +00:00
|
|
|
};
|
2007-05-06 21:49:36 +00:00
|
|
|
};
|
2011-06-01 17:25:48 +00:00
|
|
|
|
2015-11-07 00:29:54 +00:00
|
|
|
/*
|
|
|
|
* Third double word block
|
|
|
|
*
|
|
|
|
* WARNING: bit 0 of the first word encode PageTail(). That means
|
|
|
|
* the rest users of the storage space MUST NOT use the bit to
|
|
|
|
* avoid collision and false-positive PageTail().
|
|
|
|
*/
|
2011-08-09 21:12:27 +00:00
|
|
|
union {
|
|
|
|
struct list_head lru; /* Pageout list, eg. active_list
|
2016-07-28 22:45:28 +00:00
|
|
|
* protected by zone_lru_lock !
|
2014-04-08 20:44:27 +00:00
|
|
|
* Can be used as a generic list
|
|
|
|
* by the page owner.
|
2011-06-01 17:25:48 +00:00
|
|
|
*/
|
2016-01-16 00:56:49 +00:00
|
|
|
struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
|
|
|
|
* lru or handled by a slab
|
|
|
|
* allocator, this points to the
|
|
|
|
* hosting device page map.
|
|
|
|
*/
|
2011-08-09 21:12:27 +00:00
|
|
|
struct { /* slub per cpu partial pages */
|
|
|
|
struct page *next; /* Next partial slab */
|
|
|
|
#ifdef CONFIG_64BIT
|
|
|
|
int pages; /* Nr of partial slabs left */
|
|
|
|
int pobjects; /* Approximate # of objects */
|
|
|
|
#else
|
|
|
|
short int pages;
|
|
|
|
short int pobjects;
|
|
|
|
#endif
|
|
|
|
};
|
2012-06-13 15:24:52 +00:00
|
|
|
|
2013-10-24 01:07:42 +00:00
|
|
|
struct rcu_head rcu_head; /* Used by SLAB
|
|
|
|
* when destroying via RCU
|
|
|
|
*/
|
2015-11-07 00:29:54 +00:00
|
|
|
/* Tail pages of compound page */
|
2015-02-11 23:24:46 +00:00
|
|
|
struct {
|
2015-11-07 00:29:54 +00:00
|
|
|
unsigned long compound_head; /* If bit zero is set */
|
|
|
|
|
|
|
|
/* First tail page only */
|
2015-11-07 00:30:00 +00:00
|
|
|
#ifdef CONFIG_64BIT
|
|
|
|
/*
|
|
|
|
* On 64 bit system we have enough space in struct page
|
|
|
|
* to encode compound_dtor and compound_order with
|
|
|
|
* unsigned int. It can help compiler generate better or
|
|
|
|
* smaller code on some archtectures.
|
|
|
|
*/
|
|
|
|
unsigned int compound_dtor;
|
|
|
|
unsigned int compound_order;
|
|
|
|
#else
|
2015-11-07 00:29:50 +00:00
|
|
|
unsigned short int compound_dtor;
|
|
|
|
unsigned short int compound_order;
|
2015-11-07 00:30:00 +00:00
|
|
|
#endif
|
2015-02-11 23:24:46 +00:00
|
|
|
};
|
|
|
|
|
2013-11-21 22:32:11 +00:00
|
|
|
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
|
2015-11-07 00:29:54 +00:00
|
|
|
struct {
|
|
|
|
unsigned long __pad; /* do not overlay pmd_huge_pte
|
|
|
|
* with compound_head to avoid
|
|
|
|
* possible bit 0 collision.
|
|
|
|
*/
|
|
|
|
pgtable_t pmd_huge_pte; /* protected by page->ptl */
|
|
|
|
};
|
2013-11-21 22:32:11 +00:00
|
|
|
#endif
|
2011-08-09 21:12:27 +00:00
|
|
|
};
|
2011-06-01 17:25:48 +00:00
|
|
|
|
|
|
|
/* Remainder is not double word aligned */
|
2006-09-27 08:50:01 +00:00
|
|
|
union {
|
|
|
|
unsigned long private; /* Mapping-private opaque data:
|
|
|
|
* usually used for buffer_heads
|
|
|
|
* if PagePrivate set; used for
|
|
|
|
* swp_entry_t if PageSwapCache;
|
|
|
|
* indicates order in the buddy
|
|
|
|
* system if PG_buddy is set.
|
|
|
|
*/
|
2013-11-14 22:30:45 +00:00
|
|
|
#if USE_SPLIT_PTE_PTLOCKS
|
2013-12-20 11:35:58 +00:00
|
|
|
#if ALLOC_SPLIT_PTLOCKS
|
2013-11-14 22:31:52 +00:00
|
|
|
spinlock_t *ptl;
|
|
|
|
#else
|
|
|
|
spinlock_t ptl;
|
|
|
|
#endif
|
2006-09-27 08:50:01 +00:00
|
|
|
#endif
|
slub: Commonize slab_cache field in struct page
Right now, slab and slub have fields in struct page to derive which
cache a page belongs to, but they do it slightly differently.
slab uses a field called slab_cache, that lives in the third double
word. slub, uses a field called "slab", living outside of the
doublewords area.
Ideally, we could use the same field for this. Since slub heavily makes
use of the doubleword region, there isn't really much room to move
slub's slab_cache field around. Since slab does not have such strict
placement restrictions, we can move it outside the doubleword area.
The naming used by slab, "slab_cache", is less confusing, and it is
preferred over slub's generic "slab".
Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Christoph Lameter <cl@linux.com>
CC: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
2012-10-22 14:05:36 +00:00
|
|
|
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
|
2007-05-06 21:49:36 +00:00
|
|
|
};
|
2011-06-01 17:25:48 +00:00
|
|
|
|
2014-12-10 23:44:52 +00:00
|
|
|
#ifdef CONFIG_MEMCG
|
|
|
|
struct mem_cgroup *mem_cgroup;
|
|
|
|
#endif
|
|
|
|
|
2006-09-27 08:50:01 +00:00
|
|
|
/*
|
|
|
|
* On machines where all RAM is mapped into kernel address space,
|
|
|
|
* we can simply calculate the virtual address. On machines with
|
|
|
|
* highmem some memory is mapped into kernel virtual memory
|
|
|
|
* dynamically, so we need a place to store that address.
|
|
|
|
* Note that this field could be 16 bits on x86 ... ;)
|
|
|
|
*
|
|
|
|
* Architectures with slow multiplication can define
|
|
|
|
* WANT_PAGE_VIRTUAL in asm/page.h
|
|
|
|
*/
|
|
|
|
#if defined(WANT_PAGE_VIRTUAL)
|
|
|
|
void *virtual; /* Kernel virtual address (NULL if
|
|
|
|
not kmapped, ie. highmem) */
|
|
|
|
#endif /* WANT_PAGE_VIRTUAL */
|
2008-04-03 22:51:41 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_KMEMCHECK
|
|
|
|
/*
|
|
|
|
* kmemcheck wants to track the status of each byte in a page; this
|
|
|
|
* is a pointer to such a status block. NULL if not tracked.
|
|
|
|
*/
|
|
|
|
void *shadow;
|
|
|
|
#endif
|
2012-11-12 09:06:20 +00:00
|
|
|
|
2013-10-07 10:29:20 +00:00
|
|
|
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
|
|
|
int _last_cpupid;
|
2012-11-12 09:06:20 +00:00
|
|
|
#endif
|
2011-06-01 17:25:48 +00:00
|
|
|
}
|
|
|
|
/*
|
2012-01-13 01:17:27 +00:00
|
|
|
* The struct page can be forced to be double word aligned so that atomic ops
|
|
|
|
* on double words work. The SLUB allocator can make use of such a feature.
|
2011-06-01 17:25:48 +00:00
|
|
|
*/
|
2012-01-13 01:17:27 +00:00
|
|
|
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
|
|
|
|
__aligned(2 * sizeof(unsigned long))
|
2011-06-01 17:25:48 +00:00
|
|
|
#endif
|
|
|
|
;
|
2006-09-27 08:50:01 +00:00
|
|
|
|
2015-05-07 04:11:57 +00:00
|
|
|
#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK)
|
|
|
|
#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE)
|
|
|
|
|
|
|
|
struct page_frag_cache {
|
|
|
|
void * va;
|
|
|
|
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
|
|
|
|
__u16 offset;
|
|
|
|
__u16 size;
|
|
|
|
#else
|
|
|
|
__u32 offset;
|
|
|
|
#endif
|
|
|
|
/* we maintain a pagecount bias, so that we dont dirty cache line
|
2016-05-20 00:10:49 +00:00
|
|
|
* containing page->_refcount every time we allocate a fragment.
|
2015-05-07 04:11:57 +00:00
|
|
|
*/
|
|
|
|
unsigned int pagecnt_bias;
|
|
|
|
bool pfmemalloc;
|
|
|
|
};
|
|
|
|
|
2015-09-08 22:02:15 +00:00
|
|
|
typedef unsigned long vm_flags_t;
|
2011-05-26 10:16:19 +00:00
|
|
|
|
2009-01-08 12:04:47 +00:00
|
|
|
/*
|
|
|
|
* A region containing a mapping of a non-memory backed file under NOMMU
|
|
|
|
* conditions. These are held in a global tree and are pinned by the VMAs that
|
|
|
|
* map parts of them.
|
|
|
|
*/
|
|
|
|
struct vm_region {
|
|
|
|
struct rb_node vm_rb; /* link in global region tree */
|
2011-05-26 10:16:19 +00:00
|
|
|
vm_flags_t vm_flags; /* VMA vm_flags */
|
2009-01-08 12:04:47 +00:00
|
|
|
unsigned long vm_start; /* start address of region */
|
|
|
|
unsigned long vm_end; /* region initialised to here */
|
2009-01-08 12:04:47 +00:00
|
|
|
unsigned long vm_top; /* region allocated to here */
|
2009-01-08 12:04:47 +00:00
|
|
|
unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */
|
|
|
|
struct file *vm_file; /* the backing file or NULL */
|
|
|
|
|
2010-01-16 01:01:33 +00:00
|
|
|
int vm_usage; /* region usage count (access under nommu_region_sem) */
|
NOMMU: Avoiding duplicate icache flushes of shared maps
When working with FDPIC, there are many shared mappings of read-only
code regions between applications (the C library, applet packages like
busybox, etc.), but the current do_mmap_pgoff() function will issue an
icache flush whenever a VMA is added to an MM instead of only doing it
when the map is initially created.
The flush can instead be done when a region is first mmapped PROT_EXEC.
Note that we may not rely on the first mapping of a region being
executable - it's possible for it to be PROT_READ only, so we have to
remember whether we've flushed the region or not, and then flush the
entire region when a bit of it is made executable.
However, this also affects the brk area. That will no longer be
executable. We can mprotect() it to PROT_EXEC on MPU-mode kernels, but
for NOMMU mode kernels, when it increases the brk allocation, making
sys_brk() flush the extra from the icache should suffice. The brk area
probably isn't used by NOMMU programs since the brk area can only use up
the leavings from the stack allocation, where the stack allocation is
larger than requested.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-06 17:23:23 +00:00
|
|
|
bool vm_icache_flushed : 1; /* true if the icache has been flushed for
|
|
|
|
* this region */
|
2009-01-08 12:04:47 +00:00
|
|
|
};
|
|
|
|
|
2015-09-04 22:46:14 +00:00
|
|
|
#ifdef CONFIG_USERFAULTFD
|
|
|
|
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
|
|
|
|
struct vm_userfaultfd_ctx {
|
|
|
|
struct userfaultfd_ctx *ctx;
|
|
|
|
};
|
|
|
|
#else /* CONFIG_USERFAULTFD */
|
|
|
|
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
|
|
|
|
struct vm_userfaultfd_ctx {};
|
|
|
|
#endif /* CONFIG_USERFAULTFD */
|
|
|
|
|
2007-10-16 08:24:43 +00:00
|
|
|
/*
|
|
|
|
* This struct defines a memory VMM memory area. There is one of these
|
|
|
|
* per VM-area/task. A VM area is any part of the process virtual memory
|
|
|
|
* space that has a special rule for the page-fault handlers (ie a shared
|
|
|
|
* library, the executable area etc).
|
|
|
|
*/
|
|
|
|
struct vm_area_struct {
|
2012-12-12 00:01:44 +00:00
|
|
|
/* The first cache line has the info for VMA tree walking. */
|
|
|
|
|
2007-10-16 08:24:43 +00:00
|
|
|
unsigned long vm_start; /* Our start address within vm_mm. */
|
|
|
|
unsigned long vm_end; /* The first byte after our end address
|
|
|
|
within vm_mm. */
|
|
|
|
|
|
|
|
/* linked list of VM areas per task, sorted by address */
|
2010-08-20 23:24:55 +00:00
|
|
|
struct vm_area_struct *vm_next, *vm_prev;
|
2007-10-16 08:24:43 +00:00
|
|
|
|
|
|
|
struct rb_node vm_rb;
|
|
|
|
|
2012-12-12 00:01:38 +00:00
|
|
|
/*
|
|
|
|
* Largest free memory gap in bytes to the left of this VMA.
|
|
|
|
* Either between this VMA and vma->vm_prev, or between one of the
|
|
|
|
* VMAs below us in the VMA rbtree and its ->vm_prev. This helps
|
|
|
|
* get_unmapped_area find a free area of the right size.
|
|
|
|
*/
|
|
|
|
unsigned long rb_subtree_gap;
|
|
|
|
|
2012-12-12 00:01:44 +00:00
|
|
|
/* Second cache line starts here. */
|
|
|
|
|
|
|
|
struct mm_struct *vm_mm; /* The address space we belong to. */
|
|
|
|
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
|
|
|
|
unsigned long vm_flags; /* Flags, see mm.h. */
|
|
|
|
|
2007-10-16 08:24:43 +00:00
|
|
|
/*
|
|
|
|
* For areas with an address space and backing store,
|
2015-02-10 22:09:59 +00:00
|
|
|
* linkage into the address_space->i_mmap interval tree.
|
2007-10-16 08:24:43 +00:00
|
|
|
*/
|
2015-02-10 22:10:02 +00:00
|
|
|
struct {
|
|
|
|
struct rb_node rb;
|
|
|
|
unsigned long rb_subtree_last;
|
2007-10-16 08:24:43 +00:00
|
|
|
} shared;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
|
|
|
|
* list, after a COW of one of the file pages. A MAP_SHARED vma
|
|
|
|
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
|
|
|
|
* or brk vma (with NULL file) can only be in an anon_vma list.
|
|
|
|
*/
|
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking
workloads. Specifically, each anon_vma will be shared between the parent
process and all its child processes.
In a workload with 1000 child processes and a VMA with 1000 anonymous
pages per process that get COWed, this leads to a system with a million
anonymous pages in the same anon_vma, each of which is mapped in just one
of the 1000 processes. However, the current rmap code needs to walk them
all, leading to O(N) scanning complexity for each page.
This can result in systems where one CPU is walking the page tables of
1000 processes in page_referenced_one, while all other CPUs are stuck on
the anon_vma lock. This leads to catastrophic failure for a benchmark
like AIM7, where the total number of processes can reach in the tens of
thousands. Real workloads are still a factor 10 less process intensive
than AIM7, but they are catching up.
This patch changes the way anon_vmas and VMAs are linked, which allows us
to associate multiple anon_vmas with a VMA. At fork time, each child
process gets its own anon_vmas, in which its COWed pages will be
instantiated. The parents' anon_vma is also linked to the VMA, because
non-COWed pages could be present in any of the children.
This reduces rmap scanning complexity to O(1) for the pages of the 1000
child processes, with O(N) complexity for at most 1/N pages in the system.
This reduces the average scanning cost in heavily forking workloads from
O(N) to 2.
The only real complexity in this patch stems from the fact that linking a
VMA to anon_vmas now involves memory allocations. This means vma_adjust
can fail, if it needs to attach a VMA to anon_vma structures. This in
turn means error handling needs to be added to the calling functions.
A second source of complexity is that, because there can be multiple
anon_vmas, the anon_vma linking in vma_adjust can no longer be done under
"the" anon_vma lock. To prevent the rmap code from walking up an
incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit
flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h
to make sure it is impossible to compile a kernel that needs both symbolic
values for the same bitflag.
Some test results:
Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test
box with 16GB RAM and not quite enough IO), the system ends up running
>99% in system time, with every CPU on the same anon_vma lock in the
pageout code.
With these changes, AIM7 hits the cross-over point around 29.7k users.
This happens with ~99% IO wait time, there never seems to be any spike in
system time. The anon_vma lock contention appears to be resolved.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 21:42:07 +00:00
|
|
|
struct list_head anon_vma_chain; /* Serialized by mmap_sem &
|
|
|
|
* page_table_lock */
|
2007-10-16 08:24:43 +00:00
|
|
|
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
|
|
|
|
|
|
|
|
/* Function pointers to deal with this struct. */
|
2009-09-27 18:29:37 +00:00
|
|
|
const struct vm_operations_struct *vm_ops;
|
2007-10-16 08:24:43 +00:00
|
|
|
|
|
|
|
/* Information about our backing store: */
|
|
|
|
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
|
2016-04-01 12:29:48 +00:00
|
|
|
units */
|
2007-10-16 08:24:43 +00:00
|
|
|
struct file * vm_file; /* File we map to (can be NULL). */
|
|
|
|
void * vm_private_data; /* was vm_pte (shared mem) */
|
|
|
|
|
|
|
|
#ifndef CONFIG_MMU
|
2009-01-08 12:04:47 +00:00
|
|
|
struct vm_region *vm_region; /* NOMMU mapping region */
|
2007-10-16 08:24:43 +00:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
|
|
|
|
#endif
|
2015-09-04 22:46:14 +00:00
|
|
|
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
|
2007-10-16 08:24:43 +00:00
|
|
|
};
|
|
|
|
|
2008-07-25 08:47:44 +00:00
|
|
|
struct core_thread {
|
|
|
|
struct task_struct *task;
|
|
|
|
struct core_thread *next;
|
|
|
|
};
|
|
|
|
|
2008-07-25 08:47:41 +00:00
|
|
|
struct core_state {
|
2008-07-25 08:47:42 +00:00
|
|
|
atomic_t nr_threads;
|
2008-07-25 08:47:44 +00:00
|
|
|
struct core_thread dumper;
|
2008-07-25 08:47:41 +00:00
|
|
|
struct completion startup;
|
|
|
|
};
|
|
|
|
|
aio: convert the ioctx list to table lookup v3
On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote:
> On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote:
> > When using a large number of threads performing AIO operations the
> > IOCTX list may get a significant number of entries which will cause
> > significant overhead. For example, when running this fio script:
> >
> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=512; thread; loops=100
> >
> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> > 30% CPU time spent by lookup_ioctx:
> >
> > 32.51% [guest.kernel] [g] lookup_ioctx
> > 9.19% [guest.kernel] [g] __lock_acquire.isra.28
> > 4.40% [guest.kernel] [g] lock_release
> > 4.19% [guest.kernel] [g] sched_clock_local
> > 3.86% [guest.kernel] [g] local_clock
> > 3.68% [guest.kernel] [g] native_sched_clock
> > 3.08% [guest.kernel] [g] sched_clock_cpu
> > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11
> > 2.60% [guest.kernel] [g] memcpy
> > 2.33% [guest.kernel] [g] lock_acquired
> > 2.25% [guest.kernel] [g] lock_acquire
> > 1.84% [guest.kernel] [g] do_io_submit
> >
> > This patchs converts the ioctx list to a radix tree. For a performance
> > comparison the above FIO script was run on a 2 sockets 8 core
> > machine. This are the results (average and %rsd of 10 runs) for the
> > original list based implementation and for the radix tree based
> > implementation:
> >
> > cores 1 2 4 8 16 32
> > list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms
> > %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43%
> > radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms
> > %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75%
> > % of radix
> > relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66%
> > to list
> >
> > To consider the impact of the patch on the typical case of having
> > only one ctx per process the following FIO script was run:
> >
> > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=1; thread; loops=100
> >
> > on the same system and the results are the following:
> >
> > list 58892 ms
> > %rsd 0.91%
> > radix 59404 ms
> > %rsd 0.81%
> > % of radix
> > relative 100.87%
> > to list
>
> So, I was just doing some benchmarking/profiling to get ready to send
> out the aio patches I've got for 3.11 - and it looks like your patch is
> causing a ~1.5% throughput regression in my testing :/
... <snip>
I've got an alternate approach for fixing this wart in lookup_ioctx()...
Instead of using an rbtree, just use the reserved id in the ring buffer
header to index an array pointing the ioctx. It's not finished yet, and
it needs to be tidied up, but is most of the way there.
-ben
--
"Thought is the essence of where you are now."
--
kmo> And, a rework of Ben's code, but this was entirely his idea
kmo> -Kent
bcrl> And fix the code to use the right mm_struct in kill_ioctx(), actually
free memory.
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
2013-07-30 16:54:40 +00:00
|
|
|
struct kioctx_table;
|
2007-10-16 08:24:43 +00:00
|
|
|
struct mm_struct {
|
mm: per-thread vma caching
This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed. There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma(). Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.
We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality. On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.
The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number. The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed. Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question. Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:
1) System bootup: Most programs are single threaded, so the per-thread
scheme does improve ~50% hit rate by just adding a few more slots to
the cache.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 50.61% | 19.90 |
| patched | 73.45% | 13.58 |
+----------------+----------+------------------+
2) Kernel build: This one is already pretty good with the current
approach as we're dealing with good locality.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 75.28% | 11.03 |
| patched | 88.09% | 9.31 |
+----------------+----------+------------------+
3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 70.66% | 17.14 |
| patched | 91.15% | 12.57 |
+----------------+----------+------------------+
4) Ebizzy: There's a fair amount of variation from run to run, but this
approach always shows nearly perfect hit rates, while baseline is just
about non-existent. The amounts of cycles can fluctuate between
anywhere from ~60 to ~116 for the baseline scheme, but this approach
reduces it considerably. For instance, with 80 threads:
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 1.06% | 91.54 |
| patched | 99.97% | 14.18 |
+----------------+----------+------------------+
[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
[akpm@linux-foundation.org: document vmacache_valid() logic]
[akpm@linux-foundation.org: attempt to untangle header files]
[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-07 22:37:25 +00:00
|
|
|
struct vm_area_struct *mmap; /* list of VMAs */
|
2007-10-16 08:24:43 +00:00
|
|
|
struct rb_root mm_rb;
|
mm: per-thread vma caching
This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed. There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma(). Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.
We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality. On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.
The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number. The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed. Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question. Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:
1) System bootup: Most programs are single threaded, so the per-thread
scheme does improve ~50% hit rate by just adding a few more slots to
the cache.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 50.61% | 19.90 |
| patched | 73.45% | 13.58 |
+----------------+----------+------------------+
2) Kernel build: This one is already pretty good with the current
approach as we're dealing with good locality.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 75.28% | 11.03 |
| patched | 88.09% | 9.31 |
+----------------+----------+------------------+
3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 70.66% | 17.14 |
| patched | 91.15% | 12.57 |
+----------------+----------+------------------+
4) Ebizzy: There's a fair amount of variation from run to run, but this
approach always shows nearly perfect hit rates, while baseline is just
about non-existent. The amounts of cycles can fluctuate between
anywhere from ~60 to ~116 for the baseline scheme, but this approach
reduces it considerably. For instance, with 80 threads:
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 1.06% | 91.54 |
| patched | 99.97% | 14.18 |
+----------------+----------+------------------+
[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
[akpm@linux-foundation.org: document vmacache_valid() logic]
[akpm@linux-foundation.org: attempt to untangle header files]
[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-07 22:37:25 +00:00
|
|
|
u32 vmacache_seqnum; /* per-thread vmacache */
|
2010-01-16 01:01:35 +00:00
|
|
|
#ifdef CONFIG_MMU
|
2007-10-16 08:24:43 +00:00
|
|
|
unsigned long (*get_unmapped_area) (struct file *filp,
|
|
|
|
unsigned long addr, unsigned long len,
|
|
|
|
unsigned long pgoff, unsigned long flags);
|
2010-01-16 01:01:35 +00:00
|
|
|
#endif
|
2007-10-16 08:24:43 +00:00
|
|
|
unsigned long mmap_base; /* base of mmap area */
|
2013-08-21 17:55:59 +00:00
|
|
|
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
|
2017-03-06 14:17:19 +00:00
|
|
|
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
|
|
|
|
/* Base adresses for compatible mmap() */
|
|
|
|
unsigned long mmap_compat_base;
|
|
|
|
unsigned long mmap_compat_legacy_base;
|
|
|
|
#endif
|
2007-10-16 08:24:43 +00:00
|
|
|
unsigned long task_size; /* size of task vm space */
|
2012-12-12 00:01:38 +00:00
|
|
|
unsigned long highest_vm_end; /* highest vma end address */
|
2007-10-16 08:24:43 +00:00
|
|
|
pgd_t * pgd;
|
2017-02-27 22:30:16 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @mm_users: The number of users including userspace.
|
|
|
|
*
|
|
|
|
* Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
|
|
|
|
* to 0 (i.e. when the task exits and there are no other temporary
|
|
|
|
* reference holders), we also release a reference on @mm_count
|
|
|
|
* (which may then free the &struct mm_struct if @mm_count also
|
|
|
|
* drops to 0).
|
|
|
|
*/
|
|
|
|
atomic_t mm_users;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @mm_count: The number of references to &struct mm_struct
|
|
|
|
* (@mm_users count as 1).
|
|
|
|
*
|
|
|
|
* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
|
|
|
|
* &struct mm_struct is freed.
|
|
|
|
*/
|
|
|
|
atomic_t mm_count;
|
|
|
|
|
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup. The trick is to allocate a lot of PMD page tables. Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low. oom_score for the process will be 0.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#define PUD_SIZE (1UL << 30)
#define PMD_SIZE (1UL << 21)
#define NR_PUD 130000
int main(void)
{
char *addr = NULL;
unsigned long i;
prctl(PR_SET_THP_DISABLE);
for (i = 0; i < NR_PUD ; i++) {
addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap");
break;
}
*addr = 'x';
munmap(addr, PMD_SIZE);
mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
if (addr == MAP_FAILED)
perror("re-mmap"), exit(1);
}
printf("PID %d consumed %lu KiB in PMD page tables\n",
getpid(), i * 4096 >> 10);
return pause();
}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
- HugeTLB can share PMD page tables. The patch handles by accounting
the table to all processes who share it.
- x86 PAE pre-allocates few PMD tables on fork.
- Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded). As with nr_ptes we use per-mm counter. The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-11 23:26:50 +00:00
|
|
|
atomic_long_t nr_ptes; /* PTE page table pages */
|
2015-04-14 22:46:21 +00:00
|
|
|
#if CONFIG_PGTABLE_LEVELS > 2
|
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup. The trick is to allocate a lot of PMD page tables. Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low. oom_score for the process will be 0.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#define PUD_SIZE (1UL << 30)
#define PMD_SIZE (1UL << 21)
#define NR_PUD 130000
int main(void)
{
char *addr = NULL;
unsigned long i;
prctl(PR_SET_THP_DISABLE);
for (i = 0; i < NR_PUD ; i++) {
addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap");
break;
}
*addr = 'x';
munmap(addr, PMD_SIZE);
mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
if (addr == MAP_FAILED)
perror("re-mmap"), exit(1);
}
printf("PID %d consumed %lu KiB in PMD page tables\n",
getpid(), i * 4096 >> 10);
return pause();
}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
- HugeTLB can share PMD page tables. The patch handles by accounting
the table to all processes who share it.
- x86 PAE pre-allocates few PMD tables on fork.
- Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded). As with nr_ptes we use per-mm counter. The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-11 23:26:50 +00:00
|
|
|
atomic_long_t nr_pmds; /* PMD page table pages */
|
2015-04-14 22:46:21 +00:00
|
|
|
#endif
|
2007-10-16 08:24:43 +00:00
|
|
|
int map_count; /* number of VMAs */
|
2011-03-22 23:32:50 +00:00
|
|
|
|
2007-10-16 08:24:43 +00:00
|
|
|
spinlock_t page_table_lock; /* Protects page tables and some counters */
|
2011-03-22 23:32:50 +00:00
|
|
|
struct rw_semaphore mmap_sem;
|
2007-10-16 08:24:43 +00:00
|
|
|
|
|
|
|
struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
|
|
|
|
* together off init_mm.mmlist, and are protected
|
|
|
|
* by mmlist_lock
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
unsigned long hiwater_rss; /* High-watermark of RSS usage */
|
|
|
|
unsigned long hiwater_vm; /* High-water virtual memory usage */
|
|
|
|
|
2011-11-01 00:07:34 +00:00
|
|
|
unsigned long total_vm; /* Total pages mapped */
|
|
|
|
unsigned long locked_vm; /* Pages that have PG_mlocked set */
|
|
|
|
unsigned long pinned_vm; /* Refcount permanently increased */
|
2016-02-03 00:57:46 +00:00
|
|
|
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
|
|
|
|
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
|
|
|
|
unsigned long stack_vm; /* VM_STACK */
|
2011-11-01 00:07:34 +00:00
|
|
|
unsigned long def_flags;
|
2007-10-16 08:24:43 +00:00
|
|
|
unsigned long start_code, end_code, start_data, end_data;
|
|
|
|
unsigned long start_brk, brk, start_stack;
|
|
|
|
unsigned long arg_start, arg_end, env_start, env_end;
|
|
|
|
|
|
|
|
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
|
|
|
|
|
2010-03-05 21:41:39 +00:00
|
|
|
/*
|
|
|
|
* Special counters, in some configurations protected by the
|
|
|
|
* page_table_lock, in other configurations by being atomic.
|
|
|
|
*/
|
|
|
|
struct mm_rss_stat rss_stat;
|
|
|
|
|
2009-09-23 22:57:41 +00:00
|
|
|
struct linux_binfmt *binfmt;
|
|
|
|
|
2011-05-29 18:32:28 +00:00
|
|
|
cpumask_var_t cpu_vm_mask_var;
|
|
|
|
|
2007-10-16 08:24:43 +00:00
|
|
|
/* Architecture-specific MM context */
|
|
|
|
mm_context_t context;
|
|
|
|
|
|
|
|
unsigned long flags; /* Must use atomic bitops to access the bits */
|
|
|
|
|
2008-07-25 08:47:46 +00:00
|
|
|
struct core_state *core_state; /* coredumping support */
|
2009-09-23 22:57:32 +00:00
|
|
|
#ifdef CONFIG_AIO
|
aio: convert the ioctx list to table lookup v3
On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote:
> On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote:
> > When using a large number of threads performing AIO operations the
> > IOCTX list may get a significant number of entries which will cause
> > significant overhead. For example, when running this fio script:
> >
> > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=512; thread; loops=100
> >
> > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to
> > 30% CPU time spent by lookup_ioctx:
> >
> > 32.51% [guest.kernel] [g] lookup_ioctx
> > 9.19% [guest.kernel] [g] __lock_acquire.isra.28
> > 4.40% [guest.kernel] [g] lock_release
> > 4.19% [guest.kernel] [g] sched_clock_local
> > 3.86% [guest.kernel] [g] local_clock
> > 3.68% [guest.kernel] [g] native_sched_clock
> > 3.08% [guest.kernel] [g] sched_clock_cpu
> > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11
> > 2.60% [guest.kernel] [g] memcpy
> > 2.33% [guest.kernel] [g] lock_acquired
> > 2.25% [guest.kernel] [g] lock_acquire
> > 1.84% [guest.kernel] [g] do_io_submit
> >
> > This patchs converts the ioctx list to a radix tree. For a performance
> > comparison the above FIO script was run on a 2 sockets 8 core
> > machine. This are the results (average and %rsd of 10 runs) for the
> > original list based implementation and for the radix tree based
> > implementation:
> >
> > cores 1 2 4 8 16 32
> > list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms
> > %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43%
> > radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms
> > %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75%
> > % of radix
> > relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66%
> > to list
> >
> > To consider the impact of the patch on the typical case of having
> > only one ctx per process the following FIO script was run:
> >
> > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> > blocksize=1024; numjobs=1; thread; loops=100
> >
> > on the same system and the results are the following:
> >
> > list 58892 ms
> > %rsd 0.91%
> > radix 59404 ms
> > %rsd 0.81%
> > % of radix
> > relative 100.87%
> > to list
>
> So, I was just doing some benchmarking/profiling to get ready to send
> out the aio patches I've got for 3.11 - and it looks like your patch is
> causing a ~1.5% throughput regression in my testing :/
... <snip>
I've got an alternate approach for fixing this wart in lookup_ioctx()...
Instead of using an rbtree, just use the reserved id in the ring buffer
header to index an array pointing the ioctx. It's not finished yet, and
it needs to be tidied up, but is most of the way there.
-ben
--
"Thought is the essence of where you are now."
--
kmo> And, a rework of Ben's code, but this was entirely his idea
kmo> -Kent
bcrl> And fix the code to use the right mm_struct in kill_ioctx(), actually
free memory.
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
2013-07-30 16:54:40 +00:00
|
|
|
spinlock_t ioctx_lock;
|
|
|
|
struct kioctx_table __rcu *ioctx_table;
|
2009-09-23 22:57:32 +00:00
|
|
|
#endif
|
2014-06-04 23:07:34 +00:00
|
|
|
#ifdef CONFIG_MEMCG
|
2008-05-12 21:02:31 +00:00
|
|
|
/*
|
|
|
|
* "owner" points to a task that is regarded as the canonical
|
|
|
|
* user/owner of this mm. All of the following must be true in
|
|
|
|
* order for it to be changed:
|
|
|
|
*
|
|
|
|
* current == mm->owner
|
|
|
|
* current->mm != mm
|
|
|
|
* new_owner->mm == mm
|
|
|
|
* new_owner->alloc_lock is held
|
|
|
|
*/
|
2010-02-24 19:01:56 +00:00
|
|
|
struct task_struct __rcu *owner;
|
2008-02-07 08:13:51 +00:00
|
|
|
#endif
|
2016-10-14 02:23:16 +00:00
|
|
|
struct user_namespace *user_ns;
|
2008-04-29 08:01:36 +00:00
|
|
|
|
|
|
|
/* store ref to file /proc/<pid>/exe symlink points to */
|
2015-04-16 19:47:56 +00:00
|
|
|
struct file __rcu *exe_file;
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-28 22:46:29 +00:00
|
|
|
#ifdef CONFIG_MMU_NOTIFIER
|
|
|
|
struct mmu_notifier_mm *mmu_notifier_mm;
|
2011-01-13 23:46:45 +00:00
|
|
|
#endif
|
2013-11-14 22:31:07 +00:00
|
|
|
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
|
2011-01-13 23:46:45 +00:00
|
|
|
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
|
mmu-notifiers: core
With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte". In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present). The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.
Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set. Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).
The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space. Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.
To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page. Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0. This is just an example.
This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).
At least for KVM without this patch it's impossible to swap guests
reliably. And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.
Dependencies:
1) mm_take_all_locks() to register the mmu notifier when the whole VM
isn't doing anything with "mm". This allows mmu notifier users to keep
track if the VM is in the middle of the invalidate_range_begin/end
critical section with an atomic counter incraese in range_begin and
decreased in range_end. No secondary MMU page fault is allowed to map
any spte or secondary tlb reference, while the VM is in the middle of
range_begin/end as any page returned by get_user_pages in that critical
section could later immediately be freed without any further
->invalidate_page notification (invalidate_range_begin/end works on
ranges and ->invalidate_page isn't called immediately before freeing
the page). To stop all page freeing and pagetable overwrites the
mmap_sem must be taken in write mode and all other anon_vma/i_mmap
locks must be taken too.
2) It'd be a waste to add branches in the VM if nobody could possibly
run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of
mmu notifiers, but this already allows to compile a KVM external module
against a kernel with mmu notifiers enabled and from the next pull from
kvm.git we'll start using them. And GRU/XPMEM will also be able to
continue the development by enabling KVM=m in their config, until they
submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can
also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
are all =n.
The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR. Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled. Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+ int err;
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+ if (err) {
+ kfree(kvm);
+ return ERR_PTR(err);
+ }
+
return kvm;
}
mmu_notifier_unregister returns void and it's reliable.
The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-28 22:46:29 +00:00
|
|
|
#endif
|
2011-05-29 18:32:28 +00:00
|
|
|
#ifdef CONFIG_CPUMASK_OFFSTACK
|
|
|
|
struct cpumask cpumask_allocation;
|
2012-10-25 12:16:43 +00:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
/*
|
2013-02-23 00:34:25 +00:00
|
|
|
* numa_next_scan is the next time that the PTEs will be marked
|
|
|
|
* pte_numa. NUMA hinting faults will gather statistics and migrate
|
|
|
|
* pages to new nodes if necessary.
|
2012-10-25 12:16:43 +00:00
|
|
|
*/
|
|
|
|
unsigned long numa_next_scan;
|
|
|
|
|
mm: sched: numa: Implement constant, per task Working Set Sampling (WSS) rate
Previously, to probe the working set of a task, we'd use
a very simple and crude method: mark all of its address
space PROT_NONE.
That method has various (obvious) disadvantages:
- it samples the working set at dissimilar rates,
giving some tasks a sampling quality advantage
over others.
- creates performance problems for tasks with very
large working sets
- over-samples processes with large address spaces but
which only very rarely execute
Improve that method by keeping a rotating offset into the
address space that marks the current position of the scan,
and advance it by a constant rate (in a CPU cycles execution
proportional manner). If the offset reaches the last mapped
address of the mm then it then it starts over at the first
address.
The per-task nature of the working set sampling functionality in this tree
allows such constant rate, per task, execution-weight proportional sampling
of the working set, with an adaptive sampling interval/frequency that
goes from once per 100ms up to just once per 8 seconds. The current
sampling volume is 256 MB per interval.
As tasks mature and converge their working set, so does the
sampling rate slow down to just a trickle, 256 MB per 8
seconds of CPU time executed.
This, beyond being adaptive, also rate-limits rarely
executing systems and does not over-sample on overloaded
systems.
[ In AutoNUMA speak, this patch deals with the effective sampling
rate of the 'hinting page fault'. AutoNUMA's scanning is
currently rate-limited, but it is also fundamentally
single-threaded, executing in the knuma_scand kernel thread,
so the limit in AutoNUMA is global and does not scale up with
the number of CPUs, nor does it scan tasks in an execution
proportional manner.
So the idea of rate-limiting the scanning was first implemented
in the AutoNUMA tree via a global rate limit. This patch goes
beyond that by implementing an execution rate proportional
working set sampling rate that is not implemented via a single
global scanning daemon. ]
[ Dan Carpenter pointed out a possible NULL pointer dereference in the
first version of this patch. ]
Based-on-idea-by: Andrea Arcangeli <aarcange@redhat.com>
Bug-Found-By: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
[ Wrote changelog and fixed bug. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
2012-10-25 12:16:45 +00:00
|
|
|
/* Restart point for scanning and setting pte_numa */
|
|
|
|
unsigned long numa_scan_offset;
|
|
|
|
|
2012-10-25 12:16:43 +00:00
|
|
|
/* numa_scan_seq prevents two threads setting pte_numa */
|
|
|
|
int numa_scan_seq;
|
mm: fix TLB flush race between migration, and change_protection_range
There are a few subtle races, between change_protection_range (used by
mprotect and change_prot_numa) on one side, and NUMA page migration and
compaction on the other side.
The basic race is that there is a time window between when the PTE gets
made non-present (PROT_NONE or NUMA), and the TLB is flushed.
During that time, a CPU may continue writing to the page.
This is fine most of the time, however compaction or the NUMA migration
code may come in, and migrate the page away.
When that happens, the CPU may continue writing, through the cached
translation, to what is no longer the current memory location of the
process.
This only affects x86, which has a somewhat optimistic pte_accessible.
All other architectures appear to be safe, and will either always flush,
or flush whenever there is a valid mapping, even with no permissions
(SPARC).
The basic race looks like this:
CPU A CPU B CPU C
load TLB entry
make entry PTE/PMD_NUMA
fault on entry
read/write old page
start migrating page
change PTE/PMD to new page
read/write old page [*]
flush TLB
reload TLB from new entry
read/write new page
lose data
[*] the old page may belong to a new user at this point!
The obvious fix is to flush remote TLB entries, by making sure that
pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may
still be accessible if there is a TLB flush pending for the mm.
This should fix both NUMA migration and compaction.
[mgorman@suse.de: fix build]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-12-19 01:08:44 +00:00
|
|
|
#endif
|
|
|
|
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
|
|
|
|
/*
|
|
|
|
* An operation with batched TLB flushing is going on. Anything that
|
|
|
|
* can move process memory needs to flush the TLB when moving a
|
|
|
|
* PROT_NONE or PROT_NUMA mapped page.
|
|
|
|
*/
|
|
|
|
bool tlb_flush_pending;
|
2011-05-29 18:32:28 +00:00
|
|
|
#endif
|
2012-03-30 18:26:31 +00:00
|
|
|
struct uprobes_state uprobes_state;
|
2015-11-06 02:47:14 +00:00
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
|
|
atomic_long_t hugetlb_usage;
|
|
|
|
#endif
|
2016-05-20 23:57:21 +00:00
|
|
|
struct work_struct async_put_work;
|
2007-10-16 08:24:43 +00:00
|
|
|
};
|
|
|
|
|
2017-02-02 11:27:56 +00:00
|
|
|
extern struct mm_struct init_mm;
|
|
|
|
|
2011-05-29 18:32:28 +00:00
|
|
|
static inline void mm_init_cpumask(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_CPUMASK_OFFSTACK
|
|
|
|
mm->cpu_vm_mask_var = &mm->cpumask_allocation;
|
|
|
|
#endif
|
fork/exec: cleanup mm initialization
mm initialization on fork/exec is spread all over the place, which makes
the code look inconsistent.
We have mm_init(), which is supposed to init/nullify mm's internals, but
it doesn't init all the fields it should:
- on fork ->mmap,mm_rb,vmacache_seqnum,map_count,mm_cpumask,locked_vm
are zeroed in dup_mmap();
- on fork ->pmd_huge_pte is zeroed in dup_mm(), immediately before
calling mm_init();
- ->cpu_vm_mask_var ptr is initialized by mm_init_cpumask(), which is
called before mm_init() on both fork and exec;
- ->context is initialized by init_new_context(), which is called after
mm_init() on both fork and exec;
Let's consolidate all the initializations in mm_init() to make the code
look cleaner.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-08 21:21:56 +00:00
|
|
|
cpumask_clear(mm->cpu_vm_mask_var);
|
2011-05-29 18:32:28 +00:00
|
|
|
}
|
|
|
|
|
2009-03-12 20:35:44 +00:00
|
|
|
/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
|
2011-05-25 00:12:15 +00:00
|
|
|
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return mm->cpu_vm_mask_var;
|
|
|
|
}
|
2009-03-12 20:35:44 +00:00
|
|
|
|
mm: fix TLB flush race between migration, and change_protection_range
There are a few subtle races, between change_protection_range (used by
mprotect and change_prot_numa) on one side, and NUMA page migration and
compaction on the other side.
The basic race is that there is a time window between when the PTE gets
made non-present (PROT_NONE or NUMA), and the TLB is flushed.
During that time, a CPU may continue writing to the page.
This is fine most of the time, however compaction or the NUMA migration
code may come in, and migrate the page away.
When that happens, the CPU may continue writing, through the cached
translation, to what is no longer the current memory location of the
process.
This only affects x86, which has a somewhat optimistic pte_accessible.
All other architectures appear to be safe, and will either always flush,
or flush whenever there is a valid mapping, even with no permissions
(SPARC).
The basic race looks like this:
CPU A CPU B CPU C
load TLB entry
make entry PTE/PMD_NUMA
fault on entry
read/write old page
start migrating page
change PTE/PMD to new page
read/write old page [*]
flush TLB
reload TLB from new entry
read/write new page
lose data
[*] the old page may belong to a new user at this point!
The obvious fix is to flush remote TLB entries, by making sure that
pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may
still be accessible if there is a TLB flush pending for the mm.
This should fix both NUMA migration and compaction.
[mgorman@suse.de: fix build]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-12-19 01:08:44 +00:00
|
|
|
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
|
|
|
|
/*
|
|
|
|
* Memory barriers to keep this state in sync are graciously provided by
|
|
|
|
* the page table locks, outside of which no page table modifications happen.
|
|
|
|
* The barriers below prevent the compiler from re-ordering the instructions
|
|
|
|
* around the memory barriers that are already present in the code.
|
|
|
|
*/
|
|
|
|
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
barrier();
|
|
|
|
return mm->tlb_flush_pending;
|
|
|
|
}
|
|
|
|
static inline void set_tlb_flush_pending(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
mm->tlb_flush_pending = true;
|
2013-12-19 01:08:45 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Guarantee that the tlb_flush_pending store does not leak into the
|
|
|
|
* critical section updating the page tables
|
|
|
|
*/
|
|
|
|
smp_mb__before_spinlock();
|
mm: fix TLB flush race between migration, and change_protection_range
There are a few subtle races, between change_protection_range (used by
mprotect and change_prot_numa) on one side, and NUMA page migration and
compaction on the other side.
The basic race is that there is a time window between when the PTE gets
made non-present (PROT_NONE or NUMA), and the TLB is flushed.
During that time, a CPU may continue writing to the page.
This is fine most of the time, however compaction or the NUMA migration
code may come in, and migrate the page away.
When that happens, the CPU may continue writing, through the cached
translation, to what is no longer the current memory location of the
process.
This only affects x86, which has a somewhat optimistic pte_accessible.
All other architectures appear to be safe, and will either always flush,
or flush whenever there is a valid mapping, even with no permissions
(SPARC).
The basic race looks like this:
CPU A CPU B CPU C
load TLB entry
make entry PTE/PMD_NUMA
fault on entry
read/write old page
start migrating page
change PTE/PMD to new page
read/write old page [*]
flush TLB
reload TLB from new entry
read/write new page
lose data
[*] the old page may belong to a new user at this point!
The obvious fix is to flush remote TLB entries, by making sure that
pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may
still be accessible if there is a TLB flush pending for the mm.
This should fix both NUMA migration and compaction.
[mgorman@suse.de: fix build]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-12-19 01:08:44 +00:00
|
|
|
}
|
|
|
|
/* Clearing is done after a TLB flush, which also provides a barrier. */
|
|
|
|
static inline void clear_tlb_flush_pending(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
barrier();
|
|
|
|
mm->tlb_flush_pending = false;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
static inline void set_tlb_flush_pending(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void clear_tlb_flush_pending(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-12-30 04:12:19 +00:00
|
|
|
struct vm_fault;
|
|
|
|
|
|
|
|
struct vm_special_mapping {
|
|
|
|
const char *name; /* The name, e.g. "[vdso]". */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If .fault is not provided, this points to a
|
|
|
|
* NULL-terminated array of pages that back the special mapping.
|
|
|
|
*
|
|
|
|
* This must not be NULL unless .fault is provided.
|
|
|
|
*/
|
2014-05-19 22:58:33 +00:00
|
|
|
struct page **pages;
|
2015-12-30 04:12:19 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If non-NULL, then this is called to resolve page faults
|
|
|
|
* on the special mapping. If used, .pages is not checked.
|
|
|
|
*/
|
|
|
|
int (*fault)(const struct vm_special_mapping *sm,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
struct vm_fault *vmf);
|
2016-06-28 11:35:38 +00:00
|
|
|
|
|
|
|
int (*mremap)(const struct vm_special_mapping *sm,
|
|
|
|
struct vm_area_struct *new_vma);
|
2014-05-19 22:58:33 +00:00
|
|
|
};
|
|
|
|
|
2014-07-31 15:40:59 +00:00
|
|
|
enum tlb_flush_reason {
|
|
|
|
TLB_FLUSH_ON_TASK_SWITCH,
|
|
|
|
TLB_REMOTE_SHOOTDOWN,
|
|
|
|
TLB_LOCAL_SHOOTDOWN,
|
|
|
|
TLB_LOCAL_MM_SHOOTDOWN,
|
2015-09-04 22:47:29 +00:00
|
|
|
TLB_REMOTE_SEND_IPI,
|
2014-07-31 15:40:59 +00:00
|
|
|
NR_TLB_FLUSH_REASONS,
|
|
|
|
};
|
|
|
|
|
2014-12-13 00:55:35 +00:00
|
|
|
/*
|
|
|
|
* A swap entry has to fit into a "unsigned long", as the entry is hidden
|
|
|
|
* in the "index" field of the swapper address space.
|
|
|
|
*/
|
|
|
|
typedef struct {
|
|
|
|
unsigned long val;
|
|
|
|
} swp_entry_t;
|
|
|
|
|
2006-09-27 08:50:01 +00:00
|
|
|
#endif /* _LINUX_MM_TYPES_H */
|