2019-05-19 12:08:55 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2008-03-03 17:12:54 +00:00
|
|
|
#include <linux/init.h>
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/interrupt.h>
|
2016-07-14 00:18:55 +00:00
|
|
|
#include <linux/export.h>
|
x86: Spread tlb flush vector between nodes
Currently flush tlb vector allocation is based on below equation:
sender = smp_processor_id() % 8
This isn't optimal, CPUs from different node can have the same vector, this
causes a lot of lock contention. Instead, we can assign the same vectors to
CPUs from the same node, while different node has different vectors. This has
below advantages:
a. if there is lock contention, the lock contention is between CPUs from one
node. This should be much cheaper than the contention between nodes.
b. completely avoid lock contention between nodes. This especially benefits
kswapd, which is the biggest user of tlb flush, since kswapd sets its affinity
to specific node.
In my test, this could reduce > 20% CPU overhead in extreme case.The test
machine has 4 nodes and each node has 16 CPUs. I then bind each node's kswapd
to the first CPU of the node. I run a workload with 4 sequential mmap file
read thread. The files are empty sparse file. This workload will trigger a
lot of page reclaim and tlbflush. The kswapd bind is to easy trigger the
extreme tlb flush lock contention because otherwise kswapd keeps migrating
between CPUs of a node and I can't get stable result. Sure in real workload,
we can't always see so big tlb flush lock contention, but it's possible.
[ hpa: folded in fix from Eric Dumazet to use this_cpu_read() ]
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <1287544023.4571.8.camel@sli10-conroe.sh.intel.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-10-20 03:07:03 +00:00
|
|
|
#include <linux/cpu.h>
|
2018-01-29 22:04:47 +00:00
|
|
|
#include <linux/debugfs.h>
|
2008-03-03 17:12:54 +00:00
|
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include <asm/mmu_context.h>
|
2018-01-29 22:04:47 +00:00
|
|
|
#include <asm/nospec-branch.h>
|
2009-11-13 11:54:40 +00:00
|
|
|
#include <asm/cache.h>
|
2009-01-21 08:26:06 +00:00
|
|
|
#include <asm/apic.h>
|
2009-01-21 08:26:06 +00:00
|
|
|
#include <asm/uv/uv.h>
|
2008-03-25 16:28:56 +00:00
|
|
|
|
2018-12-03 17:03:49 +00:00
|
|
|
#include "mm_internal.h"
|
|
|
|
|
2020-04-21 09:20:32 +00:00
|
|
|
#ifdef CONFIG_PARAVIRT
|
|
|
|
# define STATIC_NOPV
|
|
|
|
#else
|
|
|
|
# define STATIC_NOPV static
|
|
|
|
# define __flush_tlb_local native_flush_tlb_local
|
2020-04-21 09:20:33 +00:00
|
|
|
# define __flush_tlb_global native_flush_tlb_global
|
2020-04-21 09:20:34 +00:00
|
|
|
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
|
2020-04-21 09:20:36 +00:00
|
|
|
# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
|
2020-04-21 09:20:32 +00:00
|
|
|
#endif
|
|
|
|
|
2008-03-03 17:12:54 +00:00
|
|
|
/*
|
2017-05-28 17:00:14 +00:00
|
|
|
* TLB flushing, formerly SMP-only
|
2008-03-03 17:12:54 +00:00
|
|
|
* c/o Linus Torvalds.
|
|
|
|
*
|
|
|
|
* These mean you can really definitely utterly forget about
|
|
|
|
* writing to user space from interrupts. (Its not allowed anyway).
|
|
|
|
*
|
|
|
|
* Optimizations Manfred Spraul <manfred@colorfullife.com>
|
|
|
|
*
|
|
|
|
* More scalable flush, from Andi Kleen
|
|
|
|
*
|
2012-06-28 01:02:23 +00:00
|
|
|
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
2008-03-03 17:12:54 +00:00
|
|
|
*/
|
|
|
|
|
2018-11-25 18:33:49 +00:00
|
|
|
/*
|
|
|
|
* Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
|
|
|
|
* stored in cpu_tlb_state.last_user_mm_ibpb.
|
|
|
|
*/
|
|
|
|
#define LAST_USER_MM_IBPB 0x1UL
|
|
|
|
|
2020-04-21 09:20:41 +00:00
|
|
|
/*
|
|
|
|
* The x86 feature is called PCID (Process Context IDentifier). It is similar
|
|
|
|
* to what is traditionally called ASID on the RISC processors.
|
|
|
|
*
|
|
|
|
* We don't use the traditional ASID implementation, where each process/mm gets
|
|
|
|
* its own ASID and flush/restart when we run out of ASID space.
|
|
|
|
*
|
|
|
|
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
|
|
|
|
* that came by on this CPU, allowing cheaper switch_mm between processes on
|
|
|
|
* this CPU.
|
|
|
|
*
|
|
|
|
* We end up with different spaces for different things. To avoid confusion we
|
|
|
|
* use different names for each of them:
|
|
|
|
*
|
|
|
|
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
|
|
|
* the canonical identifier for an mm
|
|
|
|
*
|
|
|
|
* kPCID - [1, TLB_NR_DYN_ASIDS]
|
|
|
|
* the value we write into the PCID part of CR3; corresponds to the
|
|
|
|
* ASID+1, because PCID 0 is special.
|
|
|
|
*
|
|
|
|
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
|
|
|
* for KPTI each mm has two address spaces and thus needs two
|
|
|
|
* PCID values, but we can still do with a single ASID denomination
|
|
|
|
* for each mm. Corresponds to kPCID + 2048.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* There are 12 bits of space for ASIDS in CR3 */
|
|
|
|
#define CR3_HW_ASID_BITS 12
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
|
|
|
|
* user/kernel switches
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
# define PTI_CONSUMED_PCID_BITS 1
|
|
|
|
#else
|
|
|
|
# define PTI_CONSUMED_PCID_BITS 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
|
|
|
|
* for them being zero-based. Another -1 is because PCID 0 is reserved for
|
|
|
|
* use by non-PCID-aware users.
|
|
|
|
*/
|
|
|
|
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given @asid, compute kPCID
|
|
|
|
*/
|
|
|
|
static inline u16 kern_pcid(u16 asid)
|
|
|
|
{
|
|
|
|
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
|
|
|
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
/*
|
|
|
|
* Make sure that the dynamic ASID space does not confict with the
|
|
|
|
* bit we are using to switch between user and kernel ASIDs.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The ASID being passed in here should have respected the
|
|
|
|
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
|
|
|
|
*/
|
|
|
|
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* The dynamically-assigned ASIDs that get passed in are small
|
|
|
|
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
|
|
|
|
* so do not bother to clear it.
|
|
|
|
*
|
|
|
|
* If PCID is on, ASID-aware code paths put the ASID+1 into the
|
|
|
|
* PCID bits. This serves two purposes. It prevents a nasty
|
|
|
|
* situation in which PCID-unaware code saves CR3, loads some other
|
|
|
|
* value (with PCID == 0), and then restores CR3, thus corrupting
|
|
|
|
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
|
|
|
|
* that any bugs involving loading a PCID-enabled CR3 with
|
|
|
|
* CR4.PCIDE off will trigger deterministically.
|
|
|
|
*/
|
|
|
|
return asid + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given @asid, compute uPCID
|
|
|
|
*/
|
|
|
|
static inline u16 user_pcid(u16 asid)
|
|
|
|
{
|
|
|
|
u16 ret = kern_pcid(asid);
|
|
|
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
|
|
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
|
|
|
|
#endif
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
|
|
|
|
{
|
|
|
|
if (static_cpu_has(X86_FEATURE_PCID)) {
|
|
|
|
return __sme_pa(pgd) | kern_pcid(asid);
|
|
|
|
} else {
|
|
|
|
VM_WARN_ON_ONCE(asid != 0);
|
|
|
|
return __sme_pa(pgd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
|
|
|
|
{
|
|
|
|
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
|
|
|
|
/*
|
|
|
|
* Use boot_cpu_has() instead of this_cpu_has() as this function
|
|
|
|
* might be called during early boot. This should work even after
|
|
|
|
* boot because all CPU's the have same capabilities:
|
|
|
|
*/
|
|
|
|
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
|
|
|
|
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
|
|
|
|
}
|
|
|
|
|
2017-12-04 14:07:57 +00:00
|
|
|
/*
|
|
|
|
* We get here when we do something requiring a TLB invalidation
|
|
|
|
* but could not go invalidate all of the contexts. We do the
|
|
|
|
* necessary invalidation by clearing out the 'ctx_id' which
|
|
|
|
* forces a TLB flush when the context is loaded.
|
|
|
|
*/
|
2018-07-21 07:55:32 +00:00
|
|
|
static void clear_asid_other(void)
|
2017-12-04 14:07:57 +00:00
|
|
|
{
|
|
|
|
u16 asid;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is only expected to be set if we have disabled
|
|
|
|
* kernel _PAGE_GLOBAL pages.
|
|
|
|
*/
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI)) {
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
|
|
|
/* Do not need to flush the current asid */
|
|
|
|
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* Make sure the next time we go to switch to
|
|
|
|
* this asid, we do a flush:
|
|
|
|
*/
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
|
|
|
|
}
|
|
|
|
this_cpu_write(cpu_tlbstate.invalidate_other, false);
|
|
|
|
}
|
|
|
|
|
2017-06-29 15:53:15 +00:00
|
|
|
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
|
|
|
|
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
|
|
|
u16 *new_asid, bool *need_flush)
|
|
|
|
{
|
|
|
|
u16 asid;
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PCID)) {
|
|
|
|
*new_asid = 0;
|
|
|
|
*need_flush = true;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2017-12-04 14:07:57 +00:00
|
|
|
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
|
|
|
clear_asid_other();
|
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
|
|
|
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
|
|
|
|
next->context.ctx_id)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
*new_asid = asid;
|
|
|
|
*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
|
|
|
|
next_tlb_gen);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't currently own an ASID slot on this CPU.
|
|
|
|
* Allocate a slot.
|
|
|
|
*/
|
|
|
|
*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
|
|
|
|
if (*new_asid >= TLB_NR_DYN_ASIDS) {
|
|
|
|
*new_asid = 0;
|
|
|
|
this_cpu_write(cpu_tlbstate.next_asid, 1);
|
|
|
|
}
|
|
|
|
*need_flush = true;
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:34 +00:00
|
|
|
/*
|
|
|
|
* Given an ASID, flush the corresponding user ASID. We can delay this
|
|
|
|
* until the next time we switch to it.
|
|
|
|
*
|
|
|
|
* See SWITCH_TO_USER_CR3.
|
|
|
|
*/
|
|
|
|
static inline void invalidate_user_asid(u16 asid)
|
|
|
|
{
|
|
|
|
/* There is no user ASID if address space separation is off */
|
|
|
|
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We only have a single ASID if PCID is off and the CR3
|
|
|
|
* write will have flushed it.
|
|
|
|
*/
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_PCID))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI))
|
|
|
|
return;
|
|
|
|
|
|
|
|
__set_bit(kern_pcid(asid),
|
|
|
|
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
|
|
|
|
}
|
|
|
|
|
2017-12-04 14:07:58 +00:00
|
|
|
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
|
|
|
|
{
|
|
|
|
unsigned long new_mm_cr3;
|
|
|
|
|
|
|
|
if (need_flush) {
|
2017-12-04 14:07:59 +00:00
|
|
|
invalidate_user_asid(new_asid);
|
2017-12-04 14:07:58 +00:00
|
|
|
new_mm_cr3 = build_cr3(pgdir, new_asid);
|
|
|
|
} else {
|
|
|
|
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Caution: many callers of this function expect
|
|
|
|
* that load_cr3() is serializing and orders TLB
|
|
|
|
* fills with respect to the mm_cpumask writes.
|
|
|
|
*/
|
|
|
|
write_cr3(new_mm_cr3);
|
|
|
|
}
|
|
|
|
|
2008-03-03 17:12:54 +00:00
|
|
|
void leave_mm(int cpu)
|
|
|
|
{
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It's plausible that we're in lazy TLB mode while our mm is init_mm.
|
|
|
|
* If so, our callers still expect us to flush the TLB, but there
|
|
|
|
* aren't any user TLB entries in init_mm to worry about.
|
|
|
|
*
|
|
|
|
* This needs to happen before any other sanity checks due to
|
|
|
|
* intel_idle's shenanigans.
|
|
|
|
*/
|
|
|
|
if (loaded_mm == &init_mm)
|
|
|
|
return;
|
|
|
|
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
/* Warn if we're not lazy. */
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
|
|
|
switch_mm(NULL, &init_mm, NULL);
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
2017-11-04 11:16:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(leave_mm);
|
2008-03-03 17:12:54 +00:00
|
|
|
|
2016-04-26 16:39:08 +00:00
|
|
|
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
2016-04-26 16:39:09 +00:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
switch_mm_irqs_off(prev, next, tsk);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2018-01-25 21:12:14 +00:00
|
|
|
static void sync_current_stack_to_mm(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
unsigned long sp = current_stack_pointer;
|
|
|
|
pgd_t *pgd = pgd_offset(mm, sp);
|
|
|
|
|
2018-05-18 10:35:24 +00:00
|
|
|
if (pgtable_l5_enabled()) {
|
2018-01-25 21:12:14 +00:00
|
|
|
if (unlikely(pgd_none(*pgd))) {
|
|
|
|
pgd_t *pgd_ref = pgd_offset_k(sp);
|
|
|
|
|
|
|
|
set_pgd(pgd, *pgd_ref);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* "pgd" is faked. The top level entries are "p4d"s, so sync
|
|
|
|
* the p4d. This compiles to approximately the same code as
|
|
|
|
* the 5-level case.
|
|
|
|
*/
|
|
|
|
p4d_t *p4d = p4d_offset(pgd, sp);
|
|
|
|
|
|
|
|
if (unlikely(p4d_none(*p4d))) {
|
|
|
|
pgd_t *pgd_ref = pgd_offset_k(sp);
|
|
|
|
p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
|
|
|
|
|
|
|
|
set_p4d(p4d, *p4d_ref);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-25 18:33:49 +00:00
|
|
|
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
|
|
|
|
{
|
|
|
|
unsigned long next_tif = task_thread_info(next)->flags;
|
|
|
|
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
|
|
|
|
|
|
|
|
return (unsigned long)next->mm | ibpb;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cond_ibpb(struct task_struct *next)
|
2018-09-25 12:38:18 +00:00
|
|
|
{
|
2018-11-25 18:33:49 +00:00
|
|
|
if (!next || !next->mm)
|
|
|
|
return;
|
|
|
|
|
2018-09-25 12:38:18 +00:00
|
|
|
/*
|
2018-11-25 18:33:49 +00:00
|
|
|
* Both, the conditional and the always IBPB mode use the mm
|
|
|
|
* pointer to avoid the IBPB when switching between tasks of the
|
|
|
|
* same process. Using the mm pointer instead of mm->context.ctx_id
|
|
|
|
* opens a hypothetical hole vs. mm_struct reuse, which is more or
|
|
|
|
* less impossible to control by an attacker. Aside of that it
|
|
|
|
* would only affect the first schedule so the theoretically
|
|
|
|
* exposed data is not really interesting.
|
2018-09-25 12:38:18 +00:00
|
|
|
*/
|
2018-11-25 18:33:49 +00:00
|
|
|
if (static_branch_likely(&switch_mm_cond_ibpb)) {
|
|
|
|
unsigned long prev_mm, next_mm;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is a bit more complex than the always mode because
|
|
|
|
* it has to handle two cases:
|
|
|
|
*
|
|
|
|
* 1) Switch from a user space task (potential attacker)
|
|
|
|
* which has TIF_SPEC_IB set to a user space task
|
|
|
|
* (potential victim) which has TIF_SPEC_IB not set.
|
|
|
|
*
|
|
|
|
* 2) Switch from a user space task (potential attacker)
|
|
|
|
* which has TIF_SPEC_IB not set to a user space task
|
|
|
|
* (potential victim) which has TIF_SPEC_IB set.
|
|
|
|
*
|
|
|
|
* This could be done by unconditionally issuing IBPB when
|
|
|
|
* a task which has TIF_SPEC_IB set is either scheduled in
|
|
|
|
* or out. Though that results in two flushes when:
|
|
|
|
*
|
|
|
|
* - the same user space task is scheduled out and later
|
|
|
|
* scheduled in again and only a kernel thread ran in
|
|
|
|
* between.
|
|
|
|
*
|
|
|
|
* - a user space task belonging to the same process is
|
|
|
|
* scheduled in after a kernel thread ran in between
|
|
|
|
*
|
|
|
|
* - a user space task belonging to the same process is
|
|
|
|
* scheduled in immediately.
|
|
|
|
*
|
|
|
|
* Optimize this with reasonably small overhead for the
|
|
|
|
* above cases. Mangle the TIF_SPEC_IB bit into the mm
|
|
|
|
* pointer of the incoming task which is stored in
|
|
|
|
* cpu_tlbstate.last_user_mm_ibpb for comparison.
|
|
|
|
*/
|
|
|
|
next_mm = mm_mangle_tif_spec_ib(next);
|
|
|
|
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Issue IBPB only if the mm's are different and one or
|
|
|
|
* both have the IBPB bit set.
|
|
|
|
*/
|
|
|
|
if (next_mm != prev_mm &&
|
|
|
|
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
|
|
|
|
indirect_branch_prediction_barrier();
|
|
|
|
|
|
|
|
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
|
|
|
|
/*
|
|
|
|
* Only flush when switching to a user space task with a
|
|
|
|
* different context than the user space task which ran
|
|
|
|
* last on this CPU.
|
|
|
|
*/
|
|
|
|
if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
|
|
|
|
indirect_branch_prediction_barrier();
|
|
|
|
this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
|
|
|
|
}
|
|
|
|
}
|
2018-09-25 12:38:18 +00:00
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:30 +00:00
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
|
|
static inline void cr4_update_pce_mm(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
if (static_branch_unlikely(&rdpmc_always_available_key) ||
|
|
|
|
(!static_branch_unlikely(&rdpmc_never_available_key) &&
|
|
|
|
atomic_read(&mm->context.perf_rdpmc_allowed)))
|
|
|
|
cr4_set_bits_irqsoff(X86_CR4_PCE);
|
|
|
|
else
|
|
|
|
cr4_clear_bits_irqsoff(X86_CR4_PCE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void cr4_update_pce(void *ignored)
|
|
|
|
{
|
|
|
|
cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
|
|
|
|
#endif
|
|
|
|
|
2016-04-26 16:39:09 +00:00
|
|
|
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
2016-04-26 16:39:08 +00:00
|
|
|
{
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
2018-09-26 03:58:44 +00:00
|
|
|
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
unsigned cpu = smp_processor_id();
|
|
|
|
u64 next_tlb_gen;
|
2018-09-26 03:58:39 +00:00
|
|
|
bool need_flush;
|
|
|
|
u16 new_asid;
|
2016-04-26 16:39:08 +00:00
|
|
|
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
/*
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
* NB: The scheduler will call us with prev == next when switching
|
|
|
|
* from lazy TLB mode to normal mode if active_mm isn't changing.
|
|
|
|
* When this happens, we don't assume that CR3 (and hence
|
|
|
|
* cpu_tlbstate.loaded_mm) matches next.
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
*
|
|
|
|
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
|
|
|
|
*/
|
2016-08-11 09:35:23 +00:00
|
|
|
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
/* We don't want flush_tlb_func_* to run concurrently with us. */
|
|
|
|
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
|
|
|
|
WARN_ON_ONCE(!irqs_disabled());
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify that CR3 is what we think it is. This will catch
|
|
|
|
* hypothetical buggy code that directly switches to swapper_pg_dir
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
* without going through leave_mm() / switch_mm_irqs_off() or that
|
|
|
|
* does something like write_cr3(read_cr3_pa()).
|
2017-09-08 05:06:57 +00:00
|
|
|
*
|
|
|
|
* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
|
|
|
|
* isn't free.
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
*/
|
2017-09-08 05:06:57 +00:00
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2017-12-04 14:07:54 +00:00
|
|
|
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
|
2017-09-08 05:06:57 +00:00
|
|
|
/*
|
|
|
|
* If we were to BUG here, we'd be very likely to kill
|
|
|
|
* the system so hard that we don't see the call trace.
|
|
|
|
* Try to recover instead by ignoring the error and doing
|
|
|
|
* a global flush to minimize the chance of corruption.
|
|
|
|
*
|
|
|
|
* (This is far from being a fully correct recovery.
|
|
|
|
* Architecturally, the CPU could prefetch something
|
|
|
|
* back into an incorrect ASID slot and leave it there
|
|
|
|
* to cause trouble down the road. It's better than
|
|
|
|
* nothing, though.)
|
|
|
|
*/
|
|
|
|
__flush_tlb_all();
|
|
|
|
}
|
|
|
|
#endif
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.is_lazy, false);
|
2016-08-11 09:35:23 +00:00
|
|
|
|
2018-01-29 20:20:12 +00:00
|
|
|
/*
|
membarrier/x86: Provide core serializing command
There are two places where core serialization is needed by membarrier:
1) When returning from the membarrier IPI,
2) After scheduler updates curr to a thread with a different mm, before
going back to user-space, since the curr->mm is used by membarrier to
check whether it needs to send an IPI to that CPU.
x86-32 uses IRET as return from interrupt, and both IRET and SYSEXIT to go
back to user-space. The IRET instruction is core serializing, but not
SYSEXIT.
x86-64 uses IRET as return from interrupt, which takes care of the IPI.
However, it can return to user-space through either SYSRETL (compat
code), SYSRETQ, or IRET. Given that SYSRET{L,Q} is not core serializing,
we rely instead on write_cr3() performed by switch_mm() to provide core
serialization after changing the current mm, and deal with the special
case of kthread -> uthread (temporarily keeping current mm into
active_mm) by adding a sync_core() in that specific case.
Use the new sync_core_before_usermode() to guarantee this.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: David Sehr <sehr@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maged Michael <maged.michael@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Link: http://lkml.kernel.org/r/20180129202020.8515-10-mathieu.desnoyers@efficios.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-01-29 20:20:18 +00:00
|
|
|
* The membarrier system call requires a full memory barrier and
|
|
|
|
* core serialization before returning to user-space, after
|
|
|
|
* storing to rq->curr. Writing to CR3 provides that full
|
|
|
|
* memory barrier and core serializing instruction.
|
2018-01-29 20:20:12 +00:00
|
|
|
*/
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
if (real_prev == next) {
|
2017-10-14 16:59:49 +00:00
|
|
|
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
|
|
|
next->context.ctx_id);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
|
2016-04-26 16:39:08 +00:00
|
|
|
/*
|
2018-09-26 03:58:44 +00:00
|
|
|
* Even in lazy TLB mode, the CPU should stay set in the
|
|
|
|
* mm_cpumask. The TLB shootdown code can figure out from
|
|
|
|
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
|
2016-04-26 16:39:08 +00:00
|
|
|
*/
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
if (WARN_ON_ONCE(real_prev != &init_mm &&
|
|
|
|
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
|
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
|
|
2018-09-26 03:58:44 +00:00
|
|
|
/*
|
|
|
|
* If the CPU is not in lazy TLB mode, we are just switching
|
|
|
|
* from one thread in a process to another thread in the same
|
|
|
|
* process. No TLB flush required.
|
|
|
|
*/
|
|
|
|
if (!was_lazy)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read the tlb_gen to check whether a flush is needed.
|
|
|
|
* If the TLB is up to date, just use it.
|
|
|
|
* The barrier synchronizes with the tlb_gen increment in
|
|
|
|
* the TLB shootdown code.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
|
|
|
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
|
|
|
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
|
|
|
|
next_tlb_gen)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TLB contents went out of date while we were in lazy
|
|
|
|
* mode. Fall through to the TLB switching code below.
|
|
|
|
*/
|
|
|
|
new_asid = prev_asid;
|
|
|
|
need_flush = true;
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
} else {
|
2018-01-29 22:04:47 +00:00
|
|
|
/*
|
|
|
|
* Avoid user/user BTB poisoning by flushing the branch
|
|
|
|
* predictor when switching between processes. This stops
|
|
|
|
* one process from doing Spectre-v2 attacks on another.
|
|
|
|
*/
|
2018-11-25 18:33:49 +00:00
|
|
|
cond_ibpb(tsk);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
|
|
|
|
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
|
|
|
|
/*
|
|
|
|
* If our current stack is in vmalloc space and isn't
|
|
|
|
* mapped in the new pgd, we'll double-fault. Forcibly
|
|
|
|
* map it.
|
|
|
|
*/
|
2018-01-25 21:12:14 +00:00
|
|
|
sync_current_stack_to_mm(next);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
}
|
2016-04-26 16:39:08 +00:00
|
|
|
|
2018-07-16 19:03:37 +00:00
|
|
|
/*
|
|
|
|
* Stop remote flushes for the previous mm.
|
|
|
|
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
|
|
|
|
* but the bitmap manipulation can cause cache line contention.
|
|
|
|
*/
|
|
|
|
if (real_prev != &init_mm) {
|
|
|
|
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
|
|
|
|
mm_cpumask(real_prev)));
|
|
|
|
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
|
|
|
}
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
/*
|
|
|
|
* Start remote flushes and then read tlb_gen.
|
|
|
|
*/
|
2018-07-16 19:03:37 +00:00
|
|
|
if (next != &init_mm)
|
|
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
2018-08-29 15:47:18 +00:00
|
|
|
/* Let nmi_uaccess_okay() know that we're changing CR3. */
|
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
|
|
|
barrier();
|
2018-09-26 03:58:39 +00:00
|
|
|
}
|
2018-08-29 15:47:18 +00:00
|
|
|
|
2018-09-26 03:58:39 +00:00
|
|
|
if (need_flush) {
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
|
|
|
load_new_mm_cr3(next->pgd, new_asid, true);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
|
2018-01-29 22:04:47 +00:00
|
|
|
/*
|
2018-09-26 03:58:39 +00:00
|
|
|
* NB: This gets called via leave_mm() in the idle path
|
|
|
|
* where RCU functions differently. Tracing normally
|
|
|
|
* uses RCU, so we need to use the _rcuidle variant.
|
|
|
|
*
|
|
|
|
* (There is no good reason for this. The idle code should
|
|
|
|
* be rearranged to call this before rcu_idle_enter().)
|
2018-01-29 22:04:47 +00:00
|
|
|
*/
|
2018-09-26 03:58:39 +00:00
|
|
|
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
|
|
|
} else {
|
|
|
|
/* The new ASID is already up to date. */
|
|
|
|
load_new_mm_cr3(next->pgd, new_asid, false);
|
2018-08-29 15:47:18 +00:00
|
|
|
|
2018-09-26 03:58:39 +00:00
|
|
|
/* See above wrt _rcuidle. */
|
|
|
|
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
}
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
|
2018-09-26 03:58:39 +00:00
|
|
|
/* Make sure we write CR3 before loaded_mm. */
|
|
|
|
barrier();
|
|
|
|
|
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
|
|
|
|
2018-09-26 03:58:44 +00:00
|
|
|
if (next != real_prev) {
|
2020-04-21 09:20:30 +00:00
|
|
|
cr4_update_pce_mm(next);
|
2018-09-26 03:58:44 +00:00
|
|
|
switch_ldt(real_prev, next);
|
|
|
|
}
|
2016-04-26 16:39:08 +00:00
|
|
|
}
|
|
|
|
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
/*
|
2017-10-14 16:59:50 +00:00
|
|
|
* Please ignore the name of this function. It should be called
|
|
|
|
* switch_to_kernel_thread().
|
|
|
|
*
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
|
|
|
|
* kernel thread or other context without an mm. Acceptable implementations
|
|
|
|
* include doing nothing whatsoever, switching to init_mm, or various clever
|
|
|
|
* lazy tricks to try to minimize TLB flushes.
|
|
|
|
*
|
|
|
|
* The scheduler reserves the right to call enter_lazy_tlb() several times
|
|
|
|
* in a row. It will notify us that we're going back to a real mm by
|
|
|
|
* calling switch_mm_irqs_off().
|
|
|
|
*/
|
|
|
|
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
|
|
|
return;
|
|
|
|
|
2018-09-26 03:58:38 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.is_lazy, true);
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
}
|
|
|
|
|
2017-09-07 02:54:53 +00:00
|
|
|
/*
|
|
|
|
* Call this when reinitializing a CPU. It fixes the following potential
|
|
|
|
* problems:
|
|
|
|
*
|
|
|
|
* - The ASID changed from what cpu_tlbstate thinks it is (most likely
|
|
|
|
* because the CPU was taken down and came back up with CR3's PCID
|
|
|
|
* bits clear. CPU hotplug can do this.
|
|
|
|
*
|
|
|
|
* - The TLB contains junk in slots corresponding to inactive ASIDs.
|
|
|
|
*
|
|
|
|
* - The CPU went so far out to lunch that it may have missed a TLB
|
|
|
|
* flush.
|
|
|
|
*/
|
|
|
|
void initialize_tlbstate_and_flush(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
|
|
|
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
|
|
|
|
unsigned long cr3 = __read_cr3();
|
|
|
|
|
|
|
|
/* Assert that CR3 already references the right mm. */
|
|
|
|
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
|
|
|
|
* doesn't work like other CR4 bits because it can only be set from
|
|
|
|
* long mode.)
|
|
|
|
*/
|
2017-09-10 15:52:58 +00:00
|
|
|
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
|
2017-09-07 02:54:53 +00:00
|
|
|
!(cr4_read_shadow() & X86_CR4_PCIDE));
|
|
|
|
|
|
|
|
/* Force ASID 0 and force a TLB flush. */
|
2017-12-04 14:07:54 +00:00
|
|
|
write_cr3(build_cr3(mm->pgd, 0));
|
2017-09-07 02:54:53 +00:00
|
|
|
|
|
|
|
/* Reinitialize tlbstate. */
|
2018-11-25 18:33:49 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
|
2017-09-07 02:54:53 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
|
|
|
|
this_cpu_write(cpu_tlbstate.next_asid, 1);
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
|
|
|
|
|
|
|
|
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
|
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
|
|
|
|
}
|
|
|
|
|
2017-06-29 15:53:16 +00:00
|
|
|
/*
|
|
|
|
* flush_tlb_func_common()'s memory ordering requirement is that any
|
|
|
|
* TLB fills that happen after we flush the TLB are ordered after we
|
|
|
|
* read active_mm's tlb_gen. We don't need any explicit barriers
|
|
|
|
* because all x86 flush operations are serializing and the
|
|
|
|
* atomic64_read operation won't be reordered by the compiler.
|
|
|
|
*/
|
2017-05-28 17:00:12 +00:00
|
|
|
static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
|
|
|
bool local, enum tlb_flush_reason reason)
|
2008-03-03 17:12:54 +00:00
|
|
|
{
|
2017-06-29 15:53:16 +00:00
|
|
|
/*
|
|
|
|
* We have three different tlb_gen values in here. They are:
|
|
|
|
*
|
|
|
|
* - mm_tlb_gen: the latest generation.
|
|
|
|
* - local_tlb_gen: the generation that this CPU has already caught
|
|
|
|
* up to.
|
|
|
|
* - f->new_tlb_gen: the generation that the requester of the flush
|
|
|
|
* wants us to catch up to.
|
|
|
|
*/
|
|
|
|
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
2017-06-29 15:53:16 +00:00
|
|
|
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
2017-06-29 15:53:16 +00:00
|
|
|
|
2017-06-29 15:53:13 +00:00
|
|
|
/* This code cannot presently handle being reentered. */
|
|
|
|
VM_WARN_ON(!irqs_disabled());
|
|
|
|
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
if (unlikely(loaded_mm == &init_mm))
|
|
|
|
return;
|
|
|
|
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
|
2017-06-29 15:53:16 +00:00
|
|
|
loaded_mm->context.ctx_id);
|
|
|
|
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
if (this_cpu_read(cpu_tlbstate.is_lazy)) {
|
2017-06-29 15:53:16 +00:00
|
|
|
/*
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
* We're in lazy mode. We need to at least flush our
|
|
|
|
* paging-structure cache to avoid speculatively reading
|
|
|
|
* garbage into our TLB. Since switching to init_mm is barely
|
|
|
|
* slower than a minimal flush, just switch to init_mm.
|
2018-09-26 03:58:44 +00:00
|
|
|
*
|
|
|
|
* This should be rare, with native_flush_tlb_others skipping
|
|
|
|
* IPIs to lazy TLB mode CPUs.
|
2017-06-29 15:53:16 +00:00
|
|
|
*/
|
x86/mm: Flush more aggressively in lazy TLB mode
Since commit:
94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
x86's lazy TLB mode has been all the way lazy: when running a kernel thread
(including the idle thread), the kernel keeps using the last user mm's
page tables without attempting to maintain user TLB coherence at all.
From a pure semantic perspective, this is fine -- kernel threads won't
attempt to access user pages, so having stale TLB entries doesn't matter.
Unfortunately, I forgot about a subtlety. By skipping TLB flushes,
we also allow any paging-structure caches that may exist on the CPU
to become incoherent. This means that we can have a
paging-structure cache entry that references a freed page table, and
the CPU is within its rights to do a speculative page walk starting
at the freed page table.
I can imagine this causing two different problems:
- A speculative page walk starting from a bogus page table could read
IO addresses. I haven't seen any reports of this causing problems.
- A speculative page walk that involves a bogus page table can install
garbage in the TLB. Such garbage would always be at a user VA, but
some AMD CPUs have logic that triggers a machine check when it notices
these bogus entries. I've seen a couple reports of this.
Boris further explains the failure mode:
> It is actually more of an optimization which assumes that paging-structure
> entries are in WB DRAM:
>
> "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables
> performance optimization that assumes PML4, PDP, PDE, and PTE entries
> are in cacheable WB-DRAM; memory type checks may be bypassed, and
> addresses outside of WB-DRAM may result in undefined behavior or NB
> protocol errors. 1=Disables performance optimization and allows PML4,
> PDP, PDE and PTE entries to be in any memory type. Operating systems
> that maintain page tables in memory types other than WB- DRAM must set
> TlbCacheDis to insure proper operation."
>
> The MCE generated is an NB protocol error to signal that
>
> "Link: A specific coherent-only packet from a CPU was issued to an
> IO link. This may be caused by software which addresses page table
> structures in a memory type other than cacheable WB-DRAM without
> properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for
> example, when page table structure addresses are above top of memory. In
> such cases, the NB will generate an MCE if it sees a mismatch between
> the memory operation generated by the core and the link type."
>
> I'm assuming coherent-only packets don't go out on IO links, thus the
> error.
To fix this, reinstate TLB coherence in lazy mode. With this patch
applied, we do it in one of two ways:
- If we have PCID, we simply switch back to init_mm's page tables
when we enter a kernel thread -- this seems to be quite cheap
except for the cost of serializing the CPU.
- If we don't have PCID, then we set a flag and switch to init_mm
the first time we would otherwise need to flush the TLB.
The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed
to override the default mode for benchmarking.
In theory, we could optimize this better by only flushing the TLB in
lazy CPUs when a page table is freed. Doing that would require
auditing the mm code to make sure that all page table freeing goes
through tlb_remove_page() as well as reworking some data structures
to implement the improved flush logic.
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Adam Borowski <kilobyte@angband.pl>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Johannes Hirte <johannes.hirte@datenkhaos.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking")
Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-09 16:50:49 +00:00
|
|
|
switch_mm_irqs_off(NULL, &init_mm, NULL);
|
2017-05-22 22:30:02 +00:00
|
|
|
return;
|
|
|
|
}
|
2008-03-03 17:12:54 +00:00
|
|
|
|
2017-06-29 15:53:16 +00:00
|
|
|
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
|
|
|
|
/*
|
|
|
|
* There's nothing to do: we're already up to date. This can
|
|
|
|
* happen if two concurrent flushes happen -- the first flush to
|
|
|
|
* be handled can catch us all the way up, leaving no work for
|
|
|
|
* the second flush.
|
|
|
|
*/
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
trace_tlb_flush(reason, 0);
|
2017-06-29 15:53:16 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
|
|
|
|
WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we get to this point, we know that our TLB is out of date.
|
|
|
|
* This does not strictly imply that we need to flush (it's
|
|
|
|
* possible that f->new_tlb_gen <= local_tlb_gen), but we're
|
|
|
|
* going to need to flush in the very near future, so we might
|
|
|
|
* as well get it over with.
|
|
|
|
*
|
|
|
|
* The only question is whether to do a full or partial flush.
|
|
|
|
*
|
|
|
|
* We do a partial flush if requested and two extra conditions
|
|
|
|
* are met:
|
|
|
|
*
|
|
|
|
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
|
|
|
|
* we've always done all needed flushes to catch up to
|
|
|
|
* local_tlb_gen. If, for example, local_tlb_gen == 2 and
|
|
|
|
* f->new_tlb_gen == 3, then we know that the flush needed to bring
|
|
|
|
* us up to date for tlb_gen 3 is the partial flush we're
|
|
|
|
* processing.
|
|
|
|
*
|
|
|
|
* As an example of why this check is needed, suppose that there
|
|
|
|
* are two concurrent flushes. The first is a full flush that
|
|
|
|
* changes context.tlb_gen from 1 to 2. The second is a partial
|
|
|
|
* flush that changes context.tlb_gen from 2 to 3. If they get
|
|
|
|
* processed on this CPU in reverse order, we'll see
|
|
|
|
* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
|
2018-01-31 16:03:10 +00:00
|
|
|
* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
|
2017-06-29 15:53:16 +00:00
|
|
|
* 3, we'd be break the invariant: we'd update local_tlb_gen above
|
|
|
|
* 1 without the full flush that's needed for tlb_gen 2.
|
|
|
|
*
|
|
|
|
* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
|
|
|
|
* Partial TLB flushes are not all that much cheaper than full TLB
|
|
|
|
* flushes, so it seems unlikely that it would be a performance win
|
|
|
|
* to do a partial flush if that won't bring our TLB fully up to
|
|
|
|
* date. By doing a full flush instead, we can increase
|
|
|
|
* local_tlb_gen all the way to mm_tlb_gen and we can probably
|
|
|
|
* avoid another flush in the very near future.
|
|
|
|
*/
|
|
|
|
if (f->end != TLB_FLUSH_ALL &&
|
|
|
|
f->new_tlb_gen == local_tlb_gen + 1 &&
|
|
|
|
f->new_tlb_gen == mm_tlb_gen) {
|
|
|
|
/* Partial flush */
|
2018-08-26 10:56:48 +00:00
|
|
|
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
|
|
|
|
unsigned long addr = f->start;
|
2017-06-29 15:53:16 +00:00
|
|
|
|
2017-05-28 17:00:10 +00:00
|
|
|
while (addr < f->end) {
|
2020-04-21 09:20:34 +00:00
|
|
|
flush_tlb_one_user(addr);
|
2018-08-26 10:56:48 +00:00
|
|
|
addr += 1UL << f->stride_shift;
|
2017-05-22 22:30:02 +00:00
|
|
|
}
|
2017-05-28 17:00:12 +00:00
|
|
|
if (local)
|
2018-08-26 10:56:48 +00:00
|
|
|
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
|
|
|
|
trace_tlb_flush(reason, nr_invalidate);
|
2017-06-29 15:53:16 +00:00
|
|
|
} else {
|
|
|
|
/* Full flush. */
|
2020-04-21 09:20:32 +00:00
|
|
|
flush_tlb_local();
|
2017-06-29 15:53:16 +00:00
|
|
|
if (local)
|
|
|
|
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
|
|
|
trace_tlb_flush(reason, TLB_FLUSH_ALL);
|
2017-05-22 22:30:02 +00:00
|
|
|
}
|
2017-06-29 15:53:16 +00:00
|
|
|
|
|
|
|
/* Both paths above update our state to mm_tlb_gen. */
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 04:41:38 +00:00
|
|
|
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
|
2017-05-28 17:00:12 +00:00
|
|
|
{
|
|
|
|
const struct flush_tlb_info *f = info;
|
|
|
|
|
|
|
|
flush_tlb_func_common(f, true, reason);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void flush_tlb_func_remote(void *info)
|
|
|
|
{
|
|
|
|
const struct flush_tlb_info *f = info;
|
|
|
|
|
|
|
|
inc_irq_stat(irq_tlb_count);
|
|
|
|
|
x86/mm: Rework lazy TLB to track the actual loaded mm
Lazy TLB state is currently managed in a rather baroque manner.
AFAICT, there are three possible states:
- Non-lazy. This means that we're running a user thread or a
kernel thread that has called use_mm(). current->mm ==
current->active_mm == cpu_tlbstate.active_mm and
cpu_tlbstate.state == TLBSTATE_OK.
- Lazy with user mm. We're running a kernel thread without an mm
and we're borrowing an mm_struct. We have current->mm == NULL,
current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
!= TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
in mm_cpumask(current->active_mm). CR3 points to
current->active_mm->pgd. The TLB is up to date.
- Lazy with init_mm. This happens when we call leave_mm(). We
have current->mm == NULL, current->active_mm ==
cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
the scheduler is tracking it for refcounting. cpu_tlbstate.state
!= TLBSTATE_OK. The current cpu is clear in
mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
i.e. init_mm->pgd.
This patch simplifies the situation. Other than perf, x86 stops
caring about current->active_mm at all. We have
cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
TLB is always up to date for that mm. leave_mm() just switches us
to init_mm. There are no longer any special cases for mm_cpumask,
and switch_mm() switches mms without worrying about laziness.
After this patch, cpu_tlbstate.state serves only to tell the TLB
flush code whether it may switch to init_mm instead of doing a
normal flush.
This makes fairly extensive changes to xen_exit_mmap(), which used
to look a bit like black magic.
Perf is unchanged. With or without this change, perf may behave a bit
erratically if it tries to read user memory in kernel thread context.
We should build on this patch to teach perf to never look at user
memory when cpu_tlbstate.loaded_mm != current->mm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-05-28 17:00:15 +00:00
|
|
|
if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
|
2017-05-28 17:00:12 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
|
|
|
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
|
|
|
|
}
|
|
|
|
|
2018-09-26 03:58:44 +00:00
|
|
|
static bool tlb_is_not_lazy(int cpu, void *data)
|
|
|
|
{
|
|
|
|
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:36 +00:00
|
|
|
STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
|
|
|
|
const struct flush_tlb_info *info)
|
2009-01-11 05:58:09 +00:00
|
|
|
{
|
2014-01-21 22:33:16 +00:00
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
2017-05-28 17:00:10 +00:00
|
|
|
if (info->end == TLB_FLUSH_ALL)
|
2016-04-01 21:31:23 +00:00
|
|
|
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
|
|
|
|
else
|
|
|
|
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
|
2017-05-28 17:00:10 +00:00
|
|
|
(info->end - info->start) >> PAGE_SHIFT);
|
2016-04-01 21:31:23 +00:00
|
|
|
|
2009-01-11 05:58:09 +00:00
|
|
|
if (is_uv_system()) {
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
/*
|
|
|
|
* This whole special case is confused. UV has a "Broadcast
|
|
|
|
* Assist Unit", which seems to be a fancy way to send IPIs.
|
|
|
|
* Back when x86 used an explicit TLB flush IPI, UV was
|
|
|
|
* optimized to use its own mechanism. These days, x86 uses
|
|
|
|
* smp_call_function_many(), but UV still uses a manual IPI,
|
|
|
|
* and that IPI's action is out of date -- it does a manual
|
|
|
|
* flush instead of calling flush_tlb_func_remote(). This
|
|
|
|
* means that the percpu tlb_gen variables won't be updated
|
|
|
|
* and we'll do pointless flushes on future context switches.
|
|
|
|
*
|
|
|
|
* Rather than hooking native_flush_tlb_others() here, I think
|
|
|
|
* that UV should be updated so that smp_call_function_many(),
|
|
|
|
* etc, are optimal on UV.
|
|
|
|
*/
|
2017-05-28 17:00:10 +00:00
|
|
|
cpumask = uv_flush_tlb_others(cpumask, info);
|
2009-01-21 08:26:06 +00:00
|
|
|
if (cpumask)
|
2017-05-28 17:00:12 +00:00
|
|
|
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
2017-05-28 17:00:10 +00:00
|
|
|
(void *)info, 1);
|
2009-01-11 05:58:10 +00:00
|
|
|
return;
|
2009-01-11 05:58:09 +00:00
|
|
|
}
|
2018-09-26 03:58:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If no page tables were freed, we can skip sending IPIs to
|
|
|
|
* CPUs in lazy TLB mode. They will flush the CPU themselves
|
|
|
|
* at the next context switch.
|
|
|
|
*
|
|
|
|
* However, if page tables are getting freed, we need to send the
|
|
|
|
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
|
|
|
|
* up on the new contents of what used to be page tables, while
|
|
|
|
* doing a speculative memory access.
|
|
|
|
*/
|
|
|
|
if (info->freed_tables)
|
|
|
|
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
2018-07-16 19:03:34 +00:00
|
|
|
(void *)info, 1);
|
2018-09-26 03:58:44 +00:00
|
|
|
else
|
|
|
|
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
|
2020-01-17 09:01:37 +00:00
|
|
|
(void *)info, 1, cpumask);
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:36 +00:00
|
|
|
void flush_tlb_others(const struct cpumask *cpumask,
|
|
|
|
const struct flush_tlb_info *info)
|
|
|
|
{
|
|
|
|
__flush_tlb_others(cpumask, info);
|
|
|
|
}
|
|
|
|
|
x86/mm: Set TLB flush tunable to sane value (33)
This has been run through Intel's LKP tests across a wide range
of modern sytems and workloads and it wasn't shown to make a
measurable performance difference positive or negative.
Now that we have some shiny new tracepoints, we can actually
figure out what the heck is going on.
During a kernel compile, 60% of the flush_tlb_mm_range() calls
are for a single page. It breaks down like this:
size percent percent<=
V V V
GLOBAL: 2.20% 2.20% avg cycles: 2283
1: 56.92% 59.12% avg cycles: 1276
2: 13.78% 72.90% avg cycles: 1505
3: 8.26% 81.16% avg cycles: 1880
4: 7.41% 88.58% avg cycles: 2447
5: 1.73% 90.31% avg cycles: 2358
6: 1.32% 91.63% avg cycles: 2563
7: 1.14% 92.77% avg cycles: 2862
8: 0.62% 93.39% avg cycles: 3542
9: 0.08% 93.47% avg cycles: 3289
10: 0.43% 93.90% avg cycles: 3570
11: 0.20% 94.10% avg cycles: 3767
12: 0.08% 94.18% avg cycles: 3996
13: 0.03% 94.20% avg cycles: 4077
14: 0.02% 94.23% avg cycles: 4836
15: 0.04% 94.26% avg cycles: 5699
16: 0.06% 94.32% avg cycles: 5041
17: 0.57% 94.89% avg cycles: 5473
18: 0.02% 94.91% avg cycles: 5396
19: 0.03% 94.95% avg cycles: 5296
20: 0.02% 94.96% avg cycles: 6749
21: 0.18% 95.14% avg cycles: 6225
22: 0.01% 95.15% avg cycles: 6393
23: 0.01% 95.16% avg cycles: 6861
24: 0.12% 95.28% avg cycles: 6912
25: 0.05% 95.32% avg cycles: 7190
26: 0.01% 95.33% avg cycles: 7793
27: 0.01% 95.34% avg cycles: 7833
28: 0.01% 95.35% avg cycles: 8253
29: 0.08% 95.42% avg cycles: 8024
30: 0.03% 95.45% avg cycles: 9670
31: 0.01% 95.46% avg cycles: 8949
32: 0.01% 95.46% avg cycles: 9350
33: 3.11% 98.57% avg cycles: 8534
34: 0.02% 98.60% avg cycles: 10977
35: 0.02% 98.62% avg cycles: 11400
We get in to dimishing returns pretty quickly. On pre-IvyBridge
CPUs, we used to set the limit at 8 pages, and it was set at 128
on IvyBrige. That 128 number looks pretty silly considering that
less than 0.5% of the flushes are that large.
The previous code tried to size this number based on the size of
the TLB. Good idea, but it's error-prone, needs maintenance
(which it didn't get up to now), and probably would not matter in
practice much.
Settting it to 33 means that we cover the mallopt
M_TRIM_THRESHOLD, which is the most universally common size to do
flushes.
That's the short version. Here's the long one for why I chose 33:
1. These numbers have a constant bias in the timestamps from the
tracing. Probably counts for a couple hundred cycles in each of
these tests, but it should be fairly _even_ across all of them.
The smallest delta between the tracepoints I have ever seen is
335 cycles. This is one reason the cycles/page cost goes down in
general as the flushes get larger. The true cost is nearer to
100 cycles.
2. A full flush is more expensive than a single invlpg, but not
by much (single percentages).
3. A dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles.
4. 22,000 cycles is approximately the equivalent of doing 85
invlpg operations. But, the odds are that the TLB can
actually be filled up faster than that because TLB misses that
are close in time also tend to leverage the same caches.
6. ~98% of flushes are <=33 pages. There are a lot of flushes of
33 pages, probably because libc's M_TRIM_THRESHOLD is set to
128k (32 pages)
7. I've found no consistent data to support changing the IvyBridge
vs. SandyBridge tunable by a factor of 16
I used the performance counters on this hardware (IvyBridge i5-3320M)
to figure out the tlb miss costs:
ocperf.py stat -e dtlb_load_misses.walk_duration,dtlb_load_misses.walk_completed,dtlb_store_misses.walk_duration,dtlb_store_misses.walk_completed,itlb_misses.walk_duration,itlb_misses.walk_completed,itlb.itlb_flush
7,720,030,970 dtlb_load_misses_walk_duration [57.13%]
169,856,353 dtlb_load_misses_walk_completed [57.15%]
708,832,859 dtlb_store_misses_walk_duration [57.17%]
19,346,823 dtlb_store_misses_walk_completed [57.17%]
2,779,687,402 itlb_misses_walk_duration [57.15%]
82,241,148 itlb_misses_walk_completed [57.13%]
770,717 itlb_itlb_flush [57.11%]
Show that a dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles. On a SandyBridge system with more cores and larger
caches, those are dtlb=13.4ns and itlb=9.5ns.
cat perf.stat.txt | perl -pe 's/,//g'
| awk '/itlb_misses_walk_duration/ { icyc+=$1 }
/itlb_misses_walk_completed/ { imiss+=$1 }
/dtlb_.*_walk_duration/ { dcyc+=$1 }
/dtlb_.*.*completed/ { dmiss+=$1 }
END {print "itlb cyc/miss: ", icyc/imiss, " dtlb cyc/miss: ", dcyc/dmiss, " ----- ", icyc,imiss, dcyc,dmiss }
On Westmere CPUs, the counters to use are: itlb_flush,itlb_misses.walk_cycles,itlb_misses.any,dtlb_misses.walk_cycles,dtlb_misses.any
The assumptions that this code went in under:
https://lkml.org/lkml/2012/6/12/119 say that a flush and a refill are
about 100ns. Being generous, that is over by a factor of 6 on the
refill side, although it is fairly close on the cost of an invlpg.
An increase of a single invlpg operation seems to lengthen the flush
range operation by about 200 cycles. Here is one example of the data
collected for flushing 10 and 11 pages (full data are below):
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
How to generate this table:
echo 10000 > /sys/kernel/debug/tracing/buffer_size_kb
echo x86-tsc > /sys/kernel/debug/tracing/trace_clock
echo 'reason != 0' > /sys/kernel/debug/tracing/events/tlb/tlb_flush/filter
echo 1 > /sys/kernel/debug/tracing/events/tlb/tlb_flush/enable
Pipe the trace output in to this script:
http://sr71.net/~dave/intel/201402-tlb/trace-time-diff-process.pl.txt
Note that these data were gathered with the invlpg threshold set to
150 pages. Only data points with >=50 of samples were printed:
Flush % of %<=
in flush this
pages es size
------------------------------------------------------------------------------
-1: 2.20% 2.20% avg cycles: 2283 cycles/page: xxxx samples: 23960
1: 56.92% 59.12% avg cycles: 1276 cycles/page: 1276 samples: 620895
2: 13.78% 72.90% avg cycles: 1505 cycles/page: 752 samples: 150335
3: 8.26% 81.16% avg cycles: 1880 cycles/page: 626 samples: 90131
4: 7.41% 88.58% avg cycles: 2447 cycles/page: 611 samples: 80877
5: 1.73% 90.31% avg cycles: 2358 cycles/page: 471 samples: 18885
6: 1.32% 91.63% avg cycles: 2563 cycles/page: 427 samples: 14397
7: 1.14% 92.77% avg cycles: 2862 cycles/page: 408 samples: 12441
8: 0.62% 93.39% avg cycles: 3542 cycles/page: 442 samples: 6721
9: 0.08% 93.47% avg cycles: 3289 cycles/page: 365 samples: 917
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
12: 0.08% 94.18% avg cycles: 3996 cycles/page: 333 samples: 864
13: 0.03% 94.20% avg cycles: 4077 cycles/page: 313 samples: 289
14: 0.02% 94.23% avg cycles: 4836 cycles/page: 345 samples: 236
15: 0.04% 94.26% avg cycles: 5699 cycles/page: 379 samples: 390
16: 0.06% 94.32% avg cycles: 5041 cycles/page: 315 samples: 643
17: 0.57% 94.89% avg cycles: 5473 cycles/page: 321 samples: 6229
18: 0.02% 94.91% avg cycles: 5396 cycles/page: 299 samples: 224
19: 0.03% 94.95% avg cycles: 5296 cycles/page: 278 samples: 367
20: 0.02% 94.96% avg cycles: 6749 cycles/page: 337 samples: 185
21: 0.18% 95.14% avg cycles: 6225 cycles/page: 296 samples: 1964
22: 0.01% 95.15% avg cycles: 6393 cycles/page: 290 samples: 83
23: 0.01% 95.16% avg cycles: 6861 cycles/page: 298 samples: 61
24: 0.12% 95.28% avg cycles: 6912 cycles/page: 288 samples: 1307
25: 0.05% 95.32% avg cycles: 7190 cycles/page: 287 samples: 533
26: 0.01% 95.33% avg cycles: 7793 cycles/page: 299 samples: 94
27: 0.01% 95.34% avg cycles: 7833 cycles/page: 290 samples: 66
28: 0.01% 95.35% avg cycles: 8253 cycles/page: 294 samples: 73
29: 0.08% 95.42% avg cycles: 8024 cycles/page: 276 samples: 846
30: 0.03% 95.45% avg cycles: 9670 cycles/page: 322 samples: 296
31: 0.01% 95.46% avg cycles: 8949 cycles/page: 288 samples: 79
32: 0.01% 95.46% avg cycles: 9350 cycles/page: 292 samples: 60
33: 3.11% 98.57% avg cycles: 8534 cycles/page: 258 samples: 33936
34: 0.02% 98.60% avg cycles: 10977 cycles/page: 322 samples: 268
35: 0.02% 98.62% avg cycles: 11400 cycles/page: 325 samples: 177
36: 0.01% 98.63% avg cycles: 11504 cycles/page: 319 samples: 161
37: 0.02% 98.65% avg cycles: 11596 cycles/page: 313 samples: 182
38: 0.02% 98.66% avg cycles: 11850 cycles/page: 311 samples: 195
39: 0.01% 98.68% avg cycles: 12158 cycles/page: 311 samples: 128
40: 0.01% 98.68% avg cycles: 11626 cycles/page: 290 samples: 78
41: 0.04% 98.73% avg cycles: 11435 cycles/page: 278 samples: 477
42: 0.01% 98.73% avg cycles: 12571 cycles/page: 299 samples: 74
43: 0.01% 98.74% avg cycles: 12562 cycles/page: 292 samples: 78
44: 0.01% 98.75% avg cycles: 12991 cycles/page: 295 samples: 108
45: 0.01% 98.76% avg cycles: 13169 cycles/page: 292 samples: 78
46: 0.02% 98.78% avg cycles: 12891 cycles/page: 280 samples: 261
47: 0.01% 98.79% avg cycles: 13099 cycles/page: 278 samples: 67
48: 0.01% 98.80% avg cycles: 13851 cycles/page: 288 samples: 77
49: 0.01% 98.80% avg cycles: 13749 cycles/page: 280 samples: 66
50: 0.01% 98.81% avg cycles: 13949 cycles/page: 278 samples: 73
52: 0.00% 98.82% avg cycles: 14243 cycles/page: 273 samples: 52
54: 0.01% 98.83% avg cycles: 15312 cycles/page: 283 samples: 87
55: 0.01% 98.84% avg cycles: 15197 cycles/page: 276 samples: 109
56: 0.02% 98.86% avg cycles: 15234 cycles/page: 272 samples: 208
57: 0.00% 98.86% avg cycles: 14888 cycles/page: 261 samples: 53
58: 0.01% 98.87% avg cycles: 15037 cycles/page: 259 samples: 59
59: 0.01% 98.87% avg cycles: 15752 cycles/page: 266 samples: 63
62: 0.00% 98.89% avg cycles: 16222 cycles/page: 261 samples: 54
64: 0.02% 98.91% avg cycles: 17179 cycles/page: 268 samples: 248
65: 0.12% 99.03% avg cycles: 18762 cycles/page: 288 samples: 1324
85: 0.00% 99.10% avg cycles: 21649 cycles/page: 254 samples: 50
127: 0.01% 99.18% avg cycles: 32397 cycles/page: 255 samples: 75
128: 0.13% 99.31% avg cycles: 31711 cycles/page: 247 samples: 1466
129: 0.18% 99.49% avg cycles: 33017 cycles/page: 255 samples: 1927
181: 0.33% 99.84% avg cycles: 2489 cycles/page: 13 samples: 3547
256: 0.05% 99.91% avg cycles: 2305 cycles/page: 9 samples: 550
512: 0.03% 99.95% avg cycles: 2133 cycles/page: 4 samples: 304
1512: 0.01% 99.99% avg cycles: 3038 cycles/page: 2 samples: 65
Here are the tlb counters during a 10-second slice of a kernel compile
for a SandyBridge system. It's better than IvyBridge, but probably
due to the larger caches since this was one of the 'X' extreme parts.
10,873,007,282 dtlb_load_misses_walk_duration
250,711,333 dtlb_load_misses_walk_completed
1,212,395,865 dtlb_store_misses_walk_duration
31,615,772 dtlb_store_misses_walk_completed
5,091,010,274 itlb_misses_walk_duration
163,193,511 itlb_misses_walk_completed
1,321,980 itlb_itlb_flush
10.008045158 seconds time elapsed
# cat perf.stat.1392743721.txt | perl -pe 's/,//g' | awk '/itlb_misses_walk_duration/ { icyc+=$1 } /itlb_misses_walk_completed/ { imiss+=$1 } /dtlb_.*_walk_duration/ { dcyc+=$1 } /dtlb_.*.*completed/ { dmiss+=$1 } END {print "itlb cyc/miss: ", icyc/imiss/3.3, " dtlb cyc/miss: ", dcyc/dmiss/3.3, " ----- ", icyc,imiss, dcyc,dmiss }'
itlb ns/miss: 9.45338 dtlb ns/miss: 12.9716
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154103.10C1115E@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:41:03 +00:00
|
|
|
/*
|
2019-06-07 18:54:32 +00:00
|
|
|
* See Documentation/x86/tlb.rst for details. We choose 33
|
x86/mm: Set TLB flush tunable to sane value (33)
This has been run through Intel's LKP tests across a wide range
of modern sytems and workloads and it wasn't shown to make a
measurable performance difference positive or negative.
Now that we have some shiny new tracepoints, we can actually
figure out what the heck is going on.
During a kernel compile, 60% of the flush_tlb_mm_range() calls
are for a single page. It breaks down like this:
size percent percent<=
V V V
GLOBAL: 2.20% 2.20% avg cycles: 2283
1: 56.92% 59.12% avg cycles: 1276
2: 13.78% 72.90% avg cycles: 1505
3: 8.26% 81.16% avg cycles: 1880
4: 7.41% 88.58% avg cycles: 2447
5: 1.73% 90.31% avg cycles: 2358
6: 1.32% 91.63% avg cycles: 2563
7: 1.14% 92.77% avg cycles: 2862
8: 0.62% 93.39% avg cycles: 3542
9: 0.08% 93.47% avg cycles: 3289
10: 0.43% 93.90% avg cycles: 3570
11: 0.20% 94.10% avg cycles: 3767
12: 0.08% 94.18% avg cycles: 3996
13: 0.03% 94.20% avg cycles: 4077
14: 0.02% 94.23% avg cycles: 4836
15: 0.04% 94.26% avg cycles: 5699
16: 0.06% 94.32% avg cycles: 5041
17: 0.57% 94.89% avg cycles: 5473
18: 0.02% 94.91% avg cycles: 5396
19: 0.03% 94.95% avg cycles: 5296
20: 0.02% 94.96% avg cycles: 6749
21: 0.18% 95.14% avg cycles: 6225
22: 0.01% 95.15% avg cycles: 6393
23: 0.01% 95.16% avg cycles: 6861
24: 0.12% 95.28% avg cycles: 6912
25: 0.05% 95.32% avg cycles: 7190
26: 0.01% 95.33% avg cycles: 7793
27: 0.01% 95.34% avg cycles: 7833
28: 0.01% 95.35% avg cycles: 8253
29: 0.08% 95.42% avg cycles: 8024
30: 0.03% 95.45% avg cycles: 9670
31: 0.01% 95.46% avg cycles: 8949
32: 0.01% 95.46% avg cycles: 9350
33: 3.11% 98.57% avg cycles: 8534
34: 0.02% 98.60% avg cycles: 10977
35: 0.02% 98.62% avg cycles: 11400
We get in to dimishing returns pretty quickly. On pre-IvyBridge
CPUs, we used to set the limit at 8 pages, and it was set at 128
on IvyBrige. That 128 number looks pretty silly considering that
less than 0.5% of the flushes are that large.
The previous code tried to size this number based on the size of
the TLB. Good idea, but it's error-prone, needs maintenance
(which it didn't get up to now), and probably would not matter in
practice much.
Settting it to 33 means that we cover the mallopt
M_TRIM_THRESHOLD, which is the most universally common size to do
flushes.
That's the short version. Here's the long one for why I chose 33:
1. These numbers have a constant bias in the timestamps from the
tracing. Probably counts for a couple hundred cycles in each of
these tests, but it should be fairly _even_ across all of them.
The smallest delta between the tracepoints I have ever seen is
335 cycles. This is one reason the cycles/page cost goes down in
general as the flushes get larger. The true cost is nearer to
100 cycles.
2. A full flush is more expensive than a single invlpg, but not
by much (single percentages).
3. A dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles.
4. 22,000 cycles is approximately the equivalent of doing 85
invlpg operations. But, the odds are that the TLB can
actually be filled up faster than that because TLB misses that
are close in time also tend to leverage the same caches.
6. ~98% of flushes are <=33 pages. There are a lot of flushes of
33 pages, probably because libc's M_TRIM_THRESHOLD is set to
128k (32 pages)
7. I've found no consistent data to support changing the IvyBridge
vs. SandyBridge tunable by a factor of 16
I used the performance counters on this hardware (IvyBridge i5-3320M)
to figure out the tlb miss costs:
ocperf.py stat -e dtlb_load_misses.walk_duration,dtlb_load_misses.walk_completed,dtlb_store_misses.walk_duration,dtlb_store_misses.walk_completed,itlb_misses.walk_duration,itlb_misses.walk_completed,itlb.itlb_flush
7,720,030,970 dtlb_load_misses_walk_duration [57.13%]
169,856,353 dtlb_load_misses_walk_completed [57.15%]
708,832,859 dtlb_store_misses_walk_duration [57.17%]
19,346,823 dtlb_store_misses_walk_completed [57.17%]
2,779,687,402 itlb_misses_walk_duration [57.15%]
82,241,148 itlb_misses_walk_completed [57.13%]
770,717 itlb_itlb_flush [57.11%]
Show that a dtlb miss is 17.1ns (~45 cycles) and a itlb miss is 13.0ns
(~34 cycles). At those rates, refilling the 512-entry dTLB takes
22,000 cycles. On a SandyBridge system with more cores and larger
caches, those are dtlb=13.4ns and itlb=9.5ns.
cat perf.stat.txt | perl -pe 's/,//g'
| awk '/itlb_misses_walk_duration/ { icyc+=$1 }
/itlb_misses_walk_completed/ { imiss+=$1 }
/dtlb_.*_walk_duration/ { dcyc+=$1 }
/dtlb_.*.*completed/ { dmiss+=$1 }
END {print "itlb cyc/miss: ", icyc/imiss, " dtlb cyc/miss: ", dcyc/dmiss, " ----- ", icyc,imiss, dcyc,dmiss }
On Westmere CPUs, the counters to use are: itlb_flush,itlb_misses.walk_cycles,itlb_misses.any,dtlb_misses.walk_cycles,dtlb_misses.any
The assumptions that this code went in under:
https://lkml.org/lkml/2012/6/12/119 say that a flush and a refill are
about 100ns. Being generous, that is over by a factor of 6 on the
refill side, although it is fairly close on the cost of an invlpg.
An increase of a single invlpg operation seems to lengthen the flush
range operation by about 200 cycles. Here is one example of the data
collected for flushing 10 and 11 pages (full data are below):
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
How to generate this table:
echo 10000 > /sys/kernel/debug/tracing/buffer_size_kb
echo x86-tsc > /sys/kernel/debug/tracing/trace_clock
echo 'reason != 0' > /sys/kernel/debug/tracing/events/tlb/tlb_flush/filter
echo 1 > /sys/kernel/debug/tracing/events/tlb/tlb_flush/enable
Pipe the trace output in to this script:
http://sr71.net/~dave/intel/201402-tlb/trace-time-diff-process.pl.txt
Note that these data were gathered with the invlpg threshold set to
150 pages. Only data points with >=50 of samples were printed:
Flush % of %<=
in flush this
pages es size
------------------------------------------------------------------------------
-1: 2.20% 2.20% avg cycles: 2283 cycles/page: xxxx samples: 23960
1: 56.92% 59.12% avg cycles: 1276 cycles/page: 1276 samples: 620895
2: 13.78% 72.90% avg cycles: 1505 cycles/page: 752 samples: 150335
3: 8.26% 81.16% avg cycles: 1880 cycles/page: 626 samples: 90131
4: 7.41% 88.58% avg cycles: 2447 cycles/page: 611 samples: 80877
5: 1.73% 90.31% avg cycles: 2358 cycles/page: 471 samples: 18885
6: 1.32% 91.63% avg cycles: 2563 cycles/page: 427 samples: 14397
7: 1.14% 92.77% avg cycles: 2862 cycles/page: 408 samples: 12441
8: 0.62% 93.39% avg cycles: 3542 cycles/page: 442 samples: 6721
9: 0.08% 93.47% avg cycles: 3289 cycles/page: 365 samples: 917
10: 0.43% 93.90% avg cycles: 3570 cycles/page: 357 samples: 4714
11: 0.20% 94.10% avg cycles: 3767 cycles/page: 342 samples: 2145
12: 0.08% 94.18% avg cycles: 3996 cycles/page: 333 samples: 864
13: 0.03% 94.20% avg cycles: 4077 cycles/page: 313 samples: 289
14: 0.02% 94.23% avg cycles: 4836 cycles/page: 345 samples: 236
15: 0.04% 94.26% avg cycles: 5699 cycles/page: 379 samples: 390
16: 0.06% 94.32% avg cycles: 5041 cycles/page: 315 samples: 643
17: 0.57% 94.89% avg cycles: 5473 cycles/page: 321 samples: 6229
18: 0.02% 94.91% avg cycles: 5396 cycles/page: 299 samples: 224
19: 0.03% 94.95% avg cycles: 5296 cycles/page: 278 samples: 367
20: 0.02% 94.96% avg cycles: 6749 cycles/page: 337 samples: 185
21: 0.18% 95.14% avg cycles: 6225 cycles/page: 296 samples: 1964
22: 0.01% 95.15% avg cycles: 6393 cycles/page: 290 samples: 83
23: 0.01% 95.16% avg cycles: 6861 cycles/page: 298 samples: 61
24: 0.12% 95.28% avg cycles: 6912 cycles/page: 288 samples: 1307
25: 0.05% 95.32% avg cycles: 7190 cycles/page: 287 samples: 533
26: 0.01% 95.33% avg cycles: 7793 cycles/page: 299 samples: 94
27: 0.01% 95.34% avg cycles: 7833 cycles/page: 290 samples: 66
28: 0.01% 95.35% avg cycles: 8253 cycles/page: 294 samples: 73
29: 0.08% 95.42% avg cycles: 8024 cycles/page: 276 samples: 846
30: 0.03% 95.45% avg cycles: 9670 cycles/page: 322 samples: 296
31: 0.01% 95.46% avg cycles: 8949 cycles/page: 288 samples: 79
32: 0.01% 95.46% avg cycles: 9350 cycles/page: 292 samples: 60
33: 3.11% 98.57% avg cycles: 8534 cycles/page: 258 samples: 33936
34: 0.02% 98.60% avg cycles: 10977 cycles/page: 322 samples: 268
35: 0.02% 98.62% avg cycles: 11400 cycles/page: 325 samples: 177
36: 0.01% 98.63% avg cycles: 11504 cycles/page: 319 samples: 161
37: 0.02% 98.65% avg cycles: 11596 cycles/page: 313 samples: 182
38: 0.02% 98.66% avg cycles: 11850 cycles/page: 311 samples: 195
39: 0.01% 98.68% avg cycles: 12158 cycles/page: 311 samples: 128
40: 0.01% 98.68% avg cycles: 11626 cycles/page: 290 samples: 78
41: 0.04% 98.73% avg cycles: 11435 cycles/page: 278 samples: 477
42: 0.01% 98.73% avg cycles: 12571 cycles/page: 299 samples: 74
43: 0.01% 98.74% avg cycles: 12562 cycles/page: 292 samples: 78
44: 0.01% 98.75% avg cycles: 12991 cycles/page: 295 samples: 108
45: 0.01% 98.76% avg cycles: 13169 cycles/page: 292 samples: 78
46: 0.02% 98.78% avg cycles: 12891 cycles/page: 280 samples: 261
47: 0.01% 98.79% avg cycles: 13099 cycles/page: 278 samples: 67
48: 0.01% 98.80% avg cycles: 13851 cycles/page: 288 samples: 77
49: 0.01% 98.80% avg cycles: 13749 cycles/page: 280 samples: 66
50: 0.01% 98.81% avg cycles: 13949 cycles/page: 278 samples: 73
52: 0.00% 98.82% avg cycles: 14243 cycles/page: 273 samples: 52
54: 0.01% 98.83% avg cycles: 15312 cycles/page: 283 samples: 87
55: 0.01% 98.84% avg cycles: 15197 cycles/page: 276 samples: 109
56: 0.02% 98.86% avg cycles: 15234 cycles/page: 272 samples: 208
57: 0.00% 98.86% avg cycles: 14888 cycles/page: 261 samples: 53
58: 0.01% 98.87% avg cycles: 15037 cycles/page: 259 samples: 59
59: 0.01% 98.87% avg cycles: 15752 cycles/page: 266 samples: 63
62: 0.00% 98.89% avg cycles: 16222 cycles/page: 261 samples: 54
64: 0.02% 98.91% avg cycles: 17179 cycles/page: 268 samples: 248
65: 0.12% 99.03% avg cycles: 18762 cycles/page: 288 samples: 1324
85: 0.00% 99.10% avg cycles: 21649 cycles/page: 254 samples: 50
127: 0.01% 99.18% avg cycles: 32397 cycles/page: 255 samples: 75
128: 0.13% 99.31% avg cycles: 31711 cycles/page: 247 samples: 1466
129: 0.18% 99.49% avg cycles: 33017 cycles/page: 255 samples: 1927
181: 0.33% 99.84% avg cycles: 2489 cycles/page: 13 samples: 3547
256: 0.05% 99.91% avg cycles: 2305 cycles/page: 9 samples: 550
512: 0.03% 99.95% avg cycles: 2133 cycles/page: 4 samples: 304
1512: 0.01% 99.99% avg cycles: 3038 cycles/page: 2 samples: 65
Here are the tlb counters during a 10-second slice of a kernel compile
for a SandyBridge system. It's better than IvyBridge, but probably
due to the larger caches since this was one of the 'X' extreme parts.
10,873,007,282 dtlb_load_misses_walk_duration
250,711,333 dtlb_load_misses_walk_completed
1,212,395,865 dtlb_store_misses_walk_duration
31,615,772 dtlb_store_misses_walk_completed
5,091,010,274 itlb_misses_walk_duration
163,193,511 itlb_misses_walk_completed
1,321,980 itlb_itlb_flush
10.008045158 seconds time elapsed
# cat perf.stat.1392743721.txt | perl -pe 's/,//g' | awk '/itlb_misses_walk_duration/ { icyc+=$1 } /itlb_misses_walk_completed/ { imiss+=$1 } /dtlb_.*_walk_duration/ { dcyc+=$1 } /dtlb_.*.*completed/ { dmiss+=$1 } END {print "itlb cyc/miss: ", icyc/imiss/3.3, " dtlb cyc/miss: ", dcyc/dmiss/3.3, " ----- ", icyc,imiss, dcyc,dmiss }'
itlb ns/miss: 9.45338 dtlb ns/miss: 12.9716
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154103.10C1115E@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:41:03 +00:00
|
|
|
* because it is large enough to cover the vast majority (at
|
|
|
|
* least 95%) of allocations, and is small enough that we are
|
|
|
|
* confident it will not cause too much overhead. Each single
|
|
|
|
* flush is about 100 ns, so this caps the maximum overhead at
|
|
|
|
* _about_ 3,000 ns.
|
|
|
|
*
|
|
|
|
* This is in units of pages.
|
|
|
|
*/
|
2018-12-03 17:03:49 +00:00
|
|
|
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
|
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the
flush sizes based on the CPU needs to get ripped out for
several reasons:
1. It is obviously buggy. It uses mm->total_vm to judge the
task's footprint in the TLB. It should certainly be using
some measure of RSS, *NOT* ->total_vm since only resident
memory can populate the TLB.
2. Haswell, and several other CPUs are missing from the
intel_tlb_flushall_shift_set() function. Thus, it has been
demonstrated to bitrot quickly in practice.
3. It is plain wrong in my vm:
[ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] tlb_flushall_shift: 6
Which leads to it to never use invlpg.
4. The assumptions about TLB refill costs are wrong:
http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com
(more on this in later patches)
5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59
I believe the sample times were too short. Running the
benchmark in a loop yields times that vary quite a bit.
Note that this leaves us with a static ceiling of 1 page. This
is a conservative, dumb setting, and will be revised in a later
patch.
This also removes the code which attempts to predict whether we
are flushing data or instructions. We expect instruction flushes
to be relatively rare and not worth tuning for explicitly.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:40:55 +00:00
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
|
|
|
|
unsigned long start, unsigned long end,
|
|
|
|
unsigned int stride_shift, bool freed_tables,
|
|
|
|
u64 new_tlb_gen)
|
|
|
|
{
|
|
|
|
struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
/*
|
|
|
|
* Ensure that the following code is non-reentrant and flush_tlb_info
|
|
|
|
* is not overwritten. This means no TLB flushing is initiated by
|
|
|
|
* interrupt handlers and machine-check exception handlers.
|
|
|
|
*/
|
|
|
|
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
info->start = start;
|
|
|
|
info->end = end;
|
|
|
|
info->mm = mm;
|
|
|
|
info->stride_shift = stride_shift;
|
|
|
|
info->freed_tables = freed_tables;
|
|
|
|
info->new_tlb_gen = new_tlb_gen;
|
|
|
|
|
|
|
|
return info;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void put_flush_tlb_info(void)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
/* Complete reentrency prevention checks */
|
|
|
|
barrier();
|
|
|
|
this_cpu_dec(flush_tlb_info_idx);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
x86/tlb: enable tlb flush range support for x86
Not every tlb_flush execution moment is really need to evacuate all
TLB entries, like in munmap, just few 'invlpg' is better for whole
process performance, since it leaves most of TLB entries for later
accessing.
This patch also rewrite flush_tlb_range for 2 purposes:
1, split it out to get flush_blt_mm_range function.
2, clean up to reduce line breaking, thanks for Borislav's input.
My micro benchmark 'mummap' http://lkml.org/lkml/2012/5/17/59
show that the random memory access on other CPU has 0~50% speed up
on a 2P * 4cores * HT NHM EP while do 'munmap'.
Thanks Yongjie's testing on this patch:
-------------
I used Linux 3.4-RC6 w/ and w/o his patches as Xen dom0 and guest
kernel.
After running two benchmarks in Xen HVM guest, I found his patches
brought about 1%~3% performance gain in 'kernel build' and 'netperf'
testing, though the performance gain was not very stable in 'kernel
build' testing.
Some detailed testing results are below.
Testing Environment:
Hardware: Romley-EP platform
Xen version: latest upstream
Linux kernel: 3.4-RC6
Guest vCPU number: 8
NIC: Intel 82599 (10GB bandwidth)
In 'kernel build' testing in guest:
Command line | performance gain
make -j 4 | 3.81%
make -j 8 | 0.37%
make -j 16 | -0.52%
In 'netperf' testing, we tested TCP_STREAM with default socket size
16384 byte as large packet and 64 byte as small packet.
I used several clients to add networking pressure, then 'netperf' server
automatically generated several threads to response them.
I also used large-size packet and small-size packet in the testing.
Packet size | Thread number | performance gain
16384 bytes | 4 | 0.02%
16384 bytes | 8 | 2.21%
16384 bytes | 16 | 2.04%
64 bytes | 4 | 1.07%
64 bytes | 8 | 3.31%
64 bytes | 16 | 0.71%
Signed-off-by: Alex Shi <alex.shi@intel.com>
Link: http://lkml.kernel.org/r/1340845344-27557-8-git-send-email-alex.shi@intel.com
Tested-by: Ren, Yongjie <yongjie.ren@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2012-06-28 01:02:22 +00:00
|
|
|
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
2018-09-26 03:58:42 +00:00
|
|
|
unsigned long end, unsigned int stride_shift,
|
|
|
|
bool freed_tables)
|
x86/tlb: enable tlb flush range support for x86
Not every tlb_flush execution moment is really need to evacuate all
TLB entries, like in munmap, just few 'invlpg' is better for whole
process performance, since it leaves most of TLB entries for later
accessing.
This patch also rewrite flush_tlb_range for 2 purposes:
1, split it out to get flush_blt_mm_range function.
2, clean up to reduce line breaking, thanks for Borislav's input.
My micro benchmark 'mummap' http://lkml.org/lkml/2012/5/17/59
show that the random memory access on other CPU has 0~50% speed up
on a 2P * 4cores * HT NHM EP while do 'munmap'.
Thanks Yongjie's testing on this patch:
-------------
I used Linux 3.4-RC6 w/ and w/o his patches as Xen dom0 and guest
kernel.
After running two benchmarks in Xen HVM guest, I found his patches
brought about 1%~3% performance gain in 'kernel build' and 'netperf'
testing, though the performance gain was not very stable in 'kernel
build' testing.
Some detailed testing results are below.
Testing Environment:
Hardware: Romley-EP platform
Xen version: latest upstream
Linux kernel: 3.4-RC6
Guest vCPU number: 8
NIC: Intel 82599 (10GB bandwidth)
In 'kernel build' testing in guest:
Command line | performance gain
make -j 4 | 3.81%
make -j 8 | 0.37%
make -j 16 | -0.52%
In 'netperf' testing, we tested TCP_STREAM with default socket size
16384 byte as large packet and 64 byte as small packet.
I used several clients to add networking pressure, then 'netperf' server
automatically generated several threads to response them.
I also used large-size packet and small-size packet in the testing.
Packet size | Thread number | performance gain
16384 bytes | 4 | 0.02%
16384 bytes | 8 | 2.21%
16384 bytes | 16 | 2.04%
64 bytes | 4 | 1.07%
64 bytes | 8 | 3.31%
64 bytes | 16 | 0.71%
Signed-off-by: Alex Shi <alex.shi@intel.com>
Link: http://lkml.kernel.org/r/1340845344-27557-8-git-send-email-alex.shi@intel.com
Tested-by: Ren, Yongjie <yongjie.ren@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2012-06-28 01:02:22 +00:00
|
|
|
{
|
2019-04-25 23:01:43 +00:00
|
|
|
struct flush_tlb_info *info;
|
|
|
|
u64 new_tlb_gen;
|
2017-05-28 17:00:12 +00:00
|
|
|
int cpu;
|
2017-04-22 07:01:21 +00:00
|
|
|
|
2017-05-28 17:00:12 +00:00
|
|
|
cpu = get_cpu();
|
2016-01-06 20:21:01 +00:00
|
|
|
|
2017-05-28 17:00:12 +00:00
|
|
|
/* Should we flush just the requested range? */
|
2019-04-25 23:01:43 +00:00
|
|
|
if ((end == TLB_FLUSH_ALL) ||
|
|
|
|
((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
|
|
|
|
start = 0;
|
|
|
|
end = TLB_FLUSH_ALL;
|
2014-07-31 15:40:54 +00:00
|
|
|
}
|
2017-05-28 17:00:12 +00:00
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
/* This is also a barrier that synchronizes with switch_mm(). */
|
|
|
|
new_tlb_gen = inc_mm_tlb_gen(mm);
|
|
|
|
|
|
|
|
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
|
|
|
|
new_tlb_gen);
|
|
|
|
|
2017-06-29 15:53:13 +00:00
|
|
|
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
2019-04-25 23:01:43 +00:00
|
|
|
lockdep_assert_irqs_enabled();
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_disable();
|
2019-04-25 23:01:43 +00:00
|
|
|
flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_enable();
|
|
|
|
}
|
|
|
|
|
2017-05-28 17:00:12 +00:00
|
|
|
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
|
2019-04-25 23:01:43 +00:00
|
|
|
flush_tlb_others(mm_cpumask(mm), info);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
put_flush_tlb_info();
|
2017-05-28 17:00:12 +00:00
|
|
|
put_cpu();
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
|
|
|
|
2017-05-28 17:00:10 +00:00
|
|
|
|
2008-03-03 17:12:54 +00:00
|
|
|
static void do_flush_tlb_all(void *info)
|
|
|
|
{
|
2014-01-21 22:33:16 +00:00
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
2008-03-03 17:12:54 +00:00
|
|
|
__flush_tlb_all();
|
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_all(void)
|
|
|
|
{
|
2014-01-21 22:33:16 +00:00
|
|
|
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
2008-05-09 07:39:44 +00:00
|
|
|
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
2008-03-03 17:12:54 +00:00
|
|
|
}
|
2012-06-28 01:02:20 +00:00
|
|
|
|
2012-06-28 01:02:24 +00:00
|
|
|
static void do_kernel_range_flush(void *info)
|
|
|
|
{
|
|
|
|
struct flush_tlb_info *f = info;
|
|
|
|
unsigned long addr;
|
|
|
|
|
|
|
|
/* flush range by one by one 'invlpg' */
|
2017-05-28 17:00:10 +00:00
|
|
|
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
|
2020-04-21 09:20:35 +00:00
|
|
|
flush_tlb_one_kernel(addr);
|
2012-06-28 01:02:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
/* Balance as user space task's flush, a bit conservative */
|
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the
flush sizes based on the CPU needs to get ripped out for
several reasons:
1. It is obviously buggy. It uses mm->total_vm to judge the
task's footprint in the TLB. It should certainly be using
some measure of RSS, *NOT* ->total_vm since only resident
memory can populate the TLB.
2. Haswell, and several other CPUs are missing from the
intel_tlb_flushall_shift_set() function. Thus, it has been
demonstrated to bitrot quickly in practice.
3. It is plain wrong in my vm:
[ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] tlb_flushall_shift: 6
Which leads to it to never use invlpg.
4. The assumptions about TLB refill costs are wrong:
http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com
(more on this in later patches)
5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59
I believe the sample times were too short. Running the
benchmark in a loop yields times that vary quite a bit.
Note that this leaves us with a static ceiling of 1 page. This
is a conservative, dumb setting, and will be revised in a later
patch.
This also removes the code which attempts to predict whether we
are flushing data or instructions. We expect instruction flushes
to be relatively rare and not worth tuning for explicitly.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:40:55 +00:00
|
|
|
if (end == TLB_FLUSH_ALL ||
|
2017-05-28 17:00:16 +00:00
|
|
|
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
2012-06-28 01:02:24 +00:00
|
|
|
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the
flush sizes based on the CPU needs to get ripped out for
several reasons:
1. It is obviously buggy. It uses mm->total_vm to judge the
task's footprint in the TLB. It should certainly be using
some measure of RSS, *NOT* ->total_vm since only resident
memory can populate the TLB.
2. Haswell, and several other CPUs are missing from the
intel_tlb_flushall_shift_set() function. Thus, it has been
demonstrated to bitrot quickly in practice.
3. It is plain wrong in my vm:
[ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0
[ 0.037444] tlb_flushall_shift: 6
Which leads to it to never use invlpg.
4. The assumptions about TLB refill costs are wrong:
http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com
(more on this in later patches)
5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59
I believe the sample times were too short. Running the
benchmark in a loop yields times that vary quite a bit.
Note that this leaves us with a static ceiling of 1 page. This
is a conservative, dumb setting, and will be revised in a later
patch.
This also removes the code which attempts to predict whether we
are flushing data or instructions. We expect instruction flushes
to be relatively rare and not worth tuning for explicitly.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2014-07-31 15:40:55 +00:00
|
|
|
} else {
|
2019-04-25 23:01:43 +00:00
|
|
|
struct flush_tlb_info *info;
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
|
|
|
|
|
|
|
|
on_each_cpu(do_kernel_range_flush, info, 1);
|
|
|
|
|
|
|
|
put_flush_tlb_info();
|
|
|
|
preempt_enable();
|
2012-06-28 01:02:24 +00:00
|
|
|
}
|
|
|
|
}
|
2014-07-31 15:41:01 +00:00
|
|
|
|
2020-04-21 09:20:28 +00:00
|
|
|
/*
|
|
|
|
* This can be used from process context to figure out what the value of
|
|
|
|
* CR3 is without needing to do a (slow) __read_cr3().
|
|
|
|
*
|
|
|
|
* It's intended to be used for code like KVM that sneakily changes CR3
|
|
|
|
* and needs to restore it. It needs to be used very carefully.
|
|
|
|
*/
|
|
|
|
unsigned long __get_current_cr3_fast(void)
|
|
|
|
{
|
|
|
|
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
|
|
|
|
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
|
|
|
|
|
|
|
/* For now, be very restrictive about when this can be called. */
|
|
|
|
VM_WARN_ON(in_nmi() || preemptible());
|
|
|
|
|
|
|
|
VM_BUG_ON(cr3 != __read_cr3());
|
|
|
|
return cr3;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
|
|
|
|
|
2020-04-21 09:20:35 +00:00
|
|
|
/*
|
|
|
|
* Flush one page in the kernel mapping
|
|
|
|
*/
|
|
|
|
void flush_tlb_one_kernel(unsigned long addr)
|
|
|
|
{
|
|
|
|
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
|
|
|
|
* paravirt equivalent. Even with PCID, this is sufficient: we only
|
|
|
|
* use PCID if we also use global PTEs for the kernel mapping, and
|
|
|
|
* INVLPG flushes global translations across all address spaces.
|
|
|
|
*
|
|
|
|
* If PTI is on, then the kernel is mapped with non-global PTEs, and
|
|
|
|
* __flush_tlb_one_user() will flush the given address for the current
|
|
|
|
* kernel address space and for its usermode counterpart, but it does
|
|
|
|
* not flush it for other address spaces.
|
|
|
|
*/
|
|
|
|
flush_tlb_one_user(addr);
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* See above. We need to propagate the flush to all other address
|
|
|
|
* spaces. In principle, we only need to propagate it to kernelmode
|
|
|
|
* address spaces, but the extra bookkeeping we would need is not
|
|
|
|
* worth it.
|
|
|
|
*/
|
|
|
|
this_cpu_write(cpu_tlbstate.invalidate_other, true);
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:34 +00:00
|
|
|
/*
|
|
|
|
* Flush one page in the user mapping
|
|
|
|
*/
|
|
|
|
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
|
|
|
|
{
|
|
|
|
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
|
|
|
|
|
|
|
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
|
|
|
|
|
|
|
if (!static_cpu_has(X86_FEATURE_PTI))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
|
|
|
|
* Just use invalidate_user_asid() in case we are called early.
|
|
|
|
*/
|
|
|
|
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
|
|
|
|
invalidate_user_asid(loaded_mm_asid);
|
|
|
|
else
|
|
|
|
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_one_user(unsigned long addr)
|
|
|
|
{
|
|
|
|
__flush_tlb_one_user(addr);
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:33 +00:00
|
|
|
/*
|
|
|
|
* Flush everything
|
|
|
|
*/
|
|
|
|
STATIC_NOPV void native_flush_tlb_global(void)
|
|
|
|
{
|
|
|
|
unsigned long cr4, flags;
|
|
|
|
|
|
|
|
if (static_cpu_has(X86_FEATURE_INVPCID)) {
|
|
|
|
/*
|
|
|
|
* Using INVPCID is considerably faster than a pair of writes
|
|
|
|
* to CR4 sandwiched inside an IRQ flag save/restore.
|
|
|
|
*
|
|
|
|
* Note, this works with CR4.PCIDE=0 or 1.
|
|
|
|
*/
|
|
|
|
invpcid_flush_all();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read-modify-write to CR4 - protect it from preemption and
|
|
|
|
* from interrupts. (Use the raw variant because this code can
|
|
|
|
* be called from deep inside debugging code.)
|
|
|
|
*/
|
|
|
|
raw_local_irq_save(flags);
|
|
|
|
|
|
|
|
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
|
|
|
/* toggle PGE */
|
|
|
|
native_write_cr4(cr4 ^ X86_CR4_PGE);
|
|
|
|
/* write old PGE again and flush TLBs */
|
|
|
|
native_write_cr4(cr4);
|
|
|
|
|
|
|
|
raw_local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:32 +00:00
|
|
|
/*
|
|
|
|
* Flush the entire current user mapping
|
|
|
|
*/
|
|
|
|
STATIC_NOPV void native_flush_tlb_local(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Preemption or interrupts must be disabled to protect the access
|
|
|
|
* to the per CPU variable and to prevent being preempted between
|
|
|
|
* read_cr3() and write_cr3().
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(preemptible());
|
|
|
|
|
|
|
|
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
|
|
|
|
|
|
|
|
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
|
|
|
|
native_write_cr3(__native_read_cr3());
|
|
|
|
}
|
|
|
|
|
|
|
|
void flush_tlb_local(void)
|
|
|
|
{
|
|
|
|
__flush_tlb_local();
|
|
|
|
}
|
2020-04-21 09:20:37 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush everything
|
|
|
|
*/
|
|
|
|
void __flush_tlb_all(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This is to catch users with enabled preemption and the PGE feature
|
|
|
|
* and don't trigger the warning in __native_flush_tlb().
|
|
|
|
*/
|
|
|
|
VM_WARN_ON_ONCE(preemptible());
|
|
|
|
|
|
|
|
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
|
|
|
__flush_tlb_global();
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
|
|
|
|
*/
|
|
|
|
flush_tlb_local();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__flush_tlb_all);
|
2020-04-21 09:20:32 +00:00
|
|
|
|
2019-04-25 23:01:43 +00:00
|
|
|
/*
|
|
|
|
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
|
|
|
|
* This means that the 'struct flush_tlb_info' that describes which mappings to
|
|
|
|
* flush is actually fixed. We therefore set a single fixed struct and use it in
|
|
|
|
* arch_tlbbatch_flush().
|
|
|
|
*/
|
|
|
|
static const struct flush_tlb_info full_flush_tlb_info = {
|
|
|
|
.mm = NULL,
|
|
|
|
.start = 0,
|
|
|
|
.end = TLB_FLUSH_ALL,
|
|
|
|
};
|
|
|
|
|
2017-05-22 22:30:03 +00:00
|
|
|
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
|
|
|
|
{
|
|
|
|
int cpu = get_cpu();
|
|
|
|
|
2017-06-29 15:53:13 +00:00
|
|
|
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
2019-04-25 23:01:43 +00:00
|
|
|
lockdep_assert_irqs_enabled();
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_disable();
|
2019-04-25 23:01:43 +00:00
|
|
|
flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
|
2017-06-29 15:53:13 +00:00
|
|
|
local_irq_enable();
|
|
|
|
}
|
|
|
|
|
2017-05-22 22:30:03 +00:00
|
|
|
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
|
2019-04-25 23:01:43 +00:00
|
|
|
flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
|
x86/mm: Rework lazy TLB mode and TLB freshness tracking
x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB. This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.
Rewrite it entirely. When we enter lazy mode, we simply remove the
CPU from mm_cpumask. This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.
Note to reviewers: this patch, my itself, looks a bit odd. I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1. Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.
NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works. This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.
The UV tlbflush code is rather dated and should be changed.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.
patched: 13.4µs
unpatched: 21.6µs
Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data
patched: 1.1 seconds or so
unpatched: 1.9 seconds or so
The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-29 15:53:17 +00:00
|
|
|
|
2017-05-22 22:30:03 +00:00
|
|
|
cpumask_clear(&batch->cpumask);
|
|
|
|
|
|
|
|
put_cpu();
|
|
|
|
}
|
|
|
|
|
2020-04-21 09:20:40 +00:00
|
|
|
/*
|
|
|
|
* Blindly accessing user memory from NMI context can be dangerous
|
|
|
|
* if we're in the middle of switching the current user task or
|
|
|
|
* switching the loaded mm. It can also be dangerous if we
|
|
|
|
* interrupted some kernel code that was temporarily using a
|
|
|
|
* different mm.
|
|
|
|
*/
|
|
|
|
bool nmi_uaccess_okay(void)
|
|
|
|
{
|
|
|
|
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
|
|
|
struct mm_struct *current_mm = current->mm;
|
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(!loaded_mm);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The condition we want to check is
|
|
|
|
* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
|
|
|
|
* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
|
|
|
|
* is supposed to be reasonably fast.
|
|
|
|
*
|
|
|
|
* Instead, we check the almost equivalent but somewhat conservative
|
|
|
|
* condition below, and we rely on the fact that switch_mm_irqs_off()
|
|
|
|
* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
|
|
|
|
*/
|
|
|
|
if (loaded_mm != current_mm)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-07-31 15:41:01 +00:00
|
|
|
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
|
|
|
|
size_t count, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[32];
|
|
|
|
unsigned int len;
|
|
|
|
|
|
|
|
len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
|
|
|
|
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t tlbflush_write_file(struct file *file,
|
|
|
|
const char __user *user_buf, size_t count, loff_t *ppos)
|
|
|
|
{
|
|
|
|
char buf[32];
|
|
|
|
ssize_t len;
|
|
|
|
int ceiling;
|
|
|
|
|
|
|
|
len = min(count, sizeof(buf) - 1);
|
|
|
|
if (copy_from_user(buf, user_buf, len))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
buf[len] = '\0';
|
|
|
|
if (kstrtoint(buf, 0, &ceiling))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (ceiling < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
tlb_single_page_flush_ceiling = ceiling;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations fops_tlbflush = {
|
|
|
|
.read = tlbflush_read_file,
|
|
|
|
.write = tlbflush_write_file,
|
|
|
|
.llseek = default_llseek,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init create_tlb_single_page_flush_ceiling(void)
|
|
|
|
{
|
|
|
|
debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
|
|
|
|
arch_debugfs_dir, NULL, &fops_tlbflush);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
late_initcall(create_tlb_single_page_flush_ceiling);
|