linux-stable/arch/x86/kvm/mmu/tdp_iter.h
Sean Christopherson bb95dfb9e2 KVM: x86/mmu: Defer TLB flush to caller when freeing TDP MMU shadow pages
Defer TLB flushes to the caller when freeing TDP MMU shadow pages instead
of immediately flushing.  Because the shadow pages are freed in an RCU
callback, so long as at least one CPU holds RCU, all CPUs are protected.
For vCPUs running in the guest, i.e. consuming TLB entries, KVM only
needs to ensure the caller services the pending TLB flush before dropping
its RCU protections.  I.e. use the caller's RCU as a proxy for all vCPUs
running in the guest.

Deferring the flushes allows batching flushes, e.g. when installing a
1gb hugepage and zapping a pile of SPs.  And when zapping an entire root,
deferring flushes allows skipping the flush entirely (because flushes are
not needed in that case).

Avoiding flushes when zapping an entire root is especially important as
synchronizing with other CPUs via IPI after zapping every shadow page can
cause significant performance issues for large VMs.  The issue is
exacerbated by KVM zapping entire top-level entries without dropping
RCU protection, which can lead to RCU stalls even when zapping roots
backing relatively "small" amounts of guest memory, e.g. 2tb.  Removing
the IPI bottleneck largely mitigates the RCU issues, though it's likely
still a problem for 5-level paging.  A future patch will further address
the problem by zapping roots in multiple passes to avoid holding RCU for
an extended duration.

Reviewed-by: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220226001546.360188-20-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-03-08 09:31:57 -05:00

88 lines
2.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
#ifndef __KVM_X86_MMU_TDP_ITER_H
#define __KVM_X86_MMU_TDP_ITER_H
#include <linux/kvm_host.h>
#include "mmu.h"
/*
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
* to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
* batched without having to collect the list of zapped SPs. Flows that can
* remove SPs must service pending TLB flushes prior to dropping RCU protection.
*/
static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{
return READ_ONCE(*rcu_dereference(sptep));
}
static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
{
WRITE_ONCE(*rcu_dereference(sptep), val);
}
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
struct tdp_iter {
/*
* The iterator will traverse the paging structure towards the mapping
* for this GFN.
*/
gfn_t next_last_level_gfn;
/*
* The next_last_level_gfn at the time when the thread last
* yielded. Only yielding when the next_last_level_gfn !=
* yielded_gfn helps ensure forward progress.
*/
gfn_t yielded_gfn;
/* Pointers to the page tables traversed to reach the current SPTE */
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
tdp_ptep_t sptep;
/* The lowest GFN mapped by the current SPTE */
gfn_t gfn;
/* The level of the root page given to the iterator */
int root_level;
/* The lowest level the iterator should traverse to */
int min_level;
/* The iterator's current level within the paging structure */
int level;
/* The address space ID, i.e. SMM vs. regular. */
int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
* Whether the iterator has a valid state. This will be false if the
* iterator walks off the end of the paging structure.
*/
bool valid;
/*
* True if KVM dropped mmu_lock and yielded in the middle of a walk, in
* which case tdp_iter_next() needs to restart the walk at the root
* level instead of advancing to the next entry.
*/
bool yielded;
};
/*
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \
for (tdp_iter_start(&iter, root, min_level, start); \
iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
#define for_each_tdp_pte(iter, root, start, end) \
for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */