Merge branch 'kvm-redo-enable-virt' into HEAD

Register KVM's cpuhp and syscore callbacks when enabling virtualization in
hardware, as the sole purpose of said callbacks is to disable and re-enable
virtualization as needed.

The primary motivation for this series is to simplify dealing with enabling
virtualization for Intel's TDX, which needs to enable virtualization
when kvm-intel.ko is loaded, i.e. long before the first VM is created.

That said, this is a nice cleanup on its own.  By registering the callbacks
on-demand, the callbacks themselves don't need to check kvm_usage_count,
because their very existence implies a non-zero count.

Patch 1 (re)adds a dedicated lock for kvm_usage_count.  This avoids a
lock ordering issue between cpus_read_lock() and kvm_lock.  The lock
ordering issue still exist in very rare cases, and will be fixed for
good by switching vm_list to an (S)RCU-protected list.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Paolo Bonzini 2024-09-12 11:13:05 -04:00
commit c09dd2bb57
18 changed files with 251 additions and 198 deletions

View file

@ -2648,6 +2648,23 @@
Default is Y (on).
kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86]
If enabled, KVM will enable virtualization in hardware
when KVM is loaded, and disable virtualization when KVM
is unloaded (if KVM is built as a module).
If disabled, KVM will dynamically enable and disable
virtualization on-demand when creating and destroying
VMs, i.e. on the 0=>1 and 1=>0 transitions of the
number of VMs.
Enabling virtualization at module lode avoids potential
latency for creation of the 0=>1 VM, as KVM serializes
virtualization enabling across all online CPUs. The
"cost" of enabling virtualization when KVM is loaded,
is that doing so may interfere with using out-of-tree
hypervisors that want to "own" virtualization hardware.
kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
Default is false (don't support).

View file

@ -11,6 +11,8 @@ The acquisition orders for mutexes are as follows:
- cpus_read_lock() is taken outside kvm_lock
- kvm_usage_lock is taken outside cpus_read_lock()
- kvm->lock is taken outside vcpu->mutex
- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock
@ -24,6 +26,12 @@ The acquisition orders for mutexes are as follows:
are taken on the waiting side when modifying memslots, so MMU notifiers
must not take either kvm->slots_lock or kvm->slots_arch_lock.
cpus_read_lock() vs kvm_lock:
- Taking cpus_read_lock() outside of kvm_lock is problematic, despite that
being the official ordering, as it is quite easy to unknowingly trigger
cpus_read_lock() while holding kvm_lock. Use caution when walking vm_list,
e.g. avoid complex operations when possible.
For SRCU:
- ``synchronize_srcu(&kvm->srcu)`` is called inside critical sections
@ -227,10 +235,16 @@ time it will be set using the Dirty tracking mechanism described above.
:Type: mutex
:Arch: any
:Protects: - vm_list
- kvm_usage_count
``kvm_usage_lock``
^^^^^^^^^^^^^^^^^^
:Type: mutex
:Arch: any
:Protects: - kvm_usage_count
- hardware virtualization enable/disable
:Comment: KVM also disables CPU hotplug via cpus_read_lock() during
enable/disable.
:Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is
protected, which simplifies the virtualization enabling logic.
``kvm->mn_invalidate_lock``
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -290,11 +304,12 @@ time it will be set using the Dirty tracking mechanism described above.
wakeup.
``vendor_module_lock``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^
:Type: mutex
:Arch: x86
:Protects: loading a vendor module (kvm_amd or kvm_intel)
:Comment: Exists because using kvm_lock leads to deadlock. cpu_hotplug_lock is
taken outside of kvm_lock, e.g. in KVM's CPU online/offline callbacks, and
many operations need to take cpu_hotplug_lock when loading a vendor module,
e.g. updating static calls.
:Comment: Exists because using kvm_lock leads to deadlock. kvm_lock is taken
in notifiers, e.g. __kvmclock_cpufreq_notifier(), that may be invoked while
cpu_hotplug_lock is held, e.g. from cpufreq_boost_trigger_state(), and many
operations need to take cpu_hotplug_lock when loading a vendor module, e.g.
updating static calls.

View file

@ -2164,7 +2164,7 @@ static void cpu_hyp_uninit(void *discard)
}
}
int kvm_arch_hardware_enable(void)
int kvm_arch_enable_virtualization_cpu(void)
{
/*
* Most calls to this function are made with migration
@ -2184,7 +2184,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
void kvm_arch_hardware_disable(void)
void kvm_arch_disable_virtualization_cpu(void)
{
kvm_timer_cpu_down();
kvm_vgic_cpu_down();
@ -2380,7 +2380,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits)
/*
* The stub hypercalls are now disabled, so set our local flag to
* prevent a later re-init attempt in kvm_arch_hardware_enable().
* prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
*/
__this_cpu_write(kvm_hyp_initialized, 1);
preempt_enable();

View file

@ -261,7 +261,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -ENOIOCTLCMD;
}
int kvm_arch_hardware_enable(void)
int kvm_arch_enable_virtualization_cpu(void)
{
unsigned long env, gcfg = 0;
@ -300,7 +300,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
void kvm_arch_hardware_disable(void)
void kvm_arch_disable_virtualization_cpu(void)
{
write_csr_gcfg(0);
write_csr_gstat(0);

View file

@ -728,8 +728,8 @@ struct kvm_mips_callbacks {
int (*handle_fpe)(struct kvm_vcpu *vcpu);
int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
int (*handle_guest_exit)(struct kvm_vcpu *vcpu);
int (*hardware_enable)(void);
void (*hardware_disable)(void);
int (*enable_virtualization_cpu)(void);
void (*disable_virtualization_cpu)(void);
int (*check_extension)(struct kvm *kvm, long ext);
int (*vcpu_init)(struct kvm_vcpu *vcpu);
void (*vcpu_uninit)(struct kvm_vcpu *vcpu);

View file

@ -125,14 +125,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
return 1;
}
int kvm_arch_hardware_enable(void)
int kvm_arch_enable_virtualization_cpu(void)
{
return kvm_mips_callbacks->hardware_enable();
return kvm_mips_callbacks->enable_virtualization_cpu();
}
void kvm_arch_hardware_disable(void)
void kvm_arch_disable_virtualization_cpu(void)
{
kvm_mips_callbacks->hardware_disable();
kvm_mips_callbacks->disable_virtualization_cpu();
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

View file

@ -2869,7 +2869,7 @@ static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
return ret + 1;
}
static int kvm_vz_hardware_enable(void)
static int kvm_vz_enable_virtualization_cpu(void)
{
unsigned int mmu_size, guest_mmu_size, ftlb_size;
u64 guest_cvmctl, cvmvmconfig;
@ -2983,7 +2983,7 @@ static int kvm_vz_hardware_enable(void)
return 0;
}
static void kvm_vz_hardware_disable(void)
static void kvm_vz_disable_virtualization_cpu(void)
{
u64 cvmvmconfig;
unsigned int mmu_size;
@ -3280,8 +3280,8 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
.handle_msa_disabled = kvm_trap_vz_handle_msa_disabled,
.handle_guest_exit = kvm_trap_vz_handle_guest_exit,
.hardware_enable = kvm_vz_hardware_enable,
.hardware_disable = kvm_vz_hardware_disable,
.enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu,
.disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu,
.check_extension = kvm_vz_check_extension,
.vcpu_init = kvm_vz_vcpu_init,
.vcpu_uninit = kvm_vz_vcpu_uninit,

View file

@ -20,7 +20,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -EINVAL;
}
int kvm_arch_hardware_enable(void)
int kvm_arch_enable_virtualization_cpu(void)
{
csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT);
csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT);
@ -35,7 +35,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
void kvm_arch_hardware_disable(void)
void kvm_arch_disable_virtualization_cpu(void)
{
kvm_riscv_aia_disable();

View file

@ -14,8 +14,8 @@ BUILD_BUG_ON(1)
* be __static_call_return0.
*/
KVM_X86_OP(check_processor_compatibility)
KVM_X86_OP(hardware_enable)
KVM_X86_OP(hardware_disable)
KVM_X86_OP(enable_virtualization_cpu)
KVM_X86_OP(disable_virtualization_cpu)
KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)

View file

@ -36,6 +36,7 @@
#include <asm/kvm_page_track.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/hyperv-tlfs.h>
#include <asm/reboot.h>
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
@ -1629,8 +1630,10 @@ struct kvm_x86_ops {
int (*check_processor_compatibility)(void);
int (*hardware_enable)(void);
void (*hardware_disable)(void);
int (*enable_virtualization_cpu)(void);
void (*disable_virtualization_cpu)(void);
cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
void (*hardware_unsetup)(void);
bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);

View file

@ -25,8 +25,8 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
typedef void (cpu_emergency_virt_cb)(void);
#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_disable_virtualization(void);

View file

@ -592,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void)
}
}
static void svm_emergency_disable(void)
static void svm_emergency_disable_virtualization_cpu(void)
{
kvm_rebooting = true;
kvm_cpu_svm_disable();
}
static void svm_hardware_disable(void)
static void svm_disable_virtualization_cpu(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
@ -610,7 +610,7 @@ static void svm_hardware_disable(void)
amd_pmu_disable_virt();
}
static int svm_hardware_enable(void)
static int svm_enable_virtualization_cpu(void)
{
struct svm_cpu_data *sd;
@ -1533,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* TSC_AUX is always virtualized for SEV-ES guests when the feature is
* available. The user return MSR support is not required in this case
* because TSC_AUX is restored on #VMEXIT from the host save area
* (which has been initialized in svm_hardware_enable()).
* (which has been initialized in svm_enable_virtualization_cpu()).
*/
if (likely(tsc_aux_uret_slot >= 0) &&
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
@ -3144,7 +3144,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* feature is available. The user return MSR support is not
* required in this case because TSC_AUX is restored on #VMEXIT
* from the host save area (which has been initialized in
* svm_hardware_enable()).
* svm_enable_virtualization_cpu()).
*/
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
break;
@ -4992,8 +4992,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.enable_virtualization_cpu = svm_enable_virtualization_cpu,
.disable_virtualization_cpu = svm_disable_virtualization_cpu,
.emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
.vcpu_create = svm_vcpu_create,
@ -5425,8 +5426,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static void __svm_exit(void)
{
kvm_x86_vendor_exit();
cpu_emergency_unregister_virt_callback(svm_emergency_disable);
}
static int __init svm_init(void)
@ -5442,8 +5441,6 @@ static int __init svm_init(void)
if (r)
return r;
cpu_emergency_register_virt_callback(svm_emergency_disable);
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!

View file

@ -23,8 +23,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.hardware_unsetup = vmx_hardware_unsetup,
.hardware_enable = vmx_hardware_enable,
.hardware_disable = vmx_hardware_disable,
.enable_virtualization_cpu = vmx_enable_virtualization_cpu,
.disable_virtualization_cpu = vmx_disable_virtualization_cpu,
.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),

View file

@ -755,7 +755,7 @@ static int kvm_cpu_vmxoff(void)
return -EIO;
}
static void vmx_emergency_disable(void)
void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
@ -2844,7 +2844,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
return -EFAULT;
}
int vmx_hardware_enable(void)
int vmx_enable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@ -2881,7 +2881,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
void vmx_hardware_disable(void)
void vmx_disable_virtualization_cpu(void)
{
vmclear_local_loaded_vmcss();
@ -8584,8 +8584,6 @@ static void __vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
vmx_cleanup_l1d_flush();
}
@ -8632,8 +8630,6 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
cpu_emergency_register_virt_callback(vmx_emergency_disable);
vmx_check_vmcs12_offsets();
/*

View file

@ -13,8 +13,9 @@ extern struct kvm_x86_init_ops vt_init_ops __initdata;
void vmx_hardware_unsetup(void);
int vmx_check_processor_compat(void);
int vmx_hardware_enable(void);
void vmx_hardware_disable(void);
int vmx_enable_virtualization_cpu(void);
void vmx_disable_virtualization_cpu(void);
void vmx_emergency_disable_virtualization_cpu(void);
int vmx_vm_init(struct kvm *kvm);
void vmx_vm_destroy(struct kvm *kvm);
int vmx_vcpu_precreate(struct kvm *kvm);

View file

@ -355,7 +355,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
/*
* Disabling irqs at this point since the following code could be
* interrupted and executed through kvm_arch_hardware_disable()
* interrupted and executed through kvm_arch_disable_virtualization_cpu()
*/
local_irq_save(flags);
if (msrs->registered) {
@ -9753,7 +9753,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
guard(mutex)(&vendor_module_lock);
if (kvm_x86_ops.hardware_enable) {
if (kvm_x86_ops.enable_virtualization_cpu) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
@ -9880,7 +9880,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return 0;
out_unwind_ops:
kvm_x86_ops.hardware_enable = NULL;
kvm_x86_ops.enable_virtualization_cpu = NULL;
kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
kvm_mmu_vendor_module_exit();
@ -9921,7 +9921,7 @@ void kvm_x86_vendor_exit(void)
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
mutex_lock(&vendor_module_lock);
kvm_x86_ops.hardware_enable = NULL;
kvm_x86_ops.enable_virtualization_cpu = NULL;
mutex_unlock(&vendor_module_lock);
}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
@ -12516,7 +12516,17 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
int kvm_arch_hardware_enable(void)
void kvm_arch_enable_virtualization(void)
{
cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
}
void kvm_arch_disable_virtualization(void)
{
cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
}
int kvm_arch_enable_virtualization_cpu(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
@ -12532,7 +12542,7 @@ int kvm_arch_hardware_enable(void)
if (ret)
return ret;
ret = kvm_x86_call(hardware_enable)();
ret = kvm_x86_call(enable_virtualization_cpu)();
if (ret != 0)
return ret;
@ -12612,9 +12622,9 @@ int kvm_arch_hardware_enable(void)
return 0;
}
void kvm_arch_hardware_disable(void)
void kvm_arch_disable_virtualization_cpu(void)
{
kvm_x86_call(hardware_disable)();
kvm_x86_call(disable_virtualization_cpu)();
drop_user_return_notifiers();
}

View file

@ -1529,8 +1529,22 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
#endif
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
int kvm_arch_hardware_enable(void);
void kvm_arch_hardware_disable(void);
/*
* kvm_arch_{enable,disable}_virtualization() are called on one CPU, under
* kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of
* kvm_usage_count, i.e. at the beginning of the generic hardware enabling
* sequence, and at the end of the generic hardware disabling sequence.
*/
void kvm_arch_enable_virtualization(void);
void kvm_arch_disable_virtualization(void);
/*
* kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to
* do the actual twiddling of hardware bits. The hooks are called on all
* online CPUs when KVM enables/disabled virtualization, and on a single CPU
* when that CPU is onlined/offlined (including for Resume/Suspend).
*/
int kvm_arch_enable_virtualization_cpu(void);
void kvm_arch_disable_virtualization_cpu(void);
#endif
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);

View file

@ -136,8 +136,8 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file)
#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
.open = kvm_no_compat_open
#endif
static int hardware_enable_all(void);
static void hardware_disable_all(void);
static int kvm_enable_virtualization(void);
static void kvm_disable_virtualization(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
@ -1220,7 +1220,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
if (r)
goto out_err_no_arch_destroy_vm;
r = hardware_enable_all();
r = kvm_enable_virtualization();
if (r)
goto out_err_no_disable;
@ -1263,7 +1263,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
#endif
out_err_no_mmu_notifier:
hardware_disable_all();
kvm_disable_virtualization();
out_err_no_disable:
kvm_arch_destroy_vm(kvm);
out_err_no_arch_destroy_vm:
@ -1360,7 +1360,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
#endif
kvm_arch_free_vm(kvm);
preempt_notifier_dec();
hardware_disable_all();
kvm_disable_virtualization();
mmdrop(mm);
}
@ -5571,137 +5571,67 @@ static struct miscdevice kvm_dev = {
};
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
static bool enable_virt_at_load = true;
module_param(enable_virt_at_load, bool, 0444);
__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
static DEFINE_PER_CPU(bool, hardware_enabled);
static DEFINE_PER_CPU(bool, virtualization_enabled);
static DEFINE_MUTEX(kvm_usage_lock);
static int kvm_usage_count;
static int __hardware_enable_nolock(void)
__weak void kvm_arch_enable_virtualization(void)
{
if (__this_cpu_read(hardware_enabled))
}
__weak void kvm_arch_disable_virtualization(void)
{
}
static int kvm_enable_virtualization_cpu(void)
{
if (__this_cpu_read(virtualization_enabled))
return 0;
if (kvm_arch_hardware_enable()) {
if (kvm_arch_enable_virtualization_cpu()) {
pr_info("kvm: enabling virtualization on CPU%d failed\n",
raw_smp_processor_id());
return -EIO;
}
__this_cpu_write(hardware_enabled, true);
__this_cpu_write(virtualization_enabled, true);
return 0;
}
static void hardware_enable_nolock(void *failed)
{
if (__hardware_enable_nolock())
atomic_inc(failed);
}
static int kvm_online_cpu(unsigned int cpu)
{
int ret = 0;
/*
* Abort the CPU online process if hardware virtualization cannot
* be enabled. Otherwise running VMs would encounter unrecoverable
* errors when scheduled to this CPU.
*/
mutex_lock(&kvm_lock);
if (kvm_usage_count)
ret = __hardware_enable_nolock();
mutex_unlock(&kvm_lock);
return ret;
return kvm_enable_virtualization_cpu();
}
static void hardware_disable_nolock(void *junk)
static void kvm_disable_virtualization_cpu(void *ign)
{
/*
* Note, hardware_disable_all_nolock() tells all online CPUs to disable
* hardware, not just CPUs that successfully enabled hardware!
*/
if (!__this_cpu_read(hardware_enabled))
if (!__this_cpu_read(virtualization_enabled))
return;
kvm_arch_hardware_disable();
kvm_arch_disable_virtualization_cpu();
__this_cpu_write(hardware_enabled, false);
__this_cpu_write(virtualization_enabled, false);
}
static int kvm_offline_cpu(unsigned int cpu)
{
mutex_lock(&kvm_lock);
if (kvm_usage_count)
hardware_disable_nolock(NULL);
mutex_unlock(&kvm_lock);
kvm_disable_virtualization_cpu(NULL);
return 0;
}
static void hardware_disable_all_nolock(void)
{
BUG_ON(!kvm_usage_count);
kvm_usage_count--;
if (!kvm_usage_count)
on_each_cpu(hardware_disable_nolock, NULL, 1);
}
static void hardware_disable_all(void)
{
cpus_read_lock();
mutex_lock(&kvm_lock);
hardware_disable_all_nolock();
mutex_unlock(&kvm_lock);
cpus_read_unlock();
}
static int hardware_enable_all(void)
{
atomic_t failed = ATOMIC_INIT(0);
int r;
/*
* Do not enable hardware virtualization if the system is going down.
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
* possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
* after kvm_reboot() is called. Note, this relies on system_state
* being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
* hook instead of registering a dedicated reboot notifier (the latter
* runs before system_state is updated).
*/
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
system_state == SYSTEM_RESTART)
return -EBUSY;
/*
* When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
* is called, and so on_each_cpu() between them includes the CPU that
* is being onlined. As a result, hardware_enable_nolock() may get
* invoked before kvm_online_cpu(), which also enables hardware if the
* usage count is non-zero. Disable CPU hotplug to avoid attempting to
* enable hardware multiple times.
*/
cpus_read_lock();
mutex_lock(&kvm_lock);
r = 0;
kvm_usage_count++;
if (kvm_usage_count == 1) {
on_each_cpu(hardware_enable_nolock, &failed, 1);
if (atomic_read(&failed)) {
hardware_disable_all_nolock();
r = -EBUSY;
}
}
mutex_unlock(&kvm_lock);
cpus_read_unlock();
return r;
}
static void kvm_shutdown(void)
{
/*
@ -5717,34 +5647,32 @@ static void kvm_shutdown(void)
*/
pr_info("kvm: exiting hardware virtualization\n");
kvm_rebooting = true;
on_each_cpu(hardware_disable_nolock, NULL, 1);
on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
}
static int kvm_suspend(void)
{
/*
* Secondary CPUs and CPU hotplug are disabled across the suspend/resume
* callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
* is stable. Assert that kvm_lock is not held to ensure the system
* isn't suspended while KVM is enabling hardware. Hardware enabling
* can be preempted, but the task cannot be frozen until it has dropped
* all locks (userspace tasks are frozen via a fake signal).
* callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
* count is stable. Assert that kvm_usage_lock is not held to ensure
* the system isn't suspended while KVM is enabling hardware. Hardware
* enabling can be preempted, but the task cannot be frozen until it has
* dropped all locks (userspace tasks are frozen via a fake signal).
*/
lockdep_assert_not_held(&kvm_lock);
lockdep_assert_not_held(&kvm_usage_lock);
lockdep_assert_irqs_disabled();
if (kvm_usage_count)
hardware_disable_nolock(NULL);
kvm_disable_virtualization_cpu(NULL);
return 0;
}
static void kvm_resume(void)
{
lockdep_assert_not_held(&kvm_lock);
lockdep_assert_not_held(&kvm_usage_lock);
lockdep_assert_irqs_disabled();
if (kvm_usage_count)
WARN_ON_ONCE(__hardware_enable_nolock());
WARN_ON_ONCE(kvm_enable_virtualization_cpu());
}
static struct syscore_ops kvm_syscore_ops = {
@ -5752,13 +5680,95 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume,
.shutdown = kvm_shutdown,
};
static int kvm_enable_virtualization(void)
{
int r;
guard(mutex)(&kvm_usage_lock);
if (kvm_usage_count++)
return 0;
kvm_arch_enable_virtualization();
r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
kvm_online_cpu, kvm_offline_cpu);
if (r)
goto err_cpuhp;
register_syscore_ops(&kvm_syscore_ops);
/*
* Undo virtualization enabling and bail if the system is going down.
* If userspace initiated a forced reboot, e.g. reboot -f, then it's
* possible for an in-flight operation to enable virtualization after
* syscore_shutdown() is called, i.e. without kvm_shutdown() being
* invoked. Note, this relies on system_state being set _before_
* kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
* or this CPU observes the impending shutdown. Which is why KVM uses
* a syscore ops hook instead of registering a dedicated reboot
* notifier (the latter runs before system_state is updated).
*/
if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
system_state == SYSTEM_RESTART) {
r = -EBUSY;
goto err_rebooting;
}
return 0;
err_rebooting:
unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
err_cpuhp:
kvm_arch_disable_virtualization();
--kvm_usage_count;
return r;
}
static void kvm_disable_virtualization(void)
{
guard(mutex)(&kvm_usage_lock);
if (--kvm_usage_count)
return;
unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
kvm_arch_disable_virtualization();
}
static int kvm_init_virtualization(void)
{
if (enable_virt_at_load)
return kvm_enable_virtualization();
return 0;
}
static void kvm_uninit_virtualization(void)
{
if (enable_virt_at_load)
kvm_disable_virtualization();
}
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
static int hardware_enable_all(void)
static int kvm_enable_virtualization(void)
{
return 0;
}
static void hardware_disable_all(void)
static int kvm_init_virtualization(void)
{
return 0;
}
static void kvm_disable_virtualization(void)
{
}
static void kvm_uninit_virtualization(void)
{
}
@ -6460,15 +6470,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
int r;
int cpu;
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
kvm_online_cpu, kvm_offline_cpu);
if (r)
return r;
register_syscore_ops(&kvm_syscore_ops);
#endif
/* A kmem cache lets us meet the alignment requirements of fx_save. */
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
@ -6479,10 +6480,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
offsetofend(struct kvm_vcpu, stats_id)
- offsetof(struct kvm_vcpu, arch),
NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
goto err_vcpu_cache;
}
if (!kvm_vcpu_cache)
return -ENOMEM;
for_each_possible_cpu(cpu) {
if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
@ -6516,6 +6515,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
kvm_gmem_init(module);
r = kvm_init_virtualization();
if (r)
goto err_virt;
/*
* Registration _must_ be the very last thing done, as this exposes
* /dev/kvm to userspace, i.e. all infrastructure must be setup!
@ -6529,6 +6532,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
return 0;
err_register:
kvm_uninit_virtualization();
err_virt:
kvm_vfio_ops_exit();
err_vfio:
kvm_async_pf_deinit();
@ -6539,11 +6544,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
err_vcpu_cache:
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
@ -6559,16 +6559,14 @@ void kvm_exit(void)
*/
misc_deregister(&kvm_dev);
kvm_uninit_virtualization();
debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
kvm_vfio_ops_exit();
kvm_async_pf_deinit();
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
unregister_syscore_ops(&kvm_syscore_ops);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
#endif
kvm_irqfd_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);