KVM generic changes for 6.11

- Enable halt poll shrinking by default, as Intel found it to be a clear win.
 
  - Setup empty IRQ routing when creating a VM to avoid having to synchronize
    SRCU when creating a split IRQCHIP on x86.
 
  - Rework the sched_in/out() paths to replace kvm_arch_sched_in() with a flag
    that arch code can use for hooking both sched_in() and sched_out().
 
  - Take the vCPU @id as an "unsigned long" instead of "u32" to avoid
    truncating a bogus value from userspace, e.g. to help userspace detect bugs.
 
  - Mark a vCPU as preempted if and only if it's scheduled out while in the
    KVM_RUN loop, e.g. to avoid marking it preempted and thus writing guest
    memory when retrieving guest state during live migration blackout.
 
  - A few minor cleanups
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmaRuOYACgkQOlYIJqCj
 N/1UnQ/8CI5Qfr+/0gzYgtWmtEMczGG+rMNpzD3XVqPjJjXcMcBiQnplnzUVLhha
 vlPdYVK7vgmEt003XGzV55mik46LHL+DX/v4hI3HEdblfyCeNLW3fKEWVRB44qJe
 o+YUQwSK42SORUp9oXuQINxhA//U9EnI7CQxlJ8w8wenv5IJKfIGr01DefmfGPAV
 PKm9t6WLcNqvhZMEyy/zmzM3KVPCJL0NcwI97x6sHxFpQYIDtL0E/VexA4AFqMoT
 QK7cSDC/2US41Zvem/r/GzM/ucdF6vb9suzZYBohwhxtVhwJe2CDeYQZvtNKJ1U7
 GOHPaKL6nBWdZCm/yyWbbX2nstY1lHqxhN3JD0X8wqU5rNcwm2b8Vfyav0Ehc7H+
 jVbDTshOx4YJmIgajoKjgM050rdBK59TdfVL+l+AAV5q/TlHocalYtvkEBdGmIDg
 2td9UHSime6sp20vQfczUEz4bgrQsh4l2Fa/qU2jFwLievnBw0AvEaMximkSGMJe
 b8XfjmdTjlOesWAejANKtQolfrq14+1wYw0zZZ8PA+uNVpKdoovmcqSOcaDC9bT8
 GO/NFUvoG+lkcvJcIlo1SSl81SmGLosijwxWfGvFAqsgpR3/3l3dYp0QtztoCNJO
 d3+HnjgYn5o5FwufuTD3eUOXH4AFjG108DH0o25XrIkb2Kymy0o=
 =BalU
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-generic-6.11' of https://github.com/kvm-x86/linux into HEAD

KVM generic changes for 6.11

 - Enable halt poll shrinking by default, as Intel found it to be a clear win.

 - Setup empty IRQ routing when creating a VM to avoid having to synchronize
   SRCU when creating a split IRQCHIP on x86.

 - Rework the sched_in/out() paths to replace kvm_arch_sched_in() with a flag
   that arch code can use for hooking both sched_in() and sched_out().

 - Take the vCPU @id as an "unsigned long" instead of "u32" to avoid
   truncating a bogus value from userspace, e.g. to help userspace detect bugs.

 - Mark a vCPU as preempted if and only if it's scheduled out while in the
   KVM_RUN loop, e.g. to avoid marking it preempted and thus writing guest
   memory when retrieving guest state during live migration blackout.

 - A few minor cleanups
This commit is contained in:
Paolo Bonzini 2024-07-16 09:51:36 -04:00
commit 86014c1e20
31 changed files with 196 additions and 146 deletions

View file

@ -7969,10 +7969,10 @@ perform a bulk copy of tags to/from the guest.
7.29 KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM
-------------------------------------
Architectures: x86 SEV enabled
Type: vm
Parameters: args[0] is the fd of the source vm
Returns: 0 on success
:Architectures: x86 SEV enabled
:Type: vm
:Parameters: args[0] is the fd of the source vm
:Returns: 0 on success
This capability enables userspace to migrate the encryption context from the VM
indicated by the fd to the VM this is called on.

View file

@ -79,11 +79,11 @@ adjustment of the polling interval.
Module Parameters
=================
The kvm module has 3 tuneable module parameters to adjust the global max
polling interval as well as the rate at which the polling interval is grown and
shrunk. These variables are defined in include/linux/kvm_host.h and as module
parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the
powerpc kvm-hv case.
The kvm module has 4 tunable module parameters to adjust the global max polling
interval, the initial value (to grow from 0), and the rate at which the polling
interval is grown and shrunk. These variables are defined in
include/linux/kvm_host.h and as module parameters in virt/kvm/kvm_main.c, or
arch/powerpc/kvm/book3s_hv.c in the powerpc kvm-hv case.
+-----------------------+---------------------------+-------------------------+
|Module Parameter | Description | Default Value |
@ -105,7 +105,7 @@ powerpc kvm-hv case.
| | grow_halt_poll_ns() | |
| | function. | |
+-----------------------+---------------------------+-------------------------+
|halt_poll_ns_shrink | The value by which the | 0 |
|halt_poll_ns_shrink | The value by which the | 2 |
| | halt polling interval is | |
| | divided in the | |
| | shrink_halt_poll_ns() | |

View file

@ -1289,7 +1289,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
void kvm_arm_init_debug(void);
void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu);

View file

@ -1138,7 +1138,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu_load(vcpu);
if (run->immediate_exit) {
if (!vcpu->wants_to_run) {
ret = -EINTR;
goto out;
}

View file

@ -274,7 +274,6 @@ static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *arch)
static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}

View file

@ -1416,7 +1416,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_complete_iocsr_read(vcpu, run);
}
if (run->immediate_exit)
if (!vcpu->wants_to_run)
return r;
/* Clear exit_reason */

View file

@ -890,7 +890,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}

View file

@ -436,7 +436,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu->mmio_needed = 0;
}
if (vcpu->run->immediate_exit)
if (!vcpu->wants_to_run)
goto out;
lose_fpu(1);

View file

@ -897,7 +897,6 @@ struct kvm_vcpu_arch {
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}

View file

@ -1852,7 +1852,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_sigset_activate(vcpu);
if (run->immediate_exit)
if (!vcpu->wants_to_run)
r = -EINTR;
else
r = kvmppc_vcpu_run(vcpu);

View file

@ -286,7 +286,6 @@ struct kvm_vcpu_arch {
};
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12

View file

@ -763,7 +763,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
return ret;
}
if (run->immediate_exit) {
if (!vcpu->wants_to_run) {
kvm_vcpu_srcu_read_unlock(vcpu);
return -EINTR;
}

View file

@ -1045,7 +1045,6 @@ extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}

View file

@ -2997,14 +2997,9 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
break;
}
case KVM_CREATE_IRQCHIP: {
struct kvm_irq_routing_entry routing;
r = -EINVAL;
if (kvm->arch.use_irqchip) {
/* Set up dummy routing. */
memset(&routing, 0, sizeof(routing));
r = kvm_set_irq_routing(kvm, &routing, 0, 0);
}
if (kvm->arch.use_irqchip)
r = 0;
break;
}
case KVM_SET_DEVICE_ATTR: {
@ -5032,7 +5027,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (vcpu->kvm->arch.pv.dumping)
return -EINVAL;
if (kvm_run->immediate_exit)
if (!vcpu->wants_to_run)
return -EINTR;
if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS ||

View file

@ -103,7 +103,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)

View file

@ -1750,8 +1750,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
/*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
* value indicates CPU dirty logging is unsupported or disabled.

View file

@ -106,7 +106,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_setup_empty_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);

View file

@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
ARRAY_SIZE(default_routing), 0);
}
static const struct kvm_irq_routing_entry empty_routing[] = {};
int kvm_setup_empty_irq_routing(struct kvm *kvm)
{
return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
}
void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
if (!irqchip_split(kvm))

View file

@ -521,9 +521,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
}
/*
* Unused perf_events are only released if the corresponding MSRs
* weren't accessed during the last vCPU time slice. kvm_arch_sched_in
* triggers KVM_REQ_PMU if cleanup is needed.
* Release unused perf_events if the corresponding guest MSRs weren't
* accessed during the last vCPU time slice (need_cleanup is set when
* the vCPU is scheduled back in).
*/
if (unlikely(pmu->need_cleanup))
kvm_pmu_cleanup(vcpu);

View file

@ -1554,6 +1554,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
@ -4607,12 +4610,6 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (!kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
}
static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
/* [63:9] are reserved. */
@ -5075,8 +5072,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
.sched_in = svm_sched_in,
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,

View file

@ -122,8 +122,6 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
.sched_in = vmx_sched_in,
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,

View file

@ -1411,6 +1411,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
}
#endif
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned int old = vmx->ple_window;
vmx->ple_window = __grow_ple_window(old, ple_window,
ple_window_grow,
ple_window_max);
if (vmx->ple_window != old) {
vmx->ple_window_dirty = true;
trace_kvm_ple_window_update(vcpu->vcpu_id,
vmx->ple_window, old);
}
}
static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned int old = vmx->ple_window;
vmx->ple_window = __shrink_ple_window(old, ple_window,
ple_window_shrink,
ple_window);
if (vmx->ple_window != old) {
vmx->ple_window_dirty = true;
trace_kvm_ple_window_update(vcpu->vcpu_id,
vmx->ple_window, old);
}
}
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@ -1486,6 +1518,9 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
@ -5897,38 +5932,6 @@ int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
return 1;
}
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned int old = vmx->ple_window;
vmx->ple_window = __grow_ple_window(old, ple_window,
ple_window_grow,
ple_window_max);
if (vmx->ple_window != old) {
vmx->ple_window_dirty = true;
trace_kvm_ple_window_update(vcpu->vcpu_id,
vmx->ple_window, old);
}
}
static void shrink_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned int old = vmx->ple_window;
vmx->ple_window = __shrink_ple_window(old, ple_window,
ple_window_shrink,
ple_window);
if (vmx->ple_window != old) {
vmx->ple_window_dirty = true;
trace_kvm_ple_window_update(vcpu->vcpu_id,
vmx->ple_window, old);
}
}
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@ -6677,9 +6680,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
bool flush_l1d;
/*
* Clear the per-vcpu flush bit, it gets set again
* either from vcpu_run() or from one of the unsafe
* VMEXIT handlers.
* Clear the per-vcpu flush bit, it gets set again if the vCPU
* is reloaded, i.e. if the vCPU is scheduled out or if KVM
* exits to userspace, or if KVM reaches one of the unsafe
* VMEXIT handlers, e.g. if KVM calls into the emulator.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
@ -8179,12 +8183,6 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
}
#endif
void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (!kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
}
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);

View file

@ -112,7 +112,6 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
void vmx_request_immediate_exit(struct kvm_vcpu *vcpu);
void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu);
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
#ifdef CONFIG_X86_64
int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,

View file

@ -4998,6 +4998,15 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
vcpu->arch.l1tf_flush_l1d = true;
if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
pmu->need_cleanup = true;
kvm_make_request(KVM_REQ_PMU, vcpu);
}
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
if (static_call(kvm_x86_has_wbinvd_exit)())
@ -6546,9 +6555,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
goto split_irqchip_unlock;
if (kvm->created_vcpus)
goto split_irqchip_unlock;
r = kvm_setup_empty_irq_routing(kvm);
if (r)
goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@ -6695,7 +6701,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
break;
mutex_lock(&kvm->lock);
if (kvm->arch.max_vcpu_ids == cap->args[0]) {
if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
;
} else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
r = 0;
} else if (!kvm->arch.max_vcpu_ids) {
kvm->arch.max_vcpu_ids = cap->args[0];
@ -7216,6 +7224,9 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
r = -EBUSY;
else if (arg > KVM_MAX_VCPU_IDS ||
(kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
r = -EINVAL;
else
kvm->arch.bsp_vcpu_id = arg;
mutex_unlock(&kvm->lock);
@ -11248,7 +11259,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
int r;
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
/*
@ -11398,7 +11408,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
@ -11476,7 +11486,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->mmio_needed);
}
if (kvm_run->immediate_exit) {
if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
@ -12569,18 +12579,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
vcpu->arch.l1tf_flush_l1d = true;
if (pmu->version && unlikely(pmu->event_count)) {
pmu->need_cleanup = true;
kvm_make_request(KVM_REQ_PMU, vcpu);
}
static_call(kvm_x86_sched_in)(vcpu, cpu);
}
void kvm_arch_free_vm(struct kvm *kvm)
{
#if IS_ENABLED(CONFIG_HYPERV)

View file

@ -378,8 +378,10 @@ struct kvm_vcpu {
bool dy_eligible;
} spin_loop;
#endif
bool wants_to_run;
bool preempted;
bool ready;
bool scheduled_out;
struct kvm_vcpu_arch arch;
struct kvm_vcpu_stat stat;
char stats_id[KVM_STATS_NAME_SIZE];
@ -1494,8 +1496,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id);
@ -1955,8 +1955,6 @@ struct _kvm_stats_desc {
HALT_POLL_HIST_COUNT), \
STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking)
extern struct dentry *kvm_debugfs_dir;
ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
const struct _kvm_stats_desc *desc,
void *stats, size_t size_stats,
@ -2096,6 +2094,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
unsigned flags);
int kvm_init_irq_routing(struct kvm *kvm);
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
@ -2105,6 +2104,11 @@ void kvm_free_irq_routing(struct kvm *kvm);
static inline void kvm_free_irq_routing(struct kvm *kvm) {}
static inline int kvm_init_irq_routing(struct kvm *kvm)
{
return 0;
}
#endif
int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);

View file

@ -192,11 +192,24 @@ struct kvm_xen_exit {
/* Flags that describe what fields in emulation_failure hold valid data. */
#define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0)
/*
* struct kvm_run can be modified by userspace at any time, so KVM must be
* careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM()
* renames fields in struct kvm_run from <symbol> to <symbol>__unsafe when
* compiled into the kernel, ensuring that any use within KVM is obvious and
* gets extra scrutiny.
*/
#ifdef __KERNEL__
#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe
#else
#define HINT_UNSAFE_IN_KVM(_symbol) _symbol
#endif
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {
/* in */
__u8 request_interrupt_window;
__u8 immediate_exit;
__u8 HINT_UNSAFE_IN_KVM(immediate_exit);
__u8 padding1[6];
/* out */

View file

@ -26,19 +26,37 @@ int main(int argc, char *argv[])
TEST_ASSERT(ret < 0,
"Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail");
/* Test BOOT_CPU_ID interaction (MAX_VCPU_ID cannot be lower) */
if (kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)) {
vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)MAX_VCPU_ID);
/* Try setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID */
ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID - 1);
TEST_ASSERT(ret < 0,
"Setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID should fail");
}
/* Set KVM_CAP_MAX_VCPU_ID */
vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID);
/* Try to set KVM_CAP_MAX_VCPU_ID again */
ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1);
TEST_ASSERT(ret < 0,
"Setting KVM_CAP_MAX_VCPU_ID multiple times should fail");
/* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/
/* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap */
ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID);
TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail");
/* Create vCPU with bits 63:32 != 0, but an otherwise valid id */
ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(1L << 32));
TEST_ASSERT(ret < 0, "Creating vCPU with ID[63:32] != 0 should fail");
/* Create vCPU with id within bounds */
ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)0);
TEST_ASSERT(ret >= 0, "Creating vCPU with ID 0 should succeed");
close(ret);
kvm_vm_free(vm);
return 0;
}

View file

@ -33,6 +33,20 @@ static void guest_not_bsp_vcpu(void *arg)
GUEST_DONE();
}
static void test_set_invalid_bsp(struct kvm_vm *vm)
{
unsigned long max_vcpu_id = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
int r;
if (max_vcpu_id) {
r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(max_vcpu_id + 1));
TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID > MAX should fail");
}
r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(1L << 32));
TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID[63:32]!=0 should fail");
}
static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg)
{
int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID,
@ -80,6 +94,8 @@ static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id,
vm = vm_create(nr_vcpus);
test_set_invalid_bsp(vm);
vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id);
for (i = 0; i < nr_vcpus; i++)

View file

@ -80,7 +80,6 @@ static void async_pf_execute(struct work_struct *work)
spin_lock(&vcpu->async_pf.lock);
first = list_empty(&vcpu->async_pf.done);
list_add_tail(&apf->link, &vcpu->async_pf.done);
apf->vcpu = NULL;
spin_unlock(&vcpu->async_pf.lock);
/*
@ -120,8 +119,6 @@ static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work)
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
{
spin_lock(&vcpu->async_pf.lock);
/* cancel outstanding work queue item */
while (!list_empty(&vcpu->async_pf.queue)) {
struct kvm_async_pf *work =
@ -129,23 +126,15 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
typeof(*work), queue);
list_del(&work->queue);
/*
* We know it's present in vcpu->async_pf.done, do
* nothing here.
*/
if (!work->vcpu)
continue;
spin_unlock(&vcpu->async_pf.lock);
#ifdef CONFIG_KVM_ASYNC_PF_SYNC
flush_work(&work->work);
#else
if (cancel_work_sync(&work->work))
kmem_cache_free(async_pf_cache, work);
#endif
spin_lock(&vcpu->async_pf.lock);
}
spin_lock(&vcpu->async_pf.lock);
while (!list_empty(&vcpu->async_pf.done)) {
struct kvm_async_pf *work =
list_first_entry(&vcpu->async_pf.done,

View file

@ -237,3 +237,27 @@ int kvm_set_irq_routing(struct kvm *kvm,
return r;
}
/*
* Allocate empty IRQ routing by default so that additional setup isn't needed
* when userspace-driven IRQ routing is activated, and so that kvm->irq_routing
* is guaranteed to be non-NULL.
*/
int kvm_init_irq_routing(struct kvm *kvm)
{
struct kvm_irq_routing_table *new;
int chip_size;
new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT);
if (!new)
return -ENOMEM;
new->nr_rt_entries = 1;
chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS;
memset(new->chip, -1, chip_size);
RCU_INIT_POINTER(kvm->irq_routing, new);
return 0;
}

View file

@ -1,9 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
* Kernel-based Virtual Machine (KVM) Hypervisor
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
@ -74,6 +71,7 @@
#define ITOA_MAX_LEN 12
MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
MODULE_LICENSE("GPL");
/* Architectures should define their poll value according to the halt latency */
@ -91,8 +89,8 @@ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
module_param(halt_poll_ns_grow_start, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
/* Default resets per-vcpu halt_poll_ns . */
unsigned int halt_poll_ns_shrink;
/* Default halves per-vcpu halt_poll_ns. */
unsigned int halt_poll_ns_shrink = 2;
module_param(halt_poll_ns_shrink, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
@ -110,8 +108,7 @@ static struct kmem_cache *kvm_vcpu_cache;
static __read_mostly struct preempt_ops kvm_preempt_ops;
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
struct dentry *kvm_debugfs_dir;
EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
static struct dentry *kvm_debugfs_dir;
static const struct file_operations stat_fops_per_vm;
@ -1145,8 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
{
struct kvm *kvm = kvm_arch_alloc_vm();
struct kvm_memslots *slots;
int r = -ENOMEM;
int i, j;
int r, i, j;
if (!kvm)
return ERR_PTR(-ENOMEM);
@ -1183,12 +1179,18 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
task_pid_nr(current));
r = -ENOMEM;
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
if (init_srcu_struct(&kvm->irq_srcu))
goto out_err_no_irq_srcu;
r = kvm_init_irq_routing(kvm);
if (r)
goto out_err_no_irq_routing;
refcount_set(&kvm->users_count, 1);
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
for (j = 0; j < 2; j++) {
slots = &kvm->__memslots[i][j];
@ -1206,6 +1208,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
}
r = -ENOMEM;
for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
@ -1267,6 +1270,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
kvm_free_irq_routing(kvm);
out_err_no_irq_routing:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
@ -4202,12 +4207,21 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
{
int r;
struct kvm_vcpu *vcpu;
struct page *page;
/*
* KVM tracks vCPU IDs as 'int', be kind to userspace and reject
* too-large values instead of silently truncating.
*
* Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
* changing the storage type (at the very least, IDs should be tracked
* as unsigned ints).
*/
BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
if (id >= KVM_MAX_VCPU_IDS)
return -EINVAL;
@ -4467,7 +4481,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
synchronize_rcu();
put_pid(oldpid);
}
vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
r = kvm_arch_vcpu_ioctl_run(vcpu);
vcpu->wants_to_run = false;
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
@ -6347,8 +6364,9 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
WRITE_ONCE(vcpu->ready, false);
__this_cpu_write(kvm_running_vcpu, vcpu);
kvm_arch_sched_in(vcpu, cpu);
kvm_arch_vcpu_load(vcpu, cpu);
WRITE_ONCE(vcpu->scheduled_out, false);
}
static void kvm_sched_out(struct preempt_notifier *pn,
@ -6356,7 +6374,9 @@ static void kvm_sched_out(struct preempt_notifier *pn,
{
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
if (current->on_rq) {
WRITE_ONCE(vcpu->scheduled_out, true);
if (current->on_rq && vcpu->wants_to_run) {
WRITE_ONCE(vcpu->preempted, true);
WRITE_ONCE(vcpu->ready, true);
}