KVM x86 PMU changes for 6.8:

- Fix a variety of bugs where KVM fail to stop/reset counters and other state
    prior to refreshing the vPMU model.
 
  - Fix a double-overflow PMU bug by tracking emulated counter events using a
    dedicated field instead of snapshotting the "previous" counter.  If the
    hardware PMC count triggers overflow that is recognized in the same VM-Exit
    that KVM manually bumps an event count, KVM would pend PMIs for both the
    hardware-triggered overflow and for KVM-triggered overflow.
 -----BEGIN PGP SIGNATURE-----
 
 iQJGBAABCgAwFiEEMHr+pfEFOIzK+KY1YJEiAU0MEvkFAmWW/rsSHHNlYW5qY0Bn
 b29nbGUuY29tAAoJEGCRIgFNDBL5Q8gQAJc4y9NOd09kYXpI+DhkTVe6v07dmYds
 NzBI2uViqxXFwA5pTs5VTVVYAl1FEmK6NvIVnJdc3epSYRSqyaeN/Z2NoulNxekj
 /jLA/aA4+dTeJf2lfMFeH65IIuSJhuhyGeZV31RfW3NzEmlglcsb74QkHnJB8rLQ
 RFJXZcOxSSap72AWxKmxk0alRaI6ONZ9NyqOWFWjZdQuAE7id9Ae5OixKUrlJkmR
 6CbY8ra51MFIXQEsomVlcl5b1DNiv0drPPf5YaC9T4CERtt5yZxpvZeTPhq70evm
 OutoZpzfi69cF1fFCxqN5cWZSt1C/Bu3xp8+ILI1+bZkMCV/ty85DU6hfMZQZzcV
 JeJkRg/AAgOrG4dtHskwg9LDMs867kgbaqZ8l8K7Dt8rGmcLc5/rZ1ZdjTStFj6V
 ukmVKMAVgkmh88u62wQ5HjrN1IE1oE6nmDp3zivfPuohEr49A8mAT02A2x9AVxAr
 HvmwfDMA92xOGSRAN9Gt0mbOA+G0WZe4A36XgPEXloYeskYZgHzgW2hT6VWTd86O
 ydU9s4L8g+Fy4jcObAiKsT8YwFgAMfVXZKTXvuTME4m/WUNBCrYCwqEOp/NM5qrk
 qYWVXxOMMjZo71tQfvSPu1TWCtW/4ckvmqMrdQosgwLFy5pSqgXEwTruDvbJ1KWU
 KhIWVbUfmgFA
 =+Emh
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-pmu-6.8' of https://github.com/kvm-x86/linux into HEAD

KVM x86 PMU changes for 6.8:

 - Fix a variety of bugs where KVM fail to stop/reset counters and other state
   prior to refreshing the vPMU model.

 - Fix a double-overflow PMU bug by tracking emulated counter events using a
   dedicated field instead of snapshotting the "previous" counter.  If the
   hardware PMC count triggers overflow that is recognized in the same VM-Exit
   that KVM manually bumps an event count, KVM would pend PMIs for both the
   hardware-triggered overflow and for KVM-triggered overflow.
This commit is contained in:
Paolo Bonzini 2024-01-08 08:10:08 -05:00
commit 01edb1cfbd
7 changed files with 137 additions and 109 deletions

View file

@ -22,7 +22,7 @@ KVM_X86_PMU_OP(get_msr)
KVM_X86_PMU_OP(set_msr)
KVM_X86_PMU_OP(refresh)
KVM_X86_PMU_OP(init)
KVM_X86_PMU_OP(reset)
KVM_X86_PMU_OP_OPTIONAL(reset)
KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
KVM_X86_PMU_OP_OPTIONAL(cleanup)

View file

@ -500,8 +500,23 @@ struct kvm_pmc {
u8 idx;
bool is_paused;
bool intr;
/*
* Base value of the PMC counter, relative to the *consumed* count in
* the associated perf_event. This value includes counter updates from
* the perf_event and emulated_count since the last time the counter
* was reprogrammed, but it is *not* the current value as seen by the
* guest or userspace.
*
* The count is relative to the associated perf_event so that KVM
* doesn't need to reprogram the perf_event every time the guest writes
* to the counter.
*/
u64 counter;
u64 prev_counter;
/*
* PMC events triggered by KVM emulation that haven't been fully
* processed, i.e. haven't undergone overflow detection.
*/
u64 emulated_counter;
u64 eventsel;
struct perf_event *perf_event;
struct kvm_vcpu *vcpu;

View file

@ -127,9 +127,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
/*
* Ignore overflow events for counters that are scheduled to be
* reprogrammed, e.g. if a PMI for the previous event races with KVM's
* handling of a related guest WRMSR.
* Ignore asynchronous overflow events for counters that are scheduled
* to be reprogrammed, e.g. if a PMI for the previous event races with
* KVM's handling of a related guest WRMSR.
*/
if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
return;
@ -161,6 +161,15 @@ static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
return 1;
}
static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
{
u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
if (!sample_period)
sample_period = pmc_bitmask(pmc) + 1;
return sample_period;
}
static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
bool exclude_user, bool exclude_kernel,
bool intr)
@ -215,17 +224,30 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
return 0;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
static bool pmc_pause_counter(struct kvm_pmc *pmc)
{
u64 counter = pmc->counter;
if (!pmc->perf_event || pmc->is_paused)
return;
u64 prev_counter;
/* update counter, reset event value to avoid redundant accumulation */
counter += perf_event_pause(pmc->perf_event, true);
if (pmc->perf_event && !pmc->is_paused)
counter += perf_event_pause(pmc->perf_event, true);
/*
* Snapshot the previous counter *after* accumulating state from perf.
* If overflow already happened, hardware (via perf) is responsible for
* generating a PMI. KVM just needs to detect overflow on emulated
* counter events that haven't yet been processed.
*/
prev_counter = counter & pmc_bitmask(pmc);
counter += pmc->emulated_counter;
pmc->counter = counter & pmc_bitmask(pmc);
pmc->emulated_counter = 0;
pmc->is_paused = true;
return pmc->counter < prev_counter;
}
static bool pmc_resume_counter(struct kvm_pmc *pmc)
@ -250,6 +272,51 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true;
}
static void pmc_release_perf_event(struct kvm_pmc *pmc)
{
if (pmc->perf_event) {
perf_event_release_kernel(pmc->perf_event);
pmc->perf_event = NULL;
pmc->current_config = 0;
pmc_to_pmu(pmc)->event_count--;
}
}
static void pmc_stop_counter(struct kvm_pmc *pmc)
{
if (pmc->perf_event) {
pmc->counter = pmc_read_counter(pmc);
pmc_release_perf_event(pmc);
}
}
static void pmc_update_sample_period(struct kvm_pmc *pmc)
{
if (!pmc->perf_event || pmc->is_paused ||
!is_sampling_event(pmc->perf_event))
return;
perf_event_period(pmc->perf_event,
get_sample_period(pmc, pmc->counter));
}
void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
{
/*
* Drop any unconsumed accumulated counts, the WRMSR is a write, not a
* read-modify-write. Adjust the counter value so that its value is
* relative to the current count, as reading the current count from
* perf is faster than pausing and repgrogramming the event in order to
* reset it to '0'. Note, this very sneakily offsets the accumulated
* emulated count too, by using pmc_read_counter()!
*/
pmc->emulated_counter = 0;
pmc->counter += val - pmc_read_counter(pmc);
pmc->counter &= pmc_bitmask(pmc);
pmc_update_sample_period(pmc);
}
EXPORT_SYMBOL_GPL(pmc_write_counter);
static int filter_cmp(const void *pa, const void *pb, u64 mask)
{
u64 a = *(u64 *)pa & mask;
@ -383,14 +450,15 @@ static void reprogram_counter(struct kvm_pmc *pmc)
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
u64 new_config = eventsel;
bool emulate_overflow;
u8 fixed_ctr_ctrl;
pmc_pause_counter(pmc);
emulate_overflow = pmc_pause_counter(pmc);
if (!pmc_event_is_allowed(pmc))
goto reprogram_complete;
if (pmc->counter < pmc->prev_counter)
if (emulate_overflow)
__kvm_perf_overflow(pmc, false);
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@ -430,7 +498,6 @@ static void reprogram_counter(struct kvm_pmc *pmc)
reprogram_complete:
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
pmc->prev_counter = 0;
}
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
@ -639,32 +706,60 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
/* refresh PMU settings. This function generally is called when underlying
* settings are changed (such as changes of PMU CPUID by guest VMs), which
* should rarely happen.
static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
int i;
pmu->need_cleanup = false;
bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
if (!pmc)
continue;
pmc_stop_counter(pmc);
pmc->counter = 0;
pmc->emulated_counter = 0;
if (pmc_is_gp(pmc))
pmc->eventsel = 0;
}
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
static_call_cond(kvm_x86_pmu_reset)(vcpu);
}
/*
* Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
* and/or PERF_CAPABILITIES.
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
return;
/*
* Stop/release all existing counters/events before realizing the new
* vPMU model.
*/
kvm_pmu_reset(vcpu);
bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
static_call(kvm_x86_pmu_refresh)(vcpu);
}
void kvm_pmu_reset(struct kvm_vcpu *vcpu)
{
static_call(kvm_x86_pmu_reset)(vcpu);
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
memset(pmu, 0, sizeof(*pmu));
static_call(kvm_x86_pmu_init)(vcpu);
pmu->event_count = 0;
pmu->need_cleanup = false;
kvm_pmu_refresh(vcpu);
}
@ -700,8 +795,7 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
pmc->prev_counter = pmc->counter;
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
pmc->emulated_counter++;
kvm_pmu_request_counter_reprogram(pmc);
}

View file

@ -66,7 +66,8 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
{
u64 counter, enabled, running;
counter = pmc->counter;
counter = pmc->counter + pmc->emulated_counter;
if (pmc->perf_event && !pmc->is_paused)
counter += perf_event_read_value(pmc->perf_event,
&enabled, &running);
@ -74,29 +75,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
return counter & pmc_bitmask(pmc);
}
static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
{
pmc->counter += val - pmc_read_counter(pmc);
pmc->counter &= pmc_bitmask(pmc);
}
static inline void pmc_release_perf_event(struct kvm_pmc *pmc)
{
if (pmc->perf_event) {
perf_event_release_kernel(pmc->perf_event);
pmc->perf_event = NULL;
pmc->current_config = 0;
pmc_to_pmu(pmc)->event_count--;
}
}
static inline void pmc_stop_counter(struct kvm_pmc *pmc)
{
if (pmc->perf_event) {
pmc->counter = pmc_read_counter(pmc);
pmc_release_perf_event(pmc);
}
}
void pmc_write_counter(struct kvm_pmc *pmc, u64 val);
static inline bool pmc_is_gp(struct kvm_pmc *pmc)
{
@ -146,25 +125,6 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
return NULL;
}
static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
{
u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
if (!sample_period)
sample_period = pmc_bitmask(pmc) + 1;
return sample_period;
}
static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
{
if (!pmc->perf_event || pmc->is_paused ||
!is_sampling_event(pmc->perf_event))
return;
perf_event_period(pmc->perf_event,
get_sample_period(pmc, pmc->counter));
}
static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@ -261,7 +221,6 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
void kvm_pmu_reset(struct kvm_vcpu *vcpu);
void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);

View file

@ -161,7 +161,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc_write_counter(pmc, data);
pmc_update_sample_period(pmc);
return 0;
}
/* MSR_EVNTSELn */
@ -233,21 +232,6 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
}
}
static void amd_pmu_reset(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) {
struct kvm_pmc *pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
pmu->global_ctrl = pmu->global_status = 0;
}
struct kvm_pmu_ops amd_pmu_ops __initdata = {
.hw_event_available = amd_hw_event_available,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
@ -259,7 +243,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.set_msr = amd_pmu_set_msr,
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.reset = amd_pmu_reset,
.EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
.MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
.MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,

View file

@ -437,11 +437,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc_write_counter(pmc, data);
pmc_update_sample_period(pmc);
break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc_write_counter(pmc, data);
pmc_update_sample_period(pmc);
break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
reserved_bits = pmu->reserved_bits;
@ -632,26 +630,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc = NULL;
int i;
for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
pmc->counter = pmc->prev_counter = 0;
}
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
intel_pmu_release_guest_lbr_event(vcpu);
}

View file

@ -12252,7 +12252,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
}
if (!init_event) {
kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
vcpu->arch.msr_misc_features_enables = 0;