From bfe4daf850f45d92dcd3da477f0b0456620294c3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:05 -0700 Subject: [PATCH 01/32] perf/core: Add perf_clear_branch_entry_bitfields() helper Make it simpler to reset all the info fields on the perf_branch_entry by adding a helper inline function. The goal is to centralize the initialization to avoid missing a field in case more are added. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-2-eranian@google.com --- arch/x86/events/intel/lbr.c | 36 +++++++++++++++++------------------- include/linux/perf_event.h | 16 ++++++++++++++++ 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index fe1742c4ca49..13179f31fe10 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -769,6 +769,7 @@ void intel_pmu_lbr_disable_all(void) void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -784,15 +785,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; cpuc->lbr_stack.hw_idx = tos; @@ -807,6 +804,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -878,15 +876,14 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; @@ -951,6 +948,8 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, to = rdlbr_to(i, lbr); info = rdlbr_info(i, lbr); + perf_clear_branch_entry_bitfields(e); + e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); @@ -959,7 +958,6 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); - e->reserved = 0; } cpuc->lbr_stack.nr = i; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index af97dd427501..a411080d5169 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1063,6 +1063,22 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->txn = 0; } +/* + * Clear all bitfields in the perf_branch_entry. + * The to and from fields are not cleared because they are + * systematically modified by caller. + */ +static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) +{ + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->reserved = 0; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, From a77d41ac3a0f41c80120ec5b8b08ab284fec950a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:06 -0700 Subject: [PATCH 02/32] x86/cpufeatures: Add AMD Fam19h Branch Sampling feature Add a cpu feature for AMD Fam19h Branch Sampling feature as bit 31 of EBX on CPUID leaf function 0x80000008. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-3-eranian@google.com --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..0d62afd525e3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -315,6 +315,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ From ada543459cab7f653dcacdaba4011a8bb19c627c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:07 -0700 Subject: [PATCH 03/32] perf/x86/amd: Add AMD Fam19h Branch Sampling support Add support for the AMD Fam19h 16-deep branch sampling feature as described in the AMD PPR Fam19h Model 01h Revision B1. This is a model specific extension. It is not an architected AMD feature. The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR registers. There is no branch type filtering. All control flow changes are captured. BRS relies on specific programming of the core PMU of Fam19h. In particular, the following requirements must be met: - the sampling period be greater than 16 (BRS depth) - the sampling period must use a fixed and not frequency mode BRS interacts with the NMI interrupt as well. Because enabling BRS is expensive, it is only activated after P event occurrences, where P is the desired sampling period. At P occurrences of the event, the counter overflows, the CPU catches the interrupt, activates BRS for 16 branches until it saturates, and then delivers the NMI to the kernel. Between the overflow and the time BRS activates more branches may be executed skewing the period. All along, the sampling event keeps counting. The skid may be attenuated by reducing the sampling period by 16 (subsequent patch). BRS is integrated into perf_events seamlessly via the same PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry records in the sampling buffer. No prediction information is supported. The branches are stored in reverse order of execution. The most recent branch is the first entry in each record. No modification to the perf tool is necessary. BRS can be used with any sampling event. However, it is recommended to use the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS captures. $ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test $ perf report -D 56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0 ... branch stack: nr:16 ..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0 ..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0 ..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0 ..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0 ..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0 ..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0 ..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0 ..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0 ..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0 ..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0 ..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0 ..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0 ..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0 ..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0 ..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0 ..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0 ... thread: test:18230 ...... dso: test Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/brs.c | 317 +++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 233 ++++++++++++++++++++++- arch/x86/events/core.c | 10 +- arch/x86/events/perf_event.h | 99 ++++++++-- arch/x86/include/asm/msr-index.h | 4 + 6 files changed, 644 insertions(+), 21 deletions(-) create mode 100644 arch/x86/events/amd/brs.c diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index 6cbe38d5fd9d..cf323ffab5cd 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..3c13c484c637 --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian + */ +#include +#include +#include + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!boot_cpu_has(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + /* can only capture at all priv levels due to the way BRS works */ + if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + rdmsrl(brs_from(brs_idx), from); + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9687a8aef01c..c7ac70d8ed9a 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -325,8 +325,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -343,7 +351,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -366,7 +433,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -555,6 +622,8 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_brs_reset(); } static void amd_pmu_cpu_dead(int cpu) @@ -610,6 +679,8 @@ static void amd_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; + amd_brs_disable_all(); + x86_pmu_disable_all(); /* @@ -634,6 +705,30 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc; + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + hwc = &cpuc->events[idx]->hw; + + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +746,18 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -671,11 +778,31 @@ static void amd_pmu_disable_event(struct perf_event *event) */ static int amd_pmu_handle_irq(struct pt_regs *regs) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -897,6 +1024,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -905,12 +1077,19 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -920,6 +1099,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -938,6 +1119,37 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { u64 even_ctr_mask = 0ULL; @@ -989,6 +1201,19 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + /* + * put_event_constraints callback same as Fam17h, set above + */ + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index eef816fc216d..7ada9172b074 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1338,6 +1338,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1704,11 +1708,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 150261d929b9..6f1265163c9f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -67,22 +67,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -325,6 +326,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -1105,6 +1108,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -1210,6 +1218,50 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); + +/* + * check if BRS is activated on the CPU + * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 + */ +static inline bool amd_brs_active(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return cpuc->brs_active; +} #else /* CONFIG_CPU_SUP_AMD */ @@ -1218,6 +1270,23 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} + #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0eb90d21049e..8179ea351bd8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -688,6 +688,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 From 44175993efbae04e8b2d7f7795ff512c3a726db0 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:08 -0700 Subject: [PATCH 04/32] perf/x86/amd: Add branch-brs helper event for Fam19h BRS Add a pseudo event called branch-brs to help use the FAM Fam19h Branch Sampling feature (BRS). BRS samples taken branches, so it is best used when sampling on a retired taken branch event (0xc4) which is what BRS captures. Instead of trying to remember the event code or actual event name, users can simply do: $ perf record -b -e cpu/branch-brs/ -c 1000037 ..... Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-5-eranian@google.com --- arch/x86/events/amd/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index c7ac70d8ed9a..f7bce8364fe4 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1145,8 +1145,23 @@ static struct attribute_group group_caps_amd_brs = { .is_visible = amd_brs_is_visible, }; +EVENT_ATTR_STR(branch-brs, amd_branch_brs, + "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); + +static struct attribute *amd_brs_events_attrs[] = { + EVENT_PTR(amd_branch_brs), + NULL, +}; + +static struct attribute_group group_events_amd_brs = { + .name = "events", + .attrs = amd_brs_events_attrs, + .is_visible = amd_brs_is_visible, +}; + static const struct attribute_group *amd_attr_update[] = { &group_caps_amd_brs, + &group_events_amd_brs, NULL, }; From 8910075d61a37e5b0d82e6c83ed9a0a31fe9ea08 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:09 -0700 Subject: [PATCH 05/32] perf/x86/amd: Enable branch sampling priv level filtering The AMD Branch Sampling features does not provide hardware filtering by privilege level. The associated PMU counter does but not the branch sampling by itself. Given how BRS operates there is a possibility that BRS captures kernel level branches even though the event is programmed to count only at the user level. Implement a workaround in software by removing the branches which belong to the wrong privilege level. The privilege level is evaluated on the target of the branch and not the source so as to be compatible with other architectures. As a consequence of this patch, the number of entries in the PERF_RECORD_BRANCH_STACK buffer may be less than the maximum (16). It could even be zero. Another consequence is that consecutive entries in the branch stack may not reflect actual code path and may have discontinuities, in case kernel branches were suppressed. But this is no different than what happens on other architectures. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-6-eranian@google.com --- arch/x86/events/amd/brs.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 3c13c484c637..40461c3ce714 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -92,10 +92,6 @@ int amd_brs_setup_filter(struct perf_event *event) if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) return -EINVAL; - /* can only capture at all priv levels due to the way BRS works */ - if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) - return -EINVAL; - return 0; } @@ -195,6 +191,21 @@ void amd_brs_disable_all(void) amd_brs_disable(); } +static bool amd_brs_match_plm(struct perf_event *event, u64 to) +{ + int type = event->attr.branch_sample_type; + int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV; + int plm_u = PERF_SAMPLE_BRANCH_USER; + + if (!(type & plm_k) && kernel_ip(to)) + return 0; + + if (!(type & plm_u) && !kernel_ip(to)) + return 0; + + return 1; +} + /* * Caller must ensure amd_brs_inuse() is true before calling * return: @@ -252,8 +263,6 @@ void amd_brs_drain(void) if (to == BRS_POISON) break; - rdmsrl(brs_from(brs_idx), from); - /* * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. * Necessary to generate proper virtual addresses suitable for @@ -261,6 +270,11 @@ void amd_brs_drain(void) */ to = (u64)(((s64)to << shift) >> shift); + if (!amd_brs_match_plm(event, to)) + continue; + + rdmsrl(brs_from(brs_idx), from); + perf_clear_branch_entry_bitfields(br+nr); br[nr].from = from; From ba2fe7500845a30fc845a72081999cf632051862 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:10 -0700 Subject: [PATCH 06/32] perf/x86/amd: Add AMD branch sampling period adjustment Add code to adjust the sampling event period when used with the Branch Sampling feature (BRS). Given the depth of the BRS (16), the period is reduced by that depth such that in the best case scenario, BRS saturates at the desired sampling period. In practice, though, the processor may execute more branches. Given a desired period P and a depth D, the kernel programs the actual period at P - D. After P occurrences of the sampling event, the counter overflows. It then may take X branches (skid) before the NMI is caught and held by the hardware and BRS activates. Then, after D branches, BRS saturates and the NMI is delivered. With no skid, the effective period would be (P - D) + D = P. In practice, however, it will likely be (P - D) + X + D. There is no way to eliminate X or predict X. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-7-eranian@google.com --- arch/x86/events/core.c | 7 +++++++ arch/x86/events/perf_event.h | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7ada9172b074..54f992e65252 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1374,6 +1374,13 @@ int x86_perf_event_set_period(struct perf_event *event) x86_pmu.set_topdown_event_period) return x86_pmu.set_topdown_event_period(event); + /* + * decrease period by the depth of the BRS feature to get + * the last N taken branches and approximate the desired period + */ + if (has_branch_stack(event)) + period = amd_brs_adjust_period(period); + /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6f1265163c9f..d91ff2c6cefe 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1263,6 +1263,14 @@ static inline bool amd_brs_active(void) return cpuc->brs_active; } +static inline s64 amd_brs_adjust_period(s64 period) +{ + if (period > x86_pmu.lbr_nr) + return period - x86_pmu.lbr_nr; + + return period; +} + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) @@ -1287,6 +1295,10 @@ static inline void amd_brs_disable_all(void) { } +static inline s64 amd_brs_adjust_period(s64 period) +{ + return period; +} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) From cc37e520a236069c0de0e7ea455082fa11c73b12 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:11 -0700 Subject: [PATCH 07/32] perf/x86/amd: Make Zen3 branch sampling opt-in Add a kernel config option CONFIG_PERF_EVENTS_AMD_BRS to make the support for AMD Zen3 Branch Sampling (BRS) an opt-in compile time option. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-8-eranian@google.com --- arch/x86/events/Kconfig | 8 ++++++ arch/x86/events/amd/Makefile | 3 ++- arch/x86/events/perf_event.h | 51 ++++++++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index d6cdfe631674..09c56965750a 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -44,4 +44,12 @@ config PERF_EVENTS_AMD_UNCORE To compile this driver as a module, choose M here: the module will be called 'amd-uncore'. + +config PERF_EVENTS_AMD_BRS + depends on PERF_EVENTS && CPU_SUP_AMD + bool "AMD Zen3 Branch Sampling support" + help + Enable AMD Zen3 branch sampling support (BRS) which samples up to + 16 consecutive taken branches in registers. + endmenu diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index cf323ffab5cd..b9f5d4610256 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o brs.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index d91ff2c6cefe..ef27aee04b13 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1218,6 +1218,8 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); + +#ifdef CONFIG_PERF_EVENTS_AMD_BRS int amd_brs_init(void); void amd_brs_disable(void); void amd_brs_enable(void); @@ -1252,17 +1254,6 @@ static inline void amd_pmu_brs_del(struct perf_event *event) void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); -/* - * check if BRS is activated on the CPU - * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 - */ -static inline bool amd_brs_active(void) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - return cpuc->brs_active; -} - static inline s64 amd_brs_adjust_period(s64 period) { if (period > x86_pmu.lbr_nr) @@ -1270,6 +1261,44 @@ static inline s64 amd_brs_adjust_period(s64 period) return period; } +#else +static inline int amd_brs_init(void) +{ + return 0; +} +static inline void amd_brs_disable(void) {} +static inline void amd_brs_enable(void) {} +static inline void amd_brs_drain(void) {} +static inline void amd_brs_lopwr_init(void) {} +static inline void amd_brs_disable_all(void) {} +static inline int amd_brs_setup_filter(struct perf_event *event) +{ + return 0; +} +static inline void amd_brs_reset(void) {} + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ +} + +static inline s64 amd_brs_adjust_period(s64 period) +{ + return period; +} + +static inline void amd_brs_enable_all(void) +{ +} + +#endif #else /* CONFIG_CPU_SUP_AMD */ From 2a606a18cd672a16343d146a126721b34cc6adbd Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:12 -0700 Subject: [PATCH 08/32] ACPI: Add perf low power callback Add an optional callback needed by some PMU features, e.g., AMD BRS, to give a chance to the perf_events code to change its state before a CPU goes to low power and after it comes back. The callback is void when the PERF_NEEDS_LOPWR_CB flag is not set. This flag must be set in arch specific perf_event.h header whenever needed. When not set, there is no impact on the ACPI code. Signed-off-by: Stephane Eranian [peterz: build fix] Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-9-eranian@google.com --- drivers/acpi/acpi_pad.c | 7 +++++++ drivers/acpi/processor_idle.c | 5 +++++ include/linux/perf_event.h | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index f45979aa2d64..ec0e22a1e25d 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -164,6 +165,9 @@ static int power_saving_thread(void *data) tsc_marked_unstable = 1; } local_irq_disable(); + + perf_lopwr_cb(true); + tick_broadcast_enable(); tick_broadcast_enter(); stop_critical_timings(); @@ -172,6 +176,9 @@ static int power_saving_thread(void *data) start_critical_timings(); tick_broadcast_exit(); + + perf_lopwr_cb(false); + local_irq_enable(); if (time_before(expire_time, jiffies)) { diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 32b20efff5f8..05dc0e148f02 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* @@ -549,6 +550,8 @@ static void wait_for_freeze(void) */ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) { + perf_lopwr_cb(true); + if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -559,6 +562,8 @@ static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) inb(cx->address); wait_for_freeze(); } + + perf_lopwr_cb(false); } /** diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a411080d5169..da759560eec5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1676,4 +1676,10 @@ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); +#ifndef PERF_NEEDS_LOPWR_CB +static inline void perf_lopwr_cb(bool mode) +{ +} +#endif + #endif /* _LINUX_PERF_EVENT_H */ From d5616bac7adadbf42a3b63b8717e75eb82a2cc2c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:13 -0700 Subject: [PATCH 09/32] perf/x86/amd: Add idle hooks for branch sampling On AMD Fam19h Zen3, the branch sampling (BRS) feature must be disabled before entering low power and re-enabled (if was active) when returning from low power. Otherwise, the NMI interrupt may be held up for too long and cause problems. Stopping BRS will cause the NMI to be delivered if it was held up. Define a perf_amd_brs_lopwr_cb() callback to stop/restart BRS. The callback is protected by a jump label which is enabled only when AMD BRS is detected. In all other cases, the callback is never called. Signed-off-by: Stephane Eranian [peterz: static_call() and build fixes] Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-10-eranian@google.com --- arch/x86/events/amd/brs.c | 33 +++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 4 ++++ arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 23 +++++++++++++++++++++ 4 files changed, 61 insertions(+) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 40461c3ce714..895c82165d85 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -7,6 +7,7 @@ * Contributed by Stephane Eranian */ #include +#include #include #include @@ -329,3 +330,35 @@ void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) if (sched_in) amd_brs_poison_buffer(); } + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f7bce8364fe4..8e1e818f8195 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -1225,6 +1226,9 @@ static int __init amd_core_pmu_init(void) /* * put_event_constraints callback same as Fam17h, set above */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); } x86_pmu.attr_update = amd_attr_update; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ef27aee04b13..3b0324584da3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1226,6 +1226,7 @@ void amd_brs_enable(void); void amd_brs_enable_all(void); void amd_brs_disable_all(void); void amd_brs_drain(void); +void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); int amd_brs_setup_filter(struct perf_event *event); void amd_brs_reset(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 58d9e4b1fa0a..8199fc5a37ea 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PERF_EVENT_H #define _ASM_X86_PERF_EVENT_H +#include + /* * Performance event hw details: */ @@ -513,6 +515,27 @@ static inline void intel_pt_handle_vmx(int on) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) extern void amd_pmu_enable_virt(void); extern void amd_pmu_disable_virt(void); + +#if defined(CONFIG_PERF_EVENTS_AMD_BRS) + +#define PERF_NEEDS_LOPWR_CB 1 + +/* + * architectural low power callback impacts + * drivers/acpi/processor_idle.c + * drivers/acpi/acpi_pad.c + */ +extern void perf_amd_brs_lopwr_cb(bool lopwr_in); + +DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); + +static inline void perf_lopwr_cb(bool lopwr_in) +{ + static_call_mod(perf_lopwr_cb)(lopwr_in); +} + +#endif /* PERF_NEEDS_LOPWR_CB */ + #else static inline void amd_pmu_enable_virt(void) { } static inline void amd_pmu_disable_virt(void) { } From 7bebfe9dd802b80abff5a43e00ab68d98893a22c Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Thu, 24 Mar 2022 11:19:57 +0800 Subject: [PATCH 10/32] perf/x86: Unify format of events sysfs show Sysfs show formats of files in /sys/devices/cpu/events/ are not unified, some end with "\n", and some do not. Modify sysfs show format of events defined by EVENT_ATTR_STR to end with "\n". Before: $ ls /sys/devices/cpu/events/* | xargs -i sh -c 'echo -n "{}: "; cat -A {}; echo' branch-instructions: event=0xc4$ branch-misses: event=0xc5$ bus-cycles: event=0x3c,umask=0x01$ cache-misses: event=0x2e,umask=0x41$ cache-references: event=0x2e,umask=0x4f$ cpu-cycles: event=0x3c$ instructions: event=0xc0$ ref-cycles: event=0x00,umask=0x03$ slots: event=0x00,umask=0x4 topdown-bad-spec: event=0x00,umask=0x81 topdown-be-bound: event=0x00,umask=0x83 topdown-fe-bound: event=0x00,umask=0x82 topdown-retiring: event=0x00,umask=0x80 After: $ ls /sys/devices/cpu/events/* | xargs -i sh -c 'echo -n "{}: "; cat -A {}; echo' /sys/devices/cpu/events/branch-instructions: event=0xc4$ /sys/devices/cpu/events/branch-misses: event=0xc5$ /sys/devices/cpu/events/bus-cycles: event=0x3c,umask=0x01$ /sys/devices/cpu/events/cache-misses: event=0x2e,umask=0x41$ /sys/devices/cpu/events/cache-references: event=0x2e,umask=0x4f$ /sys/devices/cpu/events/cpu-cycles: event=0x3c$ /sys/devices/cpu/events/instructions: event=0xc0$ /sys/devices/cpu/events/ref-cycles: event=0x00,umask=0x03$ /sys/devices/cpu/events/slots: event=0x00,umask=0x4$ Signed-off-by: Yang Jihong Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220324031957.135595-1-yangjihong1@huawei.com --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 54f992e65252..b08052b05db6 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1852,7 +1852,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha /* string trumps id */ if (pmu_attr->event_str) - return sprintf(page, "%s", pmu_attr->event_str); + return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } From 78ed93d72ded679e3caf0758357209887bda885f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 4 Apr 2022 13:12:04 +0200 Subject: [PATCH 11/32] signal: Deliver SIGTRAP on perf event asynchronously if blocked With SIGTRAP on perf events, we have encountered termination of processes due to user space attempting to block delivery of SIGTRAP. Consider this case: ... sigset_t s; sigemptyset(&s); sigaddset(&s, SIGTRAP | ); sigprocmask(SIG_BLOCK, &s, ...); ... When the perf event triggers, while SIGTRAP is blocked, force_sig_perf() will force the signal, but revert back to the default handler, thus terminating the task. This makes sense for error conditions, but not so much for explicitly requested monitoring. However, the expectation is still that signals generated by perf events are synchronous, which will no longer be the case if the signal is blocked and delivered later. To give user space the ability to clearly distinguish synchronous from asynchronous signals, introduce siginfo_t::si_perf_flags and TRAP_PERF_FLAG_ASYNC (opted for flags in case more binary information is required in future). The resolution to the problem is then to (a) no longer force the signal (avoiding the terminations), but (b) tell user space via si_perf_flags if the signal was synchronous or not, so that such signals can be handled differently (e.g. let user space decide to ignore or consider the data imprecise). The alternative of making the kernel ignore SIGTRAP on perf events if the signal is blocked may work for some usecases, but likely causes issues in others that then have to revert back to interception of sigprocmask() (which we want to avoid). [ A concrete example: when using breakpoint perf events to track data-flow, in a region of code where signals are blocked, data-flow can no longer be tracked accurately. When a relevant asynchronous signal is received after unblocking the signal, the data-flow tracking logic needs to know its state is imprecise. ] Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Geert Uytterhoeven Tested-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220404111204.935357-1-elver@google.com --- arch/arm/kernel/signal.c | 1 + arch/arm64/kernel/signal.c | 1 + arch/arm64/kernel/signal32.c | 1 + arch/m68k/kernel/signal.c | 1 + arch/sparc/kernel/signal32.c | 1 + arch/sparc/kernel/signal_64.c | 1 + arch/x86/kernel/signal_compat.c | 2 ++ include/linux/compat.h | 1 + include/linux/sched/signal.h | 2 +- include/uapi/asm-generic/siginfo.h | 7 +++++++ kernel/events/core.c | 4 ++-- kernel/signal.c | 18 ++++++++++++++++-- 12 files changed, 35 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 459abc5d1819..ea128e32e8ca 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -708,6 +708,7 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x18); static_assert(offsetof(siginfo_t, si_pkey) == 0x14); static_assert(offsetof(siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(siginfo_t, si_band) == 0x0c); static_assert(offsetof(siginfo_t, si_fd) == 0x10); static_assert(offsetof(siginfo_t, si_call_addr) == 0x0c); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 4a4122ef6f39..41b5d9d3672a 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -1011,6 +1011,7 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index d984282b979f..4700f8522d27 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -487,6 +487,7 @@ static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); static_assert(offsetof(compat_siginfo_t, si_call_addr) == 0x0c); diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 49533f65958a..b9f6908a31bc 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -625,6 +625,7 @@ static inline void siginfo_build_tests(void) /* _sigfault._perf */ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x18); /* _sigpoll */ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0c); diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index f9fe502b81c6..dad38960d1a8 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -779,5 +779,6 @@ static_assert(offsetof(compat_siginfo_t, si_upper) == 0x18); static_assert(offsetof(compat_siginfo_t, si_pkey) == 0x14); static_assert(offsetof(compat_siginfo_t, si_perf_data) == 0x10); static_assert(offsetof(compat_siginfo_t, si_perf_type) == 0x14); +static_assert(offsetof(compat_siginfo_t, si_perf_flags) == 0x18); static_assert(offsetof(compat_siginfo_t, si_band) == 0x0c); static_assert(offsetof(compat_siginfo_t, si_fd) == 0x10); diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 8b9fc76cd3e0..570e43e6fda5 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -590,5 +590,6 @@ static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); +static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x14); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index b52407c56000..879ef8c72f5c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -149,8 +149,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c758b0e0359..01fddf72a81f 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -235,6 +235,7 @@ typedef struct compat_siginfo { struct { compat_ulong_t _data; u32 _type; + u32 _flags; } _perf; }; } _sigfault; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..bab7cc56b13a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -320,7 +320,7 @@ int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); -int force_sig_perf(void __user *addr, u32 type, u64 sig_data); +int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 3ba180f550d7..ffbe4cec9f32 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -99,6 +99,7 @@ union __sifields { struct { unsigned long _data; __u32 _type; + __u32 _flags; } _perf; }; } _sigfault; @@ -164,6 +165,7 @@ typedef struct siginfo { #define si_pkey _sifields._sigfault._addr_pkey._pkey #define si_perf_data _sifields._sigfault._perf._data #define si_perf_type _sifields._sigfault._perf._type +#define si_perf_flags _sifields._sigfault._perf._flags #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #define si_call_addr _sifields._sigsys._call_addr @@ -270,6 +272,11 @@ typedef struct siginfo { * that are of the form: ((PTRACE_EVENT_XXX << 8) | SIGTRAP) */ +/* + * Flags for si_perf_flags if SIGTRAP si_code is TRAP_PERF. + */ +#define TRAP_PERF_FLAG_ASYNC (1u << 0) + /* * SIGCHLD si_codes */ diff --git a/kernel/events/core.c b/kernel/events/core.c index cfde994ce61c..6eafb1b0ad4a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6533,8 +6533,8 @@ static void perf_sigtrap(struct perf_event *event) if (current->flags & PF_EXITING) return; - force_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + send_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } static void perf_pending_event_disable(struct perf_event *event) diff --git a/kernel/signal.c b/kernel/signal.c index 30cd1ca43bcd..e43bc2a692f5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1805,7 +1805,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif -int force_sig_perf(void __user *addr, u32 type, u64 sig_data) +int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; @@ -1817,7 +1817,18 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) info.si_perf_data = sig_data; info.si_perf_type = type; - return force_sig_info(&info); + /* + * Signals generated by perf events should not terminate the whole + * process if SIGTRAP is blocked, however, delivering the signal + * asynchronously is better than not delivering at all. But tell user + * space if the signal was asynchronous, so it can clearly be + * distinguished from normal synchronous ones. + */ + info.si_perf_flags = sigismember(¤t->blocked, info.si_signo) ? + TRAP_PERF_FLAG_ASYNC : + 0; + + return send_sig_info(info.si_signo, &info, current); } /** @@ -3432,6 +3443,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3509,6 +3521,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4722,6 +4735,7 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); + CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); From d6d0c7f681fda1d07e005c8f653e578b77a0eb40 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:53 +0530 Subject: [PATCH 12/32] x86/cpufeatures: Add PerfMonV2 feature bit CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 0 of EAX indicates support for Performance Monitoring Version 2 (PerfMonV2) features. If found to be set during PMU initialization, the EBX bits of the same CPUID function can be used to determine the number of available PMCs for different PMU types. Additionally, Core PMCs can be managed using new global control and status registers. For better utilization of feature words, PerfMonV2 is added as a scattered feature bit. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/c70e497e22f18e7f05b025bb64ca21cc12b17792.1650515382.git.sandipan.das@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0d62afd525e3..b50e0872ad1e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -/* FREE! ( 7*32+20) */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4143b1e4c5c6..dbaa8326d6f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; From 089be16d5992dd0bc6df15ef12042fd1023ded9a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:54 +0530 Subject: [PATCH 13/32] x86/msr: Add PerfCntrGlobal* registers Add MSR definitions that will be used to enable the new AMD Performance Monitoring Version 2 (PerfMonV2) features. These include: * Performance Counter Global Control (PerfCntrGlobalCtl) * Performance Counter Global Status (PerfCntrGlobalStatus) * Performance Counter Global Status Clear (PerfCntrGlobalStatusClr) The new Performance Counter Global Control and Status MSRs provide an interface for enabling or disabling multiple counters at the same time and for testing overflow without probing the individual registers for each PMC. The availability of these registers is indicated through the PerfMonV2 feature bit of CPUID leaf 0x80000022 EAX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/cdc0d8f75bd519848731b5c64d924f5a0619a573.1650515382.git.sandipan.das@amd.com --- arch/x86/include/asm/msr-index.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8179ea351bd8..58a44dceef9a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -524,6 +524,11 @@ #define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) #define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 From 21d59e3e2c403c83ba196a5857d517054124168e Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:55 +0530 Subject: [PATCH 14/32] perf/x86/amd/core: Detect PerfMonV2 support AMD Performance Monitoring Version 2 (PerfMonV2) introduces some new Core PMU features such as detection of the number of available PMCs and managing PMCs using global registers namely, PerfCntrGlobalCtl and PerfCntrGlobalStatus. Clearing PerfCntrGlobalCtl and PerfCntrGlobalStatus ensures that all PMCs are inactive and have no pending overflows when CPUs are onlined or offlined. The PMU version (x86_pmu.version) now indicates PerfMonV2 support and will be used to bypass the new features on unsupported processors. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dc8672ecbddff394e088ca8abf94b089b8ecc2e7.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 8e1e818f8195..b70dfa028ba5 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -19,6 +19,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -578,6 +581,18 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -625,6 +640,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->refcnt++; amd_brs_reset(); + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -644,6 +660,8 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); } /* @@ -1185,6 +1203,15 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. From 56e026a7ca3f92b8e44359e1f705febd1833f701 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:56 +0530 Subject: [PATCH 15/32] perf/x86/amd/core: Detect available counters If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use CPUID leaf 0x80000022 EBX to detect the number of Core PMCs. This offers more flexibility if the counts change in later processor families. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/68a6d9688df189267db26530378870edd34f7b06.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 6 ++++++ arch/x86/include/asm/perf_event.h | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b70dfa028ba5..52fd7941a724 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1186,6 +1186,7 @@ static const struct attribute_group *amd_attr_update[] = { static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -1206,9 +1207,14 @@ static int __init amd_core_pmu_init(void) /* Check for Performance Monitoring v2 support */ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + /* Update PMU version for later usage */ x86_pmu.version = 2; + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8199fc5a37ea..c6cc07f46556 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -186,6 +186,18 @@ union cpuid28_ecx { unsigned int full; }; +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -367,6 +379,11 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +/* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 + /* * IBS cpuid feature detection */ From 9622e67e3980c01872490de0925e5c6c23247c94 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:57 +0530 Subject: [PATCH 16/32] perf/x86/amd/core: Add PerfMonV2 counter control If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to manage the Core PMCs using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). Currently, all PMCs have dedicated control (PERF_CTL) and counter (PERF_CTR) registers. For a given PMC, the enable (En) bit of its PERF_CTL register is used to start or stop counting. The Performance Counter Global Control (PerfCntrGlobalCtl) register has enable (PerfCntrEn) bits for each PMC. For a PMC to start counting, both PERF_CTL and PerfCntrGlobalCtl enable bits must be set. If either of those are cleared, the PMC stops counting. In x86_pmu_{en,dis}able_all(), the PERF_CTL registers of all active PMCs are written to in a loop. Ideally, PMCs counting the same event that were started and stopped at the same time should record the same counts. Due to delays in between writes to the PERF_CTL registers across loop iterations, the PMCs cannot be enabled or disabled at the same instant and hence, record slightly different counts. This is fixed by enabling or disabling all active PMCs at the same time with a single write to the PerfCntrGlobalCtl register. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/dfe8e934074aaabc6ba748dfaccd0a77c974bb82.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 50 ++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 52fd7941a724..a339c3e0be33 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -664,6 +664,11 @@ static void amd_pmu_cpu_dead(int cpu) amd_pmu_cpu_reset(cpu); } +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -693,15 +698,11 @@ static void amd_pmu_wait_on_overflow(int idx) } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - amd_brs_disable_all(); - - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -748,6 +749,26 @@ static void amd_pmu_enable_all(int added) } } +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -765,6 +786,20 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static void amd_pmu_v2_disable_all(void) +{ + /* Disable all PMCs */ + amd_pmu_set_global_ctl(0); + amd_pmu_check_overflow(); +} + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -1216,6 +1251,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.num_counters = ebx.split.num_core_pmc; amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; } /* From 7685665c390dc68c2d9a74e8445f41494cc8f6cf Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:58 +0530 Subject: [PATCH 17/32] perf/x86/amd/core: Add PerfMonV2 overflow handling If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use a new scheme to process Core PMC overflows in the NMI handler using the new global control and status registers. This will be bypassed on unsupported hardware (x86_pmu.version < 2). In x86_pmu_handle_irq(), overflows are detected by testing the contents of the PERF_CTR register for each active PMC in a loop. The new scheme instead inspects the overflow bits of the global status register. The Performance Counter Global Status (PerfCntrGlobalStatus) register has overflow (PerfCntrOvfl) bits for each PMC. This is, however, a read-only MSR. To acknowledge that overflows have been processed, the NMI handler must clear the bits by writing to the PerfCntrGlobalStatusClr register. In x86_pmu_handle_irq(), PMCs counting the same event that are started and stopped at the same time record slightly different counts due to delays in between reads from the PERF_CTR registers. This is fixed by stopping and starting the PMCs at the same before and with a single write to the Performance Counter Global Control (PerfCntrGlobalCtl) upon entering and before exiting the NMI handler. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/f20b7e4da0b0a83bdbe05857f354146623bc63ab.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 148 +++++++++++++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 13 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index a339c3e0be33..262e39a85031 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "../perf_event.h" @@ -669,6 +670,45 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); } +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status & amd_pmu_global_cntr_mask; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ + status &= amd_pmu_global_cntr_mask; + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); +} + +DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit); + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -681,7 +721,6 @@ static inline void amd_pmu_set_global_ctl(u64 ctl) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -689,8 +728,7 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!static_call(amd_pmu_test_overflow)(idx)) break; /* Might be in IRQ context, so can't sleep */ @@ -830,6 +868,24 @@ static void amd_pmu_del_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ +static inline int amd_pmu_adjust_nmi_window(int handled) +{ + /* + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. + */ + if (handled) { + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + + return handled; + } + + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) + return NMI_DONE; + + return NMI_HANDLED; +} + static int amd_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -857,20 +913,84 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (pmu_enabled) amd_pmu_enable_all(0); - /* - * If a counter was handled, record a timestamp such that un-handled - * NMIs will be claimed if arriving within that window. - */ - if (handled) { - this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + return amd_pmu_adjust_nmi_window(handled); +} - return handled; +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + + /* + * Save the PMU state as it needs to be restored when leaving the + * handler + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* Stop counting */ + amd_pmu_v2_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + + status &= ~mask; } - if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) - return NMI_DONE; + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); - return NMI_HANDLED; + /* Clear overflow bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_v2_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); } static struct event_constraint * @@ -1256,6 +1376,8 @@ static int __init amd_core_pmu_init(void) x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status); } /* From 3d47083b9ff46863e8374ad3bb5edb5e464c75f8 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 29 Apr 2022 10:44:41 +0530 Subject: [PATCH 18/32] perf/amd/ibs: Use interrupt regs ip for stack unwinding IbsOpRip is recorded when IBS interrupt is triggered. But there is a skid from the time IBS interrupt gets triggered to the time the interrupt is presented to the core. Meanwhile processor would have moved ahead and thus IbsOpRip will be inconsistent with rsp and rbp recorded as part of the interrupt regs. This causes issues while unwinding stack using the ORC unwinder as it needs consistent rip, rsp and rbp. Fix this by using rip from interrupt regs instead of IbsOpRip for stack unwinding. Fixes: ee9f8fce99640 ("x86/unwind: Add the ORC unwinder") Reported-by: Dmitry Monakhov Suggested-by: Peter Zijlstra Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220429051441.14251-1-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 9739019d4b67..11e8b493e015 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -304,6 +304,16 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. Setting _EARLY flag + * makes sure we unwind call-stack before perf sample rip is set to + * IbsOpRip. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; + return 0; } @@ -687,6 +697,14 @@ fail: data.raw = &raw; } + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + data.callchain = perf_callchain(event, iregs); + throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) { From c2a960f7c5741cc4f03b4e587afeb89ad53c32c5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:09 -0700 Subject: [PATCH 19/32] perf/x86: Add new Alder Lake and Raptor Lake support From PMU's perspective, there is no difference for the new Alder Lake N and Raptor Lake P. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fc7f458eb3de..955ae91c56dc 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6216,7 +6216,9 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: /* * Alder Lake has 2 types of CPU, core and atom. * From d773a73366bd54d0c75c533269fe2f0765ce42ee Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:10 -0700 Subject: [PATCH 20/32] perf/x86/msr: Add new Alder Lake and Raptor Lake support The new Alder Lake N and Raptor Lake P also support PPERF and SMI_COUNT MSRs. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-2-kan.liang@linux.intel.com --- arch/x86/events/msr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 6d759f88315c..ac542f98c070 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -103,7 +103,9 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ROCKETLAKE: case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; From cd971104ac7e41ff66082b9b584d319bb0688a1a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:11 -0700 Subject: [PATCH 21/32] perf/x86/cstate: Add new Alder Lake and Raptor Lake support From the perspective of Intel cstate residency counters, there is nothing changed for the new Alder Lake N and Raptor Lake P. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-3-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 48e5db21142c..8ec23f47fee9 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -682,7 +682,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); From e5ae168e8394dc3c6dce580690c87ff2cf16cdbb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:12 -0700 Subject: [PATCH 22/32] perf/x86/uncore: Clean up uncore_pci_ids[] The initialization code to assign PCI IDs for different platforms is similar. Add the new macros to reduce the redundant code. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-4-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snb.c | 402 ++++++----------------------- 1 file changed, 86 insertions(+), 316 deletions(-) diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 4262351f52b6..b30890b91137 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -84,6 +84,13 @@ #define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706 #define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709 + +#define IMC_UNCORE_DEV(a) \ +{ \ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_##a##_IMC), \ + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), \ +} + /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff #define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 @@ -849,242 +856,80 @@ static struct intel_uncore_type *snb_pci_uncores[] = { }; static const struct pci_device_id snb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SNB), { /* end: all zeroes */ }, }; static const struct pci_device_id ivb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(IVB), + IMC_UNCORE_DEV(IVB_E3), { /* end: all zeroes */ }, }; static const struct pci_device_id hsw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(HSW), + IMC_UNCORE_DEV(HSW_U), { /* end: all zeroes */ }, }; static const struct pci_device_id bdw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(BDW), { /* end: all zeroes */ }, }; static const struct pci_device_id skl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SKL_Y), + IMC_UNCORE_DEV(SKL_U), + IMC_UNCORE_DEV(SKL_HD), + IMC_UNCORE_DEV(SKL_HQ), + IMC_UNCORE_DEV(SKL_SD), + IMC_UNCORE_DEV(SKL_SQ), + IMC_UNCORE_DEV(SKL_E3), + IMC_UNCORE_DEV(KBL_Y), + IMC_UNCORE_DEV(KBL_U), + IMC_UNCORE_DEV(KBL_UQ), + IMC_UNCORE_DEV(KBL_SD), + IMC_UNCORE_DEV(KBL_SQ), + IMC_UNCORE_DEV(KBL_HQ), + IMC_UNCORE_DEV(KBL_WQ), + IMC_UNCORE_DEV(CFL_2U), + IMC_UNCORE_DEV(CFL_4U), + IMC_UNCORE_DEV(CFL_4H), + IMC_UNCORE_DEV(CFL_6H), + IMC_UNCORE_DEV(CFL_2S_D), + IMC_UNCORE_DEV(CFL_4S_D), + IMC_UNCORE_DEV(CFL_6S_D), + IMC_UNCORE_DEV(CFL_8S_D), + IMC_UNCORE_DEV(CFL_4S_W), + IMC_UNCORE_DEV(CFL_6S_W), + IMC_UNCORE_DEV(CFL_8S_W), + IMC_UNCORE_DEV(CFL_4S_S), + IMC_UNCORE_DEV(CFL_6S_S), + IMC_UNCORE_DEV(CFL_8S_S), + IMC_UNCORE_DEV(AML_YD), + IMC_UNCORE_DEV(AML_YQ), + IMC_UNCORE_DEV(WHL_UQ), + IMC_UNCORE_DEV(WHL_4_UQ), + IMC_UNCORE_DEV(WHL_UD), + IMC_UNCORE_DEV(CML_H1), + IMC_UNCORE_DEV(CML_H2), + IMC_UNCORE_DEV(CML_H3), + IMC_UNCORE_DEV(CML_U1), + IMC_UNCORE_DEV(CML_U2), + IMC_UNCORE_DEV(CML_U3), + IMC_UNCORE_DEV(CML_S1), + IMC_UNCORE_DEV(CML_S2), + IMC_UNCORE_DEV(CML_S3), + IMC_UNCORE_DEV(CML_S4), + IMC_UNCORE_DEV(CML_S5), { /* end: all zeroes */ }, }; static const struct pci_device_id icl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(ICL_U), + IMC_UNCORE_DEV(ICL_U2), + IMC_UNCORE_DEV(RKL_1), + IMC_UNCORE_DEV(RKL_2), { /* end: all zeroes */ }, }; @@ -1326,106 +1171,31 @@ void nhm_uncore_cpu_init(void) /* Tiger Lake MMIO uncore support */ static const struct pci_device_id tgl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_1_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RPL_4_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(TGL_U1), + IMC_UNCORE_DEV(TGL_U2), + IMC_UNCORE_DEV(TGL_U3), + IMC_UNCORE_DEV(TGL_U4), + IMC_UNCORE_DEV(TGL_H), + IMC_UNCORE_DEV(ADL_1), + IMC_UNCORE_DEV(ADL_2), + IMC_UNCORE_DEV(ADL_3), + IMC_UNCORE_DEV(ADL_4), + IMC_UNCORE_DEV(ADL_5), + IMC_UNCORE_DEV(ADL_6), + IMC_UNCORE_DEV(ADL_7), + IMC_UNCORE_DEV(ADL_8), + IMC_UNCORE_DEV(ADL_9), + IMC_UNCORE_DEV(ADL_10), + IMC_UNCORE_DEV(ADL_11), + IMC_UNCORE_DEV(ADL_12), + IMC_UNCORE_DEV(ADL_13), + IMC_UNCORE_DEV(ADL_14), + IMC_UNCORE_DEV(ADL_15), + IMC_UNCORE_DEV(ADL_16), + IMC_UNCORE_DEV(RPL_1), + IMC_UNCORE_DEV(RPL_2), + IMC_UNCORE_DEV(RPL_3), + IMC_UNCORE_DEV(RPL_4), { /* end: all zeroes */ } }; From f758bc5a91233bb5b5b6994a8e72ba4eba0e9ab2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 4 May 2022 12:44:13 -0700 Subject: [PATCH 23/32] perf/x86/uncore: Add new Alder Lake and Raptor Lake support From the perspective of the uncore PMU, there is nothing changed for the new Alder Lake N and Raptor Lake P. Add new PCIIDs of IMC. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220504194413.1003071-5-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 2 ++ arch/x86/events/intel/uncore_snb.c | 52 ++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7695dcae280e..db6c31bca809 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1828,7 +1828,9 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index b30890b91137..ce440011cc4e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -79,10 +79,36 @@ #define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650 #define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668 #define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670 +#define PCI_DEVICE_ID_INTEL_ADL_17_IMC 0x4614 +#define PCI_DEVICE_ID_INTEL_ADL_18_IMC 0x4617 +#define PCI_DEVICE_ID_INTEL_ADL_19_IMC 0x4618 +#define PCI_DEVICE_ID_INTEL_ADL_20_IMC 0x461B +#define PCI_DEVICE_ID_INTEL_ADL_21_IMC 0x461C #define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700 #define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702 #define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706 #define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_5_IMC 0xA701 +#define PCI_DEVICE_ID_INTEL_RPL_6_IMC 0xA703 +#define PCI_DEVICE_ID_INTEL_RPL_7_IMC 0xA704 +#define PCI_DEVICE_ID_INTEL_RPL_8_IMC 0xA705 +#define PCI_DEVICE_ID_INTEL_RPL_9_IMC 0xA706 +#define PCI_DEVICE_ID_INTEL_RPL_10_IMC 0xA707 +#define PCI_DEVICE_ID_INTEL_RPL_11_IMC 0xA708 +#define PCI_DEVICE_ID_INTEL_RPL_12_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_13_IMC 0xA70a +#define PCI_DEVICE_ID_INTEL_RPL_14_IMC 0xA70b +#define PCI_DEVICE_ID_INTEL_RPL_15_IMC 0xA715 +#define PCI_DEVICE_ID_INTEL_RPL_16_IMC 0xA716 +#define PCI_DEVICE_ID_INTEL_RPL_17_IMC 0xA717 +#define PCI_DEVICE_ID_INTEL_RPL_18_IMC 0xA718 +#define PCI_DEVICE_ID_INTEL_RPL_19_IMC 0xA719 +#define PCI_DEVICE_ID_INTEL_RPL_20_IMC 0xA71A +#define PCI_DEVICE_ID_INTEL_RPL_21_IMC 0xA71B +#define PCI_DEVICE_ID_INTEL_RPL_22_IMC 0xA71C +#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728 +#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729 +#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A #define IMC_UNCORE_DEV(a) \ @@ -1192,10 +1218,36 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = { IMC_UNCORE_DEV(ADL_14), IMC_UNCORE_DEV(ADL_15), IMC_UNCORE_DEV(ADL_16), + IMC_UNCORE_DEV(ADL_17), + IMC_UNCORE_DEV(ADL_18), + IMC_UNCORE_DEV(ADL_19), + IMC_UNCORE_DEV(ADL_20), + IMC_UNCORE_DEV(ADL_21), IMC_UNCORE_DEV(RPL_1), IMC_UNCORE_DEV(RPL_2), IMC_UNCORE_DEV(RPL_3), IMC_UNCORE_DEV(RPL_4), + IMC_UNCORE_DEV(RPL_5), + IMC_UNCORE_DEV(RPL_6), + IMC_UNCORE_DEV(RPL_7), + IMC_UNCORE_DEV(RPL_8), + IMC_UNCORE_DEV(RPL_9), + IMC_UNCORE_DEV(RPL_10), + IMC_UNCORE_DEV(RPL_11), + IMC_UNCORE_DEV(RPL_12), + IMC_UNCORE_DEV(RPL_13), + IMC_UNCORE_DEV(RPL_14), + IMC_UNCORE_DEV(RPL_15), + IMC_UNCORE_DEV(RPL_16), + IMC_UNCORE_DEV(RPL_17), + IMC_UNCORE_DEV(RPL_18), + IMC_UNCORE_DEV(RPL_19), + IMC_UNCORE_DEV(RPL_20), + IMC_UNCORE_DEV(RPL_21), + IMC_UNCORE_DEV(RPL_22), + IMC_UNCORE_DEV(RPL_23), + IMC_UNCORE_DEV(RPL_24), + IMC_UNCORE_DEV(RPL_25), { /* end: all zeroes */ } }; From 39b2ca75eec8a33e2ffdb8aa0c4840ec3e3b472c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:07 +0530 Subject: [PATCH 24/32] perf/amd/ibs: Cascade pmu init functions' return value IBS pmu initialization code ignores return value provided by callee functions. Fix it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-2-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 11e8b493e015..2704ec1e42a3 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -777,9 +777,10 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init void perf_event_ibs_init(void) +static __init int perf_event_ibs_init(void) { struct attribute **attr = ibs_op_format_attrs; + int ret; /* * Some chips fail to reset the fetch count when it is written; instead @@ -791,7 +792,9 @@ static __init void perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ret) + return ret; if (ibs_caps & IBS_CAPS_OPCNT) { perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -804,15 +807,35 @@ static __init void perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } - perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + if (ret) + goto err_op; + + ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ret) + goto err_nmi; - register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); + return 0; + +err_nmi: + perf_pmu_unregister(&perf_ibs_op.pmu); + free_percpu(perf_ibs_op.pcpu); + perf_ibs_op.pcpu = NULL; +err_op: + perf_pmu_unregister(&perf_ibs_fetch.pmu); + free_percpu(perf_ibs_fetch.pcpu); + perf_ibs_fetch.pcpu = NULL; + + return ret; } #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ -static __init void perf_event_ibs_init(void) { } +static __init int perf_event_ibs_init(void) +{ + return 0; +} #endif @@ -1082,9 +1105,7 @@ static __init int amd_ibs_init(void) x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); - perf_event_ibs_init(); - - return 0; + return perf_event_ibs_init(); } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ From 2a7a7e658682bfd7501dc6b4c9d365aa6c79788a Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:08 +0530 Subject: [PATCH 25/32] perf/amd/ibs: Use ->is_visible callback for dynamic attributes Currently, some attributes are added at build time whereas others at boot time depending on IBS pmu capabilities. Instead, we can just add all attribute groups at build time but hide individual group at boot time using more appropriate ->is_visible() callback. Also, struct perf_ibs has bunch of fields for pmu attributes which just pass on the pointer, does not do anything else. Remove them. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-3-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 78 +++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2704ec1e42a3..ece4f6a7d24b 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -94,10 +94,6 @@ struct perf_ibs { unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; - struct attribute **format_attrs; - struct attribute_group format_group; - const struct attribute_group *attr_groups[2]; - u64 (*get_count)(u64 config); }; @@ -528,16 +524,61 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +/* + * We need to initialize with empty group if all attributes in the + * group are dynamic. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group empty_format_group = { + .name = "format", + .attrs = attrs_empty, +}; + +static const struct attribute_group *empty_attr_groups[] = { + &empty_format_group, + NULL, +}; + PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); -static struct attribute *ibs_fetch_format_attrs[] = { +static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; -static struct attribute *ibs_op_format_attrs[] = { - NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ +static struct attribute_group group_rand_en = { + .name = "format", + .attrs = rand_en_attrs, +}; + +static const struct attribute_group *fetch_attr_groups[] = { + &group_rand_en, + NULL, +}; + +static umode_t +cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; +} + +static struct attribute *cnt_ctl_attrs[] = { + &format_attr_cnt_ctl.attr, + NULL, +}; + +static struct attribute_group group_cnt_ctl = { + .name = "format", + .attrs = cnt_ctl_attrs, + .is_visible = cnt_ctl_is_visible, +}; + +static const struct attribute_group *op_attr_update[] = { + &group_cnt_ctl, NULL, }; @@ -561,7 +602,6 @@ static struct perf_ibs perf_ibs_fetch = { .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, - .format_attrs = ibs_fetch_format_attrs, .get_count = get_ibs_fetch_count, }; @@ -587,7 +627,6 @@ static struct perf_ibs perf_ibs_op = { .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, - .format_attrs = ibs_op_format_attrs, .get_count = get_ibs_op_count, }; @@ -757,17 +796,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) perf_ibs->pcpu = pcpu; - /* register attributes */ - if (perf_ibs->format_attrs[0]) { - memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); - perf_ibs->format_group.name = "format"; - perf_ibs->format_group.attrs = perf_ibs->format_attrs; - - memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); - perf_ibs->attr_groups[0] = &perf_ibs->format_group; - perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; - } - ret = perf_pmu_register(&perf_ibs->pmu, name, -1); if (ret) { perf_ibs->pcpu = NULL; @@ -779,7 +807,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) static __init int perf_event_ibs_init(void) { - struct attribute **attr = ibs_op_format_attrs; int ret; /* @@ -792,14 +819,14 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ret) return ret; - if (ibs_caps & IBS_CAPS_OPCNT) { + if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; - *attr++ = &format_attr_cnt_ctl.attr; - } if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; @@ -807,6 +834,9 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_update = op_attr_update; + ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); if (ret) goto err_op; From ba5d35b442c65f32d38ef61f732218274c6dcf4c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:09 +0530 Subject: [PATCH 26/32] perf/amd/ibs: Add support for L3 miss filtering IBS L3 miss filtering works by tagging an instruction on IBS counter overflow and generating an NMI if the tagged instruction causes an L3 miss. Samples without an L3 miss are discarded and counter is reset with random value (between 1-15 for fetch pmu and 1-127 for op pmu). This helps in reducing sampling overhead when user is interested only in such samples. One of the use case of such filtered samples is to feed data to page-migration daemon in tiered memory systems. Add support for L3 miss filtering in IBS driver via new pmu attribute "l3missonly". Example usage: # perf record -a -e ibs_op/l3missonly=1/ --raw-samples sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-4-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 67 +++++++++++++++++++++++++++---- arch/x86/include/asm/perf_event.h | 3 ++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ece4f6a7d24b..2dc8b7ec030a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -544,22 +544,46 @@ static const struct attribute_group *empty_attr_groups[] = { PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); + +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, }; +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, NULL, }; +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + NULL, +}; + static umode_t cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) { @@ -571,14 +595,26 @@ static struct attribute *cnt_ctl_attrs[] = { NULL, }; +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, .is_visible = cnt_ctl_is_visible, }; +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, + &group_op_l3missonly, NULL, }; @@ -805,10 +841,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init int perf_event_ibs_init(void) +static __init int perf_ibs_fetch_init(void) { - int ret; - /* * Some chips fail to reset the fetch count when it is written; instead * they need a 0-1 transition of IbsFetchEn. @@ -819,12 +853,17 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); - if (ret) - return ret; + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} +static __init int perf_ibs_op_init(void) +{ if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -834,10 +873,24 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + perf_ibs_op.pmu.attr_groups = empty_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); if (ret) goto err_op; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7aa1d420c779..409725e86f42 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -410,6 +410,7 @@ struct pebs_xmm { #define IBS_CAPS_OPBRNFUSE (1U<<8) #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) +#define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -423,6 +424,7 @@ struct pebs_xmm { #define IBSCTL_LVT_OFFSET_MASK 0x0F /* IBS fetch bits/masks */ +#define IBS_FETCH_L3MISSONLY (1ULL<<59) #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) @@ -439,6 +441,7 @@ struct pebs_xmm { #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_L3MISSONLY (1ULL<<16) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ From 838de1d843fc9b6161e0e1c6308a8c027d08606d Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:10 +0530 Subject: [PATCH 27/32] perf/amd/ibs: Advertise zen4_ibs_extensions as pmu capability attribute PMU driver can advertise certain feature via capability attribute('caps' sysfs directory) which can be consumed by userspace tools like perf. Add zen4_ibs_extensions capability attribute for IBS pmus. This attribute will be enabled when CPUID_Fn8000001B_EAX[11] is set. With patch on Zen4: $ ls /sys/bus/event_source/devices/ibs_op/caps zen4_ibs_extensions Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-5-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2dc8b7ec030a..c251bc44c088 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -537,8 +537,14 @@ static struct attribute_group empty_format_group = { .attrs = attrs_empty, }; +static struct attribute_group empty_caps_group = { + .name = "caps", + .attrs = attrs_empty, +}; + static const struct attribute_group *empty_attr_groups[] = { &empty_format_group, + &empty_caps_group, NULL, }; @@ -546,6 +552,7 @@ PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -563,6 +570,11 @@ static struct attribute *fetch_l3missonly_attrs[] = { NULL, }; +static struct attribute *zen4_ibs_extensions_attrs[] = { + &zen4_ibs_extensions.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -574,13 +586,21 @@ static struct attribute_group group_fetch_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_zen4_ibs_extensions = { + .name = "caps", + .attrs = zen4_ibs_extensions_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, + &empty_caps_group, NULL, }; static const struct attribute_group *fetch_attr_update[] = { &group_fetch_l3missonly, + &group_zen4_ibs_extensions, NULL, }; @@ -615,6 +635,7 @@ static struct attribute_group group_op_l3missonly = { static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, + &group_zen4_ibs_extensions, NULL, }; From 9cb23f598c641c1dcbe18defd219cdc439bc94a8 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:14 +0530 Subject: [PATCH 28/32] perf/ibs: Fix comment s/IBS Op Data 2/IBS Op Data 1/ for MSR 0xc0011035. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-9-ravi.bangoria@amd.com --- arch/x86/include/asm/amd-ibs.h | 2 +- tools/arch/x86/include/asm/amd-ibs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index 46e1df45efc0..aabdbb5ab920 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -49,7 +49,7 @@ union ibs_op_ctl { }; }; -/* MSR 0xc0011035: IBS Op Data 2 */ +/* MSR 0xc0011035: IBS Op Data 1 */ union ibs_op_data { __u64 val; struct { diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 174e7d83fcbd..765e9e752d03 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -49,7 +49,7 @@ union ibs_op_ctl { }; }; -/* MSR 0xc0011035: IBS Op Data 2 */ +/* MSR 0xc0011035: IBS Op Data 1 */ union ibs_op_data { __u64 val; struct { From bc469ddf67154a4840267132e87ce0d8b72d4952 Mon Sep 17 00:00:00 2001 From: Zucheng Zheng Date: Thu, 21 Apr 2022 19:10:31 +0800 Subject: [PATCH 29/32] perf/x86/amd: Remove unused variable 'hwc' 'hwc' is never used in amd_pmu_enable_all(), so remove it. Signed-off-by: Zucheng Zheng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220421111031.174698-1-zhengzucheng@huawei.com --- arch/x86/events/amd/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 262e39a85031..d81eac2284ea 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -771,14 +771,11 @@ static void amd_pmu_enable_event(struct perf_event *event) static void amd_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc; int idx; amd_brs_enable_all(); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - hwc = &cpuc->events[idx]->hw; - /* only activate events which are marked as active */ if (!test_bit(idx, cpuc->active_mask)) continue; From 3c27b0c6ea48bc61492a138c410e262735d660ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:22:04 +0200 Subject: [PATCH 30/32] perf/x86/amd: Fix AMD BRS period adjustment There's two problems with the current amd_brs_adjust_period() code: - it isn't in fact AMD specific and wil always adjust the period; - it adjusts the period, while it should only adjust the event count, resulting in repoting a short period. Fix this by using x86_pmu.limit_period, this makes it specific to the AMD BRS case and ensures only the event count is adjusted while the reported period is unmodified. Fixes: ba2fe7500845 ("perf/x86/amd: Add AMD branch sampling period adjustment") Signed-off-by: Peter Zijlstra (Intel) --- arch/x86/events/amd/core.c | 13 +++++++++++++ arch/x86/events/core.c | 7 ------- arch/x86/events/perf_event.h | 18 ------------------ 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d81eac2284ea..3eee59c64daa 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1255,6 +1255,18 @@ static void amd_pmu_sched_task(struct perf_event_context *ctx, amd_pmu_brs_sched_task(ctx, sched_in); } +static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) +{ + /* + * Decrease period by the depth of the BRS feature to get the last N + * taken branches and approximate the desired period + */ + if (has_branch_stack(event) && left > x86_pmu.lbr_nr) + left -= x86_pmu.lbr_nr; + + return left; +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, @@ -1415,6 +1427,7 @@ static int __init amd_core_pmu_init(void) if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; x86_pmu.sched_task = amd_pmu_sched_task; + x86_pmu.limit_period = amd_pmu_limit_period; /* * put_event_constraints callback same as Fam17h, set above */ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b08052b05db6..30788894124f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1374,13 +1374,6 @@ int x86_perf_event_set_period(struct perf_event *event) x86_pmu.set_topdown_event_period) return x86_pmu.set_topdown_event_period(event); - /* - * decrease period by the depth of the BRS feature to get - * the last N taken branches and approximate the desired period - */ - if (has_branch_stack(event)) - period = amd_brs_adjust_period(period); - /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3b0324584da3..21a5482bcf84 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1254,14 +1254,6 @@ static inline void amd_pmu_brs_del(struct perf_event *event) } void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); - -static inline s64 amd_brs_adjust_period(s64 period) -{ - if (period > x86_pmu.lbr_nr) - return period - x86_pmu.lbr_nr; - - return period; -} #else static inline int amd_brs_init(void) { @@ -1290,11 +1282,6 @@ static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool s { } -static inline s64 amd_brs_adjust_period(s64 period) -{ - return period; -} - static inline void amd_brs_enable_all(void) { } @@ -1324,11 +1311,6 @@ static inline void amd_brs_enable_all(void) static inline void amd_brs_disable_all(void) { } - -static inline s64 amd_brs_adjust_period(s64 period) -{ - return period; -} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) From 841b51e4a3590866d17fa2663c64688c25b891b1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 May 2022 17:48:38 +0200 Subject: [PATCH 31/32] perf/x86/amd: Run AMD BRS code only on supported hw This fires on a Fam16h machine here: unchecked MSR access error: WRMSR to 0xc000010f (tried to write 0x0000000000000018) \ at rIP: 0xffffffff81007db1 (amd_brs_reset+0x11/0x50) Call Trace: amd_pmu_cpu_starting ? x86_pmu_dead_cpu x86_pmu_starting_cpu cpuhp_invoke_callback ? x86_pmu_starting_cpu ? x86_pmu_dead_cpu cpuhp_issue_call ? x86_pmu_starting_cpu __cpuhp_setup_state_cpuslocked ? x86_pmu_dead_cpu ? x86_pmu_starting_cpu __cpuhp_setup_state ? map_vsyscall init_hw_perf_events ? map_vsyscall do_one_initcall ? _raw_spin_unlock_irqrestore ? try_to_wake_up kernel_init_freeable ? rest_init kernel_init ret_from_fork because that CPU hotplug callback gets executed on any AMD CPU - not only on the BRS-enabled ones. Check the BRS feature bit properly. Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Acked-By: Stephane Eranian Link: https://lkml.kernel.org/r/20220516154838.7044-1-bp@alien8.de --- arch/x86/events/amd/brs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 895c82165d85..bee8765a1e9b 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -57,7 +57,7 @@ static inline u64 get_debug_extn_cfg(void) static bool __init amd_brs_detect(void) { - if (!boot_cpu_has(X86_FEATURE_BRS)) + if (!cpu_feature_enabled(X86_FEATURE_BRS)) return false; switch (boot_cpu_data.x86) { @@ -112,6 +112,9 @@ static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) */ void amd_brs_reset(void) { + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return; + /* * Reset config */ From bae19fdd7e9e759580ac4693d2df3bc23ab415d7 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 18 May 2022 14:13:27 +0530 Subject: [PATCH 32/32] perf/x86/amd/core: Fix reloading events for SVM Commit 1018faa6cf23 ("perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled") addresses an issue in which the Host-Only bit in the counter control registers needs to be masked off when SVM is not enabled. The events need to be reloaded whenever SVM is enabled or disabled for a CPU and this requires the PERF_CTL registers to be reprogrammed using {enable,disable}_all(). However, PerfMonV2 variants of these functions do not reprogram the PERF_CTL registers. Hence, the legacy enable_all() function should also be called. Fixes: 9622e67e3980 ("perf/x86/amd/core: Add PerfMonV2 counter control") Reported-by: Like Xu Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220518084327.464005-1-sandipan.das@amd.com --- arch/x86/events/amd/core.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 3eee59c64daa..9ac3718410ce 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1472,6 +1472,24 @@ __init int amd_pmu_init(void) return 0; } +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + void amd_pmu_enable_virt(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1479,8 +1497,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -1497,7 +1514,6 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);