From a9f9772114c8b07ae75bcb3654bd017461248095 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Sep 2018 17:58:35 +0200 Subject: [PATCH 1/5] perf/core: Fix perf_pmu_unregister() locking When we unregister a PMU, we fail to serialize the @pmu_idr properly. Fix that by doing the entire thing under pmu_lock. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 2e80a82a49c4 ("perf: Dynamic pmu types") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dcb093e7b377..dfb1d951789e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9431,9 +9431,7 @@ static void free_pmu_context(struct pmu *pmu) if (pmu->task_ctx_nr > perf_invalid_context) return; - mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); - mutex_unlock(&pmus_lock); } /* @@ -9689,12 +9687,8 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - int remove_device; - mutex_lock(&pmus_lock); - remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -9706,13 +9700,14 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (remove_device) { + if (pmu_bus_running) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); + mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); From 6265adb9726098b7f4f7ca70bc51992b25fdd9d6 Mon Sep 17 00:00:00 2001 From: Masayoshi Mizuma Date: Mon, 10 Sep 2018 10:47:50 -0400 Subject: [PATCH 2/5] perf/x86/intel/uncore: Use boot_cpu_data.phys_proc_id instead of hardcorded physical package ID 0 Physical package id 0 doesn't always exist, we should use boot_cpu_data.phys_proc_id here. Signed-off-by: Masayoshi Mizuma Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masayoshi Mizuma Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20180910144750.6782-1-msys.mizuma@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 51d7c117e3c7..53b981dcdb42 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3061,7 +3061,7 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; From cd6fb677ce7e460c25bdd66f689734102ec7d642 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 23 Sep 2018 18:13:43 +0200 Subject: [PATCH 3/5] perf/ring_buffer: Prevent concurent ring buffer access Some of the scheduling tracepoints allow the perf_tp_event code to write to ring buffer under different cpu than the code is running on. This results in corrupted ring buffer data demonstrated in following perf commands: # perf record -e 'sched:sched_switch,sched:sched_wakeup' perf bench sched messaging # Running 'sched/messaging' benchmark: # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 0.383 [sec] [ perf record: Woken up 8 times to write data ] 0x42b890 [0]: failed to process type: -1765585640 [ perf record: Captured and wrote 4.825 MB perf.data (29669 samples) ] # perf report --stdio 0x42b890 [0]: failed to process type: -1765585640 The reason for the corruption are some of the scheduling tracepoints, that have __perf_task dfined and thus allow to store data to another cpu ring buffer: sched_waking sched_wakeup sched_wakeup_new sched_stat_wait sched_stat_sleep sched_stat_iowait sched_stat_blocked The perf_tp_event function first store samples for current cpu related events defined for tracepoint: hlist_for_each_entry_rcu(event, head, hlist_entry) perf_swevent_event(event, count, &data, regs); And then iterates events of the 'task' and store the sample for any task's event that passes tracepoint checks: ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) continue; perf_swevent_event(event, count, &data, regs); } Above code can race with same code running on another cpu, ending up with 2 cpus trying to store under the same ring buffer, which is specifically not allowed. This patch prevents the problem, by allowing only events with the same current cpu to receive the event. NOTE: this requires the use of (per-task-)per-cpu buffers for this feature to work; perf-record does this. Signed-off-by: Jiri Olsa [peterz: small edits to Changelog] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andrew Vagin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: e6dab5ffab59 ("perf/trace: Add ability to set a target task for events") Link: http://lkml.kernel.org/r/20180923161343.GB15054@krava Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index dfb1d951789e..5a97f34bc14c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8314,6 +8314,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->cpu != smp_processor_id()) + continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) From 9d92cfeaf5215158d26d2991be7f7ff865cb98f3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 21 Sep 2018 07:07:06 -0700 Subject: [PATCH 4/5] perf/x86/intel/uncore: Fix PCI BDF address of M3UPI on SKX The counters on M3UPI Link 0 and Link 3 don't count properly, and writing 0 to these counters may causes system crash on some machines. The PCI BDF addresses of the M3UPI in the current code are incorrect. The correct addresses should be: D18:F1 0x204D D18:F2 0x204E D18:F5 0x204D Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Link: http://lkml.kernel.org/r/1537538826-55489-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 53b981dcdb42..c07bee31abe8 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3931,16 +3931,16 @@ static const struct pci_device_id skx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3), }, { /* M3UPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 0), }, { /* M3UPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204E), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 2, SKX_PCI_UNCORE_M3UPI, 1), }, { /* M3UPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 5, SKX_PCI_UNCORE_M3UPI, 2), }, { /* end: all zeroes */ } }; From d7cbbe49a9304520181fb8c9272d1327deec8453 Mon Sep 17 00:00:00 2001 From: "Natarajan, Janakarajan" Date: Thu, 27 Sep 2018 15:51:55 +0000 Subject: [PATCH 5/5] perf/x86/amd/uncore: Set ThreadMask and SliceMask for L3 Cache perf events In Family 17h, some L3 Cache Performance events require the ThreadMask and SliceMask to be set. For other events, these fields do not affect the count either way. Set ThreadMask and SliceMask to 0xFF and 0xF respectively. Signed-off-by: Janakarajan Natarajan Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H . Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Suravee Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/Message-ID: Signed-off-by: Ingo Molnar --- arch/x86/events/amd/uncore.c | 10 ++++++++++ arch/x86/include/asm/perf_event.h | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 981ba5e8241b..8671de126eac 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -36,6 +36,7 @@ static int num_counters_llc; static int num_counters_nb; +static bool l3_mask; static HLIST_HEAD(uncore_unused_list); @@ -209,6 +210,13 @@ static int amd_uncore_event_init(struct perf_event *event) hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; + /* + * SliceMask and ThreadMask need to be set for certain L3 events in + * Family 17h. For other events, the two fields do not affect the count. + */ + if (l3_mask) + hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK); + if (event->cpu < 0) return -EINVAL; @@ -525,6 +533,7 @@ static int __init amd_uncore_init(void) amd_llc_pmu.name = "amd_l3"; format_attr_event_df.show = &event_show_df; format_attr_event_l3.show = &event_show_l3; + l3_mask = true; } else { num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L2; @@ -532,6 +541,7 @@ static int __init amd_uncore_init(void) amd_llc_pmu.name = "amd_l2"; format_attr_event_df = format_attr_event; format_attr_event_l3 = format_attr_event; + l3_mask = false; } amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 12f54082f4c8..78241b736f2a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -46,6 +46,14 @@ #define INTEL_ARCH_EVENT_MASK \ (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT) +#define AMD64_L3_SLICE_SHIFT 48 +#define AMD64_L3_SLICE_MASK \ + ((0xFULL) << AMD64_L3_SLICE_SHIFT) + +#define AMD64_L3_THREAD_SHIFT 56 +#define AMD64_L3_THREAD_MASK \ + ((0xFFULL) << AMD64_L3_THREAD_SHIFT) + #define X86_RAW_EVENT_MASK \ (ARCH_PERFMON_EVENTSEL_EVENT | \ ARCH_PERFMON_EVENTSEL_UMASK | \