diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index adc5a7e44b98..31824d5269cc 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -295,7 +295,10 @@ $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml $(OUTPUT)%.xml : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b docbook -d manpage \ - $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ + $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \ + -aperf_date=$(shell git log -1 --pretty="format:%cd" \ + --date=short $<) \ + -o $@+ $< && \ mv $@+ $@ XSLT = docbook.xsl diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index 2cf2d9e9d0da..fd9241a1b987 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -1,991 +1 @@ -Intel Processor Trace -===================== - -Overview -======== - -Intel Processor Trace (Intel PT) is an extension of Intel Architecture that -collects information about software execution such as control flow, execution -modes and timings and formats it into highly compressed binary packets. -Technical details are documented in the Intel 64 and IA-32 Architectures -Software Developer Manuals, Chapter 36 Intel Processor Trace. - -Intel PT is first supported in Intel Core M and 5th generation Intel Core -processors that are based on the Intel micro-architecture code name Broadwell. - -Trace data is collected by 'perf record' and stored within the perf.data file. -See below for options to 'perf record'. - -Trace data must be 'decoded' which involves walking the object code and matching -the trace data packets. For example a TNT packet only tells whether a -conditional branch was taken or not taken, so to make use of that packet the -decoder must know precisely which instruction was being executed. - -Decoding is done on-the-fly. The decoder outputs samples in the same format as -samples output by perf hardware events, for example as though the "instructions" -or "branches" events had been recorded. Presently 3 tools support this: -'perf script', 'perf report' and 'perf inject'. See below for more information -on using those tools. - -The main distinguishing feature of Intel PT is that the decoder can determine -the exact flow of software execution. Intel PT can be used to understand why -and how did software get to a certain point, or behave a certain way. The -software does not have to be recompiled, so Intel PT works with debug or release -builds, however the executed images are needed - which makes use in JIT-compiled -environments, or with self-modified code, a challenge. Also symbols need to be -provided to make sense of addresses. - -A limitation of Intel PT is that it produces huge amounts of trace data -(hundreds of megabytes per second per core) which takes a long time to decode, -for example two or three orders of magnitude longer than it took to collect. -Another limitation is the performance impact of tracing, something that will -vary depending on the use-case and architecture. - - -Quickstart -========== - -It is important to start small. That is because it is easy to capture vastly -more data than can possibly be processed. - -The simplest thing to do with Intel PT is userspace profiling of small programs. -Data is captured with 'perf record' e.g. to trace 'ls' userspace-only: - - perf record -e intel_pt//u ls - -And profiled with 'perf report' e.g. - - perf report - -To also trace kernel space presents a problem, namely kernel self-modifying -code. A fairly good kernel image is available in /proc/kcore but to get an -accurate image a copy of /proc/kcore needs to be made under the same conditions -as the data capture. A script perf-with-kcore can do that, but beware that the -script makes use of 'sudo' to copy /proc/kcore. If you have perf installed -locally from the source tree you can do: - - ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls - -which will create a directory named 'pt_ls' and put the perf.data file and -copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use -'perf report' becomes: - - ~/libexec/perf-core/perf-with-kcore report pt_ls - -Because samples are synthesized after-the-fact, the sampling period can be -selected for reporting. e.g. sample every microsecond - - ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge - -See the sections below for more information about the --itrace option. - -Beware the smaller the period, the more samples that are produced, and the -longer it takes to process them. - -Also note that the coarseness of Intel PT timing information will start to -distort the statistical value of the sampling as the sampling period becomes -smaller. - -To represent software control flow, "branches" samples are produced. By default -a branch sample is synthesized for every single branch. To get an idea what -data is available you can use the 'perf script' tool with all itrace sampling -options, which will list all the samples. - - perf record -e intel_pt//u ls - perf script --itrace=ibxwpe - -An interesting field that is not printed by default is 'flags' which can be -displayed as follows: - - perf script --itrace=ibxwpe -F+flags - -The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, -system, asynchronous, interrupt, transaction abort, trace begin, trace end, and -in transaction, respectively. - -Another interesting field that is not printed by default is 'ipc' which can be -displayed as follows: - - perf script --itrace=be -F+ipc - -There are two ways that instructions-per-cycle (IPC) can be calculated depending -on the recording. - -If the 'cyc' config term (see config terms section below) was used, then IPC is -calculated using the cycle count from CYC packets, otherwise MTC packets are -used - refer to the 'mtc' config term. When MTC is used, however, the values -are less accurate because the timing is less accurate. - -Because Intel PT does not update the cycle count on every branch or instruction, -the values will often be zero. When there are values, they will be the number -of instructions and number of cycles since the last update, and thus represent -the average IPC since the last IPC for that event type. Note IPC for "branches" -events is calculated separately from IPC for "instructions" events. - -Also note that the IPC instruction count may or may not include the current -instruction. If the cycle count is associated with an asynchronous branch -(e.g. page fault or interrupt), then the instruction count does not include the -current instruction, otherwise it does. That is consistent with whether or not -that instruction has retired when the cycle count is updated. - -Another note, in the case of "branches" events, non-taken branches are not -presently sampled, so IPC values for them do not appear e.g. a CYC packet with a -TNT packet that starts with a non-taken branch. To see every possible IPC -value, "instructions" events can be used e.g. --itrace=i0ns - -While it is possible to create scripts to analyze the data, an alternative -approach is available to export the data to a sqlite or postgresql database. -Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, -and to script exported-sql-viewer.py for an example of using the database. - -There is also script intel-pt-events.py which provides an example of how to -unpack the raw data for power events and PTWRITE. - -As mentioned above, it is easy to capture too much data. One way to limit the -data captured is to use 'snapshot' mode which is explained further below. -Refer to 'new snapshot option' and 'Intel PT modes of operation' further below. - -Another problem that will be experienced is decoder errors. They can be caused -by inability to access the executed image, self-modified or JIT-ed code, or the -inability to match side-band information (such as context switches and mmaps) -which results in the decoder not knowing what code was executed. - -There is also the problem of perf not being able to copy the data fast enough, -resulting in data lost because the buffer was full. See 'Buffer handling' below -for more details. - - -perf record -=========== - -new event ---------- - -The Intel PT kernel driver creates a new PMU for Intel PT. PMU events are -selected by providing the PMU name followed by the "config" separated by slashes. -An enhancement has been made to allow default "config" e.g. the option - - -e intel_pt// - -will use a default config value. Currently that is the same as - - -e intel_pt/tsc,noretcomp=0/ - -which is the same as - - -e intel_pt/tsc=1,noretcomp=0/ - -Note there are now new config terms - see section 'config terms' further below. - -The config terms are listed in /sys/devices/intel_pt/format. They are bit -fields within the config member of the struct perf_event_attr which is -passed to the kernel by the perf_event_open system call. They correspond to bit -fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: - - $ grep -H . /sys/bus/event_source/devices/intel_pt/format/* - /sys/bus/event_source/devices/intel_pt/format/cyc:config:1 - /sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22 - /sys/bus/event_source/devices/intel_pt/format/mtc:config:9 - /sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17 - /sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11 - /sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27 - /sys/bus/event_source/devices/intel_pt/format/tsc:config:10 - -Note that the default config must be overridden for each term i.e. - - -e intel_pt/noretcomp=0/ - -is the same as: - - -e intel_pt/tsc=1,noretcomp=0/ - -So, to disable TSC packets use: - - -e intel_pt/tsc=0/ - -It is also possible to specify the config value explicitly: - - -e intel_pt/config=0x400/ - -Note that, as with all events, the event is suffixed with event modifiers: - - u userspace - k kernel - h hypervisor - G guest - H host - p precise ip - -'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. -'p' is also not relevant to Intel PT. So only options 'u' and 'k' are -meaningful for Intel PT. - -perf_event_attr is displayed if the -vv option is used e.g. - - ------------------------------------------------------------ - perf_event_attr: - type 6 - size 112 - config 0x400 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - enable_on_exec 1 - sample_id_all 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - - -config terms ------------- - -The June 2015 version of Intel 64 and IA-32 Architectures Software Developer -Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. -Some of the features are reflect in new config terms. All the config terms are -described below. - -tsc Always supported. Produces TSC timestamp packets to provide - timing information. In some cases it is possible to decode - without timing information, for example a per-thread context - that does not overlap executable memory maps. - - The default config selects tsc (i.e. tsc=1). - -noretcomp Always supported. Disables "return compression" so a TIP packet - is produced when a function returns. Causes more packets to be - produced but might make decoding more reliable. - - The default config does not select noretcomp (i.e. noretcomp=0). - -psb_period Allows the frequency of PSB packets to be specified. - - The PSB packet is a synchronization packet that provides a - starting point for decoding or recovery from errors. - - Support for psb_period is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and "0" - otherwise. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The psb_period value is converted to the approximate number of - trace bytes between PSB packets as: - - 2 ^ (value + 11) - - e.g. value 3 means 16KiB bytes between PSBs - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/psb_period=15/u uname - Invalid psb_period for intel_pt. Valid values are: 0-5 - - If MTC packets are selected, the default config selects a value - of 3 (i.e. psb_period=3) or the nearest lower value that is - supported (0 is always supported). Otherwise the default is 0. - - If decoding is expected to be reliable and the buffer is large - then a large PSB period can be used. - - Because a TSC packet is produced with PSB, the PSB period can - also affect the granularity to timing information in the absence - of MTC or CYC. - -mtc Produces MTC timing packets. - - MTC packets provide finer grain timestamp information than TSC - packets. MTC packets record time using the hardware crystal - clock (CTC) which is related to TSC packets using a TMA packet. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc - - which contains "1" if the feature is supported and - "0" otherwise. - - The frequency of MTC packets can also be specified - see - mtc_period below. - -mtc_period Specifies how frequently MTC packets are produced - see mtc - above for how to determine if MTC packets are supported. - - Valid values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/mtc_periods - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The mtc_period value is converted to the MTC frequency as: - - CTC-frequency / (2 ^ value) - - e.g. value 3 means one eighth of CTC-frequency - - Where CTC is the hardware crystal clock, the frequency of which - can be related to TSC via values provided in cpuid leaf 0x15. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/mtc_period=15/u uname - Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 - - The default value is 3 or the nearest lower value - that is supported (0 is always supported). - -cyc Produces CYC timing packets. - - CYC packets provide even finer grain timestamp information than - MTC and TSC packets. A CYC packet contains the number of CPU - cycles since the last CYC packet. Unlike MTC and TSC packets, - CYC packets are only sent when another packet is also sent. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/psb_cyc - - which contains "1" if the feature is supported and - "0" otherwise. - - The number of CYC packets produced can be reduced by specifying - a threshold - see cyc_thresh below. - -cyc_thresh Specifies how frequently CYC packets are produced - see cyc - above for how to determine if CYC packets are supported. - - Valid cyc_thresh values are given by: - - /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds - - which contains a hexadecimal value, the bits of which represent - valid values e.g. bit 2 set means value 2 is valid. - - The cyc_thresh value represents the minimum number of CPU cycles - that must have passed before a CYC packet can be sent. The - number of CPU cycles is: - - 2 ^ (value - 1) - - e.g. value 4 means 8 CPU cycles must pass before a CYC packet - can be sent. Note a CYC packet is still only sent when another - packet is sent, not at, e.g. every 8 CPU cycles. - - If an invalid value is entered, the error message - will give a list of valid values e.g. - - $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname - Invalid cyc_thresh for intel_pt. Valid values are: 0-12 - - CYC packets are not requested by default. - -pt Specifies pass-through which enables the 'branch' config term. - - The default config selects 'pt' if it is available, so a user will - never need to specify this term. - -branch Enable branch tracing. Branch tracing is enabled by default so to - disable branch tracing use 'branch=0'. - - The default config selects 'branch' if it is available. - -ptw Enable PTWRITE packets which are produced when a ptwrite instruction - is executed. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/ptwrite - - which contains "1" if the feature is supported and - "0" otherwise. - -fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet - provides the address of the ptwrite instruction. In the absence of - fup_on_ptw, the decoder will use the address of the previous branch - if branch tracing is enabled, otherwise the address will be zero. - Note that fup_on_ptw will work even when branch tracing is disabled. - -pwr_evt Enable power events. The power events provide information about - changes to the CPU C-state. - - Support for this feature is indicated by: - - /sys/bus/event_source/devices/intel_pt/caps/power_event_trace - - which contains "1" if the feature is supported and - "0" otherwise. - - -AUX area sampling option ------------------------- - -To select Intel PT "sampling" the AUX area sampling option can be used: - - --aux-sample - -Optionally it can be followed by the sample size in bytes e.g. - - --aux-sample=8192 - -In addition, the Intel PT event to sample must be defined e.g. - - -e intel_pt//u - -Samples on other events will be created containing Intel PT data e.g. the -following will create Intel PT samples on the branch-misses event, note the -events must be grouped using {}: - - perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' - -An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to -events. In this case, the grouping is implied e.g. - - perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u - -is the same as: - - perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}' - -but allows for also using an address filter e.g.: - - perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls - -It is important to select a sample size that is big enough to contain at least -one PSB packet. If not a warning will be displayed: - - Intel PT sample size (%zu) may be too small for PSB period (%zu) - -The calculation used for that is: if sample_size <= psb_period + 256 display the -warning. When sampling is used, psb_period defaults to 0 (2KiB). - -The default sample size is 4KiB. - -The sample size is passed in aux_sample_size in struct perf_event_attr. The -sample size is limited by the maximum event size which is 64KiB. It is -difficult to know how big the event might be without the trace sample attached, -but the tool validates that the sample size is not greater than 60KiB. - - -new snapshot option -------------------- - -The difference between full trace and snapshot from the kernel's perspective is -that in full trace we don't overwrite trace data that the user hasn't collected -yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let -the trace run and overwrite older data in the buffer so that whenever something -interesting happens, we can stop it and grab a snapshot of what was going on -around that interesting moment. - -To select snapshot mode a new option has been added: - - -S - -Optionally it can be followed by the snapshot size e.g. - - -S0x100000 - -The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap size -nor snapshot size is specified, then the default is 4MiB for privileged users -(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. -If an unprivileged user does not specify mmap pages, the mmap pages will be -reduced as described in the 'new auxtrace mmap size option' section below. - -The snapshot size is displayed if the option -vv is used e.g. - - Intel PT snapshot size: %zu - - -new auxtrace mmap size option ---------------------------- - -Intel PT buffer size is specified by an addition to the -m option e.g. - - -m,16 - -selects a buffer size of 16 pages i.e. 64KiB. - -Note that the existing functionality of -m is unchanged. The auxtrace mmap size -is specified by the optional addition of a comma and the value. - -The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users -(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. -If an unprivileged user does not specify mmap pages, the mmap pages will be -reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the -user is likely to get an error as they exceed their mlock limit (Max locked -memory as shown in /proc/self/limits). Note that perf does not count the first -512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu -against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus -their mlock limit (which defaults to 64KiB but is not multiplied by the number -of cpus). - -In full-trace mode, powers of two are allowed for buffer size, with a minimum -size of 2 pages. In snapshot mode or sampling mode, it is the same but the -minimum size is 1 page. - -The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g. - - mmap length 528384 - auxtrace mmap length 4198400 - - -Intel PT modes of operation ---------------------------- - -Intel PT can be used in 2 modes: - full-trace mode - sample mode - snapshot mode - -Full-trace mode traces continuously e.g. - - perf record -e intel_pt//u uname - -Sample mode attaches a Intel PT sample to other events e.g. - - perf record --aux-sample -e intel_pt//u -e branch-misses:u - -Snapshot mode captures the available data when a signal is sent e.g. - - perf record -v -e intel_pt//u -S ./loopy 1000000000 & - [1] 11435 - kill -USR2 11435 - Recording AUX area tracing snapshot - -Note that the signal sent is SIGUSR2. -Note that "Recording AUX area tracing snapshot" is displayed because the -v -option is used. - -The 2 modes cannot be used together. - - -Buffer handling ---------------- - -There may be buffer limitations (i.e. single ToPa entry) which means that actual -buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to -provide other sizes, and in particular an arbitrarily large size, multiple -buffers are logically concatenated. However an interrupt must be used to switch -between buffers. That has two potential problems: - a) the interrupt may not be handled in time so that the current buffer - becomes full and some trace data is lost. - b) the interrupts may slow the system and affect the performance - results. - -If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event -which the tools report as an error. - -In full-trace mode, the driver waits for data to be copied out before allowing -the (logical) buffer to wrap-around. If data is not copied out quickly enough, -again 'truncated' is set in the PERF_RECORD_AUX event. If the driver has to -wait, the intel_pt event gets disabled. Because it is difficult to know when -that happens, perf tools always re-enable the intel_pt event after copying out -data. - - -Intel PT and build ids ----------------------- - -By default "perf record" post-processes the event stream to find all build ids -for executables for all addresses sampled. Deliberately, Intel PT is not -decoded for that purpose (it would take too long). Instead the build ids for -all executables encountered (due to mmap, comm or task events) are included -in the perf.data file. - -To see buildids included in the perf.data file use the command: - - perf buildid-list - -If the perf.data file contains Intel PT data, that is the same as: - - perf buildid-list --with-hits - - -Snapshot mode and event disabling ---------------------------------- - -In order to make a snapshot, the intel_pt event is disabled using an IOCTL, -namely PERF_EVENT_IOC_DISABLE. However doing that can also disable the -collection of side-band information. In order to prevent that, a dummy -software event has been introduced that permits tracking events (like mmaps) to -continue to be recorded while intel_pt is disabled. That is important to ensure -there is complete side-band information to allow the decoding of subsequent -snapshots. - -A test has been created for that. To find the test: - - perf test list - ... - 23: Test using a dummy software event to keep tracking - -To run the test: - - perf test 23 - 23: Test using a dummy software event to keep tracking : Ok - - -perf record modes (nothing new here) ------------------------------------- - -perf record essentially operates in one of three modes: - per thread - per cpu - workload only - -"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a -workload). -"per cpu" is selected by -C or -a. -"workload only" mode is selected by not using the other options but providing a -command to run (i.e. the workload). - -In per-thread mode an exact list of threads is traced. There is no inheritance. -Each thread has its own event buffer. - -In per-cpu mode all processes (or processes from the selected cgroup i.e. -G -option, or processes selected with -p or -u) are traced. Each cpu has its own -buffer. Inheritance is allowed. - -In workload-only mode, the workload is traced but with per-cpu buffers. -Inheritance is allowed. Note that you can now trace a workload in per-thread -mode by using the --per-thread option. - - -Privileged vs non-privileged users ----------------------------------- - -Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users -have memory limits imposed upon them. That affects what buffer sizes they can -have as outlined above. - -The v4.2 kernel introduced support for a context switch metadata event, -PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes -are scheduled out and in, just not by whom, which is left for the -PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context, -which in turn requires CAP_SYS_ADMIN. - -Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context -switches") commit, that introduces these metadata events for further info. - -When working with kernels < v4.2, the following considerations must be taken, -as the sched:sched_switch tracepoints will be used to receive such information: - -Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are -not permitted to use tracepoints which means there is insufficient side-band -information to decode Intel PT in per-cpu mode, and potentially workload-only -mode too if the workload creates new processes. - -Note also, that to use tracepoints, read-access to debugfs is required. So if -debugfs is not mounted or the user does not have read-access, it will again not -be possible to decode Intel PT in per-cpu mode. - - -sched_switch tracepoint ------------------------ - -The sched_switch tracepoint is used to provide side-band data for Intel PT -decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't -available. - -The sched_switch events are automatically added. e.g. the second event shown -below: - - $ perf record -vv -e intel_pt//u uname - ------------------------------------------------------------ - perf_event_attr: - type 6 - size 112 - config 0x400 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - enable_on_exec 1 - sample_id_all 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - perf_event_attr: - type 2 - size 112 - config 0x108 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER - read_format ID - inherit 1 - sample_id_all 1 - exclude_guest 1 - ------------------------------------------------------------ - sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 - ------------------------------------------------------------ - perf_event_attr: - type 1 - size 112 - config 0x9 - { sample_period, sample_freq } 1 - sample_type IP|TID|TIME|IDENTIFIER - read_format ID - disabled 1 - inherit 1 - exclude_kernel 1 - exclude_hv 1 - mmap 1 - comm 1 - enable_on_exec 1 - task 1 - sample_id_all 1 - mmap2 1 - comm_exec 1 - ------------------------------------------------------------ - sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 - sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 - mmap size 528384B - AUX area mmap length 4194304 - perf event ring buffer mmapped per cpu - Synthesizing auxtrace information - Linux - [ perf record: Woken up 1 times to write data ] - [ perf record: Captured and wrote 0.042 MB perf.data ] - -Note, the sched_switch event is only added if the user is permitted to use it -and only in per-cpu mode. - -Note also, the sched_switch event is only added if TSC packets are requested. -That is because, in the absence of timing information, the sched_switch events -cannot be matched against the Intel PT trace. - - -perf script -=========== - -By default, perf script will decode trace data found in the perf.data file. -This can be further controlled by new option --itrace. - - -New --itrace option -------------------- - -Having no option is the same as - - --itrace - -which, in turn, is the same as - - --itrace=cepwx - -The letters are: - - i synthesize "instructions" events - b synthesize "branches" events - x synthesize "transactions" events - w synthesize "ptwrite" events - p synthesize "power" events - c synthesize branches events (calls only) - r synthesize branches events (returns only) - e synthesize tracing error events - d create a debug log - g synthesize a call chain (use with i or x) - l synthesize last branch entries (use with i or x) - s skip initial number of events - -"Instructions" events look like they were recorded by "perf record -e -instructions". - -"Branches" events look like they were recorded by "perf record -e branches". "c" -and "r" can be combined to get calls and returns. - -"Transactions" events correspond to the start or end of transactions. The -'flags' field can be used in perf script to determine whether the event is a -tranasaction start, commit or abort. - -Note that "instructions", "branches" and "transactions" events depend on code -flow packets which can be disabled by using the config term "branch=0". Refer -to the config terms section above. - -"ptwrite" events record the payload of the ptwrite instruction and whether -"fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are -recorded only if the "ptw" config term was used. Refer to the config terms -section above. perf script "synth" field displays "ptwrite" information like -this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was -used. - -"Power" events correspond to power event packets and CBR (core-to-bus ratio) -packets. While CBR packets are always recorded when tracing is enabled, power -event packets are recorded only if the "pwr_evt" config term was used. Refer to -the config terms section above. The power events record information about -C-state changes, whereas CBR is indicative of CPU frequency. perf script -"event,synth" fields display information like this: - cbr: cbr: 22 freq: 2189 MHz (200%) - mwait: hints: 0x60 extensions: 0x1 - pwre: hw: 0 cstate: 2 sub-cstate: 0 - exstop: ip: 1 - pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 -Where: - "cbr" includes the frequency and the percentage of maximum non-turbo - "mwait" shows mwait hints and extensions - "pwre" shows C-state transitions (to a C-state deeper than C0) and - whether initiated by hardware - "exstop" indicates execution stopped and whether the IP was recorded - exactly, - "pwrx" indicates return to C0 -For more details refer to the Intel 64 and IA-32 Architectures Software -Developer Manuals. - -Error events show where the decoder lost the trace. Error events -are quite important. Users must know if what they are seeing is a complete -picture or not. - -The "d" option will cause the creation of a file "intel_pt.log" containing all -decoded packets and instructions. Note that this option slows down the decoder -and that the resulting file may be very large. - -In addition, the period of the "instructions" event can be specified. e.g. - - --itrace=i10us - -sets the period to 10us i.e. one instruction sample is synthesized for each 10 -microseconds of trace. Alternatives to "us" are "ms" (milliseconds), -"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions). - -"ms", "us" and "ns" are converted to TSC ticks. - -The timing information included with Intel PT does not give the time of every -instruction. Consequently, for the purpose of sampling, the decoder estimates -the time since the last timing packet based on 1 tick per instruction. The time -on the sample is *not* adjusted and reflects the last known value of TSC. - -For Intel PT, the default period is 100us. - -Setting it to a zero period means "as often as possible". - -In the case of Intel PT that is the same as a period of 1 and a unit of -'instructions' (i.e. --itrace=i1i). - -Also the call chain size (default 16, max. 1024) for instructions or -transactions events can be specified. e.g. - - --itrace=ig32 - --itrace=xg32 - -Also the number of last branch entries (default 64, max. 1024) for instructions or -transactions events can be specified. e.g. - - --itrace=il10 - --itrace=xl10 - -Note that last branch entries are cleared for each sample, so there is no overlap -from one sample to the next. - -To disable trace decoding entirely, use the option --no-itrace. - -It is also possible to skip events generated (instructions, branches, transactions) -at the beginning. This is useful to ignore initialization code. - - --itrace=i0nss1000000 - -skips the first million instructions. - -dump option ------------ - -perf script has an option (-D) to "dump" the events i.e. display the binary -data. - -When -D is used, Intel PT packets are displayed. The packet decoder does not -pay attention to PSB packets, but just decodes the bytes - so the packets seen -by the actual decoder may not be identical in places where the data is corrupt. -One example of that would be when the buffer-switching interrupt has been too -slow, and the buffer has been filled completely. In that case, the last packet -in the buffer might be truncated and immediately followed by a PSB as the trace -continues in the next buffer. - -To disable the display of Intel PT packets, combine the -D option with ---no-itrace. - - -perf report -=========== - -By default, perf report will decode trace data found in the perf.data file. -This can be further controlled by new option --itrace exactly the same as -perf script, with the exception that the default is --itrace=igxe. - - -perf inject -=========== - -perf inject also accepts the --itrace option in which case tracing data is -removed and replaced with the synthesized events. e.g. - - perf inject --itrace -i perf.data -o perf.data.new - -Below is an example of using Intel PT with autofdo. It requires autofdo -(https://github.com/google/autofdo) and gcc version 5. The bubble -sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial) -amended to take the number of elements as a parameter. - - $ gcc-5 -O3 sort.c -o sort_optimized - $ ./sort_optimized 30000 - Bubble sorting array of 30000 elements - 2254 ms - - $ cat ~/.perfconfig - [intel-pt] - mispred-all = on - - $ perf record -e intel_pt//u ./sort 3000 - Bubble sorting array of 3000 elements - 58 ms - [ perf record: Woken up 2 times to write data ] - [ perf record: Captured and wrote 3.939 MB perf.data ] - $ perf inject -i perf.data -o inj --itrace=i100usle --strip - $ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1 - $ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo - $ ./sort_autofdo 30000 - Bubble sorting array of 30000 elements - 2155 ms - -Note there is currently no advantage to using Intel PT instead of LBR, but -that may change in the future if greater use is made of the data. - - -PEBS via Intel PT -================= - -Some hardware has the feature to redirect PEBS records to the Intel PT trace. -Recording is selected by using the aux-output config term e.g. - - perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname - -Note that currently, software only supports redirecting at most one PEBS event. - -To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g. - - perf script --itrace=oe +Documentation for support for Intel Processor Trace within perf tools' has moved to file perf-intel-pt.txt diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt index a64d6588470e..70969ea73e01 100644 --- a/tools/perf/Documentation/perf-inject.txt +++ b/tools/perf/Documentation/perf-inject.txt @@ -66,4 +66,5 @@ include::itrace.txt[] SEE ALSO -------- -linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1] +linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1], +linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt new file mode 100644 index 000000000000..456fdcbf26ac --- /dev/null +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -0,0 +1,1007 @@ +perf-intel-pt(1) +================ + +NAME +---- +perf-intel-pt - Support for Intel Processor Trace within perf tools + +SYNOPSIS +-------- +[verse] +'perf record' -e intel_pt// + +DESCRIPTION +----------- + +Intel Processor Trace (Intel PT) is an extension of Intel Architecture that +collects information about software execution such as control flow, execution +modes and timings and formats it into highly compressed binary packets. +Technical details are documented in the Intel 64 and IA-32 Architectures +Software Developer Manuals, Chapter 36 Intel Processor Trace. + +Intel PT is first supported in Intel Core M and 5th generation Intel Core +processors that are based on the Intel micro-architecture code name Broadwell. + +Trace data is collected by 'perf record' and stored within the perf.data file. +See below for options to 'perf record'. + +Trace data must be 'decoded' which involves walking the object code and matching +the trace data packets. For example a TNT packet only tells whether a +conditional branch was taken or not taken, so to make use of that packet the +decoder must know precisely which instruction was being executed. + +Decoding is done on-the-fly. The decoder outputs samples in the same format as +samples output by perf hardware events, for example as though the "instructions" +or "branches" events had been recorded. Presently 3 tools support this: +'perf script', 'perf report' and 'perf inject'. See below for more information +on using those tools. + +The main distinguishing feature of Intel PT is that the decoder can determine +the exact flow of software execution. Intel PT can be used to understand why +and how did software get to a certain point, or behave a certain way. The +software does not have to be recompiled, so Intel PT works with debug or release +builds, however the executed images are needed - which makes use in JIT-compiled +environments, or with self-modified code, a challenge. Also symbols need to be +provided to make sense of addresses. + +A limitation of Intel PT is that it produces huge amounts of trace data +(hundreds of megabytes per second per core) which takes a long time to decode, +for example two or three orders of magnitude longer than it took to collect. +Another limitation is the performance impact of tracing, something that will +vary depending on the use-case and architecture. + + +Quickstart +---------- + +It is important to start small. That is because it is easy to capture vastly +more data than can possibly be processed. + +The simplest thing to do with Intel PT is userspace profiling of small programs. +Data is captured with 'perf record' e.g. to trace 'ls' userspace-only: + + perf record -e intel_pt//u ls + +And profiled with 'perf report' e.g. + + perf report + +To also trace kernel space presents a problem, namely kernel self-modifying +code. A fairly good kernel image is available in /proc/kcore but to get an +accurate image a copy of /proc/kcore needs to be made under the same conditions +as the data capture. A script perf-with-kcore can do that, but beware that the +script makes use of 'sudo' to copy /proc/kcore. If you have perf installed +locally from the source tree you can do: + + ~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls + +which will create a directory named 'pt_ls' and put the perf.data file and +copies of /proc/kcore, /proc/kallsyms and /proc/modules into it. Then to use +'perf report' becomes: + + ~/libexec/perf-core/perf-with-kcore report pt_ls + +Because samples are synthesized after-the-fact, the sampling period can be +selected for reporting. e.g. sample every microsecond + + ~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge + +See the sections below for more information about the --itrace option. + +Beware the smaller the period, the more samples that are produced, and the +longer it takes to process them. + +Also note that the coarseness of Intel PT timing information will start to +distort the statistical value of the sampling as the sampling period becomes +smaller. + +To represent software control flow, "branches" samples are produced. By default +a branch sample is synthesized for every single branch. To get an idea what +data is available you can use the 'perf script' tool with all itrace sampling +options, which will list all the samples. + + perf record -e intel_pt//u ls + perf script --itrace=ibxwpe + +An interesting field that is not printed by default is 'flags' which can be +displayed as follows: + + perf script --itrace=ibxwpe -F+flags + +The flags are "bcrosyiABEx" which stand for branch, call, return, conditional, +system, asynchronous, interrupt, transaction abort, trace begin, trace end, and +in transaction, respectively. + +Another interesting field that is not printed by default is 'ipc' which can be +displayed as follows: + + perf script --itrace=be -F+ipc + +There are two ways that instructions-per-cycle (IPC) can be calculated depending +on the recording. + +If the 'cyc' config term (see config terms section below) was used, then IPC is +calculated using the cycle count from CYC packets, otherwise MTC packets are +used - refer to the 'mtc' config term. When MTC is used, however, the values +are less accurate because the timing is less accurate. + +Because Intel PT does not update the cycle count on every branch or instruction, +the values will often be zero. When there are values, they will be the number +of instructions and number of cycles since the last update, and thus represent +the average IPC since the last IPC for that event type. Note IPC for "branches" +events is calculated separately from IPC for "instructions" events. + +Also note that the IPC instruction count may or may not include the current +instruction. If the cycle count is associated with an asynchronous branch +(e.g. page fault or interrupt), then the instruction count does not include the +current instruction, otherwise it does. That is consistent with whether or not +that instruction has retired when the cycle count is updated. + +Another note, in the case of "branches" events, non-taken branches are not +presently sampled, so IPC values for them do not appear e.g. a CYC packet with a +TNT packet that starts with a non-taken branch. To see every possible IPC +value, "instructions" events can be used e.g. --itrace=i0ns + +While it is possible to create scripts to analyze the data, an alternative +approach is available to export the data to a sqlite or postgresql database. +Refer to script export-to-sqlite.py or export-to-postgresql.py for more details, +and to script exported-sql-viewer.py for an example of using the database. + +There is also script intel-pt-events.py which provides an example of how to +unpack the raw data for power events and PTWRITE. + +As mentioned above, it is easy to capture too much data. One way to limit the +data captured is to use 'snapshot' mode which is explained further below. +Refer to 'new snapshot option' and 'Intel PT modes of operation' further below. + +Another problem that will be experienced is decoder errors. They can be caused +by inability to access the executed image, self-modified or JIT-ed code, or the +inability to match side-band information (such as context switches and mmaps) +which results in the decoder not knowing what code was executed. + +There is also the problem of perf not being able to copy the data fast enough, +resulting in data lost because the buffer was full. See 'Buffer handling' below +for more details. + + +perf record +----------- + +new event +~~~~~~~~~ + +The Intel PT kernel driver creates a new PMU for Intel PT. PMU events are +selected by providing the PMU name followed by the "config" separated by slashes. +An enhancement has been made to allow default "config" e.g. the option + + -e intel_pt// + +will use a default config value. Currently that is the same as + + -e intel_pt/tsc,noretcomp=0/ + +which is the same as + + -e intel_pt/tsc=1,noretcomp=0/ + +Note there are now new config terms - see section 'config terms' further below. + +The config terms are listed in /sys/devices/intel_pt/format. They are bit +fields within the config member of the struct perf_event_attr which is +passed to the kernel by the perf_event_open system call. They correspond to bit +fields in the IA32_RTIT_CTL MSR. Here is a list of them and their definitions: + + $ grep -H . /sys/bus/event_source/devices/intel_pt/format/* + /sys/bus/event_source/devices/intel_pt/format/cyc:config:1 + /sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22 + /sys/bus/event_source/devices/intel_pt/format/mtc:config:9 + /sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17 + /sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11 + /sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27 + /sys/bus/event_source/devices/intel_pt/format/tsc:config:10 + +Note that the default config must be overridden for each term i.e. + + -e intel_pt/noretcomp=0/ + +is the same as: + + -e intel_pt/tsc=1,noretcomp=0/ + +So, to disable TSC packets use: + + -e intel_pt/tsc=0/ + +It is also possible to specify the config value explicitly: + + -e intel_pt/config=0x400/ + +Note that, as with all events, the event is suffixed with event modifiers: + + u userspace + k kernel + h hypervisor + G guest + H host + p precise ip + +'h', 'G' and 'H' are for virtualization which is not supported by Intel PT. +'p' is also not relevant to Intel PT. So only options 'u' and 'k' are +meaningful for Intel PT. + +perf_event_attr is displayed if the -vv option is used e.g. + + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + + +config terms +~~~~~~~~~~~~ + +The June 2015 version of Intel 64 and IA-32 Architectures Software Developer +Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features. +Some of the features are reflect in new config terms. All the config terms are +described below. + +tsc Always supported. Produces TSC timestamp packets to provide + timing information. In some cases it is possible to decode + without timing information, for example a per-thread context + that does not overlap executable memory maps. + + The default config selects tsc (i.e. tsc=1). + +noretcomp Always supported. Disables "return compression" so a TIP packet + is produced when a function returns. Causes more packets to be + produced but might make decoding more reliable. + + The default config does not select noretcomp (i.e. noretcomp=0). + +psb_period Allows the frequency of PSB packets to be specified. + + The PSB packet is a synchronization packet that provides a + starting point for decoding or recovery from errors. + + Support for psb_period is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and "0" + otherwise. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The psb_period value is converted to the approximate number of + trace bytes between PSB packets as: + + 2 ^ (value + 11) + + e.g. value 3 means 16KiB bytes between PSBs + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/psb_period=15/u uname + Invalid psb_period for intel_pt. Valid values are: 0-5 + + If MTC packets are selected, the default config selects a value + of 3 (i.e. psb_period=3) or the nearest lower value that is + supported (0 is always supported). Otherwise the default is 0. + + If decoding is expected to be reliable and the buffer is large + then a large PSB period can be used. + + Because a TSC packet is produced with PSB, the PSB period can + also affect the granularity to timing information in the absence + of MTC or CYC. + +mtc Produces MTC timing packets. + + MTC packets provide finer grain timestamp information than TSC + packets. MTC packets record time using the hardware crystal + clock (CTC) which is related to TSC packets using a TMA packet. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc + + which contains "1" if the feature is supported and + "0" otherwise. + + The frequency of MTC packets can also be specified - see + mtc_period below. + +mtc_period Specifies how frequently MTC packets are produced - see mtc + above for how to determine if MTC packets are supported. + + Valid values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/mtc_periods + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The mtc_period value is converted to the MTC frequency as: + + CTC-frequency / (2 ^ value) + + e.g. value 3 means one eighth of CTC-frequency + + Where CTC is the hardware crystal clock, the frequency of which + can be related to TSC via values provided in cpuid leaf 0x15. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/mtc_period=15/u uname + Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9 + + The default value is 3 or the nearest lower value + that is supported (0 is always supported). + +cyc Produces CYC timing packets. + + CYC packets provide even finer grain timestamp information than + MTC and TSC packets. A CYC packet contains the number of CPU + cycles since the last CYC packet. Unlike MTC and TSC packets, + CYC packets are only sent when another packet is also sent. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/psb_cyc + + which contains "1" if the feature is supported and + "0" otherwise. + + The number of CYC packets produced can be reduced by specifying + a threshold - see cyc_thresh below. + +cyc_thresh Specifies how frequently CYC packets are produced - see cyc + above for how to determine if CYC packets are supported. + + Valid cyc_thresh values are given by: + + /sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds + + which contains a hexadecimal value, the bits of which represent + valid values e.g. bit 2 set means value 2 is valid. + + The cyc_thresh value represents the minimum number of CPU cycles + that must have passed before a CYC packet can be sent. The + number of CPU cycles is: + + 2 ^ (value - 1) + + e.g. value 4 means 8 CPU cycles must pass before a CYC packet + can be sent. Note a CYC packet is still only sent when another + packet is sent, not at, e.g. every 8 CPU cycles. + + If an invalid value is entered, the error message + will give a list of valid values e.g. + + $ perf record -e intel_pt/cyc,cyc_thresh=15/u uname + Invalid cyc_thresh for intel_pt. Valid values are: 0-12 + + CYC packets are not requested by default. + +pt Specifies pass-through which enables the 'branch' config term. + + The default config selects 'pt' if it is available, so a user will + never need to specify this term. + +branch Enable branch tracing. Branch tracing is enabled by default so to + disable branch tracing use 'branch=0'. + + The default config selects 'branch' if it is available. + +ptw Enable PTWRITE packets which are produced when a ptwrite instruction + is executed. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/ptwrite + + which contains "1" if the feature is supported and + "0" otherwise. + +fup_on_ptw Enable a FUP packet to follow the PTWRITE packet. The FUP packet + provides the address of the ptwrite instruction. In the absence of + fup_on_ptw, the decoder will use the address of the previous branch + if branch tracing is enabled, otherwise the address will be zero. + Note that fup_on_ptw will work even when branch tracing is disabled. + +pwr_evt Enable power events. The power events provide information about + changes to the CPU C-state. + + Support for this feature is indicated by: + + /sys/bus/event_source/devices/intel_pt/caps/power_event_trace + + which contains "1" if the feature is supported and + "0" otherwise. + + +AUX area sampling option +~~~~~~~~~~~~~~~~~~~~~~~~ + +To select Intel PT "sampling" the AUX area sampling option can be used: + + --aux-sample + +Optionally it can be followed by the sample size in bytes e.g. + + --aux-sample=8192 + +In addition, the Intel PT event to sample must be defined e.g. + + -e intel_pt//u + +Samples on other events will be created containing Intel PT data e.g. the +following will create Intel PT samples on the branch-misses event, note the +events must be grouped using {}: + + perf record --aux-sample -e '{intel_pt//u,branch-misses:u}' + +An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to +events. In this case, the grouping is implied e.g. + + perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u + +is the same as: + + perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}' + +but allows for also using an address filter e.g.: + + perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls + +It is important to select a sample size that is big enough to contain at least +one PSB packet. If not a warning will be displayed: + + Intel PT sample size (%zu) may be too small for PSB period (%zu) + +The calculation used for that is: if sample_size <= psb_period + 256 display the +warning. When sampling is used, psb_period defaults to 0 (2KiB). + +The default sample size is 4KiB. + +The sample size is passed in aux_sample_size in struct perf_event_attr. The +sample size is limited by the maximum event size which is 64KiB. It is +difficult to know how big the event might be without the trace sample attached, +but the tool validates that the sample size is not greater than 60KiB. + + +new snapshot option +~~~~~~~~~~~~~~~~~~~ + +The difference between full trace and snapshot from the kernel's perspective is +that in full trace we don't overwrite trace data that the user hasn't collected +yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let +the trace run and overwrite older data in the buffer so that whenever something +interesting happens, we can stop it and grab a snapshot of what was going on +around that interesting moment. + +To select snapshot mode a new option has been added: + + -S + +Optionally it can be followed by the snapshot size e.g. + + -S0x100000 + +The default snapshot size is the auxtrace mmap size. If neither auxtrace mmap size +nor snapshot size is specified, then the default is 4MiB for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced as described in the 'new auxtrace mmap size option' section below. + +The snapshot size is displayed if the option -vv is used e.g. + + Intel PT snapshot size: %zu + + +new auxtrace mmap size option +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Intel PT buffer size is specified by an addition to the -m option e.g. + + -m,16 + +selects a buffer size of 16 pages i.e. 64KiB. + +Note that the existing functionality of -m is unchanged. The auxtrace mmap size +is specified by the optional addition of a comma and the value. + +The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users +(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users. +If an unprivileged user does not specify mmap pages, the mmap pages will be +reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the +user is likely to get an error as they exceed their mlock limit (Max locked +memory as shown in /proc/self/limits). Note that perf does not count the first +512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu +against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus +their mlock limit (which defaults to 64KiB but is not multiplied by the number +of cpus). + +In full-trace mode, powers of two are allowed for buffer size, with a minimum +size of 2 pages. In snapshot mode or sampling mode, it is the same but the +minimum size is 1 page. + +The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g. + + mmap length 528384 + auxtrace mmap length 4198400 + + +Intel PT modes of operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Intel PT can be used in 2 modes: + full-trace mode + sample mode + snapshot mode + +Full-trace mode traces continuously e.g. + + perf record -e intel_pt//u uname + +Sample mode attaches a Intel PT sample to other events e.g. + + perf record --aux-sample -e intel_pt//u -e branch-misses:u + +Snapshot mode captures the available data when a signal is sent e.g. + + perf record -v -e intel_pt//u -S ./loopy 1000000000 & + [1] 11435 + kill -USR2 11435 + Recording AUX area tracing snapshot + +Note that the signal sent is SIGUSR2. +Note that "Recording AUX area tracing snapshot" is displayed because the -v +option is used. + +The 2 modes cannot be used together. + + +Buffer handling +~~~~~~~~~~~~~~~ + +There may be buffer limitations (i.e. single ToPa entry) which means that actual +buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to +provide other sizes, and in particular an arbitrarily large size, multiple +buffers are logically concatenated. However an interrupt must be used to switch +between buffers. That has two potential problems: + a) the interrupt may not be handled in time so that the current buffer + becomes full and some trace data is lost. + b) the interrupts may slow the system and affect the performance + results. + +If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event +which the tools report as an error. + +In full-trace mode, the driver waits for data to be copied out before allowing +the (logical) buffer to wrap-around. If data is not copied out quickly enough, +again 'truncated' is set in the PERF_RECORD_AUX event. If the driver has to +wait, the intel_pt event gets disabled. Because it is difficult to know when +that happens, perf tools always re-enable the intel_pt event after copying out +data. + + +Intel PT and build ids +~~~~~~~~~~~~~~~~~~~~~~ + +By default "perf record" post-processes the event stream to find all build ids +for executables for all addresses sampled. Deliberately, Intel PT is not +decoded for that purpose (it would take too long). Instead the build ids for +all executables encountered (due to mmap, comm or task events) are included +in the perf.data file. + +To see buildids included in the perf.data file use the command: + + perf buildid-list + +If the perf.data file contains Intel PT data, that is the same as: + + perf buildid-list --with-hits + + +Snapshot mode and event disabling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to make a snapshot, the intel_pt event is disabled using an IOCTL, +namely PERF_EVENT_IOC_DISABLE. However doing that can also disable the +collection of side-band information. In order to prevent that, a dummy +software event has been introduced that permits tracking events (like mmaps) to +continue to be recorded while intel_pt is disabled. That is important to ensure +there is complete side-band information to allow the decoding of subsequent +snapshots. + +A test has been created for that. To find the test: + + perf test list + ... + 23: Test using a dummy software event to keep tracking + +To run the test: + + perf test 23 + 23: Test using a dummy software event to keep tracking : Ok + + +perf record modes (nothing new here) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +perf record essentially operates in one of three modes: + per thread + per cpu + workload only + +"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a +workload). +"per cpu" is selected by -C or -a. +"workload only" mode is selected by not using the other options but providing a +command to run (i.e. the workload). + +In per-thread mode an exact list of threads is traced. There is no inheritance. +Each thread has its own event buffer. + +In per-cpu mode all processes (or processes from the selected cgroup i.e. -G +option, or processes selected with -p or -u) are traced. Each cpu has its own +buffer. Inheritance is allowed. + +In workload-only mode, the workload is traced but with per-cpu buffers. +Inheritance is allowed. Note that you can now trace a workload in per-thread +mode by using the --per-thread option. + + +Privileged vs non-privileged users +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users +have memory limits imposed upon them. That affects what buffer sizes they can +have as outlined above. + +The v4.2 kernel introduced support for a context switch metadata event, +PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes +are scheduled out and in, just not by whom, which is left for the +PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context, +which in turn requires CAP_SYS_ADMIN. + +Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context +switches") commit, that introduces these metadata events for further info. + +When working with kernels < v4.2, the following considerations must be taken, +as the sched:sched_switch tracepoints will be used to receive such information: + +Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are +not permitted to use tracepoints which means there is insufficient side-band +information to decode Intel PT in per-cpu mode, and potentially workload-only +mode too if the workload creates new processes. + +Note also, that to use tracepoints, read-access to debugfs is required. So if +debugfs is not mounted or the user does not have read-access, it will again not +be possible to decode Intel PT in per-cpu mode. + + +sched_switch tracepoint +~~~~~~~~~~~~~~~~~~~~~~~ + +The sched_switch tracepoint is used to provide side-band data for Intel PT +decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't +available. + +The sched_switch events are automatically added. e.g. the second event shown +below: + + $ perf record -vv -e intel_pt//u uname + ------------------------------------------------------------ + perf_event_attr: + type 6 + size 112 + config 0x400 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + enable_on_exec 1 + sample_id_all 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 2 + size 112 + config 0x108 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER + read_format ID + inherit 1 + sample_id_all 1 + exclude_guest 1 + ------------------------------------------------------------ + sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid -1 cpu 3 group_fd -1 flags 0x8 + ------------------------------------------------------------ + perf_event_attr: + type 1 + size 112 + config 0x9 + { sample_period, sample_freq } 1 + sample_type IP|TID|TIME|IDENTIFIER + read_format ID + disabled 1 + inherit 1 + exclude_kernel 1 + exclude_hv 1 + mmap 1 + comm 1 + enable_on_exec 1 + task 1 + sample_id_all 1 + mmap2 1 + comm_exec 1 + ------------------------------------------------------------ + sys_perf_event_open: pid 31104 cpu 0 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 1 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 2 group_fd -1 flags 0x8 + sys_perf_event_open: pid 31104 cpu 3 group_fd -1 flags 0x8 + mmap size 528384B + AUX area mmap length 4194304 + perf event ring buffer mmapped per cpu + Synthesizing auxtrace information + Linux + [ perf record: Woken up 1 times to write data ] + [ perf record: Captured and wrote 0.042 MB perf.data ] + +Note, the sched_switch event is only added if the user is permitted to use it +and only in per-cpu mode. + +Note also, the sched_switch event is only added if TSC packets are requested. +That is because, in the absence of timing information, the sched_switch events +cannot be matched against the Intel PT trace. + + +perf script +----------- + +By default, perf script will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace. + + +New --itrace option +~~~~~~~~~~~~~~~~~~~ + +Having no option is the same as + + --itrace + +which, in turn, is the same as + + --itrace=cepwx + +The letters are: + + i synthesize "instructions" events + b synthesize "branches" events + x synthesize "transactions" events + w synthesize "ptwrite" events + p synthesize "power" events + c synthesize branches events (calls only) + r synthesize branches events (returns only) + e synthesize tracing error events + d create a debug log + g synthesize a call chain (use with i or x) + l synthesize last branch entries (use with i or x) + s skip initial number of events + +"Instructions" events look like they were recorded by "perf record -e +instructions". + +"Branches" events look like they were recorded by "perf record -e branches". "c" +and "r" can be combined to get calls and returns. + +"Transactions" events correspond to the start or end of transactions. The +'flags' field can be used in perf script to determine whether the event is a +tranasaction start, commit or abort. + +Note that "instructions", "branches" and "transactions" events depend on code +flow packets which can be disabled by using the config term "branch=0". Refer +to the config terms section above. + +"ptwrite" events record the payload of the ptwrite instruction and whether +"fup_on_ptw" was used. "ptwrite" events depend on PTWRITE packets which are +recorded only if the "ptw" config term was used. Refer to the config terms +section above. perf script "synth" field displays "ptwrite" information like +this: "ip: 0 payload: 0x123456789abcdef0" where "ip" is 1 if "fup_on_ptw" was +used. + +"Power" events correspond to power event packets and CBR (core-to-bus ratio) +packets. While CBR packets are always recorded when tracing is enabled, power +event packets are recorded only if the "pwr_evt" config term was used. Refer to +the config terms section above. The power events record information about +C-state changes, whereas CBR is indicative of CPU frequency. perf script +"event,synth" fields display information like this: + cbr: cbr: 22 freq: 2189 MHz (200%) + mwait: hints: 0x60 extensions: 0x1 + pwre: hw: 0 cstate: 2 sub-cstate: 0 + exstop: ip: 1 + pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4 +Where: + "cbr" includes the frequency and the percentage of maximum non-turbo + "mwait" shows mwait hints and extensions + "pwre" shows C-state transitions (to a C-state deeper than C0) and + whether initiated by hardware + "exstop" indicates execution stopped and whether the IP was recorded + exactly, + "pwrx" indicates return to C0 +For more details refer to the Intel 64 and IA-32 Architectures Software +Developer Manuals. + +Error events show where the decoder lost the trace. Error events +are quite important. Users must know if what they are seeing is a complete +picture or not. + +The "d" option will cause the creation of a file "intel_pt.log" containing all +decoded packets and instructions. Note that this option slows down the decoder +and that the resulting file may be very large. + +In addition, the period of the "instructions" event can be specified. e.g. + + --itrace=i10us + +sets the period to 10us i.e. one instruction sample is synthesized for each 10 +microseconds of trace. Alternatives to "us" are "ms" (milliseconds), +"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions). + +"ms", "us" and "ns" are converted to TSC ticks. + +The timing information included with Intel PT does not give the time of every +instruction. Consequently, for the purpose of sampling, the decoder estimates +the time since the last timing packet based on 1 tick per instruction. The time +on the sample is *not* adjusted and reflects the last known value of TSC. + +For Intel PT, the default period is 100us. + +Setting it to a zero period means "as often as possible". + +In the case of Intel PT that is the same as a period of 1 and a unit of +'instructions' (i.e. --itrace=i1i). + +Also the call chain size (default 16, max. 1024) for instructions or +transactions events can be specified. e.g. + + --itrace=ig32 + --itrace=xg32 + +Also the number of last branch entries (default 64, max. 1024) for instructions or +transactions events can be specified. e.g. + + --itrace=il10 + --itrace=xl10 + +Note that last branch entries are cleared for each sample, so there is no overlap +from one sample to the next. + +To disable trace decoding entirely, use the option --no-itrace. + +It is also possible to skip events generated (instructions, branches, transactions) +at the beginning. This is useful to ignore initialization code. + + --itrace=i0nss1000000 + +skips the first million instructions. + +dump option +~~~~~~~~~~~ + +perf script has an option (-D) to "dump" the events i.e. display the binary +data. + +When -D is used, Intel PT packets are displayed. The packet decoder does not +pay attention to PSB packets, but just decodes the bytes - so the packets seen +by the actual decoder may not be identical in places where the data is corrupt. +One example of that would be when the buffer-switching interrupt has been too +slow, and the buffer has been filled completely. In that case, the last packet +in the buffer might be truncated and immediately followed by a PSB as the trace +continues in the next buffer. + +To disable the display of Intel PT packets, combine the -D option with +--no-itrace. + + +perf report +----------- + +By default, perf report will decode trace data found in the perf.data file. +This can be further controlled by new option --itrace exactly the same as +perf script, with the exception that the default is --itrace=igxe. + + +perf inject +----------- + +perf inject also accepts the --itrace option in which case tracing data is +removed and replaced with the synthesized events. e.g. + + perf inject --itrace -i perf.data -o perf.data.new + +Below is an example of using Intel PT with autofdo. It requires autofdo +(https://github.com/google/autofdo) and gcc version 5. The bubble +sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial) +amended to take the number of elements as a parameter. + + $ gcc-5 -O3 sort.c -o sort_optimized + $ ./sort_optimized 30000 + Bubble sorting array of 30000 elements + 2254 ms + + $ cat ~/.perfconfig + [intel-pt] + mispred-all = on + + $ perf record -e intel_pt//u ./sort 3000 + Bubble sorting array of 3000 elements + 58 ms + [ perf record: Woken up 2 times to write data ] + [ perf record: Captured and wrote 3.939 MB perf.data ] + $ perf inject -i perf.data -o inj --itrace=i100usle --strip + $ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1 + $ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo + $ ./sort_autofdo 30000 + Bubble sorting array of 30000 elements + 2155 ms + +Note there is currently no advantage to using Intel PT instead of LBR, but +that may change in the future if greater use is made of the data. + + +PEBS via Intel PT +----------------- + +Some hardware has the feature to redirect PEBS records to the Intel PT trace. +Recording is selected by using the aux-output config term e.g. + + perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname + +Note that currently, software only supports redirecting at most one PEBS event. + +To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g. + + perf script --itrace=oe + + +SEE ALSO +-------- + +linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1], +linkperf:perf-inject[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index b23a4012a606..7f4db7592467 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -589,4 +589,4 @@ appended unit character - B/K/M/G SEE ALSO -------- -linkperf:perf-stat[1], linkperf:perf-list[1] +linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index db61f16ffa56..bd0a029d4c08 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -546,4 +546,5 @@ include::callchain-overhead-calculation.txt[] SEE ALSO -------- -linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1] +linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1], +linkperf:perf-intel-pt[1] diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 2599b057e47b..db6a36aac47e 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -429,4 +429,4 @@ include::itrace.txt[] SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], -linkperf:perf-script-python[1] +linkperf:perf-script-python[1], linkperf:perf-intel-pt[1] diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index d7c905f7520f..5f4045df76f4 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -186,24 +186,23 @@ static int hist_iter__branch_callback(struct hist_entry_iter *iter, { struct hist_entry *he = iter->he; struct report *rep = arg; - struct branch_info *bi; + struct branch_info *bi = he->branch_info; struct perf_sample *sample = iter->sample; struct evsel *evsel = iter->evsel; int err; + branch_type_count(&rep->brtype_stat, &bi->flags, + bi->from.addr, bi->to.addr); + if (!ui__has_annotation() && !rep->symbol_ipc) return 0; - bi = he->branch_info; err = addr_map_symbol__inc_samples(&bi->from, sample, evsel); if (err) goto out; err = addr_map_symbol__inc_samples(&bi->to, sample, evsel); - branch_type_count(&rep->brtype_stat, &bi->flags, - bi->from.addr, bi->to.addr); - out: return err; } diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json index 5e36bc2468d0..c998e4f1d1d2 100644 --- a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json +++ b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json @@ -4,27 +4,27 @@ "EventCode": "80", "EventName": "ECC_FUNCTION_COUNT", "BriefDescription": "ECC Function Count", - "PublicDescription": "Long ECC function Count" + "PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions issued by the CPU." }, { "Unit": "CPU-M-CF", "EventCode": "81", "EventName": "ECC_CYCLES_COUNT", "BriefDescription": "ECC Cycles Count", - "PublicDescription": "Long ECC Function cycles count" + "PublicDescription": "This counter counts the total number of CPU cycles when the ECC coprocessor is busy performing the elliptic-curve cryptography (ECC) functions issued by the CPU." }, { "Unit": "CPU-M-CF", "EventCode": "82", "EventName": "ECC_BLOCKED_FUNCTION_COUNT", "BriefDescription": "Ecc Blocked Function Count", - "PublicDescription": "Long ECC blocked function count" + "PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions that are issued by the CPU and are blocked because the ECC coprocessor is busy performing a function issued by another CPU." }, { "Unit": "CPU-M-CF", "EventCode": "83", "EventName": "ECC_BLOCKED_CYCLES_COUNT", "BriefDescription": "ECC Blocked Cycles Count", - "PublicDescription": "Long ECC blocked cycles count" + "PublicDescription": "This counter counts the total number of CPU cycles blocked for the elliptic-curve cryptography (ECC) functions issued by the CPU because the ECC coprocessor is busy performing a function issued by another CPU." }, ] diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json index 89e070727e1b..2df2e231e9ee 100644 --- a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json +++ b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json @@ -25,7 +25,7 @@ "EventCode": "131", "EventName": "DTLB2_HPAGE_WRITES", "BriefDescription": "DTLB2 One-Megabyte Page Writes", - "PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page or a Last Host Translation was done" + "PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page" }, { "Unit": "CPU-M-CF", @@ -356,6 +356,34 @@ "BriefDescription": "Aborted transactions in constrained TX mode using special completion logic", "PublicDescription": "A transaction abort has occurred in a constrained transactional-execution mode and the CPU is using special logic to allow the transaction to complete" }, + { + "Unit": "CPU-M-CF", + "EventCode": "247", + "EventName": "DFLT_ACCESS", + "BriefDescription": "Cycles CPU spent obtaining access to Deflate unit", + "PublicDescription": "Cycles CPU spent obtaining access to Deflate unit" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "252", + "EventName": "DFLT_CYCLES", + "BriefDescription": "Cycles CPU is using Deflate unit", + "PublicDescription": "Cycles CPU is using Deflate unit" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "264", + "EventName": "DFLT_CC", + "BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed", + "PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed" + }, + { + "Unit": "CPU-M-CF", + "EventCode": "265", + "EventName": "DFLT_CCERROR", + "BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2", + "PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2" + }, { "Unit": "CPU-M-CF", "EventCode": "448", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index f94653229dd4..a728c6e5119b 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index e7feb60f9fa9..f97e8316ad2f 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 21d7a0c2c2e8..35f5db1786f7 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -215,7 +215,8 @@ "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )", "MetricGroup": "TLB", - "MetricName": "Page_Walks_Utilization" + "MetricName": "Page_Walks_Utilization", + "MetricConstraint": "NO_NMI_WATCHDOG" }, { "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses", diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index 27b4da80f751..3c4236a5bad8 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -323,7 +323,7 @@ static int print_events_table_entry(void *data, char *name, char *event, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated) + char *deprecated, char *metric_constraint) { struct perf_entry_data *pd = data; FILE *outfp = pd->outfp; @@ -357,6 +357,8 @@ static int print_events_table_entry(void *data, char *name, char *event, fprintf(outfp, "\t.metric_group = \"%s\",\n", metric_group); if (deprecated) fprintf(outfp, "\t.deprecated = \"%s\",\n", deprecated); + if (metric_constraint) + fprintf(outfp, "\t.metric_constraint = \"%s\",\n", metric_constraint); fprintf(outfp, "},\n"); return 0; @@ -375,6 +377,7 @@ struct event_struct { char *metric_name; char *metric_group; char *deprecated; + char *metric_constraint; }; #define ADD_EVENT_FIELD(field) do { if (field) { \ @@ -422,7 +425,7 @@ static int save_arch_std_events(void *data, char *name, char *event, char *desc, char *long_desc, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated) + char *deprecated, char *metric_constraint) { struct event_struct *es; @@ -486,7 +489,7 @@ try_fixup(const char *fn, char *arch_std, char **event, char **desc, char **name, char **long_desc, char **pmu, char **filter, char **perpkg, char **unit, char **metric_expr, char **metric_name, char **metric_group, unsigned long long eventcode, - char **deprecated) + char **deprecated, char **metric_constraint) { /* try to find matching event from arch standard values */ struct event_struct *es; @@ -515,7 +518,7 @@ int json_events(const char *fn, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated), + char *deprecated, char *metric_constraint), void *data) { int err; @@ -545,6 +548,7 @@ int json_events(const char *fn, char *metric_name = NULL; char *metric_group = NULL; char *deprecated = NULL; + char *metric_constraint = NULL; char *arch_std = NULL; unsigned long long eventcode = 0; struct msrmap *msr = NULL; @@ -629,6 +633,8 @@ int json_events(const char *fn, addfield(map, &metric_name, "", "", val); } else if (json_streq(map, field, "MetricGroup")) { addfield(map, &metric_group, "", "", val); + } else if (json_streq(map, field, "MetricConstraint")) { + addfield(map, &metric_constraint, "", "", val); } else if (json_streq(map, field, "MetricExpr")) { addfield(map, &metric_expr, "", "", val); for (s = metric_expr; *s; s++) @@ -670,13 +676,13 @@ int json_events(const char *fn, &long_desc, &pmu, &filter, &perpkg, &unit, &metric_expr, &metric_name, &metric_group, eventcode, - &deprecated); + &deprecated, &metric_constraint); if (err) goto free_strings; } err = func(data, name, real_event(name, event), desc, long_desc, pmu, unit, perpkg, metric_expr, metric_name, - metric_group, deprecated); + metric_group, deprecated, metric_constraint); free_strings: free(event); free(desc); @@ -691,6 +697,7 @@ int json_events(const char *fn, free(metric_expr); free(metric_name); free(metric_group); + free(metric_constraint); free(arch_std); if (err) diff --git a/tools/perf/pmu-events/jevents.h b/tools/perf/pmu-events/jevents.h index 5cda49a42143..2afc8304529e 100644 --- a/tools/perf/pmu-events/jevents.h +++ b/tools/perf/pmu-events/jevents.h @@ -8,7 +8,7 @@ int json_events(const char *fn, char *pmu, char *unit, char *perpkg, char *metric_expr, char *metric_name, char *metric_group, - char *deprecated), + char *deprecated, char *metric_constraint), void *data); char *get_cpu_str(void); diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h index caeb577d36c9..53e76d5d5b37 100644 --- a/tools/perf/pmu-events/pmu-events.h +++ b/tools/perf/pmu-events/pmu-events.h @@ -18,6 +18,7 @@ struct pmu_event { const char *metric_name; const char *metric_group; const char *deprecated; + const char *metric_constraint; }; /* diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl index 4e7076c20616..d307ce8fd6ed 100644 --- a/tools/perf/scripts/perl/check-perf-trace.pl +++ b/tools/perf/scripts/perl/check-perf-trace.pl @@ -28,7 +28,7 @@ sub trace_end sub irq::softirq_entry { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $vec) = @_; print_header($event_name, $common_cpu, $common_secs, $common_nsecs, @@ -43,7 +43,7 @@ sub irq::softirq_entry sub kmem::kmalloc { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $call_site, $ptr, $bytes_req, $bytes_alloc, $gfp_flags) = @_; @@ -92,7 +92,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/failed-syscalls.pl b/tools/perf/scripts/perl/failed-syscalls.pl index 55e7ae4c5c88..05954a8f363a 100644 --- a/tools/perf/scripts/perl/failed-syscalls.pl +++ b/tools/perf/scripts/perl/failed-syscalls.pl @@ -18,7 +18,7 @@ my %failed_syscalls; sub raw_syscalls::sys_exit { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $id, $ret) = @_; if ($ret < 0) { diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl index 168fa5e94b44..92a750b8552b 100644 --- a/tools/perf/scripts/perl/rw-by-file.pl +++ b/tools/perf/scripts/perl/rw-by-file.pl @@ -28,7 +28,7 @@ my %writes; sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_; + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; if ($common_comm eq $for_comm) { $reads{$fd}{bytes_requested} += $count; @@ -39,7 +39,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_; + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; if ($common_comm eq $for_comm) { $writes{$fd}{bytes_written} += $count; @@ -98,7 +98,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/rw-by-pid.pl b/tools/perf/scripts/perl/rw-by-pid.pl index 495698250b2f..d789fe39caab 100644 --- a/tools/perf/scripts/perl/rw-by-pid.pl +++ b/tools/perf/scripts/perl/rw-by-pid.pl @@ -24,7 +24,7 @@ my %writes; sub syscalls::sys_exit_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; if ($ret > 0) { @@ -40,7 +40,7 @@ sub syscalls::sys_exit_read sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; $reads{$common_pid}{bytes_requested} += $count; @@ -51,7 +51,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_exit_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; if ($ret <= 0) { @@ -62,7 +62,7 @@ sub syscalls::sys_exit_write sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; $writes{$common_pid}{bytes_written} += $count; @@ -178,7 +178,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl index 6473442568a2..eba4df67af6b 100644 --- a/tools/perf/scripts/perl/rwtop.pl +++ b/tools/perf/scripts/perl/rwtop.pl @@ -35,7 +35,7 @@ if (!$interval) { sub syscalls::sys_exit_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; print_check(); @@ -53,7 +53,7 @@ sub syscalls::sys_exit_read sub syscalls::sys_enter_read { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; print_check(); @@ -66,7 +66,7 @@ sub syscalls::sys_enter_read sub syscalls::sys_exit_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $ret) = @_; print_check(); @@ -79,7 +79,7 @@ sub syscalls::sys_exit_write sub syscalls::sys_enter_write { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_; print_check(); @@ -197,7 +197,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/scripts/perl/wakeup-latency.pl b/tools/perf/scripts/perl/wakeup-latency.pl index efcfec5e347a..53444ff4ec7f 100644 --- a/tools/perf/scripts/perl/wakeup-latency.pl +++ b/tools/perf/scripts/perl/wakeup-latency.pl @@ -28,7 +28,7 @@ my $total_wakeups = 0; sub sched::sched_switch { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $prev_comm, $prev_pid, $prev_prio, $prev_state, $next_comm, $next_pid, $next_prio) = @_; @@ -51,7 +51,7 @@ sub sched::sched_switch sub sched::sched_wakeup { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, + $common_pid, $common_comm, $common_callchain, $comm, $pid, $prio, $success, $target_cpu) = @_; $last_wakeup{$target_cpu}{ts} = nsecs($common_secs, $common_nsecs); @@ -101,7 +101,7 @@ sub print_unhandled sub trace_unhandled { my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; + $common_pid, $common_comm, $common_callchain) = @_; $unhandled{$event_name}++; } diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 5f05db75cdd8..54d9516c9839 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -543,8 +543,11 @@ static int run_shell_tests(int argc, const char *argv[], int i, int width) return -1; dir = opendir(st.dir); - if (!dir) + if (!dir) { + pr_err("failed to open shell test directory: %s\n", + st.dir); return -1; + } for_each_shell_test(dir, st.dir, ent) { int curr = i++; diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index b3b3fe3ea345..62d2f9b9ce1b 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -363,6 +363,23 @@ struct cs_etm_packet_queue return NULL; } +static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm, + struct cs_etm_traceid_queue *tidq) +{ + struct cs_etm_packet *tmp; + + if (etm->sample_branches || etm->synth_opts.last_branch || + etm->sample_instructions) { + /* + * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for + * the next incoming packet. + */ + tmp = tidq->packet; + tidq->packet = tidq->prev_packet; + tidq->prev_packet = tmp; + } +} + static void cs_etm__packet_dump(const char *pkt_string) { const char *color = PERF_COLOR_BLUE; @@ -945,7 +962,7 @@ static inline u64 cs_etm__instr_addr(struct cs_etm_queue *etmq, if (packet->isa == CS_ETM_ISA_T32) { u64 addr = packet->start_addr; - while (offset > 0) { + while (offset) { addr += cs_etm__t32_instr_size(etmq, trace_chan_id, addr); offset--; @@ -1134,10 +1151,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, cs_etm__copy_insn(etmq, tidq->trace_chan_id, tidq->packet, &sample); - if (etm->synth_opts.last_branch) { - cs_etm__copy_last_branch_rb(etmq, tidq); + if (etm->synth_opts.last_branch) sample.branch_stack = tidq->last_branch; - } if (etm->synth_opts.inject) { ret = cs_etm__inject_event(event, &sample, @@ -1153,9 +1168,6 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq, "CS ETM Trace: failed to deliver instruction event, error %d\n", ret); - if (etm->synth_opts.last_branch) - cs_etm__reset_last_branch_rb(tidq); - return ret; } @@ -1342,12 +1354,14 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, struct cs_etm_traceid_queue *tidq) { struct cs_etm_auxtrace *etm = etmq->etm; - struct cs_etm_packet *tmp; int ret; u8 trace_chan_id = tidq->trace_chan_id; - u64 instrs_executed = tidq->packet->instr_count; + u64 instrs_prev; - tidq->period_instructions += instrs_executed; + /* Get instructions remainder from previous packet */ + instrs_prev = tidq->period_instructions; + + tidq->period_instructions += tidq->packet->instr_count; /* * Record a branch when the last instruction in @@ -1365,26 +1379,80 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, * TODO: allow period to be defined in cycles and clock time */ - /* Get number of instructions executed after the sample point */ - u64 instrs_over = tidq->period_instructions - - etm->instructions_sample_period; + /* + * Below diagram demonstrates the instruction samples + * generation flows: + * + * Instrs Instrs Instrs Instrs + * Sample(n) Sample(n+1) Sample(n+2) Sample(n+3) + * | | | | + * V V V V + * -------------------------------------------------- + * ^ ^ + * | | + * Period Period + * instructions(Pi) instructions(Pi') + * + * | | + * \---------------- -----------------/ + * V + * tidq->packet->instr_count + * + * Instrs Sample(n...) are the synthesised samples occurring + * every etm->instructions_sample_period instructions - as + * defined on the perf command line. Sample(n) is being the + * last sample before the current etm packet, n+1 to n+3 + * samples are generated from the current etm packet. + * + * tidq->packet->instr_count represents the number of + * instructions in the current etm packet. + * + * Period instructions (Pi) contains the the number of + * instructions executed after the sample point(n) from the + * previous etm packet. This will always be less than + * etm->instructions_sample_period. + * + * When generate new samples, it combines with two parts + * instructions, one is the tail of the old packet and another + * is the head of the new coming packet, to generate + * sample(n+1); sample(n+2) and sample(n+3) consume the + * instructions with sample period. After sample(n+3), the rest + * instructions will be used by later packet and it is assigned + * to tidq->period_instructions for next round calculation. + */ /* - * Calculate the address of the sampled instruction (-1 as - * sample is reported as though instruction has just been - * executed, but PC has not advanced to next instruction) + * Get the initial offset into the current packet instructions; + * entry conditions ensure that instrs_prev is less than + * etm->instructions_sample_period. */ - u64 offset = (instrs_executed - instrs_over - 1); - u64 addr = cs_etm__instr_addr(etmq, trace_chan_id, - tidq->packet, offset); + u64 offset = etm->instructions_sample_period - instrs_prev; + u64 addr; - ret = cs_etm__synth_instruction_sample( - etmq, tidq, addr, etm->instructions_sample_period); - if (ret) - return ret; + /* Prepare last branches for instruction sample */ + if (etm->synth_opts.last_branch) + cs_etm__copy_last_branch_rb(etmq, tidq); - /* Carry remaining instructions into next sample period */ - tidq->period_instructions = instrs_over; + while (tidq->period_instructions >= + etm->instructions_sample_period) { + /* + * Calculate the address of the sampled instruction (-1 + * as sample is reported as though instruction has just + * been executed, but PC has not advanced to next + * instruction) + */ + addr = cs_etm__instr_addr(etmq, trace_chan_id, + tidq->packet, offset - 1); + ret = cs_etm__synth_instruction_sample( + etmq, tidq, addr, + etm->instructions_sample_period); + if (ret) + return ret; + + offset += etm->instructions_sample_period; + tidq->period_instructions -= + etm->instructions_sample_period; + } } if (etm->sample_branches) { @@ -1406,15 +1474,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq, } } - if (etm->sample_branches || etm->synth_opts.last_branch) { - /* - * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for - * the next incoming packet. - */ - tmp = tidq->packet; - tidq->packet = tidq->prev_packet; - tidq->prev_packet = tmp; - } + cs_etm__packet_swap(etm, tidq); return 0; } @@ -1443,7 +1503,6 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, { int err = 0; struct cs_etm_auxtrace *etm = etmq->etm; - struct cs_etm_packet *tmp; /* Handle start tracing packet */ if (tidq->prev_packet->sample_type == CS_ETM_EMPTY) @@ -1451,6 +1510,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, if (etmq->etm->synth_opts.last_branch && tidq->prev_packet->sample_type == CS_ETM_RANGE) { + u64 addr; + + /* Prepare last branches for instruction sample */ + cs_etm__copy_last_branch_rb(etmq, tidq); + /* * Generate a last branch event for the branches left in the * circular buffer at the end of the trace. @@ -1458,7 +1522,7 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, * Use the address of the end of the last reported execution * range */ - u64 addr = cs_etm__last_executed_instr(tidq->prev_packet); + addr = cs_etm__last_executed_instr(tidq->prev_packet); err = cs_etm__synth_instruction_sample( etmq, tidq, addr, @@ -1478,15 +1542,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq, } swap_packet: - if (etm->sample_branches || etm->synth_opts.last_branch) { - /* - * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for - * the next incoming packet. - */ - tmp = tidq->packet; - tidq->packet = tidq->prev_packet; - tidq->prev_packet = tmp; - } + cs_etm__packet_swap(etm, tidq); + + /* Reset last branches after flush the trace */ + if (etm->synth_opts.last_branch) + cs_etm__reset_last_branch_rb(tidq); return err; } @@ -1507,11 +1567,16 @@ static int cs_etm__end_block(struct cs_etm_queue *etmq, */ if (etmq->etm->synth_opts.last_branch && tidq->prev_packet->sample_type == CS_ETM_RANGE) { + u64 addr; + + /* Prepare last branches for instruction sample */ + cs_etm__copy_last_branch_rb(etmq, tidq); + /* * Use the address of the end of the last reported execution * range. */ - u64 addr = cs_etm__last_executed_instr(tidq->prev_packet); + addr = cs_etm__last_executed_instr(tidq->prev_packet); err = cs_etm__synth_instruction_sample( etmq, tidq, addr, diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l index 1928f2a3dddc..eaad29243c23 100644 --- a/tools/perf/util/expr.l +++ b/tools/perf/util/expr.l @@ -79,10 +79,10 @@ symbol {spec}*{sym}*{spec}*{sym}* { int start_token; - start_token = parse_events_get_extra(yyscanner); + start_token = expr_get_extra(yyscanner); if (start_token) { - parse_events_set_extra(NULL, yyscanner); + expr_set_extra(NULL, yyscanner); return start_token; } } diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index b342f744b1fc..53d96611e6a6 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -44,8 +44,8 @@ static inline int is_no_dso_memory(const char *filename) static inline int is_android_lib(const char *filename) { - return !strncmp(filename, "/data/app-lib", 13) || - !strncmp(filename, "/system/lib", 11); + return strstarts(filename, "/data/app-lib/") || + strstarts(filename, "/system/lib/"); } static inline bool replace_android_lib(const char *filename, char *newfilename) @@ -65,7 +65,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) app_abi_length = strlen(app_abi); - if (!strncmp(filename, "/data/app-lib", 13)) { + if (strstarts(filename, "/data/app-lib/")) { char *apk_path; if (!app_abi_length) @@ -89,7 +89,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) return true; } - if (!strncmp(filename, "/system/lib/", 12)) { + if (strstarts(filename, "/system/lib/")) { char *ndk, *app; const char *arch; size_t ndk_length; diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 02aee946b6c1..c3a8c701609a 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include "util.h" struct metric_event *metricgroup__lookup(struct rblist *metric_events, struct evsel *evsel, @@ -399,13 +401,85 @@ void metricgroup__print(bool metrics, bool metricgroups, char *filter, strlist__delete(metriclist); } +static void metricgroup__add_metric_weak_group(struct strbuf *events, + const char **ids, + int idnum) +{ + bool no_group = false; + int i; + + for (i = 0; i < idnum; i++) { + pr_debug("found event %s\n", ids[i]); + /* + * Duration time maps to a software event and can make + * groups not count. Always use it outside a + * group. + */ + if (!strcmp(ids[i], "duration_time")) { + if (i > 0) + strbuf_addf(events, "}:W,"); + strbuf_addf(events, "duration_time"); + no_group = true; + continue; + } + strbuf_addf(events, "%s%s", + i == 0 || no_group ? "{" : ",", + ids[i]); + no_group = false; + } + if (!no_group) + strbuf_addf(events, "}:W"); +} + +static void metricgroup__add_metric_non_group(struct strbuf *events, + const char **ids, + int idnum) +{ + int i; + + for (i = 0; i < idnum; i++) + strbuf_addf(events, ",%s", ids[i]); +} + +static void metricgroup___watchdog_constraint_hint(const char *name, bool foot) +{ + static bool violate_nmi_constraint; + + if (!foot) { + pr_warning("Splitting metric group %s into standalone metrics.\n", name); + violate_nmi_constraint = true; + return; + } + + if (!violate_nmi_constraint) + return; + + pr_warning("Try disabling the NMI watchdog to comply NO_NMI_WATCHDOG metric constraint:\n" + " echo 0 > /proc/sys/kernel/nmi_watchdog\n" + " perf stat ...\n" + " echo 1 > /proc/sys/kernel/nmi_watchdog\n"); +} + +static bool metricgroup__has_constraint(struct pmu_event *pe) +{ + if (!pe->metric_constraint) + return false; + + if (!strcmp(pe->metric_constraint, "NO_NMI_WATCHDOG") && + sysctl__nmi_watchdog_enabled()) { + metricgroup___watchdog_constraint_hint(pe->metric_name, false); + return true; + } + + return false; +} + static int metricgroup__add_metric(const char *metric, struct strbuf *events, struct list_head *group_list) { struct pmu_events_map *map = perf_pmu__find_map(NULL); struct pmu_event *pe; - int ret = -EINVAL; - int i, j; + int i, ret = -EINVAL; if (!map) return 0; @@ -422,7 +496,6 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, const char **ids; int idnum; struct egroup *eg; - bool no_group = false; pr_debug("metric expr %s for %s\n", pe->metric_expr, pe->metric_name); @@ -431,27 +504,11 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events, continue; if (events->len > 0) strbuf_addf(events, ","); - for (j = 0; j < idnum; j++) { - pr_debug("found event %s\n", ids[j]); - /* - * Duration time maps to a software event and can make - * groups not count. Always use it outside a - * group. - */ - if (!strcmp(ids[j], "duration_time")) { - if (j > 0) - strbuf_addf(events, "}:W,"); - strbuf_addf(events, "duration_time"); - no_group = true; - continue; - } - strbuf_addf(events, "%s%s", - j == 0 || no_group ? "{" : ",", - ids[j]); - no_group = false; - } - if (!no_group) - strbuf_addf(events, "}:W"); + + if (metricgroup__has_constraint(pe)) + metricgroup__add_metric_non_group(events, ids, idnum); + else + metricgroup__add_metric_weak_group(events, ids, idnum); eg = malloc(sizeof(struct egroup)); if (!eg) { @@ -493,6 +550,10 @@ static int metricgroup__add_metric_list(const char *list, struct strbuf *events, } } free(nlist); + + if (!ret) + metricgroup___watchdog_constraint_hint(NULL, true); + return ret; } diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 3b664fa673a6..ab7108d22428 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -98,20 +98,29 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity) { void *data; size_t mmap_len; - unsigned long node_mask; + unsigned long *node_mask; + unsigned long node_index; + int err = 0; if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) { data = map->aio.data[idx]; mmap_len = mmap__mmap_len(map); - node_mask = 1UL << cpu__get_node(cpu); - if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) { - pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n", - data, data + mmap_len, cpu__get_node(cpu)); + node_index = cpu__get_node(cpu); + node_mask = bitmap_alloc(node_index + 1); + if (!node_mask) { + pr_err("Failed to allocate node mask for mbind: error %m\n"); return -1; } + set_bit(node_index, node_mask); + if (mbind(data, mmap_len, MPOL_BIND, node_mask, node_index + 1 + 1, 0)) { + pr_err("Failed to bind [%p-%p] AIO buffer to node %lu: error %m\n", + data, data + mmap_len, node_index); + err = -1; + } + bitmap_free(node_mask); } - return 0; + return err; } #else /* !HAVE_LIBNUMA_SUPPORT */ static int perf_mmap__aio_alloc(struct mmap *map, int idx) diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index d89cb0da90f8..76c6052b12e2 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -16,6 +16,7 @@ #include #include "cgroup.h" #include +#include "util.h" #define CNTR_NOT_SUPPORTED "" #define CNTR_NOT_COUNTED "" @@ -1097,7 +1098,6 @@ static void print_footer(struct perf_stat_config *config) { double avg = avg_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC; FILE *output = config->output; - int n; if (!config->null_run) fprintf(output, "\n"); @@ -1131,9 +1131,7 @@ static void print_footer(struct perf_stat_config *config) } fprintf(output, "\n\n"); - if (config->print_free_counters_hint && - sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 && - n > 0) + if (config->print_free_counters_hint && sysctl__nmi_watchdog_enabled()) fprintf(output, "Some events weren't counted. Try disabling the NMI watchdog:\n" " echo 0 > /proc/sys/kernel/nmi_watchdog\n" diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index dd3e6f43fb86..3f28af39f9c6 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -345,6 +345,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, continue; event->mmap2.ino = (u64)ino; + event->mmap2.ino_generation = 0; /* * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 969ae560dad9..d707c9624dd9 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -55,6 +55,24 @@ int sysctl__max_stack(void) return sysctl_perf_event_max_stack; } +bool sysctl__nmi_watchdog_enabled(void) +{ + static bool cached; + static bool nmi_watchdog; + int value; + + if (cached) + return nmi_watchdog; + + if (sysctl__read_int("kernel/nmi_watchdog", &value) < 0) + return false; + + nmi_watchdog = (value > 0) ? true : false; + cached = true; + + return nmi_watchdog; +} + bool test_attr__enabled; bool perf_host = true; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 9969b8b46f7c..f486fdd3a538 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -29,6 +29,8 @@ size_t hex_width(u64 v); int sysctl__max_stack(void); +bool sysctl__nmi_watchdog_enabled(void); + int fetch_kernel_version(unsigned int *puint, char *str, size_t str_sz); #define KVER_VERSION(x) (((x) >> 16) & 0xff)