From ca575ad2093d7ca94238146c3c9267c61d2523bc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2016 11:23:25 -0300 Subject: [PATCH 01/24] tools lib traceevent: Add correct header for ipv6 definitions We need to include netinet/in.h to get the in6_addr struct definition, needed to build it on the Android NDK: In file included from event-parse.c:36:0: /home/acme/android/android-ndk-r12/platforms/android-24/arch-arm/usr/include/netinet/ip6.h:82:18: error: field 'ip6_src' has incomplete type struct in6_addr ip6_src; /* source address */ And it is the canonical way of getting IPv6 definitions, as described, for instance, in Linux's 'man ipv6' Doing that uncovers another problem: this source file uses PRIu64 but doesn't include it, depending on it being included by chance via the now replaced header (netinet/ip6.h), fix it. Cc: Adrian Hunter Cc: Chris Phlipot Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Steven Rostedt Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-tilr31n3yaba1whsd47qlwa3@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 3a7bd175f73c..664c90c8e22b 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -23,6 +23,7 @@ * Frederic Weisbecker gave his permission to relicense the code to * the Lesser General Public License. */ +#include #include #include #include @@ -33,7 +34,7 @@ #include #include -#include +#include #include "event-parse.h" #include "event-utils.h" From 09dd39d2d2682729174e40161df67f45c151f307 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2016 12:02:04 -0300 Subject: [PATCH 02/24] perf tools: Do not provide dup sched_getcpu() prototype on Android The Bionic libc has this definition, so don't duplicate it. Cc: Adrian Hunter Cc: Chris Phlipot Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-rmd19832zkt07e4crdzyen9z@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 6178cab82374..843cbba8f9d3 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -360,7 +360,7 @@ void print_binary(unsigned char *data, size_t len, size_t bytes_per_line, print_binary_t printer, void *extra); -#ifndef __GLIBC__ +#if !defined(__GLIBC__) && !defined(__ANDROID__) extern int sched_getcpu(void); #endif From 8c98abff43616a2a9c208189f86d32cefafbe708 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2016 12:03:04 -0300 Subject: [PATCH 03/24] tools: Make "__always_inline" just "inline" on Android As the gcc there is producing tons of: "warning: always_inline function might not be inlinable" At least on android-ndk-r12/platforms/android-24/arch-arm, so, for the time being, use this big hammer. Cc: Adrian Hunter Cc: Chris Phlipot Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-97l3eg3fnk5shmo4rsyyvj2t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/compiler.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index fa7208a32d76..e33fc1df3935 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -9,6 +9,17 @@ # define __always_inline inline __attribute__((always_inline)) #endif +#ifdef __ANDROID__ +/* + * FIXME: Big hammer to get rid of tons of: + * "warning: always_inline function might not be inlinable" + * + * At least on android-ndk-r12/platforms/android-24/arch-arm + */ +#undef __always_inline +#define __always_inline inline +#endif + #define __user #ifndef __attribute_const__ From 8811e8ea14c1056b09c794c67836b8a84584ddef Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Jul 2016 10:05:19 -0300 Subject: [PATCH 04/24] perf tools: Just pr_debug() about not being able to read cacheline_size So far the cacheline_size is only useful for the "dcacheline" --sort order, i.e. if that is not used, which is the norm, then the user shouldn't care that he is running this, say, on an Android system where sysconf(_SC_LEVEL1_DCACHE_LINESIZE) and the /sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size sysfs file isn't available. An upcoming patch will emit an warning only for "--sort ...,dcacheline,...". Cc: Adrian Hunter Cc: Chris Phlipot Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-580cnkvftunyvt9n7unsholi@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 4b2ff021434c..64c06961bfe4 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -503,7 +503,7 @@ void pthread__unblock_sigwinch(void) static void cache_line_size(int *cacheline_sizep) { if (sysfs__read_int("devices/system/cpu/cpu0/cache/index0/coherency_line_size", cacheline_sizep)) - perror("cannot determine cache line size"); + pr_debug("cannot determine cache line size"); } #endif From 0d203166de37ad50ea826f97570b3a2beea87c9d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Jul 2016 10:08:43 -0300 Subject: [PATCH 05/24] perf tools: Bail out at "--sort dcacheline" and cacheline_size not known There are cases where further work would be needed to overcome the fact that neither sysconf(_SC_LEVEL1_DCACHE_LINESIZE) nor /sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size are available in some systems (Android, for instance), so bail out when such a situation takes place. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-ho8d8g8mh0o2dri7ckcccafi@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/sort.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 5854b4660a49..947d21f38398 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2381,6 +2381,9 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok, if (sort__mode != SORT_MODE__MEMORY) return -EINVAL; + if (sd->entry == &sort_mem_dcacheline && cacheline_size == 0) + return -EINVAL; + if (sd->entry == &sort_mem_daddr_sym) list->sym = 1; @@ -2424,7 +2427,10 @@ static int setup_sort_list(struct perf_hpp_list *list, char *str, if (*tok) { ret = sort_dimension__add(list, tok, evlist, level); if (ret == -EINVAL) { - error("Invalid --sort key: `%s'", tok); + if (!cacheline_size && !strncasecmp(tok, "dcacheline", strlen(tok))) + error("The \"dcacheline\" --sort key needs to know the cacheline size and it couldn't be determined on this system"); + else + error("Invalid --sort key: `%s'", tok); break; } else if (ret == -ESRCH) { error("Unknown --sort key: `%s'", tok); From db49120a32b347fb93aea3319305932657dd118c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 15 Jul 2016 09:29:57 +0200 Subject: [PATCH 06/24] tools lib api fs: Use base 0 in filename__read_ull By using 0 for base, the strtoull() detects the base automatically (see 'man strtoull'). ATM we have just one user of this function, the cpu__get_max_freq function reading the "cpuinfo_max_freq" sysfs file. It should not get affected by this change. Committer note: This change seems motivated by this discussion: "[PATCH] [RFC V1]s390/perf: fix 'start' address of module's map" http://lkml.kernel.org/r/20160711120155.GA29929@krava I.e. this patches paves the way for filename__read_ull() to be used in a S/390 related fix. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Songshan Gong Link: http://lkml.kernel.org/r/1468567797-27564-4-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fs/fs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index 08556cf2c70d..ba7094b945ff 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -283,6 +283,11 @@ int filename__read_int(const char *filename, int *value) return err; } +/* + * Parses @value out of @filename with strtoull. + * By using 0 for base, the strtoull detects the + * base automatically (see man strtoull). + */ int filename__read_ull(const char *filename, unsigned long long *value) { char line[64]; @@ -292,7 +297,7 @@ int filename__read_ull(const char *filename, unsigned long long *value) return -1; if (read(fd, line, sizeof(line)) > 0) { - *value = strtoull(line, NULL, 10); + *value = strtoull(line, NULL, 0); if (*value != ULLONG_MAX) err = 0; } From 32a951b4fd6bbe60ef5d65930b1712321e241b27 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jul 2016 08:34:33 +0000 Subject: [PATCH 07/24] perf evlist: Drop redundant evsel->overwrite indicator evsel->overwrite indicator means an event should be put into overwritable ring buffer. In current implementation, it equals to evsel->attr.write_backward. To reduce compliexity, remove evsel->overwrite, use evsel->attr.write_backward instead. In addition, in __perf_evsel__open(), if kernel doesn't support write_backward and user explicitly set it in evsel, don't fallback like other missing feature, since it is meaningless to fall back to a forward ring buffer in this case: we are unable to stably read from an forward overwritable ring buffer. Cc: He Kuang Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Wang Nan Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-2-git-send-email-wangnan0@huawei.com Signed-off-by: Wang Nan Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/backward-ring-buffer.c | 1 + tools/perf/util/evlist.c | 4 ++-- tools/perf/util/evsel.c | 12 +++++------- tools/perf/util/evsel.h | 1 - 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index f20ea4c0d0cb..5cee3873f2b5 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -101,6 +101,7 @@ int test__backward_ring_buffer(int subtest __maybe_unused) return TEST_FAIL; } + evlist->backward = true; err = perf_evlist__create_maps(evlist, &opts.target); if (err < 0) { pr_debug("Not enough memory to create thread/cpu maps\n"); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 862e69c2690d..6803f5ccd15e 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1003,7 +1003,7 @@ static bool perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused, struct perf_evsel *evsel) { - if (evsel->overwrite) + if (evsel->attr.write_backward) return false; return true; } @@ -1018,7 +1018,7 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx, evlist__for_each_entry(evlist, evsel) { int fd; - if (evsel->overwrite != (evlist->overwrite && evlist->backward)) + if (!!evsel->attr.write_backward != (evlist->overwrite && evlist->backward)) continue; if (evsel->system_wide && thread) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ba0f59fa3d5d..9ac2f92ce88d 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1377,6 +1377,9 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, int pid = -1, err; enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE; + if (perf_missing_features.write_backward && evsel->attr.write_backward) + return -EINVAL; + if (evsel->system_wide) nthreads = 1; else @@ -1407,11 +1410,6 @@ fallback_missing_features: if (perf_missing_features.lbr_flags) evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS | PERF_SAMPLE_BRANCH_NO_CYCLES); - if (perf_missing_features.write_backward) { - if (evsel->overwrite) - return -EINVAL; - evsel->attr.write_backward = false; - } retry_sample_id: if (perf_missing_features.sample_id_all) evsel->attr.sample_id_all = 0; @@ -1513,7 +1511,7 @@ try_fallback: */ if (!perf_missing_features.write_backward && evsel->attr.write_backward) { perf_missing_features.write_backward = true; - goto fallback_missing_features; + goto out_close; } else if (!perf_missing_features.clockid_wrong && evsel->attr.use_clockid) { perf_missing_features.clockid_wrong = true; goto fallback_missing_features; @@ -2422,7 +2420,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, "We found oprofile daemon running, please stop it and try again."); break; case EINVAL: - if (evsel->overwrite && perf_missing_features.write_backward) + if (evsel->attr.write_backward && perf_missing_features.write_backward) return scnprintf(msg, size, "Reading from overwrite event is not supported by this kernel."); if (perf_missing_features.clockid) return scnprintf(msg, size, "clockid feature not supported."); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index d73391e8740e..e60cbfc2cd35 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -114,7 +114,6 @@ struct perf_evsel { bool tracking; bool per_pkg; bool precise_max; - bool overwrite; /* parse modifier helper */ int exclude_GH; int nr_members; From e81fcd43723d32e9c9dbb8e8d66f147b5b84256b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jul 2016 12:38:18 -0300 Subject: [PATCH 08/24] tools: Simplify BITS_PER_LONG define Do it using (__CHAR_BIT__ * __SIZEOF_LONG__), simpler, works everywhere, reduces the complexity by ditching CONFIG_64BIT, that was being synthesized from yet another set of defines, which proved fragile, breaking the build on linux-next for no obvious reasons. Committer Note: Except on: gcc version 4.1.2 20080704 (Red Hat 4.1.2-55) Fallback to __WORDSIZE in that case... Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra Tested-by: Arnaldo Carvalho de Melo Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160715072243.GP30154@twins.programming.kicks-ass.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/asm-generic/bitsperlong.h | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/tools/include/asm-generic/bitsperlong.h b/tools/include/asm-generic/bitsperlong.h index cfd661c6fc17..45eca517efb3 100644 --- a/tools/include/asm-generic/bitsperlong.h +++ b/tools/include/asm-generic/bitsperlong.h @@ -3,31 +3,12 @@ #include -/* - * In the kernel, where this file comes from, we can rely on CONFIG_64BIT, - * here we have to make amends with what the various compilers provides us - * to figure out if we're on a 64-bit machine... - */ #ifdef __SIZEOF_LONG__ -# if __SIZEOF_LONG__ == 8 -# define CONFIG_64BIT -# endif +#define BITS_PER_LONG (__CHAR_BIT__ * __SIZEOF_LONG__) #else -# ifdef __WORDSIZE -# if __WORDSIZE == 64 -# define CONFIG_64BIT -# endif -# else -# error Failed to determine BITS_PER_LONG value -# endif +#define BITS_PER_LONG __WORDSIZE #endif -#ifdef CONFIG_64BIT -#define BITS_PER_LONG 64 -#else -#define BITS_PER_LONG 32 -#endif /* CONFIG_64BIT */ - #if BITS_PER_LONG != __BITS_PER_LONG #error Inconsistent word size. Check asm/bitsperlong.h #endif From 2b4383470675c85add72725cc08840d36b409276 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:34 +0000 Subject: [PATCH 09/24] tools lib fd array: Allow associating a pointer cookie with each entry Add a 'ptr' field to fdarray->priv array. This feature will be used by following commits, which introduce muiltiple 'struct perf_mmap' arrays for different types of mapping. Because of this, during fdarray__filter(), a simple 'idx' is not enough. Add a pointer cookie that allows to directly associate a 'struct perf_mmap' pointer to an fdarray entry. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: David Ahern Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-3-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/api/fd/array.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/api/fd/array.h b/tools/lib/api/fd/array.h index e87fd800fa8d..71287dddc05f 100644 --- a/tools/lib/api/fd/array.h +++ b/tools/lib/api/fd/array.h @@ -22,6 +22,7 @@ struct fdarray { struct pollfd *entries; union { int idx; + void *ptr; } *priv; }; From 8db6d6b19e486eef3db41bbd74a1f4c2b82d7706 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:35 +0000 Subject: [PATCH 10/24] perf evlist: Update mmap related APIs and helpers Currently, the evlist mmap related helpers and APIs accept evlist and idx, and dereference 'struct perf_mmap' by evlist->mmap[idx]. This is unnecessary, and force each evlist contains only one mmap array. Following commits are going to introduce multiple mmap arrays to a evlist. This patch refators these APIs and helpers, introduces functions accept perf_mmap pointer directly. New helpers and APIs are decoupled with perf_evlist, and become perf_mmap functions (so they have perf_mmap prefix). Old functions are reimplemented with new functions. Some of them will be removed in following commits. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-4-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 143 +++++++++++++++++++++++++++------------ tools/perf/util/evlist.h | 12 ++++ 2 files changed, 110 insertions(+), 45 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6803f5ccd15e..a4137e02eab8 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -29,6 +29,7 @@ static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx); static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx); +static void perf_mmap__munmap(struct perf_mmap *map); #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define SID(e, x, y) xyarray__entry(e->sample_id, x, y) @@ -781,9 +782,8 @@ broken_event: return event; } -union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int idx) +union perf_event *perf_mmap__read_forward(struct perf_mmap *md, bool check_messup) { - struct perf_mmap *md = &evlist->mmap[idx]; u64 head; u64 old = md->prev; @@ -795,13 +795,12 @@ union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int head = perf_mmap__read_head(md); - return perf_mmap__read(md, evlist->overwrite, old, head, &md->prev); + return perf_mmap__read(md, check_messup, old, head, &md->prev); } union perf_event * -perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx) +perf_mmap__read_backward(struct perf_mmap *md) { - struct perf_mmap *md = &evlist->mmap[idx]; u64 head, end; u64 start = md->prev; @@ -836,6 +835,31 @@ perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx) return perf_mmap__read(md, false, start, end, &md->prev); } +union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int idx) +{ + struct perf_mmap *md = &evlist->mmap[idx]; + + /* + * Check messup is required for forward overwritable ring buffer: + * memory pointed by md->prev can be overwritten in this case. + * No need for read-write ring buffer: kernel stop outputting when + * it hit md->prev (perf_mmap__consume()). + */ + return perf_mmap__read_forward(md, evlist->overwrite); +} + +union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx) +{ + struct perf_mmap *md = &evlist->mmap[idx]; + + /* + * No need to check messup for backward ring buffer: + * We can always read arbitrary long data from a backward + * ring buffer unless we forget to pause it before reading. + */ + return perf_mmap__read_backward(md); +} + union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) { if (!evlist->backward) @@ -843,9 +867,8 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) return perf_evlist__mmap_read_backward(evlist, idx); } -void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx) +void perf_mmap__read_catchup(struct perf_mmap *md) { - struct perf_mmap *md = &evlist->mmap[idx]; u64 head; if (!atomic_read(&md->refcnt)) @@ -855,38 +878,54 @@ void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx) md->prev = head; } +void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx) +{ + perf_mmap__read_catchup(&evlist->mmap[idx]); +} + static bool perf_mmap__empty(struct perf_mmap *md) { return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base; } +static void perf_mmap__get(struct perf_mmap *map) +{ + atomic_inc(&map->refcnt); +} + +static void perf_mmap__put(struct perf_mmap *md) +{ + BUG_ON(md->base && atomic_read(&md->refcnt) == 0); + + if (atomic_dec_and_test(&md->refcnt)) + perf_mmap__munmap(md); +} + static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx) { - atomic_inc(&evlist->mmap[idx].refcnt); + perf_mmap__get(&evlist->mmap[idx]); } static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx) { - struct perf_mmap *md = &evlist->mmap[idx]; - - BUG_ON(md->base && atomic_read(&md->refcnt) == 0); - - if (atomic_dec_and_test(&md->refcnt)) - __perf_evlist__munmap(evlist, idx); + perf_mmap__put(&evlist->mmap[idx]); } -void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx) +void perf_mmap__consume(struct perf_mmap *md, bool overwrite) { - struct perf_mmap *md = &evlist->mmap[idx]; - - if (!evlist->overwrite) { + if (!overwrite) { u64 old = md->prev; perf_mmap__write_tail(md, old); } if (atomic_read(&md->refcnt) == 1 && perf_mmap__empty(md)) - perf_evlist__mmap_put(evlist, idx); + perf_mmap__put(md); +} + +void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx) +{ + perf_mmap__consume(&evlist->mmap[idx], evlist->overwrite); } int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused, @@ -917,15 +956,20 @@ void __weak auxtrace_mmap_params__set_idx( { } +static void perf_mmap__munmap(struct perf_mmap *map) +{ + if (map->base != NULL) { + munmap(map->base, perf_mmap__mmap_len(map)); + map->base = NULL; + map->fd = -1; + atomic_set(&map->refcnt, 0); + } + auxtrace_mmap__munmap(&map->auxtrace_mmap); +} + static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx) { - if (evlist->mmap[idx].base != NULL) { - munmap(evlist->mmap[idx].base, evlist->mmap_len); - evlist->mmap[idx].base = NULL; - evlist->mmap[idx].fd = -1; - atomic_set(&evlist->mmap[idx].refcnt, 0); - } - auxtrace_mmap__munmap(&evlist->mmap[idx].auxtrace_mmap); + perf_mmap__munmap(&evlist->mmap[idx]); } void perf_evlist__munmap(struct perf_evlist *evlist) @@ -941,20 +985,21 @@ void perf_evlist__munmap(struct perf_evlist *evlist) zfree(&evlist->mmap); } -static int perf_evlist__alloc_mmap(struct perf_evlist *evlist) +static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist) { int i; + struct perf_mmap *map; evlist->nr_mmaps = cpu_map__nr(evlist->cpus); if (cpu_map__empty(evlist->cpus)) evlist->nr_mmaps = thread_map__nr(evlist->threads); - evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap)); - if (!evlist->mmap) - return -ENOMEM; + map = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap)); + if (!map) + return NULL; for (i = 0; i < evlist->nr_mmaps; i++) - evlist->mmap[i].fd = -1; - return 0; + map[i].fd = -1; + return map; } struct mmap_params { @@ -963,8 +1008,8 @@ struct mmap_params { struct auxtrace_mmap_params auxtrace_mp; }; -static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx, - struct mmap_params *mp, int fd) +static int perf_mmap__mmap(struct perf_mmap *map, + struct mmap_params *mp, int fd) { /* * The last one will be done at perf_evlist__mmap_consume(), so that we @@ -979,26 +1024,32 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx, * evlist layer can't just drop it when filtering events in * perf_evlist__filter_pollfd(). */ - atomic_set(&evlist->mmap[idx].refcnt, 2); - evlist->mmap[idx].prev = 0; - evlist->mmap[idx].mask = mp->mask; - evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, mp->prot, - MAP_SHARED, fd, 0); - if (evlist->mmap[idx].base == MAP_FAILED) { + atomic_set(&map->refcnt, 2); + map->prev = 0; + map->mask = mp->mask; + map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot, + MAP_SHARED, fd, 0); + if (map->base == MAP_FAILED) { pr_debug2("failed to mmap perf event ring buffer, error %d\n", errno); - evlist->mmap[idx].base = NULL; + map->base = NULL; return -1; } - evlist->mmap[idx].fd = fd; + map->fd = fd; - if (auxtrace_mmap__mmap(&evlist->mmap[idx].auxtrace_mmap, - &mp->auxtrace_mp, evlist->mmap[idx].base, fd)) + if (auxtrace_mmap__mmap(&map->auxtrace_mmap, + &mp->auxtrace_mp, map->base, fd)) return -1; return 0; } +static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx, + struct mmap_params *mp, int fd) +{ + return perf_mmap__mmap(&evlist->mmap[idx], mp, fd); +} + static bool perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused, struct perf_evsel *evsel) @@ -1248,7 +1299,9 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages, .prot = PROT_READ | (overwrite ? 0 : PROT_WRITE), }; - if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0) + if (!evlist->mmap) + evlist->mmap = perf_evlist__alloc_mmap(evlist); + if (!evlist->mmap) return -ENOMEM; if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index afd087761a47..9e680c62c980 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -35,6 +35,12 @@ struct perf_mmap { char event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8))); }; +static inline size_t +perf_mmap__mmap_len(struct perf_mmap *map) +{ + return map->mask + 1 + page_size; +} + struct perf_evlist { struct list_head entries; struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; @@ -129,6 +135,12 @@ struct perf_evsel *perf_evlist__id2evsel_strict(struct perf_evlist *evlist, struct perf_sample_id *perf_evlist__id2sid(struct perf_evlist *evlist, u64 id); +union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup); +union perf_event *perf_mmap__read_backward(struct perf_mmap *map); + +void perf_mmap__read_catchup(struct perf_mmap *md); +void perf_mmap__consume(struct perf_mmap *md, bool overwrite); + union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx); union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, From a4ea0ec4f24a721bea5447a27ad5fbcb89275bae Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:36 +0000 Subject: [PATCH 11/24] perf record: Decouple record__mmap_read() and evlist. Perf evlist will have multiple mmap arrays. Update record__mmap_read(): it should read from 'struct perf_mmap' directly. Also, make record__mmap_read() ready to read from backward ring buffer. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-5-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 39 ++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d9f5cc3a3667..d15517e849a3 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -119,11 +119,10 @@ backward_rb_find_range(void *buf, int mask, u64 head, u64 *start, u64 *end) } static int -rb_find_range(struct perf_evlist *evlist, - void *data, int mask, u64 head, u64 old, - u64 *start, u64 *end) +rb_find_range(void *data, int mask, u64 head, u64 old, + u64 *start, u64 *end, bool backward) { - if (!evlist->backward) { + if (!backward) { *start = old; *end = head; return 0; @@ -132,9 +131,10 @@ rb_find_range(struct perf_evlist *evlist, return backward_rb_find_range(data, mask, head, start, end); } -static int record__mmap_read(struct record *rec, struct perf_evlist *evlist, int idx) +static int +record__mmap_read(struct record *rec, struct perf_mmap *md, + bool overwrite, bool backward) { - struct perf_mmap *md = &evlist->mmap[idx]; u64 head = perf_mmap__read_head(md); u64 old = md->prev; u64 end = head, start = old; @@ -143,8 +143,8 @@ static int record__mmap_read(struct record *rec, struct perf_evlist *evlist, int void *buf; int rc = 0; - if (rb_find_range(evlist, data, md->mask, head, - old, &start, &end)) + if (rb_find_range(data, md->mask, head, + old, &start, &end, backward)) return -1; if (start == end) @@ -157,7 +157,7 @@ static int record__mmap_read(struct record *rec, struct perf_evlist *evlist, int WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); md->prev = head; - perf_evlist__mmap_consume(evlist, idx); + perf_mmap__consume(md, overwrite || backward); return 0; } @@ -182,7 +182,7 @@ static int record__mmap_read(struct record *rec, struct perf_evlist *evlist, int } md->prev = head; - perf_evlist__mmap_consume(evlist, idx); + perf_mmap__consume(md, overwrite || backward); out: return rc; } @@ -498,20 +498,27 @@ static struct perf_event_header finished_round_event = { .type = PERF_RECORD_FINISHED_ROUND, }; -static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist) +static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist, + bool backward) { u64 bytes_written = rec->bytes_written; int i; int rc = 0; + struct perf_mmap *maps; if (!evlist) return 0; - for (i = 0; i < evlist->nr_mmaps; i++) { - struct auxtrace_mmap *mm = &evlist->mmap[i].auxtrace_mmap; + maps = evlist->mmap; + if (!maps) + return 0; - if (evlist->mmap[i].base) { - if (record__mmap_read(rec, evlist, i) != 0) { + for (i = 0; i < evlist->nr_mmaps; i++) { + struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap; + + if (maps[i].base) { + if (record__mmap_read(rec, &maps[i], + evlist->overwrite, backward) != 0) { rc = -1; goto out; } @@ -539,7 +546,7 @@ static int record__mmap_read_all(struct record *rec) { int err; - err = record__mmap_read_evlist(rec, rec->evlist); + err = record__mmap_read_evlist(rec, rec->evlist, false); if (err) return err; From 4876075b3205af992bf1012f6d6fbc03593d55b9 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:37 +0000 Subject: [PATCH 12/24] perf evlist: Record mmap cookie into fdarray private field Insetad of saving a index into fdarray entries private field, save the corresponding 'struct perf_mmap' pointer, and release them directly using perf_mmap__put(). Following commits introduce multiple mmap arrays to evlist. Without this patch, perf_evlist__munmap_filtered() is unable to retrive correct 'struct perf_mmap' pointer. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-6-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index a4137e02eab8..1462085a8618 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -30,6 +30,7 @@ static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx); static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx); static void perf_mmap__munmap(struct perf_mmap *map); +static void perf_mmap__put(struct perf_mmap *map); #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define SID(e, x, y) xyarray__entry(e->sample_id, x, y) @@ -466,7 +467,8 @@ int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) return 0; } -static int __perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, int idx, short revent) +static int __perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, + struct perf_mmap *map, short revent) { int pos = fdarray__add(&evlist->pollfd, fd, revent | POLLERR | POLLHUP); /* @@ -474,7 +476,7 @@ static int __perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, int idx * close the associated evlist->mmap[] entry. */ if (pos >= 0) { - evlist->pollfd.priv[pos].idx = idx; + evlist->pollfd.priv[pos].ptr = map; fcntl(fd, F_SETFL, O_NONBLOCK); } @@ -484,15 +486,16 @@ static int __perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, int idx int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd) { - return __perf_evlist__add_pollfd(evlist, fd, -1, POLLIN); + return __perf_evlist__add_pollfd(evlist, fd, NULL, POLLIN); } static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd, void *arg __maybe_unused) { - struct perf_evlist *evlist = container_of(fda, struct perf_evlist, pollfd); + struct perf_mmap *map = fda->priv[fd].ptr; - perf_evlist__mmap_put(evlist, fda->priv[fd].idx); + if (map) + perf_mmap__put(map); } int perf_evlist__filter_pollfd(struct perf_evlist *evlist, short revents_and_mask) @@ -1098,7 +1101,7 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx, * Therefore don't add it for polling. */ if (!evsel->system_wide && - __perf_evlist__add_pollfd(evlist, fd, idx, revent) < 0) { + __perf_evlist__add_pollfd(evlist, fd, &evlist->mmap[idx], revent) < 0) { perf_evlist__mmap_put(evlist, idx); return -1; } From a1f72618346a4e7ce9cff6aec1a62737d5d08763 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:38 +0000 Subject: [PATCH 13/24] perf evlist: Extract common code in mmap failure processing In perf_evlist__mmap_per_cpu() and perf_evlist__mmap_per_thread(), in case of mmap failure, successfully created maps should be cleared. Current code uses two loops calling __perf_evlist__munmap() for each function. This patch extracts common code to perf_evlist__munmap_nofree() and use previous introduced decoupled API perf_mmap__munmap(). Now __perf_evlist__munmap() can be removed because of no user. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-7-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 1462085a8618..54ae0a0bc22c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -28,7 +28,6 @@ #include static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx); -static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx); static void perf_mmap__munmap(struct perf_mmap *map); static void perf_mmap__put(struct perf_mmap *map); @@ -970,12 +969,7 @@ static void perf_mmap__munmap(struct perf_mmap *map) auxtrace_mmap__munmap(&map->auxtrace_mmap); } -static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx) -{ - perf_mmap__munmap(&evlist->mmap[idx]); -} - -void perf_evlist__munmap(struct perf_evlist *evlist) +static void perf_evlist__munmap_nofree(struct perf_evlist *evlist) { int i; @@ -983,8 +977,12 @@ void perf_evlist__munmap(struct perf_evlist *evlist) return; for (i = 0; i < evlist->nr_mmaps; i++) - __perf_evlist__munmap(evlist, i); + perf_mmap__munmap(&evlist->mmap[i]); +} +void perf_evlist__munmap(struct perf_evlist *evlist) +{ + perf_evlist__munmap_nofree(evlist); zfree(&evlist->mmap); } @@ -1142,8 +1140,7 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, return 0; out_unmap: - for (cpu = 0; cpu < nr_cpus; cpu++) - __perf_evlist__munmap(evlist, cpu); + perf_evlist__munmap_nofree(evlist); return -1; } @@ -1168,8 +1165,7 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, return 0; out_unmap: - for (thread = 0; thread < nr_threads; thread++) - __perf_evlist__munmap(evlist, thread); + perf_evlist__munmap_nofree(evlist); return -1; } From b2cb615d8aaba520fe351ff456f6c7730828b3fe Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:39 +0000 Subject: [PATCH 14/24] perf evlist: Introduce backward_mmap array for evlist Add backward_mmap to evlist, free it together with normal mmap. Improve perf_evlist__pick_pc(), search backward_mmap if evlist->mmap is not available. This patch doesn't alloc this array. It will be allocated conditionally in the following commits. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-8-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 10 +++++++--- tools/perf/util/evlist.c | 12 ++++++++---- tools/perf/util/evlist.h | 1 + 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d15517e849a3..dbcb22304398 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -509,7 +509,7 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli if (!evlist) return 0; - maps = evlist->mmap; + maps = backward ? evlist->backward_mmap : evlist->mmap; if (!maps) return 0; @@ -696,8 +696,12 @@ perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused static const struct perf_event_mmap_page * perf_evlist__pick_pc(struct perf_evlist *evlist) { - if (evlist && evlist->mmap && evlist->mmap[0].base) - return evlist->mmap[0].base; + if (evlist) { + if (evlist->mmap && evlist->mmap[0].base) + return evlist->mmap[0].base; + if (evlist->backward_mmap && evlist->backward_mmap[0].base) + return evlist->backward_mmap[0].base; + } return NULL; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 54ae0a0bc22c..24927e111d17 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -123,6 +123,7 @@ static void perf_evlist__purge(struct perf_evlist *evlist) void perf_evlist__exit(struct perf_evlist *evlist) { zfree(&evlist->mmap); + zfree(&evlist->backward_mmap); fdarray__exit(&evlist->pollfd); } @@ -973,17 +974,20 @@ static void perf_evlist__munmap_nofree(struct perf_evlist *evlist) { int i; - if (evlist->mmap == NULL) - return; + if (evlist->mmap) + for (i = 0; i < evlist->nr_mmaps; i++) + perf_mmap__munmap(&evlist->mmap[i]); - for (i = 0; i < evlist->nr_mmaps; i++) - perf_mmap__munmap(&evlist->mmap[i]); + if (evlist->backward_mmap) + for (i = 0; i < evlist->nr_mmaps; i++) + perf_mmap__munmap(&evlist->backward_mmap[i]); } void perf_evlist__munmap(struct perf_evlist *evlist) { perf_evlist__munmap_nofree(evlist); zfree(&evlist->mmap); + zfree(&evlist->backward_mmap); } static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 9e680c62c980..07a1ad040c92 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -61,6 +61,7 @@ struct perf_evlist { } workload; struct fdarray pollfd; struct perf_mmap *mmap; + struct perf_mmap *backward_mmap; struct thread_map *threads; struct cpu_map *cpus; struct perf_evsel *selected; From 078c33862e042b3778dce3bcc8eaef84ab40715c Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:40 +0000 Subject: [PATCH 15/24] perf evlist: Map backward events to backward_mmap In perf_evlist__mmap_per_evsel(), select backward_mmap for backward events. Utilize new perf_mmap APIs. Dynamically alloc backward_mmap. Remove useless functions. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-9-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/backward-ring-buffer.c | 4 +- tools/perf/util/evlist.c | 54 ++++++++++++------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 5cee3873f2b5..b2c634815f6b 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -31,8 +31,8 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count, for (i = 0; i < evlist->nr_mmaps; i++) { union perf_event *event; - perf_evlist__mmap_read_catchup(evlist, i); - while ((event = perf_evlist__mmap_read_backward(evlist, i)) != NULL) { + perf_mmap__read_catchup(&evlist->backward_mmap[i]); + while ((event = perf_mmap__read_backward(&evlist->backward_mmap[i])) != NULL) { const u32 type = event->header.type; switch (type) { diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 24927e111d17..7570f903200e 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -27,7 +27,6 @@ #include #include -static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx); static void perf_mmap__munmap(struct perf_mmap *map); static void perf_mmap__put(struct perf_mmap *map); @@ -692,8 +691,11 @@ static int perf_evlist__set_paused(struct perf_evlist *evlist, bool value) { int i; + if (!evlist->backward_mmap) + return 0; + for (i = 0; i < evlist->nr_mmaps; i++) { - int fd = evlist->mmap[i].fd; + int fd = evlist->backward_mmap[i].fd; int err; if (fd < 0) @@ -904,16 +906,6 @@ static void perf_mmap__put(struct perf_mmap *md) perf_mmap__munmap(md); } -static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx) -{ - perf_mmap__get(&evlist->mmap[idx]); -} - -static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx) -{ - perf_mmap__put(&evlist->mmap[idx]); -} - void perf_mmap__consume(struct perf_mmap *md, bool overwrite) { if (!overwrite) { @@ -1049,12 +1041,6 @@ static int perf_mmap__mmap(struct perf_mmap *map, return 0; } -static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx, - struct mmap_params *mp, int fd) -{ - return perf_mmap__mmap(&evlist->mmap[idx], mp, fd); -} - static bool perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused, struct perf_evsel *evsel) @@ -1066,16 +1052,27 @@ perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused, static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx, struct mmap_params *mp, int cpu, - int thread, int *output) + int thread, int *_output, int *_output_backward) { struct perf_evsel *evsel; int revent; evlist__for_each_entry(evlist, evsel) { + struct perf_mmap *maps = evlist->mmap; + int *output = _output; int fd; - if (!!evsel->attr.write_backward != (evlist->overwrite && evlist->backward)) - continue; + if (evsel->attr.write_backward) { + output = _output_backward; + maps = evlist->backward_mmap; + + if (!maps) { + maps = perf_evlist__alloc_mmap(evlist); + if (!maps) + return -1; + evlist->backward_mmap = maps; + } + } if (evsel->system_wide && thread) continue; @@ -1084,13 +1081,14 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx, if (*output == -1) { *output = fd; - if (__perf_evlist__mmap(evlist, idx, mp, *output) < 0) + + if (perf_mmap__mmap(&maps[idx], mp, *output) < 0) return -1; } else { if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, *output) != 0) return -1; - perf_evlist__mmap_get(evlist, idx); + perf_mmap__get(&maps[idx]); } revent = perf_evlist__should_poll(evlist, evsel) ? POLLIN : 0; @@ -1103,8 +1101,8 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx, * Therefore don't add it for polling. */ if (!evsel->system_wide && - __perf_evlist__add_pollfd(evlist, fd, &evlist->mmap[idx], revent) < 0) { - perf_evlist__mmap_put(evlist, idx); + __perf_evlist__add_pollfd(evlist, fd, &maps[idx], revent) < 0) { + perf_mmap__put(&maps[idx]); return -1; } @@ -1130,13 +1128,14 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, pr_debug2("perf event ring buffer mmapped per cpu\n"); for (cpu = 0; cpu < nr_cpus; cpu++) { int output = -1; + int output_backward = -1; auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, cpu, true); for (thread = 0; thread < nr_threads; thread++) { if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu, - thread, &output)) + thread, &output, &output_backward)) goto out_unmap; } } @@ -1157,12 +1156,13 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, pr_debug2("perf event ring buffer mmapped per thread\n"); for (thread = 0; thread < nr_threads; thread++) { int output = -1; + int output_backward = -1; auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread, false); if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread, - &output)) + &output, &output_backward)) goto out_unmap; } From a0c6f451f90204847ce5f91c3268d83a76bde1b6 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:41 +0000 Subject: [PATCH 16/24] perf evlist: Drop evlist->backward Now there's no real user of evlist->backward. Drop it. We are going to use evlist->backward_mmap as a container for backward ring buffer. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-10-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/backward-ring-buffer.c | 1 - tools/perf/util/evlist.c | 5 +---- tools/perf/util/evlist.h | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index b2c634815f6b..db9cd3048655 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -101,7 +101,6 @@ int test__backward_ring_buffer(int subtest __maybe_unused) return TEST_FAIL; } - evlist->backward = true; err = perf_evlist__create_maps(evlist, &opts.target); if (err < 0) { pr_debug("Not enough memory to create thread/cpu maps\n"); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 7570f903200e..5beb44faa71d 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -44,7 +44,6 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, perf_evlist__set_maps(evlist, cpus, threads); fdarray__init(&evlist->pollfd, 64); evlist->workload.pid = -1; - evlist->backward = false; } struct perf_evlist *perf_evlist__new(void) @@ -867,9 +866,7 @@ union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, in union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) { - if (!evlist->backward) - return perf_evlist__mmap_read_forward(evlist, idx); - return perf_evlist__mmap_read_backward(evlist, idx); + return perf_evlist__mmap_read_forward(evlist, idx); } void perf_mmap__read_catchup(struct perf_mmap *md) diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 07a1ad040c92..6a3d9bdda4bb 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -50,7 +50,6 @@ struct perf_evlist { bool overwrite; bool enabled; bool has_user_cpus; - bool backward; size_t mmap_len; int id_pos; int is_pos; From 54cc54decd79b2c92f4db06001b4b245f38181b7 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:42 +0000 Subject: [PATCH 17/24] perf evlist: Setup backward mmap state machine Introduce a bkw_mmap_state state machine to evlist: .________________(forbid)_____________. | V NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY ^ ^ | ^ | | |__(forbid)____/ |___(forbid)___/| | | \_________________(3)_______________/ NOTREADY : Backward ring buffers are not ready RUNNING : Backward ring buffers are recording DATA_PENDING : We are required to collect data from backward ring buffers EMPTY : We have collected data from backward ring buffers. (0): Setup backward ring buffer (1): Pause ring buffers for reading (2): Read from ring buffers (3): Resume ring buffers for recording We can't avoid this complexity. Since we deliberately drop records from overwritable ring buffer, there's no way for us to check remaining from ring buffer itself (by checking head and old pointers). Therefore, we need DATA_PENDING and EMPTY state to help us recording what we have done to the ring buffer. In record__mmap_read_evlist(), drive this state machine from DATA_PENDING to EMPTY. In perf_evlist__mmap_per_evsel(), drive this state machine from NOTREADY to RUNNING when creating backward mmap. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-11-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 5 +++ tools/perf/util/evlist.c | 62 +++++++++++++++++++++++++++++++++++++ tools/perf/util/evlist.h | 31 +++++++++++++++++++ 3 files changed, 98 insertions(+) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index dbcb22304398..d4f15e76c98a 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -513,6 +513,9 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli if (!maps) return 0; + if (backward && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING) + return 0; + for (i = 0; i < evlist->nr_mmaps; i++) { struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap; @@ -538,6 +541,8 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli if (bytes_written != rec->bytes_written) rc = record__write(rec, &finished_round_event, sizeof(finished_round_event)); + if (backward) + perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); out: return rc; } diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 5beb44faa71d..93ab66415f51 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -15,6 +15,7 @@ #include "evlist.h" #include "evsel.h" #include "debug.h" +#include "asm/bug.h" #include #include "parse-events.h" @@ -44,6 +45,7 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, perf_evlist__set_maps(evlist, cpus, threads); fdarray__init(&evlist->pollfd, 64); evlist->workload.pid = -1; + evlist->bkw_mmap_state = BKW_MMAP_NOTREADY; } struct perf_evlist *perf_evlist__new(void) @@ -1068,6 +1070,8 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx, if (!maps) return -1; evlist->backward_mmap = maps; + if (evlist->bkw_mmap_state == BKW_MMAP_NOTREADY) + perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING); } } @@ -1972,3 +1976,61 @@ perf_evlist__find_evsel_by_str(struct perf_evlist *evlist, return NULL; } + +void perf_evlist__toggle_bkw_mmap(struct perf_evlist *evlist, + enum bkw_mmap_state state) +{ + enum bkw_mmap_state old_state = evlist->bkw_mmap_state; + enum action { + NONE, + PAUSE, + RESUME, + } action = NONE; + + if (!evlist->backward_mmap) + return; + + switch (old_state) { + case BKW_MMAP_NOTREADY: { + if (state != BKW_MMAP_RUNNING) + goto state_err;; + break; + } + case BKW_MMAP_RUNNING: { + if (state != BKW_MMAP_DATA_PENDING) + goto state_err; + action = PAUSE; + break; + } + case BKW_MMAP_DATA_PENDING: { + if (state != BKW_MMAP_EMPTY) + goto state_err; + break; + } + case BKW_MMAP_EMPTY: { + if (state != BKW_MMAP_RUNNING) + goto state_err; + action = RESUME; + break; + } + default: + WARN_ONCE(1, "Shouldn't get there\n"); + } + + evlist->bkw_mmap_state = state; + + switch (action) { + case PAUSE: + perf_evlist__pause(evlist); + break; + case RESUME: + perf_evlist__resume(evlist); + break; + case NONE: + default: + break; + } + +state_err: + return; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 6a3d9bdda4bb..20faaab1941c 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -41,6 +41,34 @@ perf_mmap__mmap_len(struct perf_mmap *map) return map->mask + 1 + page_size; } +/* + * State machine of bkw_mmap_state: + * + * .________________(forbid)_____________. + * | V + * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY + * ^ ^ | ^ | + * | |__(forbid)____/ |___(forbid)___/| + * | | + * \_________________(3)_______________/ + * + * NOTREADY : Backward ring buffers are not ready + * RUNNING : Backward ring buffers are recording + * DATA_PENDING : We are required to collect data from backward ring buffers + * EMPTY : We have collected data from backward ring buffers. + * + * (0): Setup backward ring buffer + * (1): Pause ring buffers for reading + * (2): Read from ring buffers + * (3): Resume ring buffers for recording + */ +enum bkw_mmap_state { + BKW_MMAP_NOTREADY, + BKW_MMAP_RUNNING, + BKW_MMAP_DATA_PENDING, + BKW_MMAP_EMPTY, +}; + struct perf_evlist { struct list_head entries; struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; @@ -54,6 +82,7 @@ struct perf_evlist { int id_pos; int is_pos; u64 combined_sample_type; + enum bkw_mmap_state bkw_mmap_state; struct { int cork_fd; pid_t pid; @@ -135,6 +164,8 @@ struct perf_evsel *perf_evlist__id2evsel_strict(struct perf_evlist *evlist, struct perf_sample_id *perf_evlist__id2sid(struct perf_evlist *evlist, u64 id); +void perf_evlist__toggle_bkw_mmap(struct perf_evlist *evlist, enum bkw_mmap_state state); + union perf_event *perf_mmap__read_forward(struct perf_mmap *map, bool check_messup); union perf_event *perf_mmap__read_backward(struct perf_mmap *map); From 057374645bd42e8bcf22aa4529f99cf7c920a1c6 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:43 +0000 Subject: [PATCH 18/24] perf record: Read from overwritable ring buffer Drive the evlist->bkw_mmap_state state machine during draining and when SIGUSR2 is received. Read the backward ring buffer in record__mmap_read_all. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-12-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d4f15e76c98a..b87070b1f492 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -555,7 +555,7 @@ static int record__mmap_read_all(struct record *rec) if (err) return err; - return err; + return record__mmap_read_evlist(rec, rec->evlist, true); } static void record__init_features(struct record *rec) @@ -953,6 +953,17 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) for (;;) { unsigned long long hits = rec->samples; + /* + * rec->evlist->bkw_mmap_state is possible to be + * BKW_MMAP_EMPTY here: when done == true and + * hits != rec->samples in previous round. + * + * perf_evlist__toggle_bkw_mmap ensure we never + * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING. + */ + if (trigger_is_hit(&switch_output_trigger) || done || draining) + perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING); + if (record__mmap_read_all(rec) < 0) { trigger_error(&auxtrace_snapshot_trigger); trigger_error(&switch_output_trigger); @@ -972,8 +983,26 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) } if (trigger_is_hit(&switch_output_trigger)) { + /* + * If switch_output_trigger is hit, the data in + * overwritable ring buffer should have been collected, + * so bkw_mmap_state should be set to BKW_MMAP_EMPTY. + * + * If SIGUSR2 raise after or during record__mmap_read_all(), + * record__mmap_read_all() didn't collect data from + * overwritable ring buffer. Read again. + */ + if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING) + continue; trigger_ready(&switch_output_trigger); + /* + * Reenable events in overwrite ring buffer after + * record__mmap_read_all(): we should have collected + * data from it. + */ + perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING); + if (!quiet) fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", waking); From f6cdff8329e04b08cbc195194223e9dbadeeaa1e Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:44 +0000 Subject: [PATCH 19/24] perf evlist: Make {pause,resume} internal helpers There's no user of these two function outside evlist.c. Remove them from public namespace. Signed-off-by: Wang Nan Acked-by: Jiri Olsa Cc: He Kuang Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-13-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 4 ++-- tools/perf/util/evlist.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 93ab66415f51..2a40b8e1def7 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -708,12 +708,12 @@ static int perf_evlist__set_paused(struct perf_evlist *evlist, bool value) return 0; } -int perf_evlist__pause(struct perf_evlist *evlist) +static int perf_evlist__pause(struct perf_evlist *evlist) { return perf_evlist__set_paused(evlist, true); } -int perf_evlist__resume(struct perf_evlist *evlist) +static int perf_evlist__resume(struct perf_evlist *evlist) { return perf_evlist__set_paused(evlist, false); } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 20faaab1941c..4fd034f22d2f 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -182,8 +182,6 @@ void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx); void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx); -int perf_evlist__pause(struct perf_evlist *evlist); -int perf_evlist__resume(struct perf_evlist *evlist); int perf_evlist__open(struct perf_evlist *evlist); void perf_evlist__close(struct perf_evlist *evlist); From 626a6b784e91bc61ca9fe0f9dd5bb60cb91ccb6b Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:45 +0000 Subject: [PATCH 20/24] perf tools: Enable overwrite settings This patch allows following config terms and option: Globally setting events to overwrite; # perf record --overwrite ... Set specific events to be overwrite or no-overwrite. # perf record --event cycles/overwrite/ ... # perf record --event cycles/no-overwrite/ ... Add missing config terms and update the config term array size because the longest string length has changed. For overwritable events, it automatically selects attr.write_backward since perf requires it to be backward for reading. Test result: # perf record --overwrite -e syscalls:*enter_nanosleep* usleep 1 [ perf record: Woken up 2 times to write data ] [ perf record: Captured and wrote 0.011 MB perf.data (1 samples) ] # perf evlist -v syscalls:sys_enter_nanosleep: type: 2, size: 112, config: 0x134, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|PERIOD|RAW, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, write_backward: 1 # Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events Signed-off-by: Wang Nan Tested-by: Arnaldo Carvalho de Melo Acked-by: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-14-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 14 ++++++++++++++ tools/perf/builtin-record.c | 1 + tools/perf/perf.h | 1 + tools/perf/tests/backward-ring-buffer.c | 10 +++++----- tools/perf/util/evsel.c | 4 ++++ tools/perf/util/evsel.h | 2 ++ tools/perf/util/parse-events.c | 20 ++++++++++++++++++-- tools/perf/util/parse-events.h | 2 ++ tools/perf/util/parse-events.l | 2 ++ 9 files changed, 49 insertions(+), 7 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 5b46b1d1a37c..384c630436f8 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -367,6 +367,20 @@ options. 'perf record --dry-run -e' can act as a BPF script compiler if llvm.dump-obj in config file is set to true. +--overwrite:: +Makes all events use an overwritable ring buffer. An overwritable ring +buffer works like a flight recorder: when it gets full, the kernel will +overwrite the oldest records, that thus will never make it to the +perf.data file. + +When '--overwrite' and '--switch-output' are used perf records and drops +events until it receives a signal, meaning that something unusual was +detected that warrants taking a snapshot of the most current events, +those fitting in the ring buffer at that moment. + +'overwrite' attribute can also be set or canceled for an event using +config terms. For example: 'cycles/overwrite/' and 'instructions/no-overwrite/'. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index b87070b1f492..39c7486f0607 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1399,6 +1399,7 @@ struct option __record_options[] = { OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, &record.opts.no_inherit_set, "child tasks do not inherit counters"), + OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"), OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", "number of mmap data pages and AUX area tracing mmap pages", diff --git a/tools/perf/perf.h b/tools/perf/perf.h index cd8f1b150f9e..608b42bdb1b6 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -59,6 +59,7 @@ struct record_opts { bool record_switch_events; bool all_kernel; bool all_user; + bool overwrite; unsigned int freq; unsigned int mmap_pages; unsigned int auxtrace_mmap_pages; diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index db9cd3048655..615780cbfe1d 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -108,7 +108,11 @@ int test__backward_ring_buffer(int subtest __maybe_unused) } bzero(&parse_error, sizeof(parse_error)); - err = parse_events(evlist, "syscalls:sys_enter_prctl", &parse_error); + /* + * Set backward bit, ring buffer should be writing from end. Record + * it in aux evlist + */ + err = parse_events(evlist, "syscalls:sys_enter_prctl/overwrite/", &parse_error); if (err) { pr_debug("Failed to parse tracepoint event, try use root\n"); ret = TEST_SKIP; @@ -117,10 +121,6 @@ int test__backward_ring_buffer(int subtest __maybe_unused) perf_evlist__config(evlist, &opts, NULL); - /* Set backward bit, ring buffer should be writing from end */ - evlist__for_each_entry(evlist, evsel) - evsel->attr.write_backward = 1; - err = perf_evlist__open(evlist); if (err < 0) { pr_debug("perf_evlist__open: %s\n", diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 9ac2f92ce88d..8c54df61fe64 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -695,6 +695,9 @@ static void apply_config_terms(struct perf_evsel *evsel, */ attr->inherit = term->val.inherit ? 1 : 0; break; + case PERF_EVSEL__CONFIG_TERM_OVERWRITE: + attr->write_backward = term->val.overwrite ? 1 : 0; + break; default: break; } @@ -776,6 +779,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1; attr->inherit = !opts->no_inherit; + attr->write_backward = opts->overwrite ? 1 : 0; perf_evsel__set_sample_bit(evsel, IP); perf_evsel__set_sample_bit(evsel, TID); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index e60cbfc2cd35..8a4a6c9f1480 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -45,6 +45,7 @@ enum { PERF_EVSEL__CONFIG_TERM_STACK_USER, PERF_EVSEL__CONFIG_TERM_INHERIT, PERF_EVSEL__CONFIG_TERM_MAX_STACK, + PERF_EVSEL__CONFIG_TERM_OVERWRITE, PERF_EVSEL__CONFIG_TERM_MAX, }; @@ -59,6 +60,7 @@ struct perf_evsel_config_term { u64 stack_user; int max_stack; bool inherit; + bool overwrite; } val; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 375af0e02831..6c913c3914fb 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -902,6 +902,8 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = { [PARSE_EVENTS__TERM_TYPE_NOINHERIT] = "no-inherit", [PARSE_EVENTS__TERM_TYPE_INHERIT] = "inherit", [PARSE_EVENTS__TERM_TYPE_MAX_STACK] = "max-stack", + [PARSE_EVENTS__TERM_TYPE_OVERWRITE] = "overwrite", + [PARSE_EVENTS__TERM_TYPE_NOOVERWRITE] = "no-overwrite", }; static bool config_term_shrinked; @@ -994,6 +996,12 @@ do { \ case PARSE_EVENTS__TERM_TYPE_NOINHERIT: CHECK_TYPE_VAL(NUM); break; + case PARSE_EVENTS__TERM_TYPE_OVERWRITE: + CHECK_TYPE_VAL(NUM); + break; + case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: + CHECK_TYPE_VAL(NUM); + break; case PARSE_EVENTS__TERM_TYPE_NAME: CHECK_TYPE_VAL(STR); break; @@ -1046,6 +1054,8 @@ static int config_term_tracepoint(struct perf_event_attr *attr, case PARSE_EVENTS__TERM_TYPE_INHERIT: case PARSE_EVENTS__TERM_TYPE_NOINHERIT: case PARSE_EVENTS__TERM_TYPE_MAX_STACK: + case PARSE_EVENTS__TERM_TYPE_OVERWRITE: + case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: return config_term_common(attr, term, err); default: if (err) { @@ -1118,6 +1128,12 @@ do { \ case PARSE_EVENTS__TERM_TYPE_MAX_STACK: ADD_CONFIG_TERM(MAX_STACK, max_stack, term->val.num); break; + case PARSE_EVENTS__TERM_TYPE_OVERWRITE: + ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0); + break; + case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: + ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 0 : 1); + break; default: break; } @@ -2412,9 +2428,9 @@ static void config_terms_list(char *buf, size_t buf_sz) char *parse_events_formats_error_string(char *additional_terms) { char *str; - /* "branch_type" is the longest name */ + /* "no-overwrite" is the longest name */ char static_terms[__PARSE_EVENTS__TERM_TYPE_NR * - (sizeof("branch_type") - 1)]; + (sizeof("no-overwrite") - 1)]; config_terms_list(static_terms, sizeof(static_terms)); /* valid terms */ diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index b4aa7eb2df73..d1edbf8cc66a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -69,6 +69,8 @@ enum { PARSE_EVENTS__TERM_TYPE_NOINHERIT, PARSE_EVENTS__TERM_TYPE_INHERIT, PARSE_EVENTS__TERM_TYPE_MAX_STACK, + PARSE_EVENTS__TERM_TYPE_NOOVERWRITE, + PARSE_EVENTS__TERM_TYPE_OVERWRITE, __PARSE_EVENTS__TERM_TYPE_NR, }; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 3c15b33b2e84..7a2519435da0 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -202,6 +202,8 @@ stack-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); } max-stack { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_MAX_STACK); } inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_INHERIT); } no-inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); } +overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_OVERWRITE); } +no-overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE); } , { return ','; } "/" { BEGIN(INITIAL); return '/'; } {name_minus} { return str(yyscanner, PE_NAME); } From f06149c0db430d3694d601df126b0944cc0156a6 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:46 +0000 Subject: [PATCH 21/24] perf session: Don't warn about out of order event if write_backward is used If write_backward attribute is set, records are written into kernel ring buffer from end to beginning, but read from beginning to end. To avoid 'XX out of order events recorded' warning message (timestamps of records is in reverse order when using write_backward), suppress the warning message if write_backward is selected by at lease one event. Result: Before this patch: # perf record -m 1 -e raw_syscalls:sys_exit/overwrite/ \ -e raw_syscalls:sys_enter \ dd if=/dev/zero of=/dev/null count=300 300+0 records in 300+0 records out 153600 bytes (154 kB) copied, 0.000601617 s, 255 MB/s [ perf record: Woken up 5 times to write data ] Warning: 40 out of order events recorded. [ perf record: Captured and wrote 0.096 MB perf.data (696 samples) ] After this patch: # perf record -m 1 -e raw_syscalls:sys_exit/overwrite/ \ -e raw_syscalls:sys_enter \ dd if=/dev/zero of=/dev/null count=300 300+0 records in 300+0 records out 153600 bytes (154 kB) copied, 0.000644873 s, 238 MB/s [ perf record: Woken up 5 times to write data ] [ perf record: Captured and wrote 0.096 MB perf.data (696 samples) ] Signed-off-by: Wang Nan Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-15-git-send-email-wangnan0@huawei.com Signed-off-by: He Kuang Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/session.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 078d49626494..5d61242a6e64 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1499,10 +1499,27 @@ int perf_session__register_idle_thread(struct perf_session *session) return err; } +static void +perf_session__warn_order(const struct perf_session *session) +{ + const struct ordered_events *oe = &session->ordered_events; + struct perf_evsel *evsel; + bool should_warn = true; + + evlist__for_each_entry(session->evlist, evsel) { + if (evsel->attr.write_backward) + should_warn = false; + } + + if (!should_warn) + return; + if (oe->nr_unordered_events != 0) + ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events); +} + static void perf_session__warn_about_errors(const struct perf_session *session) { const struct events_stats *stats = &session->evlist->stats; - const struct ordered_events *oe = &session->ordered_events; if (session->tool->lost == perf_event__process_lost && stats->nr_events[PERF_RECORD_LOST] != 0) { @@ -1559,8 +1576,7 @@ static void perf_session__warn_about_errors(const struct perf_session *session) stats->nr_unprocessable_samples); } - if (oe->nr_unordered_events != 0) - ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events); + perf_session__warn_order(session); events_stats__auxtrace_error_warn(stats); From 4ea648aec01982d5a57816a95c4665d6081e78f9 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 14 Jul 2016 08:34:47 +0000 Subject: [PATCH 22/24] perf record: Add --tail-synthesize option When working with overwritable ring buffer there's a inconvenience problem: if perf dumps data after a long period after it starts, non-sample events may lost, which makes following 'perf report' unable to identify proc name and mmap layout. For example: # perf record -m 4 -e raw_syscalls:* -g --overwrite --switch-output \ dd if=/dev/zero of=/dev/null send SIGUSR2 after dd runs long enough. The resuling perf.data lost correct comm and mmap events: # perf script -i perf.data.2016061522374354 perf 24478 [004] 2581325.601789: raw_syscalls:sys_exit: NR 0 = 512 ^^^^ Should be 'dd' 27b2e8 syscall_slow_exit_work+0xfe2000e3 (/lib/modules/4.6.0-rc3+/build/vmlinux) 203cc7 do_syscall_64+0xfe200117 (/lib/modules/4.6.0-rc3+/build/vmlinux) b18d83 return_from_SYSCALL_64+0xfe200000 (/lib/modules/4.6.0-rc3+/build/vmlinux) 7f47c417edf0 [unknown] ([unknown]) ^^^^^^^^^^^^ Fail to unwind This patch provides a '--tail-synthesize' option, allows perf to collect system status when finalizing output file. In resuling output file, the non-sample events reflect system status when dumping data. After this patch: # perf record -m 4 -e raw_syscalls:* -g --overwrite --switch-output --tail-synthesize \ dd if=/dev/zero of=/dev/null # perf script -i perf.data.2016061600544998 dd 27364 [004] 2583244.994464: raw_syscalls:sys_enter: NR 1 (1, ... ^^ Correct comm 203a18 syscall_trace_enter_phase2+0xfe2001a8 ([kernel.kallsyms]) 203aa5 syscall_trace_enter+0xfe200055 ([kernel.kallsyms]) 203caa do_syscall_64+0xfe2000fa ([kernel.kallsyms]) b18d83 return_from_SYSCALL_64+0xfe200000 ([kernel.kallsyms]) d8e50 __GI___libc_write+0xffff01d9639f4010 (/tmp/oxygen_root-w00229757/lib64/libc-2.18.so) ^^^^^ Correct unwind This option doesn't aim to solve this problem completely. If a process terminates before SIGUSR2, we still lost its COMM and MMAP events. For example, we can't unwind correctly from the final perf.data we get from the previous example, because when perf collects the final output file (when we press C-c), 'dd' has been terminated so its '/proc//mmap' becomes empty. However, this is a cheaper choice. To completely solve this problem we need to continously output non-sample events. To satisify the requirement of daemonization, we need to merge them periodically. It is possible but requires much more code and cycles. Automatically select --tail-synthesize when --overwrite is provided. Signed-off-by: Wang Nan Cc: He Kuang Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Nilay Vaish Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1468485287-33422-16-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 8 ++++++ tools/perf/builtin-record.c | 31 +++++++++++++++++++----- tools/perf/perf.h | 1 + 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 384c630436f8..69966abf65d1 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -367,6 +367,12 @@ options. 'perf record --dry-run -e' can act as a BPF script compiler if llvm.dump-obj in config file is set to true. +--tail-synthesize:: +Instead of collecting non-sample events (for example, fork, comm, mmap) at +the beginning of record, collect them during finalizing an output file. +The collected non-sample events reflects the status of the system when +record is finished. + --overwrite:: Makes all events use an overwritable ring buffer. An overwritable ring buffer works like a flight recorder: when it gets full, the kernel will @@ -381,6 +387,8 @@ those fitting in the ring buffer at that moment. 'overwrite' attribute can also be set or canceled for an event using config terms. For example: 'cycles/overwrite/' and 'instructions/no-overwrite/'. +Implies --tail-synthesize. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 39c7486f0607..8f2c16d9275f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -604,13 +604,16 @@ record__finish_output(struct record *rec) return; } -static int record__synthesize_workload(struct record *rec) +static int record__synthesize_workload(struct record *rec, bool tail) { struct { struct thread_map map; struct thread_map_data map_data; } thread_map; + if (rec->opts.tail_synthesize != tail) + return 0; + thread_map.map.nr = 1; thread_map.map.map[0].pid = rec->evlist->workload.pid; thread_map.map.map[0].comm = NULL; @@ -621,7 +624,7 @@ static int record__synthesize_workload(struct record *rec) rec->opts.proc_map_timeout); } -static int record__synthesize(struct record *rec); +static int record__synthesize(struct record *rec, bool tail); static int record__switch_output(struct record *rec, bool at_exit) @@ -632,6 +635,10 @@ record__switch_output(struct record *rec, bool at_exit) /* Same Size: "2015122520103046"*/ char timestamp[] = "InvalidTimestamp"; + record__synthesize(rec, true); + if (target__none(&rec->opts.target)) + record__synthesize_workload(rec, true); + rec->samples = 0; record__finish_output(rec); err = fetch_current_timestamp(timestamp, sizeof(timestamp)); @@ -654,7 +661,7 @@ record__switch_output(struct record *rec, bool at_exit) /* Output tracking events */ if (!at_exit) { - record__synthesize(rec); + record__synthesize(rec, false); /* * In 'perf record --switch-output' without -a, @@ -666,7 +673,7 @@ record__switch_output(struct record *rec, bool at_exit) * perf_event__synthesize_thread_map() for those events. */ if (target__none(&rec->opts.target)) - record__synthesize_workload(rec); + record__synthesize_workload(rec, false); } return fd; } @@ -720,7 +727,7 @@ static const struct perf_event_mmap_page *record__pick_pc(struct record *rec) return NULL; } -static int record__synthesize(struct record *rec) +static int record__synthesize(struct record *rec, bool tail) { struct perf_session *session = rec->session; struct machine *machine = &session->machines.host; @@ -730,6 +737,9 @@ static int record__synthesize(struct record *rec) int fd = perf_data_file__fd(file); int err = 0; + if (rec->opts.tail_synthesize != tail) + return 0; + if (file->is_pipe) { err = perf_event__synthesize_attrs(tool, session, process_synthesized_event); @@ -893,7 +903,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) machine = &session->machines.host; - err = record__synthesize(rec); + err = record__synthesize(rec, false); if (err < 0) goto out_child; @@ -1057,6 +1067,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) if (!quiet) fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking); + if (target__none(&rec->opts.target)) + record__synthesize_workload(rec, true); + out_child: if (forks) { int exit_status; @@ -1075,6 +1088,7 @@ out_child: } else status = err; + record__synthesize(rec, true); /* this will be recalculated during process_buildids() */ rec->samples = 0; @@ -1399,6 +1413,8 @@ struct option __record_options[] = { OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, &record.opts.no_inherit_set, "child tasks do not inherit counters"), + OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, + "synthesize non-sample events at the end of output"), OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"), OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]", @@ -1610,6 +1626,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) } } + if (record.opts.overwrite) + record.opts.tail_synthesize = true; + if (rec->evlist->nr_entries == 0 && perf_evlist__add_default(rec->evlist) < 0) { pr_err("Not enough memory for event selector list\n"); diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 608b42bdb1b6..a7e0f1497244 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -59,6 +59,7 @@ struct record_opts { bool record_switch_events; bool all_kernel; bool all_user; + bool tail_synthesize; bool overwrite; unsigned int freq; unsigned int mmap_pages; From 4a0982f941d6f2550e8c39bc8bb0984b50dddf36 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Jul 2016 17:23:46 -0300 Subject: [PATCH 23/24] objtool: Add -I$(srctree)/tools/arch/$(ARCH)/include/uapi So that it can find asm/bitsperlong.h to get the __BITS_PER_LONG definition. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-pr3pvskh65pey4po7t122z4j@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/objtool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 9a3110cac604..1f75b0a046cc 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -26,7 +26,7 @@ OBJTOOL_IN := $(OBJTOOL)-in.o all: $(OBJTOOL) -INCLUDES := -I$(srctree)/tools/include +INCLUDES := -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi CFLAGS += -Wall -Werror $(EXTRA_WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES) LDFLAGS += -lelf $(LIBSUBCMD) From b49364f36cfdb6d540ac961102d7ffaf84279bb6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 15 Jul 2016 17:32:52 -0300 Subject: [PATCH 24/24] objtool: Initialize variable to silence old compiler gcc version 4.1.2 20080704 (Red Hat 4.1.2-55) barfs with: CC /tmp/build/objtool/builtin-check.o cc1: warnings being treated as errors builtin-check.c: In function 'cmd_check': builtin-check.c:667: warning: 'prev_rela' may be used uninitialized in this function mv: cannot stat `/tmp/build/objtool/.builtin-check.o.tmp': No such file or directory make[1]: *** [/tmp/build/objtool/builtin-check.o] Error 1 Init it to NULL to silence it. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-qolo31rl2ojlwj1lj9dhemyz@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/objtool/builtin-check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 92d84b277032..4ed30f45c6da 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -664,7 +664,7 @@ static int add_func_switch_tables(struct objtool_file *file, struct symbol *func) { struct instruction *insn, *prev_jump; - struct rela *text_rela, *rodata_rela, *prev_rela; + struct rela *text_rela, *rodata_rela, *prev_rela = NULL; int ret; prev_jump = NULL;