diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 11134238417d..9a904f63f8c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) static inline void bpf_prog_put(struct bpf_prog *prog) { } +static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b6b43cc0dd5..ccb73a58113d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -679,6 +679,10 @@ struct perf_event { u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + perf_overflow_handler_t orig_overflow_handler; + struct bpf_prog *prog; +#endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; @@ -788,6 +792,11 @@ struct perf_output_handle { int page; }; +struct bpf_perf_event_data_kern { + struct pt_regs *regs; + struct perf_sample_data *data; +}; + #ifdef CONFIG_CGROUP_PERF /* diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 185f8ea2702f..d0352a971ebd 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -71,6 +71,7 @@ header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h header-y += bpf_common.h +header-y += bpf_perf_event.h header-y += bpf.h header-y += bpqether.h header-y += bsg.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e4c5a1baa993..f896dfac4ac0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -95,6 +95,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SCHED_ACT, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h new file mode 100644 index 000000000000..067427259820 --- /dev/null +++ b/include/uapi/linux/bpf_perf_event.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__ +#define _UAPI__LINUX_BPF_PERF_EVENT_H__ + +#include +#include + +struct bpf_perf_event_data { + struct pt_regs regs; + __u64 sample_period; +}; + +#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index abb61f3f6900..48c2705db22c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2333,7 +2333,8 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W) { + if (BPF_SIZE(insn->code) != BPF_W && + BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; continue; } @@ -2510,6 +2511,20 @@ static int do_check(struct verifier_env *env) return 0; } +static int check_map_prog_compatibility(struct bpf_map *map, + struct bpf_prog *prog) + +{ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT && + (map->map_type == BPF_MAP_TYPE_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && + (map->map_flags & BPF_F_NO_PREALLOC)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -2517,7 +2532,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; - int i, j; + int i, j, err; for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && @@ -2561,6 +2576,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return PTR_ERR(map); } + err = check_map_prog_compatibility(map, env->prog); + if (err) { + fdput(f); + return err; + } + /* store map pointer inside BPF_LD_IMM64 instruction */ insn[0].imm = (u32) (unsigned long) map; insn[1].imm = ((u64) (unsigned long) map) >> 32; @@ -2642,9 +2663,11 @@ static int convert_ctx_accesses(struct verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { u32 insn_delta, cnt; - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else continue; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf7b942..85bf4c37911f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7022,7 +7022,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7637,11 +7637,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7682,6 +7754,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -8998,6 +9072,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ad35213b8405..d3869b03d9fe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -8,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = { .type = BPF_PROG_TYPE_TRACEPOINT, }; +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + if (off == offsetof(struct bpf_perf_event_data, sample_period)) { + if (size != sizeof(u64)) + return false; + } else { + if (size != sizeof(long)) + return false; + } + return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_perf_event_data, sample_period): + BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + offsetof(struct perf_sample_data, period)); + break; + default: + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), + dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, regs)); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), + dst_reg, dst_reg, ctx_off); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_verifier_ops perf_event_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = pe_prog_is_valid_access, + .convert_ctx_access = pe_prog_convert_ctx_access, +}; + +static struct bpf_prog_type_list perf_event_tl = { + .ops = &perf_event_prog_ops, + .type = BPF_PROG_TYPE_PERF_EVENT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); bpf_register_prog_type(&tracepoint_tl); + bpf_register_prog_type(&perf_event_tl); return 0; } late_initcall(register_kprobe_prog_ops); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index db3cb061bfcd..12b7304d55dc 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -25,6 +25,8 @@ hostprogs-y += test_cgrp2_array_pin hostprogs-y += xdp1 hostprogs-y += xdp2 hostprogs-y += test_current_task_under_cgroup +hostprogs-y += trace_event +hostprogs-y += sampleip test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -52,6 +54,8 @@ xdp1-objs := bpf_load.o libbpf.o xdp1_user.o xdp2-objs := bpf_load.o libbpf.o xdp1_user.o test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \ test_current_task_under_cgroup_user.o +trace_event-objs := bpf_load.o libbpf.o trace_event_user.o +sampleip-objs := bpf_load.o libbpf.o sampleip_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -79,6 +83,8 @@ always += test_cgrp2_tc_kern.o always += xdp1_kern.o always += xdp2_kern.o always += test_current_task_under_cgroup_kern.o +always += trace_event_kern.o +always += sampleip_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -103,6 +109,8 @@ HOSTLOADLIBES_test_overhead += -lelf -lrt HOSTLOADLIBES_xdp1 += -lelf HOSTLOADLIBES_xdp2 += -lelf HOSTLOADLIBES_test_current_task_under_cgroup += -lelf +HOSTLOADLIBES_trace_event += -lelf +HOSTLOADLIBES_sampleip += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index bbdf62a1e45e..90f44bd2045e 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -55,6 +55,8 @@ static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = (void *) BPF_FUNC_skb_get_tunnel_opt; static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = (void *) BPF_FUNC_skb_set_tunnel_opt; +static unsigned long long (*bpf_get_prandom_u32)(void) = + (void *) BPF_FUNC_get_prandom_u32; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 0cfda2320320..97913e109b14 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -51,6 +51,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; bool is_xdp = strncmp(event, "xdp", 3) == 0; + bool is_perf_event = strncmp(event, "perf_event", 10) == 0; enum bpf_prog_type prog_type; char buf[256]; int fd, efd, err, id; @@ -69,6 +70,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_TRACEPOINT; } else if (is_xdp) { prog_type = BPF_PROG_TYPE_XDP; + } else if (is_perf_event) { + prog_type = BPF_PROG_TYPE_PERF_EVENT; } else { printf("Unknown event '%s'\n", event); return -1; @@ -82,7 +85,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_fd[prog_cnt++] = fd; - if (is_xdp) + if (is_xdp || is_perf_event) return 0; if (is_socket) { @@ -326,6 +329,7 @@ int load_bpf_file(char *path) memcmp(shname_prog, "kretprobe/", 10) == 0 || memcmp(shname_prog, "tracepoint/", 11) == 0 || memcmp(shname_prog, "xdp", 3) == 0 || + memcmp(shname_prog, "perf_event", 10) == 0 || memcmp(shname_prog, "socket", 6) == 0) load_and_attach(shname_prog, insns, data_prog->d_size); } @@ -344,6 +348,7 @@ int load_bpf_file(char *path) memcmp(shname, "kretprobe/", 10) == 0 || memcmp(shname, "tracepoint/", 11) == 0 || memcmp(shname, "xdp", 3) == 0 || + memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0) load_and_attach(shname, data->d_buf, data->d_size); } diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c new file mode 100644 index 000000000000..774a681f374a --- /dev/null +++ b/samples/bpf/sampleip_kern.c @@ -0,0 +1,38 @@ +/* Copyright 2016 Netflix, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define MAX_IPS 8192 + +struct bpf_map_def SEC("maps") ip_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u64), + .value_size = sizeof(u32), + .max_entries = MAX_IPS, +}; + +SEC("perf_event") +int do_sample(struct bpf_perf_event_data *ctx) +{ + u64 ip; + u32 *value, init_val = 1; + + ip = ctx->regs.ip; + value = bpf_map_lookup_elem(&ip_map, &ip); + if (value) + *value += 1; + else + /* E2BIG not tested for this example only */ + bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST); + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c new file mode 100644 index 000000000000..260a6bdd6413 --- /dev/null +++ b/samples/bpf/sampleip_user.c @@ -0,0 +1,196 @@ +/* + * sampleip: sample instruction pointer and frequency count in a BPF map. + * + * Copyright 2016 Netflix, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define DEFAULT_FREQ 99 +#define DEFAULT_SECS 5 +#define MAX_IPS 8192 +#define PAGE_OFFSET 0xffff880000000000 + +static int nr_cpus; + +static void usage(void) +{ + printf("USAGE: sampleip [-F freq] [duration]\n"); + printf(" -F freq # sample frequency (Hertz), default 99\n"); + printf(" duration # sampling duration (seconds), default 5\n"); +} + +static int sampling_start(int *pmu_fd, int freq) +{ + int i; + + struct perf_event_attr pe_sample_attr = { + .type = PERF_TYPE_SOFTWARE, + .freq = 1, + .sample_period = freq, + .config = PERF_COUNT_SW_CPU_CLOCK, + .inherit = 1, + }; + + for (i = 0; i < nr_cpus; i++) { + pmu_fd[i] = perf_event_open(&pe_sample_attr, -1 /* pid */, i, + -1 /* group_fd */, 0 /* flags */); + if (pmu_fd[i] < 0) { + fprintf(stderr, "ERROR: Initializing perf sampling\n"); + return 1; + } + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, + prog_fd[0]) == 0); + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); + } + + return 0; +} + +static void sampling_end(int *pmu_fd) +{ + int i; + + for (i = 0; i < nr_cpus; i++) + close(pmu_fd[i]); +} + +struct ipcount { + __u64 ip; + __u32 count; +}; + +/* used for sorting */ +struct ipcount counts[MAX_IPS]; + +static int count_cmp(const void *p1, const void *p2) +{ + return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count; +} + +static void print_ip_map(int fd) +{ + struct ksym *sym; + __u64 key, next_key; + __u32 value; + int i, max; + + printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT"); + + /* fetch IPs and counts */ + key = 0, i = 0; + while (bpf_get_next_key(fd, &key, &next_key) == 0) { + bpf_lookup_elem(fd, &next_key, &value); + counts[i].ip = next_key; + counts[i++].count = value; + key = next_key; + } + max = i; + + /* sort and print */ + qsort(counts, max, sizeof(struct ipcount), count_cmp); + for (i = 0; i < max; i++) { + if (counts[i].ip > PAGE_OFFSET) { + sym = ksym_search(counts[i].ip); + printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name, + counts[i].count); + } else { + printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)", + counts[i].count); + } + } + + if (max == MAX_IPS) { + printf("WARNING: IP hash was full (max %d entries); ", max); + printf("may have dropped samples\n"); + } +} + +static void int_exit(int sig) +{ + printf("\n"); + print_ip_map(map_fd[0]); + exit(0); +} + +int main(int argc, char **argv) +{ + char filename[256]; + int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS; + + /* process arguments */ + while ((opt = getopt(argc, argv, "F:h")) != -1) { + switch (opt) { + case 'F': + freq = atoi(optarg); + break; + case 'h': + default: + usage(); + return 0; + } + } + if (argc - optind == 1) + secs = atoi(argv[optind]); + if (freq == 0 || secs == 0) { + usage(); + return 1; + } + + /* initialize kernel symbol translation */ + if (load_kallsyms()) { + fprintf(stderr, "ERROR: loading /proc/kallsyms\n"); + return 2; + } + + /* create perf FDs for each CPU */ + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + pmu_fd = malloc(nr_cpus * sizeof(int)); + if (pmu_fd == NULL) { + fprintf(stderr, "ERROR: malloc of pmu_fd\n"); + return 1; + } + + /* load BPF program */ + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (load_bpf_file(filename)) { + fprintf(stderr, "ERROR: loading BPF program (errno %d):\n", + errno); + if (strcmp(bpf_log_buf, "") == 0) + fprintf(stderr, "Try: ulimit -l unlimited\n"); + else + fprintf(stderr, "%s", bpf_log_buf); + return 1; + } + signal(SIGINT, int_exit); + + /* do sampling */ + printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n", + freq, secs); + if (sampling_start(pmu_fd, freq) != 0) + return 1; + sleep(secs); + sampling_end(pmu_fd); + free(pmu_fd); + + /* output sample counts */ + print_ip_map(map_fd[0]); + + return 0; +} diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c new file mode 100644 index 000000000000..71a8ed32823e --- /dev/null +++ b/samples/bpf/trace_event_kern.c @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include "bpf_helpers.h" + +struct key_t { + char comm[TASK_COMM_LEN]; + u32 kernstack; + u32 userstack; +}; + +struct bpf_map_def SEC("maps") counts = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct key_t), + .value_size = sizeof(u64), + .max_entries = 10000, +}; + +struct bpf_map_def SEC("maps") stackmap = { + .type = BPF_MAP_TYPE_STACK_TRACE, + .key_size = sizeof(u32), + .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), + .max_entries = 10000, +}; + +#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) +#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) + +SEC("perf_event") +int bpf_prog1(struct bpf_perf_event_data *ctx) +{ + char fmt[] = "CPU-%d period %lld ip %llx"; + u32 cpu = bpf_get_smp_processor_id(); + struct key_t key; + u64 *val, one = 1; + + if (ctx->sample_period < 10000) + /* ignore warmup */ + return 0; + bpf_get_current_comm(&key.comm, sizeof(key.comm)); + key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS); + key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS); + if ((int)key.kernstack < 0 && (int)key.userstack < 0) { + bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period, + ctx->regs.ip); + return 0; + } + + val = bpf_map_lookup_elem(&counts, &key); + if (val) + (*val)++; + else + bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c new file mode 100644 index 000000000000..9a130d31ecf2 --- /dev/null +++ b/samples/bpf/trace_event_user.c @@ -0,0 +1,213 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define SAMPLE_FREQ 50 + +static bool sys_read_seen, sys_write_seen; + +static void print_ksym(__u64 addr) +{ + struct ksym *sym; + + if (!addr) + return; + sym = ksym_search(addr); + printf("%s;", sym->name); + if (!strcmp(sym->name, "sys_read")) + sys_read_seen = true; + else if (!strcmp(sym->name, "sys_write")) + sys_write_seen = true; +} + +static void print_addr(__u64 addr) +{ + if (!addr) + return; + printf("%llx;", addr); +} + +#define TASK_COMM_LEN 16 + +struct key_t { + char comm[TASK_COMM_LEN]; + __u32 kernstack; + __u32 userstack; +}; + +static void print_stack(struct key_t *key, __u64 count) +{ + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; + static bool warned; + int i; + + printf("%3lld %s;", count, key->comm); + if (bpf_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) { + printf("---;"); + } else { + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) + print_ksym(ip[i]); + } + printf("-;"); + if (bpf_lookup_elem(map_fd[1], &key->userstack, ip) != 0) { + printf("---;"); + } else { + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) + print_addr(ip[i]); + } + printf("\n"); + + if (key->kernstack == -EEXIST && !warned) { + printf("stackmap collisions seen. Consider increasing size\n"); + warned = true; + } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) { + printf("err stackid %d %d\n", key->kernstack, key->userstack); + } +} + +static void int_exit(int sig) +{ + kill(0, SIGKILL); + exit(0); +} + +static void print_stacks(void) +{ + struct key_t key = {}, next_key; + __u64 value; + __u32 stackid = 0, next_id; + int fd = map_fd[0], stack_map = map_fd[1]; + + sys_read_seen = sys_write_seen = false; + while (bpf_get_next_key(fd, &key, &next_key) == 0) { + bpf_lookup_elem(fd, &next_key, &value); + print_stack(&next_key, value); + bpf_delete_elem(fd, &next_key); + key = next_key; + } + + if (!sys_read_seen || !sys_write_seen) { + printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n"); + int_exit(0); + } + + /* clear stack map */ + while (bpf_get_next_key(stack_map, &stackid, &next_id) == 0) { + bpf_delete_elem(stack_map, &next_id); + stackid = next_id; + } +} + +static void test_perf_event_all_cpu(struct perf_event_attr *attr) +{ + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + int *pmu_fd = malloc(nr_cpus * sizeof(int)); + int i; + + /* open perf_event on all cpus */ + for (i = 0; i < nr_cpus; i++) { + pmu_fd[i] = perf_event_open(attr, -1, i, -1, 0); + if (pmu_fd[i] < 0) { + printf("perf_event_open failed\n"); + goto all_cpu_err; + } + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); + } + system("dd if=/dev/zero of=/dev/null count=5000k"); + print_stacks(); +all_cpu_err: + for (i--; i >= 0; i--) + close(pmu_fd[i]); + free(pmu_fd); +} + +static void test_perf_event_task(struct perf_event_attr *attr) +{ + int pmu_fd; + + /* open task bound event */ + pmu_fd = perf_event_open(attr, 0, -1, -1, 0); + if (pmu_fd < 0) { + printf("perf_event_open failed\n"); + return; + } + assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); + assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0); + system("dd if=/dev/zero of=/dev/null count=5000k"); + print_stacks(); + close(pmu_fd); +} + +static void test_bpf_perf_event(void) +{ + struct perf_event_attr attr_type_hw = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .inherit = 1, + }; + struct perf_event_attr attr_type_sw = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_CPU_CLOCK, + .inherit = 1, + }; + + test_perf_event_all_cpu(&attr_type_hw); + test_perf_event_task(&attr_type_hw); + test_perf_event_all_cpu(&attr_type_sw); + test_perf_event_task(&attr_type_sw); +} + + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + setrlimit(RLIMIT_MEMLOCK, &r); + + signal(SIGINT, int_exit); + + if (load_kallsyms()) { + printf("failed to process /proc/kallsyms\n"); + return 1; + } + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 2; + } + + if (fork() == 0) { + read_trace_pipe(); + return 0; + } + test_bpf_perf_event(); + + int_exit(0); + return 0; +}