From 0cd3cbed3caf6eae3bc0fa4afa4f26a9babfe55a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:08 -0700 Subject: [PATCH 01/10] bpf: offload: allow offloaded programs to use perf event arrays BPF_MAP_TYPE_PERF_EVENT_ARRAY is special as far as offload goes. The map only holds glue to perf ring, not actual data. Allow non-offloaded perf event arrays to be used in offloaded programs. Offload driver can extract the events from HW and put them in the map for user space to retrieve. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 +++++ kernel/bpf/offload.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0e00a13ff01b..321969da67b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -110,6 +110,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) return container_of(map, struct bpf_offloaded_map, map); } +static inline bool bpf_map_offload_neutral(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; +} + static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return map->ops->map_seq_show_elem && map->ops->map_check_btf; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index c9401075b58c..ac747d5cf7c6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux)) return false; + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); down_read(&bpf_devs_lock); offload = prog->aux->offload; From 630a4d3874a06aa9f9481cbfc688981aad7a834c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:09 -0700 Subject: [PATCH 02/10] nfp: bpf: record offload neutral maps in the driver For asynchronous events originating from the device, like perf event output, we need to be able to make sure that objects being referred to by the FW message are valid on the host. FW events can get queued and reordered. Even if we had a FW message "barrier" we should still protect ourselves from bogus FW output. Add a reverse-mapping hash table and record in it all raw map pointers FW may refer to. Only record neutral maps, i.e. perf event arrays. These are currently the only objects FW can refer to. Use RCU protection on the read side, update side is under RTNL. Since program vs map destruction order is slightly painful for offload simply take an extra reference on all the recorded maps to make sure they don't disappear. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 25 +++- drivers/net/ethernet/netronome/nfp/bpf/main.h | 20 ++- .../net/ethernet/netronome/nfp/bpf/offload.c | 125 +++++++++++++++++- drivers/net/ethernet/netronome/nfp/nfp_app.c | 2 +- kernel/bpf/syscall.c | 2 + 5 files changed, 168 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 1dc424685f4e..f65df341c557 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -43,6 +43,14 @@ #include "fw.h" #include "main.h" +const struct rhashtable_params nfp_bpf_maps_neutral_params = { + .nelem_hint = 4, + .key_len = FIELD_SIZEOF(struct nfp_bpf_neutral_map, ptr), + .key_offset = offsetof(struct nfp_bpf_neutral_map, ptr), + .head_offset = offsetof(struct nfp_bpf_neutral_map, l), + .automatic_shrinking = true, +}; + static bool nfp_net_ebpf_capable(struct nfp_net *nn) { #ifdef __LITTLE_ENDIAN @@ -401,17 +409,28 @@ static int nfp_bpf_init(struct nfp_app *app) init_waitqueue_head(&bpf->cmsg_wq); INIT_LIST_HEAD(&bpf->map_list); - err = nfp_bpf_parse_capabilities(app); + err = rhashtable_init(&bpf->maps_neutral, &nfp_bpf_maps_neutral_params); if (err) goto err_free_bpf; + err = nfp_bpf_parse_capabilities(app); + if (err) + goto err_free_neutral_maps; + return 0; +err_free_neutral_maps: + rhashtable_destroy(&bpf->maps_neutral); err_free_bpf: kfree(bpf); return err; } +static void nfp_check_rhashtable_empty(void *ptr, void *arg) +{ + WARN_ON_ONCE(1); +} + static void nfp_bpf_clean(struct nfp_app *app) { struct nfp_app_bpf *bpf = app->priv; @@ -419,6 +438,8 @@ static void nfp_bpf_clean(struct nfp_app *app) WARN_ON(!skb_queue_empty(&bpf->cmsg_replies)); WARN_ON(!list_empty(&bpf->map_list)); WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use); + rhashtable_free_and_destroy(&bpf->maps_neutral, + nfp_check_rhashtable_empty, NULL); kfree(bpf); } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 68b5d326483d..5db6284a44ba 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -114,6 +115,8 @@ enum pkt_vec { * @maps_in_use: number of currently offloaded maps * @map_elems_in_use: number of elements allocated to offloaded maps * + * @maps_neutral: hash table of offload-neutral maps (on pointer) + * * @adjust_head: adjust head capability * @adjust_head.flags: extra flags for adjust head * @adjust_head.off_min: minimal packet offset within buffer required @@ -150,6 +153,8 @@ struct nfp_app_bpf { unsigned int maps_in_use; unsigned int map_elems_in_use; + struct rhashtable maps_neutral; + struct nfp_bpf_cap_adjust_head { u32 flags; int off_min; @@ -199,6 +204,14 @@ struct nfp_bpf_map { enum nfp_bpf_map_use use_map[]; }; +struct nfp_bpf_neutral_map { + struct rhash_head l; + struct bpf_map *ptr; + u32 count; +}; + +extern const struct rhashtable_params nfp_bpf_maps_neutral_params; + struct nfp_prog; struct nfp_insn_meta; typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); @@ -367,6 +380,8 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) * @error: error code if something went wrong * @stack_depth: max stack depth from the verifier * @adjust_head_location: if program has single adjust head call - the insn no. + * @map_records_cnt: the number of map pointers recorded for this prog + * @map_records: the map record pointers from bpf->maps_neutral * @insns: list of BPF instruction wrappers (struct nfp_insn_meta) */ struct nfp_prog { @@ -390,6 +405,9 @@ struct nfp_prog { unsigned int stack_depth; unsigned int adjust_head_location; + unsigned int map_records_cnt; + struct nfp_bpf_neutral_map **map_records; + struct list_head insns; }; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 42d98792bd25..e3ead906cc60 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -56,6 +56,126 @@ #include "../nfp_net_ctrl.h" #include "../nfp_net.h" +static int +nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, + struct bpf_map *map) +{ + struct nfp_bpf_neutral_map *record; + int err; + + /* Map record paths are entered via ndo, update side is protected. */ + ASSERT_RTNL(); + + /* Reuse path - other offloaded program is already tracking this map. */ + record = rhashtable_lookup_fast(&bpf->maps_neutral, &map, + nfp_bpf_maps_neutral_params); + if (record) { + nfp_prog->map_records[nfp_prog->map_records_cnt++] = record; + record->count++; + return 0; + } + + /* Grab a single ref to the map for our record. The prog destroy ndo + * happens after free_used_maps(). + */ + map = bpf_map_inc(map, false); + if (IS_ERR(map)) + return PTR_ERR(map); + + record = kmalloc(sizeof(*record), GFP_KERNEL); + if (!record) { + err = -ENOMEM; + goto err_map_put; + } + + record->ptr = map; + record->count = 1; + + err = rhashtable_insert_fast(&bpf->maps_neutral, &record->l, + nfp_bpf_maps_neutral_params); + if (err) + goto err_free_rec; + + nfp_prog->map_records[nfp_prog->map_records_cnt++] = record; + + return 0; + +err_free_rec: + kfree(record); +err_map_put: + bpf_map_put(map); + return err; +} + +static void +nfp_map_ptrs_forget(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog) +{ + bool freed = false; + int i; + + ASSERT_RTNL(); + + for (i = 0; i < nfp_prog->map_records_cnt; i++) { + if (--nfp_prog->map_records[i]->count) { + nfp_prog->map_records[i] = NULL; + continue; + } + + WARN_ON(rhashtable_remove_fast(&bpf->maps_neutral, + &nfp_prog->map_records[i]->l, + nfp_bpf_maps_neutral_params)); + freed = true; + } + + if (freed) { + synchronize_rcu(); + + for (i = 0; i < nfp_prog->map_records_cnt; i++) + if (nfp_prog->map_records[i]) { + bpf_map_put(nfp_prog->map_records[i]->ptr); + kfree(nfp_prog->map_records[i]); + } + } + + kfree(nfp_prog->map_records); + nfp_prog->map_records = NULL; + nfp_prog->map_records_cnt = 0; +} + +static int +nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, + struct bpf_prog *prog) +{ + int i, cnt, err; + + /* Quickly count the maps we will have to remember */ + cnt = 0; + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (bpf_map_offload_neutral(prog->aux->used_maps[i])) + cnt++; + if (!cnt) + return 0; + + nfp_prog->map_records = kmalloc_array(cnt, + sizeof(nfp_prog->map_records[0]), + GFP_KERNEL); + if (!nfp_prog->map_records) + return -ENOMEM; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (bpf_map_offload_neutral(prog->aux->used_maps[i])) { + err = nfp_map_ptr_record(bpf, nfp_prog, + prog->aux->used_maps[i]); + if (err) { + nfp_map_ptrs_forget(bpf, nfp_prog); + return err; + } + } + WARN_ON(cnt != nfp_prog->map_records_cnt); + + return 0; +} + static int nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog, unsigned int cnt) @@ -151,7 +271,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog) prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64); prog->aux->offload->jited_image = nfp_prog->prog; - return 0; + return nfp_map_ptrs_record(nfp_prog->bpf, nfp_prog, prog); } static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) @@ -159,6 +279,7 @@ static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv; kvfree(nfp_prog->prog); + nfp_map_ptrs_forget(nfp_prog->bpf, nfp_prog); nfp_prog_free(nfp_prog); return 0; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c index 6aedef0ad433..0e0253c7e17b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 263e13ede029..9b87198deea2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -282,6 +282,7 @@ void bpf_map_put(struct bpf_map *map) { __bpf_map_put(map, true); } +EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { @@ -543,6 +544,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) atomic_inc(&map->usercnt); return map; } +EXPORT_SYMBOL_GPL(bpf_map_inc); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { From 6cb5fb3891db9d3f6feb0f7669946550c9abc420 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:10 -0700 Subject: [PATCH 03/10] bpf: export bpf_event_output() bpf_event_output() is useful for offloads to add events to BPF event rings, export it. Note that export is placed near the stub since tracing is optional and kernel/bpf/core.c is always going to be built. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1127552c8033..d0d7d9462368 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1719,6 +1719,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, { return -ENOTSUPP; } +EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { From f4e3ec0d573e238f383b3da365127002579a07d6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:11 -0700 Subject: [PATCH 04/10] bpf: replace map pointer loads before calling into offloads Offloads may find host map pointers more useful than map fds. Map pointers can be used to identify the map, while fds are only valid within the context of loading process. Jump to skip_full_check on error in case verifier log overflow has to be handled (replace_map_fd_with_map_ptr() prints to the log, driver prep may do that too in the future). Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 37e0affa515e..ccac552ac861 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5741,16 +5741,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env); - if (ret) - goto err_unlock; - } - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + ret = bpf_prog_offload_verifier_prep(env); + if (ret) + goto skip_full_check; + } + env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); From 9816dd35ececc095f3e3be29d30d3adc755908d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:12 -0700 Subject: [PATCH 05/10] nfp: bpf: perf event output helpers support Add support for the perf_event_output family of helpers. The implementation on the NFP will not match the host code exactly. The state of the host map and rings is unknown to the device, hence device can't return errors when rings are not installed. The device simply packs the data into a firmware notification message and sends it over to the host, returning success to the program. There is no notion of a host CPU on the device when packets are being processed. Device will only offload programs which set BPF_F_CURRENT_CPU. Still, if map index doesn't match CPU no error will be returned (see above). Dropped/lost firmware notification messages will not cause "lost events" event on the perf ring, they are only visible via device error counters. Firmware notification messages may also get reordered in respect to the packets which caused their generation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 16 ++++- drivers/net/ethernet/netronome/nfp/bpf/fw.h | 20 +++++- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 32 ++++++++- drivers/net/ethernet/netronome/nfp/bpf/main.c | 3 + drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 ++ .../net/ethernet/netronome/nfp/bpf/offload.c | 47 +++++++++++++ .../net/ethernet/netronome/nfp/bpf/verifier.c | 69 ++++++++++++++++++- 7 files changed, 187 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 7e298148ca26..cb87fccb9f6a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -102,6 +102,15 @@ nfp_bpf_cmsg_map_req_alloc(struct nfp_app_bpf *bpf, unsigned int n) return nfp_bpf_cmsg_alloc(bpf, size); } +static u8 nfp_bpf_cmsg_get_type(struct sk_buff *skb) +{ + struct cmsg_hdr *hdr; + + hdr = (struct cmsg_hdr *)skb->data; + + return hdr->type; +} + static unsigned int nfp_bpf_cmsg_get_tag(struct sk_buff *skb) { struct cmsg_hdr *hdr; @@ -431,6 +440,11 @@ void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb) goto err_free; } + if (nfp_bpf_cmsg_get_type(skb) == CMSG_TYPE_BPF_EVENT) { + nfp_bpf_event_output(bpf, skb); + return; + } + nfp_ctrl_lock(bpf->app->ctrl); tag = nfp_bpf_cmsg_get_tag(skb); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index 39639ac28b01..3dbc21653ce5 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -37,6 +37,14 @@ #include #include +/* Kernel's enum bpf_reg_type is not uABI so people may change it breaking + * our FW ABI. In that case we will do translation in the driver. + */ +#define NFP_BPF_SCALAR_VALUE 1 +#define NFP_BPF_MAP_VALUE 4 +#define NFP_BPF_STACK 6 +#define NFP_BPF_PACKET_DATA 8 + enum bpf_cap_tlv_type { NFP_BPF_CAP_TYPE_FUNC = 1, NFP_BPF_CAP_TYPE_ADJUST_HEAD = 2, @@ -81,6 +89,7 @@ enum nfp_bpf_cmsg_type { CMSG_TYPE_MAP_DELETE = 5, CMSG_TYPE_MAP_GETNEXT = 6, CMSG_TYPE_MAP_GETFIRST = 7, + CMSG_TYPE_BPF_EVENT = 8, __CMSG_TYPE_MAP_MAX, }; @@ -155,4 +164,13 @@ struct cmsg_reply_map_op { __be32 resv; struct cmsg_key_value_pair elem[0]; }; + +struct cmsg_bpf_event { + struct cmsg_hdr hdr; + __be32 cpu_id; + __be64 map_ptr; + __be32 data_size; + __be32 pkt_size; + u8 data[0]; +}; #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 65f0791cae0c..2127cf1548dd 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -1456,6 +1456,31 @@ nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return 0; } +static int +nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + swreg ptr_type; + u32 ret_tgt; + + ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog)); + + ret_tgt = nfp_prog_current_offset(nfp_prog) + 3; + + emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id, + 2, RELO_BR_HELPER); + + /* Load ptr type into A1 */ + wrp_mov(nfp_prog, reg_a(1), ptr_type); + + /* Load the return address into B0 */ + wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL); + + if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt)) + return -EINVAL; + + return 0; +} + /* --- Callbacks --- */ static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { @@ -2411,6 +2436,8 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return map_call_stack_common(nfp_prog, meta); case BPF_FUNC_get_prandom_u32: return nfp_get_prandom_u32(nfp_prog, meta); + case BPF_FUNC_perf_event_output: + return nfp_perf_event_output(nfp_prog, meta); default: WARN_ONCE(1, "verifier allowed unsupported function\n"); return -EOPNOTSUPP; @@ -3353,6 +3380,9 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv) case BPF_FUNC_map_delete_elem: val = nfp_prog->bpf->helpers.map_delete; break; + case BPF_FUNC_perf_event_output: + val = nfp_prog->bpf->helpers.perf_event_output; + break; default: pr_err("relocation of unknown helper %d\n", val); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index f65df341c557..d72f9e7f42da 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -298,6 +298,9 @@ nfp_bpf_parse_cap_func(struct nfp_app_bpf *bpf, void __iomem *value, u32 length) case BPF_FUNC_map_delete_elem: bpf->helpers.map_delete = readl(&cap->func_addr); break; + case BPF_FUNC_perf_event_output: + bpf->helpers.perf_event_output = readl(&cap->func_addr); + break; } return 0; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 5db6284a44ba..82682378d57f 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -136,6 +136,7 @@ enum pkt_vec { * @helpers.map_lookup: map lookup helper address * @helpers.map_update: map update helper address * @helpers.map_delete: map delete helper address + * @helpers.perf_event_output: output perf event to a ring buffer * * @pseudo_random: FW initialized the pseudo-random machinery (CSRs) */ @@ -176,6 +177,7 @@ struct nfp_app_bpf { u32 map_lookup; u32 map_update; u32 map_delete; + u32 perf_event_output; } helpers; bool pseudo_random; @@ -458,5 +460,7 @@ int nfp_bpf_ctrl_lookup_entry(struct bpf_offloaded_map *offmap, int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap, void *key, void *next_key); +int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb); + void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb); #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index e3ead906cc60..4db0ac1e42a8 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -441,6 +441,53 @@ int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf) } } +static unsigned long +nfp_bpf_perf_event_copy(void *dst, const void *src, + unsigned long off, unsigned long len) +{ + memcpy(dst, src + off, len); + return 0; +} + +int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb) +{ + struct cmsg_bpf_event *cbe = (void *)skb->data; + u32 pkt_size, data_size; + struct bpf_map *map; + + if (skb->len < sizeof(struct cmsg_bpf_event)) + goto err_drop; + + pkt_size = be32_to_cpu(cbe->pkt_size); + data_size = be32_to_cpu(cbe->data_size); + map = (void *)(unsigned long)be64_to_cpu(cbe->map_ptr); + + if (skb->len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size) + goto err_drop; + if (cbe->hdr.ver != CMSG_MAP_ABI_VERSION) + goto err_drop; + + rcu_read_lock(); + if (!rhashtable_lookup_fast(&bpf->maps_neutral, &map, + nfp_bpf_maps_neutral_params)) { + rcu_read_unlock(); + pr_warn("perf event: dest map pointer %px not recognized, dropping event\n", + map); + goto err_drop; + } + + bpf_event_output(map, be32_to_cpu(cbe->cpu_id), + &cbe->data[round_up(pkt_size, 4)], data_size, + cbe->data, pkt_size, nfp_bpf_perf_event_copy); + rcu_read_unlock(); + + dev_consume_skb_any(skb); + return 0; +err_drop: + dev_kfree_skb_any(skb); + return -EINVAL; +} + static int nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog, struct netlink_ext_ack *extack) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 06ad53ce4ad9..c76b622743ae 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2017 Netronome Systems, Inc. + * Copyright (C) 2016-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -36,6 +36,8 @@ #include #include +#include "../nfp_app.h" +#include "../nfp_main.h" #include "fw.h" #include "main.h" @@ -216,6 +218,71 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, pr_vlog(env, "bpf_get_prandom_u32(): FW doesn't support random number generation\n"); return -EOPNOTSUPP; + case BPF_FUNC_perf_event_output: + BUILD_BUG_ON(NFP_BPF_SCALAR_VALUE != SCALAR_VALUE || + NFP_BPF_MAP_VALUE != PTR_TO_MAP_VALUE || + NFP_BPF_STACK != PTR_TO_STACK || + NFP_BPF_PACKET_DATA != PTR_TO_PACKET); + + if (!bpf->helpers.perf_event_output) { + pr_vlog(env, "event_output: not supported by FW\n"); + return -EOPNOTSUPP; + } + + /* Force current CPU to make sure we can report the event + * wherever we get the control message from FW. + */ + if (reg3->var_off.mask & BPF_F_INDEX_MASK || + (reg3->var_off.value & BPF_F_INDEX_MASK) != + BPF_F_CURRENT_CPU) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg3->var_off); + pr_vlog(env, "event_output: must use BPF_F_CURRENT_CPU, var_off: %s\n", + tn_buf); + return -EOPNOTSUPP; + } + + /* Save space in meta, we don't care about arguments other + * than 4th meta, shove it into arg1. + */ + reg1 = cur_regs(env) + BPF_REG_4; + + if (reg1->type != SCALAR_VALUE /* NULL ptr */ && + reg1->type != PTR_TO_STACK && + reg1->type != PTR_TO_MAP_VALUE && + reg1->type != PTR_TO_PACKET) { + pr_vlog(env, "event_output: unsupported ptr type: %d\n", + reg1->type); + return -EOPNOTSUPP; + } + + if (reg1->type == PTR_TO_STACK && + !nfp_bpf_stack_arg_ok("event_output", env, reg1, NULL)) + return -EOPNOTSUPP; + + /* Warn user that on offload NFP may return success even if map + * is not going to accept the event, since the event output is + * fully async and device won't know the state of the map. + * There is also FW limitation on the event length. + * + * Lost events will not show up on the perf ring, driver + * won't see them at all. Events may also get reordered. + */ + dev_warn_once(&nfp_prog->bpf->app->pf->pdev->dev, + "bpf: note: return codes and behavior of bpf_event_output() helper differs for offloaded programs!\n"); + pr_vlog(env, "warning: return codes and behavior of event_output helper differ for offload!\n"); + + if (!meta->func_id) + break; + + if (reg1->type != meta->arg1.type) { + pr_vlog(env, "event_output: ptr type changed: %d %d\n", + meta->arg1.type, reg1->type); + return -EINVAL; + } + break; + default: pr_vlog(env, "unsupported function id: %d\n", func_id); return -EOPNOTSUPP; From b4264c96b5cbc00c4c07deb9fbab928d43dffcf9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:13 -0700 Subject: [PATCH 06/10] nfp: bpf: rewrite map pointers with NFP TIDs Kernel will now replace map fds with actual pointer before calling the offload prepare. We can identify those pointers and replace them with NFP table IDs instead of loading the table ID in code generated for CALL instruction. This allows us to support having the same CALL being used with different maps. Since we don't want to change the FW ABI we still need to move the TID from R1 to portion of R0 before the jump. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 44 ++++++++++++++----- .../net/ethernet/netronome/nfp/bpf/verifier.c | 9 ---- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 2127cf1548dd..326a2085d650 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1395,15 +1395,9 @@ static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - struct bpf_offloaded_map *offmap; - struct nfp_bpf_map *nfp_map; bool load_lm_ptr; u32 ret_tgt; s64 lm_off; - swreg tid; - - offmap = (struct bpf_offloaded_map *)meta->arg1.map_ptr; - nfp_map = offmap->dev_priv; /* We only have to reload LM0 if the key is not at start of stack */ lm_off = nfp_prog->stack_depth; @@ -1416,17 +1410,12 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) if (meta->func_id == BPF_FUNC_map_update_elem) emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2); - /* Load map ID into a register, it should actually fit as an immediate - * but in case it doesn't deal with it here, not in the delay slots. - */ - tid = ur_load_imm_any(nfp_prog, nfp_map->tid, imm_a(nfp_prog)); - emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id, 2, RELO_BR_HELPER); ret_tgt = nfp_prog_current_offset(nfp_prog) + 2; /* Load map ID into A0 */ - wrp_mov(nfp_prog, reg_a(0), tid); + wrp_mov(nfp_prog, reg_a(0), reg_a(2)); /* Load the return address into B0 */ wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL); @@ -3254,6 +3243,33 @@ static int nfp_bpf_optimize(struct nfp_prog *nfp_prog) return 0; } +static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog) +{ + struct nfp_insn_meta *meta1, *meta2; + struct nfp_bpf_map *nfp_map; + struct bpf_map *map; + + nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) { + if (meta1->skip || meta2->skip) + continue; + + if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) || + meta1->insn.src_reg != BPF_PSEUDO_MAP_FD) + continue; + + map = (void *)(unsigned long)((u32)meta1->insn.imm | + (u64)meta2->insn.imm << 32); + if (bpf_map_offload_neutral(map)) + continue; + nfp_map = map_to_offmap(map)->dev_priv; + + meta1->insn.imm = nfp_map->tid; + meta2->insn.imm = 0; + } + + return 0; +} + static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len) { __le64 *ustore = (__force __le64 *)prog; @@ -3290,6 +3306,10 @@ int nfp_bpf_jit(struct nfp_prog *nfp_prog) { int ret; + ret = nfp_bpf_replace_map_ptrs(nfp_prog); + if (ret) + return ret; + ret = nfp_bpf_optimize(nfp_prog); if (ret) return ret; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index c76b622743ae..e163f3cfa47d 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -151,15 +151,6 @@ nfp_bpf_map_call_ok(const char *fname, struct bpf_verifier_env *env, return false; } - /* Rest of the checks is only if we re-parse the same insn */ - if (!meta->func_id) - return true; - - if (meta->arg1.map_ptr != reg1->map_ptr) { - pr_vlog(env, "%s: called for different map\n", fname); - return false; - } - return true; } From c642ea265445873901041fa0b892a45ea7c6ff33 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:14 -0700 Subject: [PATCH 07/10] tools: bpftool: fold hex keyword in command help Instead of spelling [hex] BYTES everywhere use DATA as keyword for generalized value. This will help us keep the messages concise when longer command are added in the future. It will also be useful once BTF support comes. We will only have to change the definition of DATA. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- .../bpf/bpftool/Documentation/bpftool-map.rst | 19 ++++++++++--------- tools/bpf/bpftool/map.c | 13 +++++++------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 5f512b14bff9..c3eef8c972cd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -23,16 +23,17 @@ MAP COMMANDS | **bpftool** **map { show | list }** [*MAP*] | **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* **key** [**hex**] *BYTES* **value** [**hex**] *VALUE* [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* **key** [**hex**] *BYTES* -| **bpftool** **map getnext** *MAP* [**key** [**hex**] *BYTES*] -| **bpftool** **map delete** *MAP* **key** [**hex**] *BYTES* +| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* **key** *DATA* +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* | **bpftool** **map pin** *MAP* *FILE* | **bpftool** **map help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *DATA* := { [**hex**] *BYTES* } | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *VALUE* := { *BYTES* | *MAP* | *PROG* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } | *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } DESCRIPTION @@ -48,7 +49,7 @@ DESCRIPTION **bpftool map dump** *MAP* Dump all entries in a given *MAP*. - **bpftool map update** *MAP* **key** [**hex**] *BYTES* **value** [**hex**] *VALUE* [*UPDATE_FLAGS*] + **bpftool map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] Update map entry for a given *KEY*. *UPDATE_FLAGS* can be one of: **any** update existing entry @@ -61,13 +62,13 @@ DESCRIPTION the bytes are parsed as decimal values, unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is provided. - **bpftool map lookup** *MAP* **key** [**hex**] *BYTES* + **bpftool map lookup** *MAP* **key** *DATA* Lookup **key** in the map. - **bpftool map getnext** *MAP* [**key** [**hex**] *BYTES*] + **bpftool map getnext** *MAP* [**key** *DATA*] Get next key. If *key* is not specified, get first key. - **bpftool map delete** *MAP* **key** [**hex**] *BYTES* + **bpftool map delete** *MAP* **key** *DATA* Remove entry from the map. **bpftool map pin** *MAP* *FILE* diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index a6cdb640a0d7..7da77e4166ec 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -874,16 +874,17 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [MAP]\n" " %s %s dump MAP\n" - " %s %s update MAP key [hex] BYTES value [hex] VALUE [UPDATE_FLAGS]\n" - " %s %s lookup MAP key [hex] BYTES\n" - " %s %s getnext MAP [key [hex] BYTES]\n" - " %s %s delete MAP key [hex] BYTES\n" + " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" + " %s %s lookup MAP key DATA\n" + " %s %s getnext MAP [key DATA]\n" + " %s %s delete MAP key DATA\n" " %s %s pin MAP FILE\n" " %s %s help\n" "\n" " MAP := { id MAP_ID | pinned FILE }\n" + " DATA := { [hex] BYTES }\n" " " HELP_SPEC_PROGRAM "\n" - " VALUE := { BYTES | MAP | PROG }\n" + " VALUE := { DATA | MAP | PROG }\n" " UPDATE_FLAGS := { any | exist | noexist }\n" " " HELP_SPEC_OPTIONS "\n" "", From e64d52569f6e847495091db40ab58d2d379748ef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:15 -0700 Subject: [PATCH 08/10] tools: bpftool: move get_possible_cpus() to common code Move the get_possible_cpus() function to shared code. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/common.c | 58 +++++++++++++++++++++++++++++++++++++- tools/bpf/bpftool/main.h | 3 +- tools/bpf/bpftool/map.c | 56 ------------------------------------ 3 files changed, 59 insertions(+), 58 deletions(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 465995281dcd..9c620770c6ed 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -33,6 +33,7 @@ /* Author: Jakub Kicinski */ +#include #include #include #include @@ -420,6 +421,61 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab) } } +unsigned int get_possible_cpus(void) +{ + static unsigned int result; + char buf[128]; + long int n; + char *ptr; + int fd; + + if (result) + return result; + + fd = open("/sys/devices/system/cpu/possible", O_RDONLY); + if (fd < 0) { + p_err("can't open sysfs possible cpus"); + exit(-1); + } + + n = read(fd, buf, sizeof(buf)); + if (n < 2) { + p_err("can't read sysfs possible cpus"); + exit(-1); + } + close(fd); + + if (n == sizeof(buf)) { + p_err("read sysfs possible cpus overflow"); + exit(-1); + } + + ptr = buf; + n = 0; + while (*ptr && *ptr != '\n') { + unsigned int a, b; + + if (sscanf(ptr, "%u-%u", &a, &b) == 2) { + n += b - a + 1; + + ptr = strchr(ptr, '-') + 1; + } else if (sscanf(ptr, "%u", &a) == 1) { + n++; + } else { + assert(0); + } + + while (isdigit(*ptr)) + ptr++; + if (*ptr == ',') + ptr++; + } + + result = n; + + return result; +} + static char * ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf) { diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index b8e9584d6246..cbf8985da362 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is dual licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -125,6 +125,7 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, const char *arch); void print_hex_data_json(uint8_t *data, size_t len); +unsigned int get_possible_cpus(void); const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); #endif diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 7da77e4166ec..5efefde5f578 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -34,7 +34,6 @@ /* Author: Jakub Kicinski */ #include -#include #include #include #include @@ -69,61 +68,6 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_CPUMAP] = "cpumap", }; -static unsigned int get_possible_cpus(void) -{ - static unsigned int result; - char buf[128]; - long int n; - char *ptr; - int fd; - - if (result) - return result; - - fd = open("/sys/devices/system/cpu/possible", O_RDONLY); - if (fd < 0) { - p_err("can't open sysfs possible cpus"); - exit(-1); - } - - n = read(fd, buf, sizeof(buf)); - if (n < 2) { - p_err("can't read sysfs possible cpus"); - exit(-1); - } - close(fd); - - if (n == sizeof(buf)) { - p_err("read sysfs possible cpus overflow"); - exit(-1); - } - - ptr = buf; - n = 0; - while (*ptr && *ptr != '\n') { - unsigned int a, b; - - if (sscanf(ptr, "%u-%u", &a, &b) == 2) { - n += b - a + 1; - - ptr = strchr(ptr, '-') + 1; - } else if (sscanf(ptr, "%u", &a) == 1) { - n++; - } else { - assert(0); - } - - while (isdigit(*ptr)) - ptr++; - if (*ptr == ',') - ptr++; - } - - result = n; - - return result; -} - static bool map_is_per_cpu(__u32 type) { return type == BPF_MAP_TYPE_PERCPU_HASH || From f412eed9dfdeeb6becd7de2ffe8b5d0a8b3f81ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:16 -0700 Subject: [PATCH 09/10] tools: bpftool: add simple perf event output reader Users of BPF sooner or later discover perf_event_output() helpers and BPF_MAP_TYPE_PERF_EVENT_ARRAY. Dumping this array type is not possible, however, we can add simple reading of perf events. Create a new event_pipe subcommand for maps, this sub command will only work with BPF_MAP_TYPE_PERF_EVENT_ARRAY maps. Parts of the code from samples/bpf/trace_output_user.c. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- .../bpf/bpftool/Documentation/bpftool-map.rst | 29 +- tools/bpf/bpftool/Documentation/bpftool.rst | 2 +- tools/bpf/bpftool/Makefile | 7 +- tools/bpf/bpftool/bash-completion/bpftool | 36 +- tools/bpf/bpftool/common.c | 19 + tools/bpf/bpftool/main.h | 4 + tools/bpf/bpftool/map.c | 19 +- tools/bpf/bpftool/map_perf_ring.c | 347 ++++++++++++++++++ 8 files changed, 444 insertions(+), 19 deletions(-) create mode 100644 tools/bpf/bpftool/map_perf_ring.c diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index c3eef8c972cd..a6258bc8ec4f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -22,12 +22,13 @@ MAP COMMANDS ============= | **bpftool** **map { show | list }** [*MAP*] -| **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* **key** *DATA* -| **bpftool** **map getnext** *MAP* [**key** *DATA*] -| **bpftool** **map delete** *MAP* **key** *DATA* -| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* **key** *DATA* +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] | **bpftool** **map help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } @@ -76,6 +77,22 @@ DESCRIPTION Note: *FILE* must be located in *bpffs* mount. + **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] + Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map. + + Install perf rings into a perf event array map and dump + output of any bpf_perf_event_output() call in the kernel. + By default read the number of CPUs on the system and + install perf ring for each CPU in the corresponding index + in the array. + + If **cpu** and **index** are specified, install perf ring + for given **cpu** at **index** in the array (single ring). + + Note that installing a perf ring into an array will silently + replace any existing ring. Any other application will stop + receiving events if it installed its rings earlier. + **bpftool map help** Print short help message. diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 20689a321ffe..564cb0d9692b 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -23,7 +23,7 @@ SYNOPSIS *MAP-COMMANDS* := { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete** - | **pin** | **help** } + | **pin** | **event_pipe** | **help** } *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | **load** | **help** } diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 4e69782c4a79..892dbf095bff 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -39,7 +39,12 @@ CC = gcc CFLAGS += -O2 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers -CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ +CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ + -I$(srctree)/kernel/bpf/ \ + -I$(srctree)/tools/include \ + -I$(srctree)/tools/include/uapi \ + -I$(srctree)/tools/lib/bpf \ + -I$(srctree)/tools/perf CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' LIBS = -lelf -lbfd -lopcodes $(LIBBPF) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 852d84a98acd..b301c9b315f1 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1,6 +1,6 @@ # bpftool(8) bash completion -*- shell-script -*- # -# Copyright (C) 2017 Netronome Systems, Inc. +# Copyright (C) 2017-2018 Netronome Systems, Inc. # # This software is dual licensed under the GNU General License # Version 2, June 1991 as shown in the file COPYING in the top-level @@ -79,6 +79,14 @@ _bpftool_get_map_ids() command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) } +_bpftool_get_perf_map_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command grep -C2 perf_event_array | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + + _bpftool_get_prog_ids() { COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ @@ -359,10 +367,34 @@ _bpftool() fi return 0 ;; + event_pipe) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_perf_map_ids + return 0 + ;; + cpu) + return 0 + ;; + index) + return 0 + ;; + *) + _bpftool_once_attr 'cpu' + _bpftool_once_attr 'index' + return 0 + ;; + esac + ;; *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'delete dump getnext help \ - lookup pin show list update' -- "$cur" ) ) + lookup pin event_pipe show list update' -- \ + "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 9c620770c6ed..32f9e397a6c0 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -331,6 +331,16 @@ char *get_fdinfo(int fd, const char *key) return NULL; } +void print_data_json(uint8_t *data, size_t len) +{ + unsigned int i; + + jsonw_start_array(json_wtr); + for (i = 0; i < len; i++) + jsonw_printf(json_wtr, "%d", data[i]); + jsonw_end_array(json_wtr); +} + void print_hex_data_json(uint8_t *data, size_t len) { unsigned int i; @@ -421,6 +431,15 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab) } } +unsigned int get_page_size(void) +{ + static int result; + + if (!result) + result = getpagesize(); + return result; +} + unsigned int get_possible_cpus(void) { static unsigned int result; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index cbf8985da362..6173cd997e7a 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -117,14 +117,18 @@ int do_pin_fd(int fd, const char *name); int do_prog(int argc, char **arg); int do_map(int argc, char **arg); +int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); int prog_parse_fd(int *argc, char ***argv); +int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len); void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, const char *arch); +void print_data_json(uint8_t *data, size_t len); void print_hex_data_json(uint8_t *data, size_t len); +unsigned int get_page_size(void); unsigned int get_possible_cpus(void); const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino); diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 5efefde5f578..af6766e956ba 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -130,8 +130,7 @@ static int map_parse_fd(int *argc, char ***argv) return -1; } -static int -map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len) +int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len) { int err; int fd; @@ -817,12 +816,13 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [MAP]\n" - " %s %s dump MAP\n" - " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" - " %s %s lookup MAP key DATA\n" - " %s %s getnext MAP [key DATA]\n" - " %s %s delete MAP key DATA\n" - " %s %s pin MAP FILE\n" + " %s %s dump MAP\n" + " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" + " %s %s lookup MAP key DATA\n" + " %s %s getnext MAP [key DATA]\n" + " %s %s delete MAP key DATA\n" + " %s %s pin MAP FILE\n" + " %s %s event_pipe MAP [cpu N index M]\n" " %s %s help\n" "\n" " MAP := { id MAP_ID | pinned FILE }\n" @@ -834,7 +834,7 @@ static int do_help(int argc, char **argv) "", bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2], bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); return 0; } @@ -849,6 +849,7 @@ static const struct cmd cmds[] = { { "getnext", do_getnext }, { "delete", do_delete }, { "pin", do_pin }, + { "event_pipe", do_event_pipe }, { 0 } }; diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c new file mode 100644 index 000000000000..c5a2ced8552d --- /dev/null +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2018 Netronome Systems, Inc. */ +/* This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "main.h" + +#define MMAP_PAGE_CNT 16 + +static bool stop; + +struct event_ring_info { + int fd; + int key; + unsigned int cpu; + void *mem; +}; + +struct perf_event_sample { + struct perf_event_header header; + __u32 size; + unsigned char data[]; +}; + +static void int_exit(int signo) +{ + fprintf(stderr, "Stopping...\n"); + stop = true; +} + +static void +print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e) +{ + struct { + struct perf_event_header header; + __u64 id; + __u64 lost; + } *lost = (void *)e; + struct timespec ts; + + if (clock_gettime(CLOCK_MONOTONIC, &ts)) { + perror("Can't read clock for timestamp"); + return; + } + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "timestamp"); + jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec); + jsonw_name(json_wtr, "type"); + jsonw_uint(json_wtr, e->header.type); + jsonw_name(json_wtr, "cpu"); + jsonw_uint(json_wtr, ring->cpu); + jsonw_name(json_wtr, "index"); + jsonw_uint(json_wtr, ring->key); + if (e->header.type == PERF_RECORD_SAMPLE) { + jsonw_name(json_wtr, "data"); + print_data_json(e->data, e->size); + } else if (e->header.type == PERF_RECORD_LOST) { + jsonw_name(json_wtr, "lost"); + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "id"); + jsonw_uint(json_wtr, lost->id); + jsonw_name(json_wtr, "count"); + jsonw_uint(json_wtr, lost->lost); + jsonw_end_object(json_wtr); + } + jsonw_end_object(json_wtr); + } else { + if (e->header.type == PERF_RECORD_SAMPLE) { + printf("== @%ld.%ld CPU: %d index: %d =====\n", + (long)ts.tv_sec, ts.tv_nsec, + ring->cpu, ring->key); + fprint_hex(stdout, e->data, e->size, " "); + printf("\n"); + } else if (e->header.type == PERF_RECORD_LOST) { + printf("lost %lld events\n", lost->lost); + } else { + printf("unknown event type=%d size=%d\n", + e->header.type, e->header.size); + } + } +} + +static void +perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len) +{ + volatile struct perf_event_mmap_page *header = ring->mem; + __u64 buffer_size = MMAP_PAGE_CNT * get_page_size(); + __u64 data_tail = header->data_tail; + __u64 data_head = header->data_head; + void *base, *begin, *end; + + asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ + if (data_head == data_tail) + return; + + base = ((char *)header) + get_page_size(); + + begin = base + data_tail % buffer_size; + end = base + data_head % buffer_size; + + while (begin != end) { + struct perf_event_sample *e; + + e = begin; + if (begin + e->header.size > base + buffer_size) { + long len = base + buffer_size - begin; + + if (*buf_len < e->header.size) { + free(*buf); + *buf = malloc(e->header.size); + if (!*buf) { + fprintf(stderr, + "can't allocate memory"); + stop = true; + return; + } + *buf_len = e->header.size; + } + + memcpy(*buf, begin, len); + memcpy(*buf + len, base, e->header.size - len); + e = (void *)*buf; + begin = base + e->header.size - len; + } else if (begin + e->header.size == base + buffer_size) { + begin = base; + } else { + begin += e->header.size; + } + + print_bpf_output(ring, e); + } + + __sync_synchronize(); /* smp_mb() */ + header->data_tail = data_head; +} + +static int perf_mmap_size(void) +{ + return get_page_size() * (MMAP_PAGE_CNT + 1); +} + +static void *perf_event_mmap(int fd) +{ + int mmap_size = perf_mmap_size(); + void *base; + + base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + p_err("event mmap failed: %s\n", strerror(errno)); + return NULL; + } + + return base; +} + +static void perf_event_unmap(void *mem) +{ + if (munmap(mem, perf_mmap_size())) + fprintf(stderr, "Can't unmap ring memory!\n"); +} + +static int bpf_perf_event_open(int map_fd, int key, int cpu) +{ + struct perf_event_attr attr = { + .sample_type = PERF_SAMPLE_RAW, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_BPF_OUTPUT, + }; + int pmu_fd; + + pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); + if (pmu_fd < 0) { + p_err("failed to open perf event %d for CPU %d", key, cpu); + return -1; + } + + if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) { + p_err("failed to update map for event %d for CPU %d", key, cpu); + goto err_close; + } + if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) { + p_err("failed to enable event %d for CPU %d", key, cpu); + goto err_close; + } + + return pmu_fd; + +err_close: + close(pmu_fd); + return -1; +} + +int do_event_pipe(int argc, char **argv) +{ + int i, nfds, map_fd, index = -1, cpu = -1; + struct bpf_map_info map_info = {}; + struct event_ring_info *rings; + size_t tmp_buf_sz = 0; + void *tmp_buf = NULL; + struct pollfd *pfds; + __u32 map_info_len; + bool do_all = true; + + map_info_len = sizeof(map_info); + map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len); + if (map_fd < 0) + return -1; + + if (map_info.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + p_err("map is not a perf event array"); + goto err_close_map; + } + + while (argc) { + if (argc < 2) + BAD_ARG(); + + if (is_prefix(*argv, "cpu")) { + char *endptr; + + NEXT_ARG(); + cpu = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as CPU ID", **argv); + goto err_close_map; + } + + NEXT_ARG(); + } else if (is_prefix(*argv, "index")) { + char *endptr; + + NEXT_ARG(); + index = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as index", **argv); + goto err_close_map; + } + + NEXT_ARG(); + } else { + BAD_ARG(); + } + + do_all = false; + } + + if (!do_all) { + if (index == -1 || cpu == -1) { + p_err("cpu and index must be specified together"); + goto err_close_map; + } + + nfds = 1; + } else { + nfds = min(get_possible_cpus(), map_info.max_entries); + cpu = 0; + index = 0; + } + + rings = calloc(nfds, sizeof(rings[0])); + if (!rings) + goto err_close_map; + + pfds = calloc(nfds, sizeof(pfds[0])); + if (!pfds) + goto err_free_rings; + + for (i = 0; i < nfds; i++) { + rings[i].cpu = cpu + i; + rings[i].key = index + i; + + rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key, + rings[i].cpu); + if (rings[i].fd < 0) + goto err_close_fds_prev; + + rings[i].mem = perf_event_mmap(rings[i].fd); + if (!rings[i].mem) + goto err_close_fds_current; + + pfds[i].fd = rings[i].fd; + pfds[i].events = POLLIN; + } + + signal(SIGINT, int_exit); + signal(SIGHUP, int_exit); + signal(SIGTERM, int_exit); + + if (json_output) + jsonw_start_array(json_wtr); + + while (!stop) { + poll(pfds, nfds, 200); + for (i = 0; i < nfds; i++) + perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz); + } + free(tmp_buf); + + if (json_output) + jsonw_end_array(json_wtr); + + for (i = 0; i < nfds; i++) { + perf_event_unmap(rings[i].mem); + close(rings[i].fd); + } + free(pfds); + free(rings); + close(map_fd); + + return 0; + +err_close_fds_prev: + while (i--) { + perf_event_unmap(rings[i].mem); +err_close_fds_current: + close(rings[i].fd); + } + free(pfds); +err_free_rings: + free(rings); +err_close_map: + close(map_fd); + return -1; +} From ab7f5bf0928be2f148d000a6eaa6c0a36e74750e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:17 -0700 Subject: [PATCH 10/10] bpf: fix references to free_bpf_prog_info() in comments Comments in the verifier refer to free_bpf_prog_info() which seems to have never existed in tree. Replace it with free_used_maps(). Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ccac552ac861..d5e1a6c4165d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5055,7 +5055,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_bpf_prog_info() + * and all maps are released in free_used_maps() */ map = bpf_map_inc(map, false); if (IS_ERR(map)) { @@ -5821,7 +5821,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release - * them now. Otherwise free_bpf_prog_info() will release them. + * them now. Otherwise free_used_maps() will release them. */ release_maps(env); *prog = env->prog;