From b9fc8b4a591811546fec2dbef7e9f809362100c9 Mon Sep 17 00:00:00 2001 From: Grant Seltzer Date: Mon, 22 Feb 2021 19:58:46 +0000 Subject: [PATCH 01/90] bpf: Add kernel/modules BTF presence checks to bpftool feature command This adds both the CONFIG_DEBUG_INFO_BTF and CONFIG_DEBUG_INFO_BTF_MODULES kernel compile option to output of the bpftool feature command. This is relevant for developers that want to account for data structure definition differences between kernels. Signed-off-by: Grant Seltzer Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210222195846.155483-1-grantseltzer@gmail.com --- tools/bpf/bpftool/feature.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 359960a8f1de..40a88df275f9 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -336,6 +336,10 @@ static void probe_kernel_image_config(const char *define_prefix) { "CONFIG_BPF_JIT", }, /* Avoid compiling eBPF interpreter (use JIT only) */ { "CONFIG_BPF_JIT_ALWAYS_ON", }, + /* Kernel BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF", }, + /* Kernel module BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF_MODULES", }, /* cgroups */ { "CONFIG_CGROUPS", }, From 2463e073497385ef63c220571013a2b89e9b95cc Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 18 Feb 2021 20:49:41 +0000 Subject: [PATCH 02/90] netdevice: Add missing IFF_PHONY_HEADROOM self-definition This is harmless for now, but can be fatal for future refactors. Fixes: 871b642adebe3 ("netdev: introduce ndo_set_rx_headroom") Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210218204908.5455-2-alobakin@pm.me --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ddf4cfc12615..3b6f82c2c271 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1577,6 +1577,7 @@ enum netdev_priv_flags { #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE #define IFF_TEAM IFF_TEAM #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED +#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM #define IFF_MACSEC IFF_MACSEC #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER #define IFF_FAILOVER IFF_FAILOVER From c2ff53d8049f30098153cd2d1299a44d7b124c57 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Thu, 18 Feb 2021 20:50:02 +0000 Subject: [PATCH 03/90] net: Add priv_flags for allow tx skb without linear In some cases, we hope to construct skb directly based on the existing memory without copying data. In this case, the page will be placed directly in the skb, and the linear space of skb is empty. But unfortunately, many the network card does not support this operation. For example Mellanox Technologies MT27710 Family [ConnectX-4 Lx] will get the following error message: mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2 WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb So a priv_flag is added here to indicate whether the network card supports this feature. Suggested-by: Alexander Lobakin Signed-off-by: Xuan Zhuo Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210218204908.5455-3-alobakin@pm.me --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3b6f82c2c271..6cef47b76cc6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1518,6 +1518,8 @@ struct net_device_ops { * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running + * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with + * skb_headlen(skb) == 0 (data starts from frag0) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1551,6 +1553,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, + IFF_TX_SKB_NO_LINEAR = 1<<31, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1584,6 +1587,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK +#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /** * struct net_device - The DEVICE structure. From ab5bd583b9289666e918f9e5f672d33ccdfd49b2 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Thu, 18 Feb 2021 20:50:17 +0000 Subject: [PATCH 04/90] virtio-net: Support IFF_TX_SKB_NO_LINEAR flag Virtio net supports the case where the skb linear space is empty, so add priv_flags. Signed-off-by: Xuan Zhuo Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Michael S. Tsirkin Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210218204908.5455-4-alobakin@pm.me --- drivers/net/virtio_net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ba8e63792549..f2ff6c3906c1 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2972,7 +2972,8 @@ static int virtnet_probe(struct virtio_device *vdev) return -ENOMEM; /* Set up network device as normal. */ - dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; + dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE | + IFF_TX_SKB_NO_LINEAR; dev->netdev_ops = &virtnet_netdev; dev->features = NETIF_F_HIGHDMA; From 3914d88f7608e6c2e80e344474fa289370c32451 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 18 Feb 2021 20:50:31 +0000 Subject: [PATCH 05/90] xsk: Respect device's headroom and tailroom on generic xmit path xsk_generic_xmit() allocates a new skb and then queues it for xmitting. The size of new skb's headroom is desc->len, so it comes to the driver/device with no reserved headroom and/or tailroom. Lots of drivers need some headroom (and sometimes tailroom) to prepend (and/or append) some headers or data, e.g. CPU tags, device-specific headers/descriptors (LSO, TLS etc.), and if case of no available space skb_cow_head() will reallocate the skb. Reallocations are unwanted on fast-path, especially when it comes to XDP, so generic XSK xmit should reserve the spaces declared in dev->needed_headroom and dev->needed tailroom to avoid them. Note on max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)): Usually, output functions reserve LL_RESERVED_SPACE(dev), which consists of dev->hard_header_len + dev->needed_headroom, aligned by 16. However, on XSK xmit hard header is already here in the chunk, so hard_header_len is not needed. But it'd still be better to align data up to cacheline, while reserving no less than driver requests for headroom. NET_SKB_PAD here is to double-insure there will be no reallocations even when the driver advertises no needed_headroom, but in fact need it (not so rare case). Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210218204908.5455-5-alobakin@pm.me --- net/xdp/xsk.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4faabd1ecfd1..143979ea4165 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -454,12 +454,16 @@ static int xsk_generic_xmit(struct sock *sk) struct sk_buff *skb; unsigned long flags; int err = 0; + u32 hr, tr; mutex_lock(&xs->mutex); if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + tr = xs->dev->needed_tailroom; + while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { char *buffer; u64 addr; @@ -471,11 +475,13 @@ static int xsk_generic_xmit(struct sock *sk) } len = desc.len; - skb = sock_alloc_send_skb(sk, len, 1, &err); + skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err); if (unlikely(!skb)) goto out; + skb_reserve(skb, hr); skb_put(skb, len); + addr = desc.addr; buffer = xsk_buff_raw_get_data(xs->pool, addr); err = skb_store_bits(skb, 0, buffer, len); From 9c8f21e6f8856a96634e542a58ef3abf27486801 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Thu, 18 Feb 2021 20:50:45 +0000 Subject: [PATCH 06/90] xsk: Build skb by page (aka generic zerocopy xmit) This patch is used to construct skb based on page to save memory copy overhead. This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to directly construct skb. If this feature is not supported, it is still necessary to copy data to construct skb. ---------------- Performance Testing ------------ The test environment is Aliyun ECS server. Test cmd: ``` xdpsock -i eth0 -t -S -s ``` Test result data: size 64 512 1024 1500 copy 1916747 1775988 1600203 1440054 page 1974058 1953655 1945463 1904478 percent 3.0% 10.0% 21.58% 32.3% Signed-off-by: Xuan Zhuo Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Reviewed-by: Dust Li Acked-by: Magnus Karlsson Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210218204908.5455-6-alobakin@pm.me --- net/xdp/xsk.c | 120 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 96 insertions(+), 24 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 143979ea4165..a71ed664da0a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -445,6 +445,97 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct xsk_buff_pool *pool = xs->pool; + u32 hr, len, ts, offset, copy, copied; + struct sk_buff *skb; + struct page *page; + void *buffer; + int err, i; + u64 addr; + + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + + addr = desc->addr; + len = desc->len; + ts = pool->unaligned ? len : pool->chunk_size; + + buffer = xsk_buff_raw_get_data(pool, addr); + offset = offset_in_page(buffer); + addr = buffer - pool->addrs; + + for (copied = 0, i = 0; copied < len; i++) { + page = pool->umem->pgs[addr >> PAGE_SHIFT]; + get_page(page); + + copy = min_t(u32, PAGE_SIZE - offset, len - copied); + skb_fill_page_desc(skb, i, page, offset, copy); + + copied += copy; + addr += copy; + offset = 0; + } + + skb->len += len; + skb->data_len += len; + skb->truesize += ts; + + refcount_add(ts, &xs->sk.sk_wmem_alloc); + + return skb; +} + +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct net_device *dev = xs->dev; + struct sk_buff *skb; + + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { + skb = xsk_build_skb_zerocopy(xs, desc); + if (IS_ERR(skb)) + return skb; + } else { + u32 hr, tr, len; + void *buffer; + int err; + + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + len = desc->len; + + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + skb_put(skb, len); + + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + return ERR_PTR(err); + } + } + + skb->dev = dev; + skb->priority = xs->sk.sk_priority; + skb->mark = xs->sk.sk_mark; + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; + skb->destructor = xsk_destruct_skb; + + return skb; +} + static int xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); @@ -454,56 +545,37 @@ static int xsk_generic_xmit(struct sock *sk) struct sk_buff *skb; unsigned long flags; int err = 0; - u32 hr, tr; mutex_lock(&xs->mutex); if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); - tr = xs->dev->needed_tailroom; - while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { - char *buffer; - u64 addr; - u32 len; - if (max_batch-- == 0) { err = -EAGAIN; goto out; } - len = desc.len; - skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err); - if (unlikely(!skb)) + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); goto out; + } - skb_reserve(skb, hr); - skb_put(skb, len); - - addr = desc.addr; - buffer = xsk_buff_raw_get_data(xs->pool, addr); - err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { + if (xskq_prod_reserve(xs->pool->cq)) { spin_unlock_irqrestore(&xs->pool->cq_lock, flags); kfree_skb(skb); goto out; } spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - skb->dev = xs->dev; - skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; - skb->destructor = xsk_destruct_skb; - err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ From a10787e6d58c24b51e91c19c6d16c5da89fcaa4b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:14 -0800 Subject: [PATCH 07/90] bpf: Enable task local storage for tracing programs To access per-task data, BPF programs usually creates a hash table with pid as the key. This is not ideal because: 1. The user need to estimate the proper size of the hash table, which may be inaccurate; 2. Big hash tables are slow; 3. To clean up the data properly during task terminations, the user need to write extra logic. Task local storage overcomes these issues and offers a better option for these per-task data. Task local storage is only available to BPF_LSM. Now enable it for tracing programs. Unlike LSM programs, tracing programs can be called in IRQ contexts. Helpers that access task local storage are updated to use raw_spin_lock_irqsave() instead of raw_spin_lock_bh(). Tracing programs can attach to functions on the task free path, e.g. exit_creds(). To avoid allocating task local storage after bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate new storage when the task is not refcounted (task->usage == 0). Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225234319.336131-2-songliubraving@fb.com --- include/linux/bpf.h | 7 ++++++ include/linux/bpf_lsm.h | 22 ----------------- include/linux/bpf_types.h | 2 +- include/linux/sched.h | 5 ++++ kernel/bpf/Makefile | 3 +-- kernel/bpf/bpf_local_storage.c | 28 +++++++++++++--------- kernel/bpf/bpf_lsm.c | 4 ---- kernel/bpf/bpf_task_storage.c | 43 +++++++++------------------------- kernel/fork.c | 5 ++++ kernel/trace/bpf_trace.c | 4 ++++ 10 files changed, 51 insertions(+), 72 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cccaef1088ea..e2cfc4809219 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1499,6 +1499,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1684,6 +1685,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } + +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -1886,6 +1891,8 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 0d1c33ace398..479c101546ad 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -38,21 +38,9 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - if (unlikely(!task->security)) - return NULL; - - return task->security + bpf_lsm_blob_sizes.lbs_task; -} - extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; -extern const struct bpf_func_proto bpf_task_storage_get_proto; -extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); -void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -73,20 +61,10 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - return NULL; -} - static inline void bpf_inode_storage_free(struct inode *inode) { } -static inline void bpf_task_storage_free(struct task_struct *task) -{ -} - #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 99f7fd657d87..b9edee336d80 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,8 +109,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d568288abf9..e5fbf8e6952a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,6 +42,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_local_storage; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1348,6 +1349,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used by BPF task local storage */ + struct bpf_local_storage __rcu *bpf_storage; +#endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d1249340fd6b..7f33098ca63f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e7..9bd47ad2b26f 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; + unsigned long flags; if (unlikely(!selem_linked_to_storage(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference(selem->local_storage); - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) kfree_rcu(local_storage, rcu); @@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; + unsigned long flags; if (unlikely(!selem_linked_to_map(selem))) /* selem has already be unlinked from smap */ @@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) smap = rcu_dereference(SDATA(selem)->smap); b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + unsigned long flags; - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem) @@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, sdata = SDATA(selem); if (cacheit_lockit) { + unsigned long flags; + /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], sdata); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); } return sdata; @@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; + unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } unlock: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return SDATA(selem); unlock_err: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return ERR_PTR(err); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 1622a44d1617..9829f381b51c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e0da0258b732..baf3566e2323 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -24,12 +23,8 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; - struct bpf_storage_blob *bsb; - bsb = bpf_task(task); - if (!bsb) - return NULL; - return &bsb->storage; + return &task->bpf_storage; } static struct bpf_local_storage_data * @@ -38,13 +33,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, { struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - struct bpf_storage_blob *bsb; - bsb = bpf_task(task); - if (!bsb) - return NULL; - - task_storage = rcu_dereference(bsb->storage); + task_storage = rcu_dereference(task->bpf_storage); if (!task_storage) return NULL; @@ -57,16 +47,12 @@ void bpf_task_storage_free(struct task_struct *task) struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct bpf_storage_blob *bsb; struct hlist_node *n; - - bsb = bpf_task(task); - if (!bsb) - return; + unsigned long flags; rcu_read_lock(); - local_storage = rcu_dereference(bsb->storage); + local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) { rcu_read_unlock(); return; @@ -81,7 +67,7 @@ void bpf_task_storage_free(struct task_struct *task) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -90,7 +76,7 @@ void bpf_task_storage_free(struct task_struct *task) free_task_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, false); } - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); rcu_read_unlock(); /* free_task_storage should always be true as long as @@ -150,7 +136,7 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task || !task_storage_ptr(task)) { + if (!task) { err = -ENOENT; goto out; } @@ -213,23 +199,16 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; - /* explicitly check that the task_storage_ptr is not - * NULL as task_storage_lookup returns NULL in this case and - * bpf_local_storage_update expects the owner to have a - * valid storage pointer. - */ - if (!task || !task_storage_ptr(task)) + if (!task) return (unsigned long)NULL; sdata = task_storage_lookup(task, map, true); if (sdata) return (unsigned long)sdata->data; - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + /* only allocate new storage, when the task is refcounted */ + if (refcount_read(&task->usage) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); diff --git a/kernel/fork.c b/kernel/fork.c index d66cd1014211..181604db2d65 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -2062,6 +2064,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + RCU_INIT_POINTER(p->bpf_storage, NULL); +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..e9701744d8e4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1367,6 +1367,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; default: return NULL; } From bc235cdb423a2daed6f337676006a66557429cd1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:15 -0800 Subject: [PATCH 08/90] bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete] BPF helpers bpf_task_storage_[get|delete] could hold two locks: bpf_local_storage_map_bucket->lock and bpf_local_storage->lock. Calling these helpers from fentry/fexit programs on functions in bpf_*_storage.c may cause deadlock on either locks. Prevent such deadlock with a per cpu counter, bpf_task_storage_busy. We need this counter to be global, because the two locks here belong to two different objects: bpf_local_storage_map and bpf_local_storage. If we pick one of them as the owner of the counter, it is still possible to trigger deadlock on the other lock. For example, if bpf_local_storage_map owns the counters, it cannot prevent deadlock on bpf_local_storage->lock when two maps are used. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225234319.336131-3-songliubraving@fb.com --- include/linux/bpf_local_storage.h | 3 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 11 +++++- kernel/bpf/bpf_task_storage.c | 59 ++++++++++++++++++++++++++----- net/core/bpf_sk_storage.c | 2 +- 5 files changed, 65 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index b2c9463f36a1..b902c580c48d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -126,7 +126,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6639640523c0..da753721457c 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static int inode_storage_map_btf_id; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 9bd47ad2b26f..b305270b7a4b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -474,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map_bucket *b; @@ -503,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + __this_cpu_inc(*busy_counter); + } bpf_selem_unlink(selem); + if (busy_counter) { + __this_cpu_dec(*busy_counter); + migrate_enable(); + } cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index baf3566e2323..fd3c74ef608e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -20,6 +20,31 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); +DEFINE_PER_CPU(int, bpf_task_storage_busy); + +static void bpf_task_storage_lock(void) +{ + migrate_disable(); + __this_cpu_inc(bpf_task_storage_busy); +} + +static void bpf_task_storage_unlock(void) +{ + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); +} + +static bool bpf_task_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); + return false; + } + return true; +} + static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; @@ -67,6 +92,7 @@ void bpf_task_storage_free(struct task_struct *task) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ + bpf_task_storage_lock(); raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from @@ -77,6 +103,7 @@ void bpf_task_storage_free(struct task_struct *task) local_storage, selem, false); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_task_storage_unlock(); rcu_read_unlock(); /* free_task_storage should always be true as long as @@ -109,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); + bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -141,8 +170,10 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, goto out; } + bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags); + bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -185,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); err = task_storage_delete(task, map); + bpf_task_storage_unlock(); out: put_pid(pid); return err; @@ -202,34 +235,44 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, if (!task) return (unsigned long)NULL; + if (!bpf_task_storage_trylock()) + return (unsigned long)NULL; + sdata = task_storage_lookup(task, map, true); if (sdata) - return (unsigned long)sdata->data; + goto unlock; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); - return IS_ERR(sdata) ? (unsigned long)NULL : - (unsigned long)sdata->data; - } - return (unsigned long)NULL; +unlock: + bpf_task_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { + int ret; + if (!task) return -EINVAL; + if (!bpf_task_storage_trylock()) + return -EBUSY; + /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - return task_storage_delete(task, map); + ret = task_storage_delete(task, map); + bpf_task_storage_unlock(); + return ret; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -255,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } static int task_storage_map_btf_id; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4edd033e899c..cc3712ad8716 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) From 1f87dcf116adedd6b9f47258d4fdf4fa9ce74002 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:16 -0800 Subject: [PATCH 09/90] selftests/bpf: Add non-BPF_LSM test for task local storage Task local storage is enabled for tracing programs. Add two tests for task local storage without CONFIG_BPF_LSM. The first test stores a value in sys_enter and read it back in sys_exit. The second test checks whether the kernel allows allocating task local storage in exit_creds() (which it should not). Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210225234319.336131-4-songliubraving@fb.com --- .../bpf/prog_tests/task_local_storage.c | 69 +++++++++++++++++++ .../selftests/bpf/progs/task_local_storage.c | 64 +++++++++++++++++ .../bpf/progs/task_local_storage_exit_creds.c | 32 +++++++++ 3 files changed, 165 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/task_local_storage.c create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage.c create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c new file mode 100644 index 000000000000..dbb7525cdd56 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include +#include /* For SYS_xxx definitions */ +#include +#include +#include "task_local_storage.skel.h" +#include "task_local_storage_exit_creds.skel.h" + +static void test_sys_enter_exit(void) +{ + struct task_local_storage *skel; + int err; + + skel = task_local_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + skel->bss->target_pid = syscall(SYS_gettid); + + err = task_local_storage__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + syscall(SYS_gettid); + syscall(SYS_gettid); + + /* 3x syscalls: 1x attach and 2x gettid */ + ASSERT_EQ(skel->bss->enter_cnt, 3, "enter_cnt"); + ASSERT_EQ(skel->bss->exit_cnt, 3, "exit_cnt"); + ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt"); +out: + task_local_storage__destroy(skel); +} + +static void test_exit_creds(void) +{ + struct task_local_storage_exit_creds *skel; + int err; + + skel = task_local_storage_exit_creds__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_local_storage_exit_creds__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger at least one exit_creds() */ + if (CHECK_FAIL(system("ls > /dev/null"))) + goto out; + + /* sync rcu to make sure exit_creds() is called for "ls" */ + kern_sync_rcu(); + ASSERT_EQ(skel->bss->valid_ptr_count, 0, "valid_ptr_count"); + ASSERT_NEQ(skel->bss->null_ptr_count, 0, "null_ptr_count"); +out: + task_local_storage_exit_creds__destroy(skel); +} + +void test_task_local_storage(void) +{ + if (test__start_subtest("sys_enter_exit")) + test_sys_enter_exit(); + if (test__start_subtest("exit_creds")) + test_exit_creds(); +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c new file mode 100644 index 000000000000..80a0a20db88d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} enter_id SEC(".maps"); + +#define MAGIC_VALUE 0xabcd1234 + +pid_t target_pid = 0; +int mismatch_cnt = 0; +int enter_cnt = 0; +int exit_cnt = 0; + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&enter_cnt, 1); + *ptr = MAGIC_VALUE + enter_cnt; + + return 0; +} + +SEC("tp_btf/sys_exit") +int BPF_PROG(on_exit, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + if (task->pid != target_pid) + return 0; + + ptr = bpf_task_storage_get(&enter_id, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + __sync_fetch_and_add(&exit_cnt, 1); + if (*ptr != MAGIC_VALUE + exit_cnt) + __sync_fetch_and_add(&mismatch_cnt, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c new file mode 100644 index 000000000000..81758c0aef99 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage_exit_creds.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, __u64); +} task_storage SEC(".maps"); + +int valid_ptr_count = 0; +int null_ptr_count = 0; + +SEC("fentry/exit_creds") +int BPF_PROG(trace_exit_creds, struct task_struct *task) +{ + __u64 *ptr; + + ptr = bpf_task_storage_get(&task_storage, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + __sync_fetch_and_add(&valid_ptr_count, 1); + else + __sync_fetch_and_add(&null_ptr_count, 1); + return 0; +} From c540957a4d1d13934e93e53ce3056ac4108ffc29 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:17 -0800 Subject: [PATCH 10/90] selftests/bpf: Test deadlock from recursive bpf_task_storage_[get|delete] Add a test with recursive bpf_task_storage_[get|delete] from fentry programs on bpf_local_storage_lookup and bpf_local_storage_update. Without proper deadlock prevent mechanism, this test would cause deadlock. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210225234319.336131-5-songliubraving@fb.com --- .../bpf/prog_tests/task_local_storage.c | 23 ++++++ .../selftests/bpf/progs/task_ls_recursion.c | 70 +++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/task_ls_recursion.c diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index dbb7525cdd56..035c263aab1b 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -8,6 +8,7 @@ #include #include "task_local_storage.skel.h" #include "task_local_storage_exit_creds.skel.h" +#include "task_ls_recursion.skel.h" static void test_sys_enter_exit(void) { @@ -60,10 +61,32 @@ out: task_local_storage_exit_creds__destroy(skel); } +static void test_recursion(void) +{ + struct task_ls_recursion *skel; + int err; + + skel = task_ls_recursion__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = task_ls_recursion__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + + /* trigger sys_enter, make sure it does not cause deadlock */ + syscall(SYS_gettid); + +out: + task_ls_recursion__destroy(skel); +} + void test_task_local_storage(void) { if (test__start_subtest("sys_enter_exit")) test_sys_enter_exit(); if (test__start_subtest("exit_creds")) test_exit_creds(); + if (test__start_subtest("recursion")) + test_recursion(); } diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c new file mode 100644 index 000000000000..564583dca7c8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_a SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, long); +} map_b SEC(".maps"); + +SEC("fentry/bpf_local_storage_lookup") +int BPF_PROG(on_lookup) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + bpf_task_storage_delete(&map_a, task); + bpf_task_storage_delete(&map_b, task); + return 0; +} + +SEC("fentry/bpf_local_storage_update") +int BPF_PROG(on_update) +{ + struct task_struct *task = bpf_get_current_task_btf(); + long *ptr; + + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr += 1; + + return 0; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(on_enter, struct pt_regs *regs, long id) +{ + struct task_struct *task; + long *ptr; + + task = bpf_get_current_task_btf(); + ptr = bpf_task_storage_get(&map_a, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 200; + + ptr = bpf_task_storage_get(&map_b, task, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = 100; + return 0; +} From 4b0d2d4156cfb9f27d8e52a33d3522a78002fca1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:18 -0800 Subject: [PATCH 11/90] bpf: runqslower: Prefer using local vmlimux to generate vmlinux.h Update the Makefile to prefer using $(O)/vmlinux, $(KBUILD_OUTPUT)/vmlinux (for selftests) or ../../../vmlinux. These two files should have latest definitions for vmlinux.h. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210225234319.336131-6-songliubraving@fb.com --- tools/bpf/runqslower/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 9d9fb6209be1..c96ba90c6f01 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -16,7 +16,10 @@ CFLAGS := -g -Wall # Try to detect best kernel BTF source KERNEL_REL := $(shell uname -r) -VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) +VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../vmlinux /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(KERNEL_REL) VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) From ced47e30ab8b3ed986e28411f63e041b51c1fdf8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:19 -0800 Subject: [PATCH 12/90] bpf: runqslower: Use task local storage Replace hashtab with task local storage in runqslower. This improves the performance of these BPF programs. The following table summarizes average runtime of these programs, in nanoseconds: task-local hash-prealloc hash-no-prealloc handle__sched_wakeup 125 340 3124 handle__sched_wakeup_new 2812 1510 2998 handle__sched_switch 151 208 991 Note that, task local storage gives better performance than hashtab for handle__sched_wakeup and handle__sched_switch. On the other hand, for handle__sched_wakeup_new, task local storage is slower than hashtab with prealloc. This is because handle__sched_wakeup_new accesses the data for the first time, so it has to allocate the data for task local storage. Once the initial allocation is done, subsequent accesses, as those in handle__sched_wakeup, are much faster with task local storage. If we disable hashtab prealloc, task local storage is much faster for all 3 functions. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210225234319.336131-7-songliubraving@fb.com --- tools/bpf/runqslower/runqslower.bpf.c | 33 +++++++++++++++++---------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 1f18a409f044..645530ca7e98 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 10240); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); __type(value, u64); } start SEC(".maps"); @@ -25,15 +25,20 @@ struct { /* record enqueue timestamp */ __always_inline -static int trace_enqueue(u32 tgid, u32 pid) +static int trace_enqueue(struct task_struct *t) { - u64 ts; + u32 pid = t->pid; + u64 *ptr; if (!pid || (targ_pid && targ_pid != pid)) return 0; - ts = bpf_ktime_get_ns(); - bpf_map_update_elem(&start, &pid, &ts, 0); + ptr = bpf_task_storage_get(&start, t, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!ptr) + return 0; + + *ptr = bpf_ktime_get_ns(); return 0; } @@ -43,7 +48,7 @@ int handle__sched_wakeup(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") @@ -52,7 +57,7 @@ int handle__sched_wakeup_new(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_switch") @@ -70,12 +75,16 @@ int handle__sched_switch(u64 *ctx) /* ivcsw: treat like an enqueue event and store timestamp */ if (prev->state == TASK_RUNNING) - trace_enqueue(prev->tgid, prev->pid); + trace_enqueue(prev); pid = next->pid; + /* For pid mismatch, save a bpf_task_storage_get */ + if (!pid || (targ_pid && targ_pid != pid)) + return 0; + /* fetch timestamp and calculate delta */ - tsp = bpf_map_lookup_elem(&start, &pid); + tsp = bpf_task_storage_get(&start, next, 0, 0); if (!tsp) return 0; /* missed enqueue */ @@ -91,7 +100,7 @@ int handle__sched_switch(u64 *ctx) bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); - bpf_map_delete_elem(&start, &pid); + bpf_task_storage_delete(&start, next); return 0; } From 523a4cf491b3c9e2d546040d57250f1a0ca84f03 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Fri, 26 Feb 2021 00:26:29 +0400 Subject: [PATCH 13/90] bpf: Use MAX_BPF_FUNC_REG_ARGS macro Instead of using integer literal here and there use macro name for better context. Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225202629.585485-1-me@ubique.spb.ru --- include/linux/bpf.h | 5 +++++ kernel/bpf/btf.c | 25 ++++++++++++++----------- kernel/bpf/verifier.c | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e2cfc4809219..ae2c35641619 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -506,6 +506,11 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_ARGS 12 +/* The maximum number of arguments passed through registers + * a single function may have. + */ +#define MAX_BPF_FUNC_REG_ARGS 5 + struct btf_func_model { u8 ret_size; u8 nr_args; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2efeb5f4b343..16e8148a28e2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4594,8 +4594,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ - nr_args = t ? btf_type_vlen(t) : 5; + /* if (t == NULL) Fall back to default BPF prog with + * MAX_BPF_FUNC_REG_ARGS u64 arguments. + */ + nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -4651,7 +4653,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } else { if (!t) - /* Default prog with 5 args */ + /* Default prog with MAX_BPF_FUNC_REG_ARGS args */ return true; t = btf_type_by_id(btf, args[arg].type); } @@ -5102,12 +5104,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (!func) { /* BTF function prototype doesn't match the verifier types. - * Fall back to 5 u64 args. + * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < 5; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) m->arg_size[i] = 8; m->ret_size = 8; - m->nr_args = 5; + m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; } args = (const struct btf_param *)(func + 1); @@ -5330,8 +5332,9 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Function %s has %d > %d args\n", tname, nargs, + MAX_BPF_FUNC_REG_ARGS); goto out; } @@ -5460,9 +5463,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", - tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } /* check that function returns int */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dda9d81f12c..9f7e35590fc6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5544,7 +5544,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - for (i = 0; i < 5; i++) { + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { err = check_func_arg(env, i, &meta, fn); if (err) return err; From e6ac593372aadacc14e02b198e4a1acfef1db595 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 17 Feb 2021 10:45:09 +0000 Subject: [PATCH 14/90] bpf: Rename fixup_bpf_calls and add some comments This function has become overloaded, it actually does lots of diverse things in a single pass. Rename it to avoid confusion, and add some concise commentary. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210217104509.2423183-1-jackmanb@google.com --- kernel/bpf/verifier.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f7e35590fc6..9336bac39027 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5877,7 +5877,7 @@ static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, aux->alu_limit != alu_limit)) return -EACCES; - /* Corresponding fixup done in fixup_bpf_calls(). */ + /* Corresponding fixup done in do_misc_fixups(). */ aux->alu_state = alu_state; aux->alu_limit = alu_limit; return 0; @@ -11535,12 +11535,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } -/* fixup insn->imm field of bpf_call instructions - * and inline eligible helpers as explicit sequence of BPF instructions - * - * this function is called after eBPF program passed verification +/* Do various post-verification rewrites in a single program pass. + * These rewrites simplify JIT and interpreter implementations. */ -static int fixup_bpf_calls(struct bpf_verifier_env *env) +static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; bool expect_blinding = bpf_jit_blinding_enabled(prog); @@ -11555,6 +11553,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || insn->code == (BPF_ALU | BPF_MOD | BPF_X) || @@ -11595,6 +11594,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || BPF_MODE(insn->code) == BPF_IND)) { @@ -11614,6 +11614,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + /* Rewrite pointer arithmetic to mitigate speculation attacks. */ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; @@ -11835,6 +11836,7 @@ patch_map_ops_generic: goto patch_call_imm; } + /* Implement bpf_jiffies64 inline. */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_jiffies64) { struct bpf_insn ld_jiffies_addr[2] = { @@ -12645,7 +12647,7 @@ skip_full_check: ret = convert_ctx_accesses(env); if (ret == 0) - ret = fixup_bpf_calls(env); + ret = do_misc_fixups(env); /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. From ecde60614d5ed60fde1c80b38b71582a3ea2e662 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 23 Feb 2021 16:23:01 +0000 Subject: [PATCH 15/90] selftest/bpf: Make xsk tests less verbose Make the xsk tests less verbose by only printing the essentials. Currently, it is hard to see if the tests passed or not due to all the printouts. Move the extra printouts to a verbose option, if further debugging is needed when a problem arises. To run the xsk tests with verbose output: ./test_xsk.sh -v Signed-off-by: Magnus Karlsson Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20210223162304.7450-2-ciara.loftus@intel.com --- tools/testing/selftests/bpf/test_xsk.sh | 27 ++++++++++++++----- tools/testing/selftests/bpf/xdpxceiver.c | 31 +++++++++++++--------- tools/testing/selftests/bpf/xdpxceiver.h | 13 +++++---- tools/testing/selftests/bpf/xsk_prereqs.sh | 9 +++---- 4 files changed, 50 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 88a7483eaae4..f4cedf4c2718 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -71,13 +71,17 @@ # # Run (full output without color-coding): # sudo ./test_xsk.sh +# +# Run with verbose output: +# sudo ./test_xsk.sh -v . xsk_prereqs.sh -while getopts c flag +while getopts "cv" flag do case "${flag}" in c) colorconsole=1;; + v) verbose=1;; esac done @@ -95,13 +99,17 @@ NS1=af_xdp${VETH1_POSTFIX} MTU=1500 setup_vethPairs() { - echo "setting up ${VETH0}: namespace: ${NS0}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH0}: namespace: ${NS0}" + fi ip netns add ${NS1} ip link add ${VETH0} type veth peer name ${VETH1} if [ -f /proc/net/if_inet6 ]; then echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 fi - echo "setting up ${VETH1}: namespace: ${NS1}" + if [[ $verbose -eq 1 ]]; then + echo "setting up ${VETH1}: namespace: ${NS1}" + fi ip link set ${VETH1} netns ${NS1} ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} ip link set ${VETH0} mtu ${MTU} @@ -125,7 +133,10 @@ echo "${VETH0}:${VETH1},${NS1}" > ${SPECFILE} validate_veth_spec_file -echo "Spec file created: ${SPECFILE}" +if [[ $verbose -eq 1 ]]; then + echo "Spec file created: ${SPECFILE}" + VERBOSE_ARG="-v" +fi test_status $retval "${TEST_NAME}" @@ -136,12 +147,16 @@ statusList=() ### TEST 1 TEST_NAME="XSK KSELFTEST FRAMEWORK" -echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" +if [[ $verbose -eq 1 ]]; then + echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" +fi vethXDPgeneric ${VETH0} ${VETH1} ${NS1} retval=$? if [ $retval -eq 0 ]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" + if [[ $verbose -eq 1 ]]; then + echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" + fi vethXDPnative ${VETH0} ${VETH1} ${NS1} fi diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index f4a96d5ff524..8af746c9a6b6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -341,6 +341,7 @@ static struct option long_options[] = { {"tear-down", no_argument, 0, 'T'}, {"bidi", optional_argument, 0, 'B'}, {"debug", optional_argument, 0, 'D'}, + {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} }; @@ -359,6 +360,7 @@ static void usage(const char *prog) " -T, --tear-down Tear down sockets by repeatedly recreating them\n" " -B, --bidi Bi-directional sockets test\n" " -D, --debug Debug mode - dump packets L2 - L5\n" + " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; ksft_print_msg(str, prog); } @@ -392,7 +394,7 @@ static void *nsswitchthread(void *args) ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", __func__, ifdict[targs->idx]->ifname); } else { - ksft_print_msg("Interface found: %s\n", ifdict[targs->idx]->ifname); + print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname); targs->retptr = true; } } @@ -422,7 +424,7 @@ static int validate_interfaces(void) pthread_join(ns_thread, NULL); if (targs->retptr) - ksft_print_msg("NS switched: %s\n", ifdict[i]->nsname); + print_verbose("NS switched: %s\n", ifdict[i]->nsname); free(targs); } else { @@ -432,7 +434,7 @@ static int validate_interfaces(void) ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); ret = false; } else { - ksft_print_msg("Interface found: %s\n", ifdict[i]->ifname); + print_verbose("Interface found: %s\n", ifdict[i]->ifname); } } } @@ -446,7 +448,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:pSNcTBDC:", long_options, &option_index); + c = getopt_long(argc, argv, "i:q:pSNcTBDC:v", long_options, &option_index); if (c == -1) break; @@ -497,6 +499,9 @@ static void parse_command_line(int argc, char **argv) case 'C': opt_pkt_count = atoi(optarg); break; + case 'v': + opt_verbose = 1; + break; default: usage(basename(argv[0])); ksft_exit_xfail(); @@ -714,7 +719,7 @@ static void worker_pkt_dump(void) int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); if (payload == EOT) { - ksft_print_msg("End-of-transmission frame received\n"); + print_verbose("End-of-transmission frame received\n"); fprintf(stdout, "---------------------------------------\n"); break; } @@ -746,7 +751,7 @@ static void worker_pkt_validate(void) } if (payloadseqnum == EOT) { - ksft_print_msg("End-of-transmission frame received: PASS\n"); + print_verbose("End-of-transmission frame received: PASS\n"); sigvar = 1; break; } @@ -836,7 +841,7 @@ static void *worker_testapp_validate(void *arg) usleep(USLEEP_MAX); } - ksft_print_msg("Interface [%s] vector [Tx]\n", ifobject->ifname); + print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname); for (int i = 0; i < num_frames; i++) { /*send EOT frame */ if (i == (num_frames - 1)) @@ -850,7 +855,7 @@ static void *worker_testapp_validate(void *arg) gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); } - ksft_print_msg("Sending %d packets on interface %s\n", + print_verbose("Sending %d packets on interface %s\n", (opt_pkt_count - 1), ifobject->ifname); tx_only_all(ifobject); } else if (ifobject->fv.vector == rx) { @@ -860,7 +865,7 @@ static void *worker_testapp_validate(void *arg) if (!bidi_pass) thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); - ksft_print_msg("Interface [%s] vector [Rx]\n", ifobject->ifname); + print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); xsk_populate_fill_ring(ifobject->umem); TAILQ_INIT(&head); @@ -890,11 +895,11 @@ static void *worker_testapp_validate(void *arg) break; } - ksft_print_msg("Received %d packets on interface %s\n", + print_verbose("Received %d packets on interface %s\n", pkt_counter, ifobject->ifname); if (opt_teardown) - ksft_print_msg("Destroying socket\n"); + print_verbose("Destroying socket\n"); } if (!opt_bidi || bidi_pass) { @@ -914,7 +919,7 @@ static void testapp_validate(void) if (opt_bidi && bidi_pass) { pthread_init_mutex(); if (!switching_notify) { - ksft_print_msg("Switching Tx/Rx vectors\n"); + print_verbose("Switching Tx/Rx vectors\n"); switching_notify++; } } @@ -974,7 +979,7 @@ static void testapp_sockets(void) pkt_counter = 0; prev_pkt = -1; sigvar = 0; - ksft_print_msg("Creating socket\n"); + print_verbose("Creating socket\n"); testapp_validate(); opt_bidi ? bidi_pass++ : bidi_pass; } diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 0e9f9b7e61c2..f66f399dfb2d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -42,6 +42,8 @@ #define POLL_TMOUT 1000 #define NEED_WAKEUP true +#define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) + typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; @@ -51,11 +53,11 @@ enum TESTS { ORDER_CONTENT_VALIDATE_XDP_DRV = 1, }; -u8 uut; -u8 debug_pkt_dump; -u32 num_frames; -u8 switching_notify; -u8 bidi_pass; +static u8 uut; +static u8 debug_pkt_dump; +static u32 num_frames; +static u8 switching_notify; +static u8 bidi_pass; static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int opt_queue; @@ -64,6 +66,7 @@ static int opt_poll; static int opt_teardown; static int opt_bidi; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u8 opt_verbose; static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; static u32 prev_pkt = -1; diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index 9d54c4645127..ef8c5b31f4b6 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -82,24 +82,21 @@ clear_configs() { if [ $(ip netns show | grep $3 &>/dev/null; echo $?;) == 0 ]; then [ $(ip netns exec $3 ip link show $2 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1:$2"; ip netns exec $3 ip link del $2; } - echo "removing ns $3" + { ip netns exec $3 ip link del $2; } ip netns del $3 fi #Once we delete a veth pair node, the entire veth pair is removed, #this is just to be cautious just incase the NS does not exist then #veth node inside NS won't get removed so we explicitly remove it [ $(ip link show $1 &>/dev/null; echo $?;) == 0 ] && - { echo "removing link $1"; ip link del $1; } + { ip link del $1; } if [ -f ${SPECFILE} ]; then - echo "removing spec file:" ${SPECFILE} rm -f ${SPECFILE} fi } cleanup_exit() { - echo "cleaning up..." clear_configs $1 $2 $3 } @@ -131,5 +128,5 @@ execxdpxceiver() copy[$index]=${!current} done - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} } From d2b0dfd5d1f94fe74ed580b5a1d5fdb5bf11f2fb Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 23 Feb 2021 16:23:02 +0000 Subject: [PATCH 16/90] selftests/bpf: Expose and rename debug argument Launching xdpxceiver with -D enables what was formerly know as 'debug' mode. Rename this mode to 'dump-pkts' as it better describes the behavior enabled by the option. New usage: ./xdpxceiver .. -D or ./xdpxceiver .. --dump-pkts Also make it possible to pass this flag to the app via the test_xsk.sh shell script like so: ./test_xsk.sh -D Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210223162304.7450-3-ciara.loftus@intel.com --- tools/testing/selftests/bpf/test_xsk.sh | 10 +++++++++- tools/testing/selftests/bpf/xdpxceiver.c | 6 +++--- tools/testing/selftests/bpf/xsk_prereqs.sh | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index f4cedf4c2718..dbb129a36606 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -74,14 +74,18 @@ # # Run with verbose output: # sudo ./test_xsk.sh -v +# +# Run and dump packet contents: +# sudo ./test_xsk.sh -D . xsk_prereqs.sh -while getopts "cv" flag +while getopts "cvD" flag do case "${flag}" in c) colorconsole=1;; v) verbose=1;; + D) dump_pkts=1;; esac done @@ -138,6 +142,10 @@ if [[ $verbose -eq 1 ]]; then VERBOSE_ARG="-v" fi +if [[ $dump_pkts -eq 1 ]]; then + DUMP_PKTS_ARG="-D" +fi + test_status $retval "${TEST_NAME}" ## START TESTS diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 8af746c9a6b6..506423201197 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -58,7 +58,7 @@ * - Rx thread verifies if all 10k packets were received and delivered in-order, * and have the right content * - * Enable/disable debug mode: + * Enable/disable packet dump mode: * -------------------------- * To enable L2 - L4 headers and payload dump of each packet on STDOUT, add * parameter -D to params array in test_xsk.sh, i.e. params=("-S" "-D") @@ -340,7 +340,7 @@ static struct option long_options[] = { {"copy", no_argument, 0, 'c'}, {"tear-down", no_argument, 0, 'T'}, {"bidi", optional_argument, 0, 'B'}, - {"debug", optional_argument, 0, 'D'}, + {"dump-pkts", optional_argument, 0, 'D'}, {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} @@ -359,7 +359,7 @@ static void usage(const char *prog) " -c, --copy Force copy mode\n" " -T, --tear-down Tear down sockets by repeatedly recreating them\n" " -B, --bidi Bi-directional sockets test\n" - " -D, --debug Debug mode - dump packets L2 - L5\n" + " -D, --dump-pkts Dump packets L2 - L5\n" " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; ksft_print_msg(str, prog); diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index ef8c5b31f4b6..da93575d757a 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -128,5 +128,6 @@ execxdpxceiver() copy[$index]=${!current} done - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} \ + ${DUMP_PKTS_ARG} } From d3e3bf5b4c673e79c5a11608f53d4e0059a7ec79 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 23 Feb 2021 16:23:03 +0000 Subject: [PATCH 17/90] selftests/bpf: Restructure xsk selftests Prior to this commit individual xsk tests were launched from the shell script 'test_xsk.sh'. When adding a new test type, two new test configurations had to be added to this file - one for each of the supported XDP 'modes' (skb or drv). Should zero copy support be added to the xsk selftest framework in the future, three new test configurations would need to be added for each new test type. Each new test type also typically requires new CLI arguments for the xdpxceiver program. This commit aims to reduce the overhead of adding new tests, by launching the test configurations from within the xdpxceiver program itself, using simple loops. Every test is run every time the C program is executed. Many of the CLI arguments can be removed as a result. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20210223162304.7450-4-ciara.loftus@intel.com --- tools/testing/selftests/bpf/test_xsk.sh | 112 +---------- tools/testing/selftests/bpf/xdpxceiver.c | 218 +++++++++++++-------- tools/testing/selftests/bpf/xdpxceiver.h | 33 ++-- tools/testing/selftests/bpf/xsk_prereqs.sh | 24 +-- 4 files changed, 159 insertions(+), 228 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index dbb129a36606..56d4474e2c83 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -152,117 +152,9 @@ test_status $retval "${TEST_NAME}" statusList=() -### TEST 1 -TEST_NAME="XSK KSELFTEST FRAMEWORK" +TEST_NAME="XSK KSELFTESTS" -if [[ $verbose -eq 1 ]]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Generic mode" -fi -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -retval=$? -if [ $retval -eq 0 ]; then - if [[ $verbose -eq 1 ]]; then - echo "Switching interfaces [${VETH0}, ${VETH1}] to XDP Native mode" - fi - vethXDPnative ${VETH0} ${VETH1} ${NS1} -fi - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 2 -TEST_NAME="SKB NOPOLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 3 -TEST_NAME="SKB POLL" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 4 -TEST_NAME="DRV NOPOLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 5 -TEST_NAME="DRV POLL" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-p") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 6 -TEST_NAME="SKB SOCKET TEARDOWN" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-T") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 7 -TEST_NAME="DRV SOCKET TEARDOWN" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-T") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 8 -TEST_NAME="SKB BIDIRECTIONAL SOCKETS" - -vethXDPgeneric ${VETH0} ${VETH1} ${NS1} - -params=("-S" "-B") -execxdpxceiver params - -retval=$? -test_status $retval "${TEST_NAME}" -statusList+=($retval) - -### TEST 9 -TEST_NAME="DRV BIDIRECTIONAL SOCKETS" - -vethXDPnative ${VETH0} ${VETH1} ${NS1} - -params=("-N" "-B") -execxdpxceiver params +execxdpxceiver retval=$? test_status $retval "${TEST_NAME}" diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 506423201197..e7913444518d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -18,12 +18,7 @@ * These selftests test AF_XDP SKB and Native/DRV modes using veth * Virtual Ethernet interfaces. * - * The following tests are run: - * - * 1. AF_XDP SKB mode - * Generic mode XDP is driver independent, used when the driver does - * not have support for XDP. Works on any netdevice using sockets and - * generic XDP path. XDP hook from netif_receive_skb(). + * For each mode, the following tests are run: * a. nopoll - soft-irq processing * b. poll - using poll() syscall * c. Socket Teardown @@ -34,17 +29,6 @@ * completion rings on each socket, tx/rx in both directions. Only nopoll * mode is used * - * 2. AF_XDP DRV/Native mode - * Works on any netdevice with XDP_REDIRECT support, driver dependent. Processes - * packets before SKB allocation. Provides better performance than SKB. Driver - * hook available just after DMA of buffer descriptor. - * a. nopoll - * b. poll - * c. Socket Teardown - * d. Bi-directional sockets - * - Only copy mode is supported because veth does not currently support - * zero-copy mode - * * Total tests: 8 * * Flow: @@ -98,17 +82,23 @@ typedef __u16 __sum16; static void __exit_with_error(int error, const char *file, const char *func, int line) { - ksft_test_result_fail - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - ksft_exit_xfail(); + if (configured_mode == TEST_MODE_UNCONFIGURED) { + ksft_exit_fail_msg + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + } else { + ksft_test_result_fail + ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); + ksft_exit_xfail(); + } } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s\n", uut ? "DRV" : "SKB", opt_poll ? "POLL" :\ - "NOPOLL", opt_teardown ? "Socket Teardown" : "",\ - opt_bidi ? "Bi-directional Sockets" : "")) + (ksft_test_result_pass("PASS: %s %s %s%s\n", configured_mode ? "DRV" : "SKB",\ + test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ + test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ + test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "")) static void pthread_init_mutex(void) { @@ -311,10 +301,10 @@ static int xsk_configure_socket(struct ifobject *ifobject) cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; - cfg.xdp_flags = opt_xdp_flags; - cfg.bind_flags = opt_xdp_bind_flags; + cfg.xdp_flags = xdp_flags; + cfg.bind_flags = xdp_bind_flags; - if (!opt_bidi) { + if (test_type != TEST_TYPE_BIDI) { rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; } else { @@ -334,12 +324,6 @@ static int xsk_configure_socket(struct ifobject *ifobject) static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"queue", optional_argument, 0, 'q'}, - {"poll", no_argument, 0, 'p'}, - {"xdp-skb", no_argument, 0, 'S'}, - {"xdp-native", no_argument, 0, 'N'}, - {"copy", no_argument, 0, 'c'}, - {"tear-down", no_argument, 0, 'T'}, - {"bidi", optional_argument, 0, 'B'}, {"dump-pkts", optional_argument, 0, 'D'}, {"verbose", no_argument, 0, 'v'}, {"tx-pkt-count", optional_argument, 0, 'C'}, @@ -353,12 +337,6 @@ static void usage(const char *prog) " Options:\n" " -i, --interface Use interface\n" " -q, --queue=n Use queue n (default 0)\n" - " -p, --poll Use poll syscall\n" - " -S, --xdp-skb=n Use XDP SKB mode\n" - " -N, --xdp-native=n Enforce XDP DRV (native) mode\n" - " -c, --copy Force copy mode\n" - " -T, --tear-down Tear down sockets by repeatedly recreating them\n" - " -B, --bidi Bi-directional sockets test\n" " -D, --dump-pkts Dump packets L2 - L5\n" " -v, --verbose Verbose output\n" " -C, --tx-pkt-count=n Number of packets to send\n"; @@ -448,7 +426,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:pSNcTBDC:v", long_options, &option_index); + c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index); if (c == -1) break; @@ -471,28 +449,6 @@ static void parse_command_line(int argc, char **argv) case 'q': opt_queue = atoi(optarg); break; - case 'p': - opt_poll = 1; - break; - case 'S': - opt_xdp_flags |= XDP_FLAGS_SKB_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_SKB; - break; - case 'N': - opt_xdp_flags |= XDP_FLAGS_DRV_MODE; - opt_xdp_bind_flags |= XDP_COPY; - uut = ORDER_CONTENT_VALIDATE_XDP_DRV; - break; - case 'c': - opt_xdp_bind_flags |= XDP_COPY; - break; - case 'T': - opt_teardown = 1; - break; - case 'B': - opt_bidi = 1; - break; case 'D': debug_pkt_dump = 1; break; @@ -508,6 +464,11 @@ static void parse_command_line(int argc, char **argv) } } + if (!opt_pkt_count) { + print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT); + opt_pkt_count = DEFAULT_PKT_CNT; + } + if (!validate_interfaces()) { usage(basename(argv[0])); ksft_exit_xfail(); @@ -659,7 +620,7 @@ static void tx_only_all(struct ifobject *ifobject) while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { int batch_size = get_batch_size(pkt_cnt); - if (opt_poll) { + if (test_type == TEST_TYPE_POLL) { ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -883,7 +844,7 @@ static void *worker_testapp_validate(void *arg) pthread_mutex_unlock(&sync_mutex); while (1) { - if (opt_poll) { + if (test_type == TEST_TYPE_POLL) { ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -898,11 +859,11 @@ static void *worker_testapp_validate(void *arg) print_verbose("Received %d packets on interface %s\n", pkt_counter, ifobject->ifname); - if (opt_teardown) + if (test_type == TEST_TYPE_TEARDOWN) print_verbose("Destroying socket\n"); } - if (!opt_bidi || bidi_pass) { + if ((test_type != TEST_TYPE_BIDI) || bidi_pass) { xsk_socket__delete(ifobject->xsk->xsk); (void)xsk_umem__delete(ifobject->umem->umem); } @@ -912,11 +873,12 @@ static void *worker_testapp_validate(void *arg) static void testapp_validate(void) { struct timespec max_wait = { 0, 0 }; + bool bidi = test_type == TEST_TYPE_BIDI; pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, THREAD_STACK); - if (opt_bidi && bidi_pass) { + if ((test_type == TEST_TYPE_BIDI) && bidi_pass) { pthread_init_mutex(); if (!switching_notify) { print_verbose("Switching Tx/Rx vectors\n"); @@ -927,10 +889,10 @@ static void testapp_validate(void) pthread_mutex_lock(&sync_mutex); /*Spawn RX thread */ - if (!opt_bidi || !bidi_pass) { + if (!bidi || !bidi_pass) { if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { + } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[0]->fv.vector = rx; if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) @@ -947,10 +909,10 @@ static void testapp_validate(void) pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - if (!opt_bidi || !bidi_pass) { + if (!bidi || !bidi_pass) { if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) exit_with_error(errno); - } else if (opt_bidi && bidi_pass) { + } else if (bidi && bidi_pass) { /*switch Tx/Rx vectors */ ifdict[1]->fv.vector = tx; if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) @@ -969,19 +931,20 @@ static void testapp_validate(void) free(pkt_buf); } - if (!opt_teardown && !opt_bidi) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi) print_ksft_result(); } static void testapp_sockets(void) { - for (int i = 0; i < (opt_teardown ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); i++) { + for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); + i++) { pkt_counter = 0; prev_pkt = -1; sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); - opt_bidi ? bidi_pass++ : bidi_pass; + test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass; } print_ksft_result(); @@ -1008,6 +971,98 @@ static void init_iface_config(struct ifaceconfigobj *ifaceconfig) ifdict[1]->src_port = ifaceconfig->dst_port; } +static void *nsdisablemodethread(void *args) +{ + struct targs *targs = args; + + targs->retptr = false; + + if (switch_namespace(targs->idx)) { + targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags); + } else { + targs->retptr = errno; + print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname); + } + + pthread_exit(NULL); +} + +static void disable_xdp_mode(int mode) +{ + int err = 0; + __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode; + char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv"; + + for (int i = 0; i < MAX_INTERFACES; i++) { + if (strcmp(ifdict[i]->nsname, "")) { + struct targs *targs; + + targs = malloc(sizeof(*targs)); + memset(targs, 0, sizeof(*targs)); + if (!targs) + exit_with_error(errno); + + targs->idx = i; + targs->flags = flags; + if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs)) + exit_with_error(errno); + + pthread_join(ns_thread, NULL); + err = targs->retptr; + free(targs); + } else { + err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags); + } + + if (err) { + print_verbose("Failed to disable %s mode on interface %s\n", + mode_str, ifdict[i]->ifname); + exit_with_error(err); + } + + print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname); + configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB; + } +} + +static void run_pkt_test(int mode, int type) +{ + test_type = type; + + /* reset defaults after potential previous test */ + xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + pkt_counter = 0; + switching_notify = 0; + bidi_pass = 0; + prev_pkt = -1; + ifdict[0]->fv.vector = tx; + ifdict[1]->fv.vector = rx; + + switch (mode) { + case (TEST_MODE_SKB): + if (configured_mode == TEST_MODE_DRV) + disable_xdp_mode(XDP_FLAGS_DRV_MODE); + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case (TEST_MODE_DRV): + if (configured_mode == TEST_MODE_SKB) + disable_xdp_mode(XDP_FLAGS_SKB_MODE); + xdp_flags |= XDP_FLAGS_DRV_MODE; + break; + default: + break; + } + + pthread_init_mutex(); + + if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) + testapp_validate(); + else + testapp_sockets(); + + pthread_destroy_mutex(); +} + int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; @@ -1021,6 +1076,7 @@ int main(int argc, char **argv) const char *IP2 = "192.168.100.161"; u16 UDP_DST_PORT = 2020; u16 UDP_SRC_PORT = 2121; + int i, j; ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); @@ -1046,24 +1102,18 @@ int main(int argc, char **argv) init_iface_config(ifaceconfig); - pthread_init_mutex(); + disable_xdp_mode(XDP_FLAGS_DRV_MODE); - ksft_set_plan(1); + ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - if (!opt_teardown && !opt_bidi) { - testapp_validate(); - } else if (opt_teardown && opt_bidi) { - ksft_test_result_fail("ERROR: parameters -T and -B cannot be used together\n"); - ksft_exit_xfail(); - } else { - testapp_sockets(); + for (i = 0; i < TEST_MODE_MAX; i++) { + for (j = 0; j < TEST_TYPE_MAX; j++) + run_pkt_test(i, j); } for (int i = 0; i < MAX_INTERFACES; i++) free(ifdict[i]); - pthread_destroy_mutex(); - ksft_exit_pass(); return 0; diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index f66f399dfb2d..e05703f661f8 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -41,6 +41,7 @@ #define BATCH_SIZE 64 #define POLL_TMOUT 1000 #define NEED_WAKEUP true +#define DEFAULT_PKT_CNT 10000 #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) @@ -48,28 +49,37 @@ typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; -enum TESTS { - ORDER_CONTENT_VALIDATE_XDP_SKB = 0, - ORDER_CONTENT_VALIDATE_XDP_DRV = 1, +enum TEST_MODES { + TEST_MODE_UNCONFIGURED = -1, + TEST_MODE_SKB, + TEST_MODE_DRV, + TEST_MODE_MAX }; -static u8 uut; +enum TEST_TYPES { + TEST_TYPE_NOPOLL, + TEST_TYPE_POLL, + TEST_TYPE_TEARDOWN, + TEST_TYPE_BIDI, + TEST_TYPE_MAX +}; + +static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; static u32 num_frames; static u8 switching_notify; static u8 bidi_pass; +static int test_type; -static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int opt_queue; static int opt_pkt_count; -static int opt_poll; -static int opt_teardown; -static int opt_bidi; -static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; static u8 opt_verbose; + +static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; +static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; -static u32 prev_pkt = -1; +static long prev_pkt = -1; static int sigvar; struct xsk_umem_info { @@ -140,8 +150,9 @@ pthread_t t0, t1, ns_thread; pthread_attr_t attr; struct targs { - bool retptr; + u8 retptr; int idx; + u32 flags; }; TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index da93575d757a..dac1c5f78752 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -105,29 +105,7 @@ validate_ip_utility() [ ! $(type -P ip) ] && { echo "'ip' not found. Skipping tests."; test_exit $ksft_skip 1; } } -vethXDPgeneric() -{ - ip link set dev $1 xdpdrv off - ip netns exec $3 ip link set dev $2 xdpdrv off -} - -vethXDPnative() -{ - ip link set dev $1 xdpgeneric off - ip netns exec $3 ip link set dev $2 xdpgeneric off -} - execxdpxceiver() { - local -a 'paramkeys=("${!'"$1"'[@]}")' copy - paramkeysstr=${paramkeys[*]} - - for index in $paramkeysstr; - do - current=$1"[$index]" - copy[$index]=${!current} - done - - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${copy[*]} -C ${NUMPKTS} ${VERBOSE_ARG} \ - ${DUMP_PKTS_ARG} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} } From b267e5a458a719f3f5eaaaebe87c5f4a13584832 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Tue, 23 Feb 2021 16:23:04 +0000 Subject: [PATCH 18/90] selftests/bpf: Introduce xsk statistics tests This commit introduces a range of tests to the xsk testsuite for validating xsk statistics. A new test type called 'stats' is added. Within it there are four sub-tests. Each test configures a scenario which should trigger the given error statistic. The test passes if the statistic is successfully incremented. The four statistics for which tests have been created are: 1. rx dropped Increase the UMEM frame headroom to a value which results in insufficient space in the rx buffer for both the packet and the headroom. 2. tx invalid Set the 'len' field of tx descriptors to an invalid value (umem frame size + 1). 3. rx ring full Reduce the size of the RX ring to a fraction of the fill ring size. 4. fill queue empty Do not populate the fill queue and then try to receive pkts. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20210223162304.7450-5-ciara.loftus@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 137 ++++++++++++++++++++--- tools/testing/selftests/bpf/xdpxceiver.h | 13 +++ 2 files changed, 136 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index e7913444518d..8b0f7fdd9003 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -28,8 +28,21 @@ * Configure sockets as bi-directional tx/rx sockets, sets up fill and * completion rings on each socket, tx/rx in both directions. Only nopoll * mode is used + * e. Statistics + * Trigger some error conditions and ensure that the appropriate statistics + * are incremented. Within this test, the following statistics are tested: + * i. rx dropped + * Increase the UMEM frame headroom to a value which results in + * insufficient space in the rx buffer for both the packet and the headroom. + * ii. tx invalid + * Set the 'len' field of tx descriptors to an invalid value (umem frame + * size + 1). + * iii. rx ring full + * Reduce the size of the RX ring to a fraction of the fill ring size. + * iv. fill queue empty + * Do not populate the fill queue and then try to receive pkts. * - * Total tests: 8 + * Total tests: 10 * * Flow: * ----- @@ -95,10 +108,11 @@ static void __exit_with_error(int error, const char *file, const char *func, int #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s\n", configured_mode ? "DRV" : "SKB",\ + (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\ test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ - test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "")) + test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ + test_type == TEST_TYPE_STATS ? "Stats" : "")) static void pthread_init_mutex(void) { @@ -260,13 +274,20 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) { int ret; + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, + .frame_headroom = frame_headroom, + .flags = XSK_UMEM__DEFAULT_FLAGS + }; data->umem = calloc(1, sizeof(struct xsk_umem_info)); if (!data->umem) exit_with_error(errno); ret = xsk_umem__create(&data->umem->umem, buffer, size, - &data->umem->fq, &data->umem->cq, NULL); + &data->umem->fq, &data->umem->cq, &cfg); if (ret) exit_with_error(ret); @@ -298,7 +319,7 @@ static int xsk_configure_socket(struct ifobject *ifobject) exit_with_error(errno); ifobject->xsk->umem = ifobject->umem; - cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.rx_size = rxqsize; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; cfg.xdp_flags = xdp_flags; @@ -565,6 +586,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) { u32 idx; unsigned int i; + bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; + u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) complete_tx_only(xsk, batch_size); @@ -573,11 +596,16 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = PKT_SIZE; + tx_desc->len = len; } xsk_ring_prod__submit(&xsk->tx, batch_size); - xsk->outstanding_tx += batch_size; + if (!tx_invalid_test) { + xsk->outstanding_tx += batch_size; + } else { + if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + } *frameptr += batch_size; *frameptr %= num_frames; complete_tx_only(xsk, batch_size); @@ -689,6 +717,48 @@ static void worker_pkt_dump(void) } } +static void worker_stats_validate(struct ifobject *ifobject) +{ + struct xdp_statistics stats; + socklen_t optlen; + int err; + struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ? + ifdict[!ifobject->ifdict_index]->xsk->xsk : + ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + unsigned long xsk_stat = 0, expected_stat = opt_pkt_count; + + sigvar = 0; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) + return; + + if (optlen == sizeof(struct xdp_statistics)) { + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + xsk_stat = stats.rx_dropped; + break; + case STAT_TEST_TX_INVALID: + xsk_stat = stats.tx_invalid_descs; + break; + case STAT_TEST_RX_FULL: + xsk_stat = stats.rx_ring_full; + expected_stat -= RX_FULL_RXQSIZE; + break; + case STAT_TEST_RX_FILL_EMPTY: + xsk_stat = stats.rx_fill_ring_empty_descs; + break; + default: + break; + } + + if (xsk_stat == expected_stat) + sigvar = 1; + } +} + static void worker_pkt_validate(void) { u32 payloadseqnum = -2; @@ -827,7 +897,8 @@ static void *worker_testapp_validate(void *arg) thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); - xsk_populate_fill_ring(ifobject->umem); + if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) + xsk_populate_fill_ring(ifobject->umem); TAILQ_INIT(&head); if (debug_pkt_dump) { @@ -849,15 +920,21 @@ static void *worker_testapp_validate(void *arg) if (ret <= 0) continue; } - rx_pkt(ifobject->xsk, fds); - worker_pkt_validate(); + + if (test_type != TEST_TYPE_STATS) { + rx_pkt(ifobject->xsk, fds); + worker_pkt_validate(); + } else { + worker_stats_validate(ifobject); + } if (sigvar) break; } - print_verbose("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + if (test_type != TEST_TYPE_STATS) + print_verbose("Received %d packets on interface %s\n", + pkt_counter, ifobject->ifname); if (test_type == TEST_TYPE_TEARDOWN) print_verbose("Destroying socket\n"); @@ -931,7 +1008,7 @@ static void testapp_validate(void) free(pkt_buf); } - if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } @@ -950,6 +1027,32 @@ static void testapp_sockets(void) print_ksft_result(); } +static void testapp_stats(void) +{ + for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) { + stat_test_type = i; + + /* reset defaults */ + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + + switch (stat_test_type) { + case STAT_TEST_RX_DROPPED: + frame_headroom = XSK_UMEM__DEFAULT_FRAME_SIZE - + XDP_PACKET_HEADROOM - 1; + break; + case STAT_TEST_RX_FULL: + rxqsize = RX_FULL_RXQSIZE; + break; + default: + break; + } + testapp_validate(); + } + + print_ksft_result(); +} + static void init_iface_config(struct ifaceconfigobj *ifaceconfig) { /*Init interface0 */ @@ -1037,6 +1140,10 @@ static void run_pkt_test(int mode, int type) prev_pkt = -1; ifdict[0]->fv.vector = tx; ifdict[1]->fv.vector = rx; + sigvar = 0; + stat_test_type = -1; + rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; switch (mode) { case (TEST_MODE_SKB): @@ -1055,7 +1162,9 @@ static void run_pkt_test(int mode, int type) pthread_init_mutex(); - if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) + if (test_type == TEST_TYPE_STATS) + testapp_stats(); + else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) testapp_validate(); else testapp_sockets(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index e05703f661f8..30314ef305c2 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -42,6 +42,7 @@ #define POLL_TMOUT 1000 #define NEED_WAKEUP true #define DEFAULT_PKT_CNT 10000 +#define RX_FULL_RXQSIZE 32 #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) @@ -61,9 +62,18 @@ enum TEST_TYPES { TEST_TYPE_POLL, TEST_TYPE_TEARDOWN, TEST_TYPE_BIDI, + TEST_TYPE_STATS, TEST_TYPE_MAX }; +enum STAT_TEST_TYPES { + STAT_TEST_RX_DROPPED, + STAT_TEST_TX_INVALID, + STAT_TEST_RX_FULL, + STAT_TEST_RX_FILL_EMPTY, + STAT_TEST_TYPE_MAX +}; + static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; static u32 num_frames; @@ -81,6 +91,9 @@ static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; static long prev_pkt = -1; static int sigvar; +static int stat_test_type; +static u32 rxqsize; +static u32 frame_headroom; struct xsk_umem_info { struct xsk_ring_prod fq; From a83586a7ddba25065ec37323c05deb9019ce4fa9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 23 Feb 2021 21:14:57 +0800 Subject: [PATCH 19/90] bpf: Remove blank line in bpf helper description comment Commit 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") added an extra blank line in bpf helper description. This will make bpf_helpers_doc.py stop building bpf_helper_defs.h immediately after bpf_check_mtu(), which will affect future added functions. Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210223131457.1378978-1-liuhangbin@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 - tools/include/uapi/linux/bpf.h | 1 - 2 files changed, 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. From 887596095ec2a9ea39ffcf98f27bf2e77c5eb512 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:26 -0800 Subject: [PATCH 20/90] bpf: Clean up sockmap related Kconfigs As suggested by John, clean up sockmap related Kconfigs: Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream parser, to reflect its name. Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL and CONFIG_INET, the latter is still needed at this point because of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched, as it is used by non-sockmap cases. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-2-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 26 ++++--- include/linux/bpf_types.h | 6 +- include/linux/skmsg.h | 18 +++++ include/net/tcp.h | 16 +++-- include/net/udp.h | 4 +- init/Kconfig | 1 + net/Kconfig | 6 +- net/core/Makefile | 6 +- net/core/skmsg.c | 145 +++++++++++++++++++++----------------- net/core/sock_map.c | 2 + net/ipv4/Makefile | 2 +- net/ipv4/tcp_bpf.c | 4 +- 12 files changed, 133 insertions(+), 103 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ae2c35641619..2be47ada5f2d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1778,7 +1778,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_BPF_STREAM_PARSER) +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); @@ -1786,7 +1786,18 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); + +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); #else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL static inline int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) @@ -1811,20 +1822,7 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } -#endif /* CONFIG_BPF_STREAM_PARSER */ -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags); -#else -static inline void bpf_sk_reuseport_detach(struct sock *sk) -{ -} - -#ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b9edee336d80..f883f01a5061 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -103,10 +103,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) -#if defined(CONFIG_BPF_STREAM_PARSER) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) -#endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) #endif @@ -116,6 +112,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif #ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 8edbbf5f2f93..db7a08be4725 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -71,7 +71,9 @@ struct sk_psock_link { }; struct sk_psock_parser { +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; +#endif bool enabled; void (*saved_data_ready)(struct sock *sk); }; @@ -305,9 +307,25 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) struct sk_psock *sk_psock_init(struct sock *sk, int node); +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +#else +static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + return -EOPNOTSUPP; +} + +static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ +} + +static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ +} +#endif + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); diff --git a/include/net/tcp.h b/include/net/tcp.h index 963cd86d12dd..c00e125dcfb9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2222,25 +2222,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +#ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); -#else -static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) -{ -} -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ -#ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif + #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, diff --git a/include/net/udp.h b/include/net/udp.h index a132a02b2f2c..d4d064c59232 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -515,9 +515,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; } -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -#endif /* BPF_STREAM_PARSER */ +#endif #endif /* _UDP_H */ diff --git a/init/Kconfig b/init/Kconfig index 096e1af5c586..66cef5eac275 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1702,6 +1702,7 @@ config BPF_SYSCALL select BPF select IRQ_WORK select TASKS_TRACE_RCU + select NET_SOCK_MSG if INET default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/net/Kconfig b/net/Kconfig index 8cea808ad9e8..0ead7ec0d2bd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -317,13 +317,9 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG help - Enabling this allows a stream parser to be used with + Enabling this allows a TCP stream parser to be used with BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. - config NET_FLOW_LIMIT bool depends on RPS diff --git a/net/core/Makefile b/net/core/Makefile index 3e2c378e5f31..0c2233c826fd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o -obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1261512d6807..e017744111e1 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -645,15 +645,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +static void sk_psock_done_strp(struct sk_psock *psock); + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - /* Parser has been stopped */ - if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + sk_psock_done_strp(psock); cancel_work_sync(&psock->work); @@ -750,14 +750,6 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, return bpf_prog_run_pin_on_cpu(prog, skb); } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -866,6 +858,24 @@ out_free: } } +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; @@ -897,6 +907,14 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = sk_psock_from_strp(strp); @@ -933,6 +951,56 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (psock->progs.skb_parser) + strp_done(&psock->parser.strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t orig_len) { @@ -984,35 +1052,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); } -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk) = NULL; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - } - rcu_read_unlock(); - if (write_space) - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -1026,32 +1065,6 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) parser->enabled = true; } -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; -} - void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d758fb83c884..ee3334dd3a38 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1461,9 +1461,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, case BPF_SK_MSG_VERDICT: pprog = &progs->msg_parser; break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: pprog = &progs->skb_parser; break; +#endif case BPF_SK_SKB_STREAM_VERDICT: pprog = &progs->skb_verdict; break; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b9..bbdd9c44f14e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o +obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index bc7d2a586e18..17c322b875fd 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL static bool tcp_bpf_stream_read(const struct sock *sk) { struct sk_psock *psock; @@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) newsk->sk_prot = sk->sk_prot_creator; } -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ From 5a685cd94b21a88efa6be77169eddef525368034 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:27 -0800 Subject: [PATCH 21/90] skmsg: Get rid of struct sk_psock_parser struct sk_psock_parser is embedded in sk_psock, it is unnecessary as skb verdict also uses ->saved_data_ready. We can simply fold these fields into sk_psock, and get rid of ->enabled. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 19 ++++++---------- net/core/skmsg.c | 53 +++++++++++++------------------------------ net/core/sock_map.c | 8 +++---- 3 files changed, 27 insertions(+), 53 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index db7a08be4725..22e26f82de33 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -70,14 +70,6 @@ struct sk_psock_link { void *link_raw; }; -struct sk_psock_parser { -#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) - struct strparser strp; -#endif - bool enabled; - void (*saved_data_ready)(struct sock *sk); -}; - struct sk_psock_work_state { struct sk_buff *skb; u32 len; @@ -92,7 +84,9 @@ struct sk_psock { u32 eval; struct sk_msg *cork; struct sk_psock_progs progs; - struct sk_psock_parser parser; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + struct strparser strp; +#endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; unsigned long state; @@ -102,6 +96,7 @@ struct sk_psock { void (*saved_unhash)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); + void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; struct sk_psock_work_state work_state; struct work_struct work; @@ -418,8 +413,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { - if (psock->parser.enabled) - psock->parser.saved_data_ready(sk); + if (psock->saved_data_ready) + psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } @@ -458,6 +453,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; - return psock->parser.enabled; + return !!psock->saved_data_ready; } #endif /* _LINUX_SKMSG_H */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e017744111e1..d00c9a4b47e7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -907,17 +907,9 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; @@ -941,10 +933,10 @@ static void sk_psock_strp_data_ready(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { - psock->parser.saved_data_ready(sk); + psock->saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); + strp_data_ready(&psock->strp); write_unlock_bh(&sk->sk_callback_lock); } } @@ -959,41 +951,34 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) .parse_msg = sk_psock_strp_parse, }; - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); + return strp_init(&psock->strp, sk, &cb); } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) @@ -1054,25 +1039,19 @@ static void sk_psock_verdict_data_ready(struct sock *sk) void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ee3334dd3a38..1a28a5c2c61e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->parser.enabled && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.skb_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -283,14 +283,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.skb_verdict, skb_verdict); sk_psock_start_verdict(sk,psock); } From 16137b09a66f2b75090f1e56a9ba0e27ef845ebc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:28 -0800 Subject: [PATCH 22/90] bpf: Compute data_end dynamically with JIT code Currently, we compute ->data_end with a compile-time constant offset of skb. But as Jakub pointed out, we can actually compute it in eBPF JIT code at run-time, so that we can competely get rid of ->data_end. This is similar to skb_shinfo(skb) computation in bpf_convert_shinfo_access(). Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-4-xiyou.wangcong@gmail.com --- include/net/tcp.h | 6 ------ net/core/filter.c | 48 +++++++++++++++++++++++++++-------------------- net/core/skmsg.c | 1 - 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index c00e125dcfb9..947ef5da6867 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -886,18 +886,12 @@ struct tcp_skb_cb { struct { __u32 flags; struct sock *sk_redir; - void *data_end; } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..13bcf248ee7b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3577,7 +3574,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3738,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3801,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -9655,22 +9645,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d00c9a4b47e7..8822001ab3dc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -746,7 +746,6 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - bpf_compute_data_end_sk_skb(skb); return bpf_prog_run_pin_on_cpu(prog, skb); } From e3526bb92a2084cdaec6cb2855bcec98b280426c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:29 -0800 Subject: [PATCH 23/90] skmsg: Move sk_redir from TCP_SKB_CB to skb Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly does not work for any other non-TCP protocols. We can move them to skb ext, but it introduces a memory allocation on fast path. Fortunately, we only need to a word-size to store all the information, because the flags actually only contains 1 bit so can be just packed into the lowest bit of the "pointer", which is stored as unsigned long. Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is no longer needed after ->sk_data_ready() so we can just drop it. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-5-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 3 +++ include/linux/skmsg.h | 38 ++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 19 ------------------- net/core/skmsg.c | 31 +++++++++++++++++++------------ net/core/sock_map.c | 8 ++------ 5 files changed, 62 insertions(+), 37 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..bd84f799c952 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -755,6 +755,9 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; +#ifdef CONFIG_NET_SOCK_MSG + unsigned long _sk_redir; +#endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 22e26f82de33..e0de45527bb6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -455,4 +455,42 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return false; return !!psock->saved_data_ready; } + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + +/* We only have one bit so far. */ +#define BPF_F_PTR_MASK ~(BPF_F_INGRESS) + +static inline bool skb_bpf_ingress(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_INGRESS; +} + +static inline void skb_bpf_set_ingress(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, + bool ingress) +{ + skb->_sk_redir = (unsigned long)sk_redir; + if (ingress) + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return (struct sock *)(sk_redir & BPF_F_PTR_MASK); +} + +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) +{ + skb->_sk_redir = 0; +} +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 947ef5da6867..075de26f449d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,30 +883,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8822001ab3dc..409258367bea 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -754,7 +760,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -804,9 +810,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -818,7 +825,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -830,8 +836,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -892,9 +897,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1011,9 +1017,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..dbfcd7006338 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } From ae8b8332fbb512f53bf50ff6a7586dd0f90ed18a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:30 -0800 Subject: [PATCH 24/90] sock_map: Rename skb_parser and skb_verdict These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact they are only used for TCP. And save the name 'skb_verdict' for general use later. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 8 +-- net/core/skmsg.c | 14 ++--- net/core/sock_map.c | 60 +++++++++---------- .../selftests/bpf/prog_tests/sockmap_listen.c | 8 +-- .../selftests/bpf/progs/test_sockmap_listen.c | 4 +- 5 files changed, 47 insertions(+), 47 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e0de45527bb6..d9f6ec4a9cf2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -56,8 +56,8 @@ struct sk_msg { struct sk_psock_progs { struct bpf_prog *msg_parser; - struct bpf_prog *skb_parser; - struct bpf_prog *skb_verdict; + struct bpf_prog *stream_parser; + struct bpf_prog *stream_verdict; }; enum sk_psock_state_bits { @@ -443,8 +443,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog, static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); - psock_set_prog(&progs->skb_parser, NULL); - psock_set_prog(&progs->skb_verdict, NULL); + psock_set_prog(&progs->stream_parser, NULL); + psock_set_prog(&progs->stream_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 409258367bea..35f9caa3b125 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -691,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -803,7 +803,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { /* We skip full set_owner_r here because if we do a SK_PASS * or SK_DROP we can skip skb memory accounting and use the @@ -895,7 +895,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); @@ -919,7 +919,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); @@ -982,7 +982,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) strp_done(&psock->strp); } #else @@ -1015,7 +1015,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dbfcd7006338..69785070f02d 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->saved_data_ready && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -224,23 +224,23 @@ out: static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } @@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->saved_data_ready) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); @@ -303,12 +303,12 @@ out_drop: out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: - if (skb_verdict) - bpf_prog_put(skb_verdict); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: - pprog = &progs->skb_verdict; + pprog = &progs->stream_verdict; break; default: return -EOPNOTSUPP; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index d7d65a700799..c26e6bf05e49 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a3a366c57ce1..fa221141e9c1 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -31,13 +31,13 @@ struct { static volatile bool test_sockmap; /* toggled by user-space */ SEC("sk_skb/stream_parser") -int prog_skb_parser(struct __sk_buff *skb) +int prog_stream_parser(struct __sk_buff *skb) { return skb->len; } SEC("sk_skb/stream_verdict") -int prog_skb_verdict(struct __sk_buff *skb) +int prog_stream_verdict(struct __sk_buff *skb) { unsigned int *count; __u32 zero = 0; From 4675e234b9e15159894b90ead9340e1dc202b670 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:31 -0800 Subject: [PATCH 25/90] sock_map: Make sock_map_prog_update() static It is only used within sock_map.c so can become static. Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-7-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 9 --------- net/core/sock_map.c | 7 +++++-- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be47ada5f2d..e1e4d2f60527 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1779,8 +1779,6 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); @@ -1798,13 +1796,6 @@ static inline void bpf_sk_reuseport_detach(struct sock *sk) } #ifdef CONFIG_BPF_SYSCALL -static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, - struct bpf_prog *old, u32 which) -{ - return -EOPNOTSUPP; -} - static inline int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 69785070f02d..dd53a7771d7e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,6 +24,9 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); + static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; @@ -1444,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; From cd81cefb1abc52bd164f4d9760cd22eadc0e4468 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:32 -0800 Subject: [PATCH 26/90] skmsg: Make __sk_psock_purge_ingress_msg() static It is only used within skmsg.c so can become static. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-8-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 -- net/core/skmsg.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9f6ec4a9cf2..676d48e08159 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,8 +340,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link) struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); -void __sk_psock_purge_ingress_msg(struct sk_psock *psock); - static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 35f9caa3b125..46e29d2c0c48 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -619,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) return link; } -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; From 533342322276b06b4db260c413ce907238851e9b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:33 -0800 Subject: [PATCH 27/90] skmsg: Get rid of sk_psock_bpf_run() It is now nearly identical to bpf_prog_run_pin_on_cpu() and it has an unused parameter 'psock', so we can just get rid of it and call bpf_prog_run_pin_on_cpu() directly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 46e29d2c0c48..07f54015238a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -749,12 +749,6 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, - struct sk_buff *skb) -{ - return bpf_prog_run_pin_on_cpu(prog, skb); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -812,7 +806,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) skb->sk = psock->sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } @@ -899,7 +893,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); @@ -922,7 +916,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); @@ -1019,7 +1013,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); From ff9614b81be65d648ec4615b593c6e4b2dac6375 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:34 -0800 Subject: [PATCH 28/90] skmsg: Remove unused sk_psock_stop() declaration It is not defined or used anywhere. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 676d48e08159..6c09d94be2e9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -400,7 +400,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk) return psock; } -void sk_psock_stop(struct sock *sk, struct sk_psock *psock); void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) From 2854436612c4d2606c9246ce2976ab6634276337 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 25 Feb 2021 16:19:47 +0000 Subject: [PATCH 29/90] selftests/bpf: Propagate error code of the command to vmtest.sh When vmtest.sh ran a command in a VM, it did not record or propagate the error code of the command. This made the script less "script-able". The script now saves the error code of the said command in a file in the VM, copies the file back to the host and (when available) uses this error code instead of its own. Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210225161947.1778590-1-kpsingh@kernel.org --- tools/testing/selftests/bpf/vmtest.sh | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 26ae8d0b6ce3..22554894db99 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -17,6 +17,9 @@ KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vm KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config" INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX" NUM_COMPILE_JOBS="$(nproc)" +LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")" +LOG_FILE="${LOG_FILE_BASE}.log" +EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status" usage() { @@ -146,7 +149,6 @@ update_init_script() local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" local init_script="${init_script_dir}/S50-startup" local command="$1" - local log_file="$2" mount_image @@ -163,11 +165,16 @@ EOF sudo bash -c "cat >${init_script}" < "/root/${EXIT_STATUS_FILE}" + { cd /root/bpf echo ${command} stdbuf -oL -eL ${command} -} 2>&1 | tee /root/${log_file} + echo "\$?" > "/root/${EXIT_STATUS_FILE}" +} 2>&1 | tee "/root/${LOG_FILE}" poweroff -f EOF @@ -221,10 +228,12 @@ EOF copy_logs() { local mount_dir="${OUTPUT_DIR}/${MOUNT_DIR}" - local log_file="${mount_dir}/root/$1" + local log_file="${mount_dir}/root/${LOG_FILE}" + local exit_status_file="${mount_dir}/root/${EXIT_STATUS_FILE}" mount_image sudo cp ${log_file} "${OUTPUT_DIR}" + sudo cp ${exit_status_file} "${OUTPUT_DIR}" sudo rm -f ${log_file} unmount_image } @@ -263,7 +272,6 @@ main() { local script_dir="$(cd -P -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)" local kernel_checkout=$(realpath "${script_dir}"/../../../../) - local log_file="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S.log")" # By default the script searches for the kernel in the checkout directory but # it also obeys environment variables O= and KBUILD_OUTPUT= local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" @@ -347,19 +355,23 @@ main() fi update_selftests "${kernel_checkout}" "${make_command}" - update_init_script "${command}" "${log_file}" + update_init_script "${command}" run_vm "${kernel_bzimage}" - copy_logs "${log_file}" - echo "Logs saved in ${OUTPUT_DIR}/${log_file}" + copy_logs + echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" } catch() { local exit_code=$1 + local exit_status_file="${OUTPUT_DIR}/${EXIT_STATUS_FILE}" # This is just a cleanup and the directory may # have already been unmounted. So, don't let this # clobber the error code we intend to return. unmount_image || true + if [[ -f "${exit_status_file}" ]]; then + exit_code="$(cat ${exit_status_file})" + fi exit ${exit_code} } From 86fd166575c38c17ecd3a6b8fb9c64fa19871486 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 24 Feb 2021 12:14:45 +0100 Subject: [PATCH 30/90] selftests/bpf: Copy extras in out-of-srctree builds Building selftests in a separate directory like this: make O="$BUILD" -C tools/testing/selftests/bpf and then running: cd "$BUILD" && ./test_progs -t btf causes all the non-flavored btf_dump_test_case_*.c tests to fail, because these files are not copied to where test_progs expects to find them. Fix by not skipping EXT-COPY when the original $(OUTPUT) is not empty (lib.mk sets it to $(shell pwd) in that case) and using rsync instead of cp: cp fails because e.g. urandom_read is being copied into itself, and rsync simply skips such cases. rsync is already used by kselftests and therefore is not a new dependency. Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210224111445.102342-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 044bfdcf5b74..a81af15e4ded 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -382,11 +382,12 @@ $(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@) $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@ -# only copy extra resources if in flavored build +# non-flavored in-srctree builds receive special treatment, in particular, we +# do not need to copy extra resources (see e.g. test_btf_dump_case()) $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) -ifneq ($2,) +ifneq ($2:$(OUTPUT),:$(shell pwd)) $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) - $(Q)cp -a $$^ $(TRUNNER_OUTPUT)/ + $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ endif $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ From efdb22de7dcd3b24b8154b3c3ba496f62afea00c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:20 -0800 Subject: [PATCH 31/90] bpf: Factor out visit_func_call_insn() in check_cfg() During verifier check_cfg(), all instructions are visited to ensure verifier can handle program control flows. This patch factored out function visit_func_call_insn() so it can be reused in later patch to visit callback function calls. There is no functionality change for this patch. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204920.3884136-1-yhs@fb.com --- kernel/bpf/verifier.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9336bac39027..b14fab7a7a16 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8592,6 +8592,27 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } +static int visit_func_call_insn(int t, int insn_cnt, + struct bpf_insn *insns, + struct bpf_verifier_env *env, + bool visit_callee) +{ + int ret; + + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + if (ret) + return ret; + + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + if (visit_callee) { + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); + } + return ret; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -8612,18 +8633,8 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret) - return ret; - - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); - } - return ret; + return visit_func_call_insn(t, insn_cnt, insns, env, + insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: if (BPF_SRC(insns[t].code) != BPF_K) From bc2591d63fc91bd5a9aaff145148a224d892bdb8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:22 -0800 Subject: [PATCH 32/90] bpf: Factor out verbose_invalid_scalar() Factor out the function verbose_invalid_scalar() to verbose print if a scalar is not in a tnum range. There is no functionality change and the function will be used by later patch which introduced bpf_for_each_map_elem(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204922.3884375-1-yhs@fb.com --- kernel/bpf/verifier.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b14fab7a7a16..7194980c3ca4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -390,6 +390,24 @@ __printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, env->prev_linfo = linfo; } +static void verbose_invalid_scalar(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct tnum *range, const char *ctx, + const char *reg_name) +{ + char tn_buf[48]; + + verbose(env, "At %s the register %s ", ctx, reg_name); + if (!tnum_is_unknown(reg->var_off)) { + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "has value %s", tn_buf); + } else { + verbose(env, "has unknown scalar value"); + } + tnum_strn(tn_buf, sizeof(tn_buf), *range); + verbose(env, " should have been in %s\n", tn_buf); +} + static bool type_is_pkt_pointer(enum bpf_reg_type type) { return type == PTR_TO_PACKET || @@ -8455,17 +8473,7 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { - char tn_buf[48]; - - verbose(env, "At program exit the register R0 "); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); - } - tnum_strn(tn_buf, sizeof(tn_buf), range); - verbose(env, " should have been in %s\n", tn_buf); + verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); return -EINVAL; } From 1435137573f9c75455903e8cd01f84d6e092ea16 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:23 -0800 Subject: [PATCH 33/90] bpf: Refactor check_func_call() to allow callback function Later proposed bpf_for_each_map_elem() helper has callback function as one of its arguments. This patch refactored check_func_call() to permit callback function which sets callee state. Different callback functions may have different callee states. There is no functionality change for this patch. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204923.3884627-1-yhs@fb.com --- kernel/bpf/verifier.c | 60 +++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7194980c3ca4..97e772f44cd7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5249,13 +5249,19 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) +typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx); + +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) { struct bpf_verifier_state *state = env->cur_state; struct bpf_func_info_aux *func_info_aux; struct bpf_func_state *caller, *callee; - int i, err, subprog, target_insn; + int err; bool is_global = false; if (state->curframe + 1 >= MAX_CALL_FRAMES) { @@ -5264,14 +5270,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -E2BIG; } - target_insn = *insn_idx + insn->imm; - subprog = find_subprog(env, target_insn + 1); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn + 1); - return -EFAULT; - } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", @@ -5326,11 +5324,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err) return err; - /* copy r1 - r5 args that callee can access. The copy includes parent - * pointers, which connects us up to the liveness chain - */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - callee->regs[i] = caller->regs[i]; + err = set_callee_state_cb(env, caller, callee, *insn_idx); + if (err) + return err; clear_caller_saved_regs(env, caller->regs); @@ -5338,7 +5334,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe++; /* and go analyze first insn of the callee */ - *insn_idx = target_insn; + *insn_idx = env->subprog_info[subprog].start - 1; if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); @@ -5349,6 +5345,36 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx) +{ + int i; + + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + int subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn); + return -EFAULT; + } + + return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; From 282a0f46d6cda7cf843cd77c9b53b4d1d9e31302 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:24 -0800 Subject: [PATCH 34/90] bpf: Change return value of verifier function add_subprog() Currently, verifier function add_subprog() returns 0 for success and negative value for failure. Change the return value to be the subprog number for success. This functionality will be used in the next patch to save a call to find_subprog(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204924.3884848-1-yhs@fb.com --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 97e772f44cd7..dbdca49ac6cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1530,7 +1530,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) } ret = find_subprog(env, off); if (ret >= 0) - return 0; + return ret; if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { verbose(env, "too many subprograms\n"); return -E2BIG; @@ -1538,7 +1538,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) env->subprog_info[env->subprog_cnt++].start = off; sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); - return 0; + return env->subprog_cnt - 1; } static int check_subprogs(struct bpf_verifier_env *env) From 69c087ba6225b574afb6e505b72cb75242a3d844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:25 -0800 Subject: [PATCH 35/90] bpf: Add bpf_for_each_map_elem() helper The bpf_for_each_map_elem() helper is introduced which iterates all map elements with a callback function. The helper signature looks like long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags) and for each map element, the callback_fn will be called. For example, like hashmap, the callback signature may look like long callback_fn(map, key, val, callback_ctx) There are two known use cases for this. One is from upstream ([1]) where a for_each_map_elem helper may help implement a timeout mechanism in a more generic way. Another is from our internal discussion for a firewall use case where a map contains all the rules. The packet data can be compared to all these rules to decide allow or deny the packet. For array maps, users can already use a bounded loop to traverse elements. Using this helper can avoid using bounded loop. For other type of maps (e.g., hash maps) where bounded loop is hard or impossible to use, this helper provides a convenient way to operate on all elements. For callback_fn, besides map and map element, a callback_ctx, allocated on caller stack, is also passed to the callback function. This callback_ctx argument can provide additional input and allow to write to caller stack for output. If the callback_fn returns 0, the helper will iterate through next element if available. If the callback_fn returns 1, the helper will stop iterating and returns to the bpf program. Other return values are not used for now. Currently, this helper is only available with jit. It is possible to make it work with interpreter with so effort but I leave it as the future work. [1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/ Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com --- include/linux/bpf.h | 13 +++ include/linux/bpf_verifier.h | 3 + include/uapi/linux/bpf.h | 38 ++++++ kernel/bpf/bpf_iter.c | 16 +++ kernel/bpf/helpers.c | 2 + kernel/bpf/verifier.c | 208 ++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 2 + tools/include/uapi/linux/bpf.h | 38 ++++++ 8 files changed, 307 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1e4d2f60527..aeb1b93a4d75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; +struct bpf_func_state; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -129,6 +130,13 @@ struct bpf_map_ops { bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); + + int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -295,6 +303,8 @@ enum bpf_arg_type { ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ + ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ __BPF_ARG_TYPE_MAX, }; @@ -411,6 +421,8 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ }; /* The information passed from prog-specific *_is_valid_access @@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_for_each_map_elem_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 971b33aca13d..51c2ffa3d901 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -68,6 +68,8 @@ struct bpf_reg_state { unsigned long raw1; unsigned long raw2; } raw; + + u32 subprogno; /* for PTR_TO_FUNC */ }; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. @@ -204,6 +206,7 @@ struct bpf_func_state { int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; + bool in_callback_fn; struct bpf_stack_state *stack; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..074800226327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbdca49ac6cc..53afe9461b03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +254,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -552,6 +561,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { + if (bpf_pseudo_func(insn + i)) { + if (!env->bpf_capable) { + verbose(env, + "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + return -EPERM; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + /* remember subprog */ + insn[i + 1].imm = ret; + continue; + } if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { @@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3396,7 +3430,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4729,6 +4785,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); } +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem) return 0; if (map == NULL) { @@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, else *ptr_limit = -off; return 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; @@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", @@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e9701744d8e4..0d23755c2747 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From 314ee05e2fc601a7bece14376547d2b7a04bab67 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:27 -0800 Subject: [PATCH 36/90] bpf: Add hashtab support for bpf_for_each_map_elem() helper This patch added support for hashmap, percpu hashmap, lru hashmap and percpu lru hashmap. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204927.3885020-1-yhs@fb.com --- include/linux/bpf.h | 4 +++ kernel/bpf/hashtab.c | 65 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 27 ++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aeb1b93a4d75..4c730863fa77 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1397,6 +1397,10 @@ void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d63912e73ad9..330d721dd2af 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; +static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + u32 roundup_key_size; + int i, num_elems = 0; + void __percpu *pptr; + struct bucket *b; + void *key, *val; + bool is_percpu; + u64 ret = 0; + + if (flags != 0) + return -EINVAL; + + is_percpu = htab_is_percpu(htab); + + roundup_key_size = round_up(map->key_size, 8); + /* disable migration so percpu value prepared here will be the + * same as the one seen by the bpf program with bpf_map_lookup_elem(). + */ + if (is_percpu) + migrate_disable(); + for (i = 0; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + rcu_read_lock(); + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + key = elem->key; + if (is_percpu) { + /* current cpu value for percpu map */ + pptr = htab_elem_get_ptr(elem, map->key_size); + val = this_cpu_ptr(pptr); + } else { + val = elem->key + roundup_key_size; + } + num_elems++; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) { + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + } +out: + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), .map_btf_name = "bpf_htab", .map_btf_id = &htab_map_btf_id, @@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_map_btf_id, @@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_percpu_map_btf_id, @@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_percpu_map_btf_id, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53afe9461b03..9fe90ce52a65 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5403,6 +5403,33 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return 0; } +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee) +{ + /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, + * void *callback_ctx, u64 flags); + * callback_fn(struct bpf_map *map, void *key, void *value, + * void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* pointer to stack or null */ + callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx) From 06dcdcd4b9e84ee78d865c928b1d43bb71829251 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:28 -0800 Subject: [PATCH 37/90] bpf: Add arraymap support for bpf_for_each_map_elem() helper This patch added support for arraymap and percpu arraymap. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204928.3885192-1-yhs@fb.com --- kernel/bpf/arraymap.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1f8453343bf2..463d25e1e67e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -625,6 +625,42 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; +static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + u32 i, key, num_elems = 0; + struct bpf_array *array; + bool is_percpu; + u64 ret = 0; + void *val; + + if (flags != 0) + return -EINVAL; + + is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + array = container_of(map, struct bpf_array, map); + if (is_percpu) + migrate_disable(); + for (i = 0; i < map->max_entries; i++) { + if (is_percpu) + val = this_cpu_ptr(array->pptrs[i]); + else + val = array->value + array->elem_size * i; + num_elems++; + key = i; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)&key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + break; + } + + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, @@ -643,6 +679,8 @@ const struct bpf_map_ops array_map_ops = { .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &array_map_btf_id, .iter_seq_info = &iter_seq_info, @@ -660,6 +698,8 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_array_elem, .map_btf_name = "bpf_array", .map_btf_id = &percpu_array_map_btf_id, .iter_seq_info = &iter_seq_info, From b8f871fa32ad392759bc70090fa8c60d9f10625c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:29 -0800 Subject: [PATCH 38/90] libbpf: Move function is_ldimm64() earlier in libbpf.c Move function is_ldimm64() close to the beginning of libbpf.c, so it can be reused by later code and the next patch as well. There is no functionality change. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204929.3885295-1-yhs@fb.com --- tools/lib/bpf/libbpf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d43cc3f29dae..21a3eedf070d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -574,6 +574,11 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn) insn->off == 0; } +static bool is_ldimm64(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + static int bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, const char *name, size_t sec_idx, const char *sec_name, @@ -3395,7 +3400,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } - if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) { + if (!is_ldimm64(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; @@ -5566,11 +5571,6 @@ static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx, insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ } -static bool is_ldimm64(struct bpf_insn *insn) -{ - return insn->code == (BPF_LD | BPF_IMM | BPF_DW); -} - static int insn_bpf_size_to_bytes(struct bpf_insn *insn) { switch (BPF_SIZE(insn->code)) { From 53eddb5e04ac5c53a4ccef9f1f900562a5a75246 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:30 -0800 Subject: [PATCH 39/90] libbpf: Support subprog address relocation A new relocation RELO_SUBPROG_ADDR is added to capture subprog addresses loaded with ld_imm64 insns. Such ld_imm64 insns are marked with BPF_PSEUDO_FUNC and will be passed to kernel. For bpf_for_each_map_elem() case, kernel will check that the to-be-used subprog address must be a static function and replace it with proper actual jited func address. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204930.3885367-1-yhs@fb.com --- tools/lib/bpf/libbpf.c | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 21a3eedf070d..62d9ed56b081 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -188,6 +188,7 @@ enum reloc_type { RELO_CALL, RELO_DATA, RELO_EXTERN, + RELO_SUBPROG_ADDR, }; struct reloc_desc { @@ -579,6 +580,11 @@ static bool is_ldimm64(struct bpf_insn *insn) return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } +static bool insn_is_pseudo_func(struct bpf_insn *insn) +{ + return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; +} + static int bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, const char *name, size_t sec_idx, const char *sec_name, @@ -2979,6 +2985,23 @@ static bool sym_is_extern(const GElf_Sym *sym) GELF_ST_TYPE(sym->st_info) == STT_NOTYPE; } +static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx) +{ + int bind = GELF_ST_BIND(sym->st_info); + int type = GELF_ST_TYPE(sym->st_info); + + /* in .text section */ + if (sym->st_shndx != text_shndx) + return false; + + /* local function */ + if (bind == STB_LOCAL && type == STT_SECTION) + return true; + + /* global function */ + return bind == STB_GLOBAL && type == STT_FUNC; +} + static int find_extern_btf_id(const struct btf *btf, const char *ext_name) { const struct btf_type *t; @@ -3435,6 +3458,23 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return -LIBBPF_ERRNO__RELOC; } + /* loading subprog addresses */ + if (sym_is_subprog(sym, obj->efile.text_shndx)) { + /* global_func: sym->st_value = offset in the section, insn->imm = 0. + * local_func: sym->st_value = 0, insn->imm = offset in the section. + */ + if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) { + pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n", + prog->name, sym_name, (size_t)sym->st_value, insn->imm); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_SUBPROG_ADDR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); @@ -6172,6 +6212,10 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } relo->processed = true; break; + case RELO_SUBPROG_ADDR: + insn[0].src_reg = BPF_PSEUDO_FUNC; + /* will be handled as a follow up pass */ + break; case RELO_CALL: /* will be handled as a follow up pass */ break; @@ -6358,11 +6402,11 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; - if (!insn_is_subprog_call(insn)) + if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn)) continue; relo = find_prog_insn_relo(prog, insn_idx); - if (relo && relo->type != RELO_CALL) { + if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) { pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", prog->name, insn_idx, relo->type); return -LIBBPF_ERRNO__RELOC; @@ -6374,8 +6418,22 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * call always has imm = -1, but for static functions * relocation is against STT_SECTION and insn->imm * points to a start of a static function + * + * for subprog addr relocation, the relo->sym_off + insn->imm is + * the byte offset in the corresponding section. */ - sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + if (relo->type == RELO_CALL) + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + else + sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ; + } else if (insn_is_pseudo_func(insn)) { + /* + * RELO_SUBPROG_ADDR relo is always emitted even if both + * functions are in the same section, so it shouldn't reach here. + */ + pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n", + prog->name, insn_idx); + return -LIBBPF_ERRNO__RELOC; } else { /* if subprogram call is to a static function within * the same ELF section, there won't be any relocation From f1f9f0d8d737b9a1c5d15635bf5696643626fd39 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:31 -0800 Subject: [PATCH 40/90] bpftool: Print subprog address properly With later hashmap example, using bpftool xlated output may look like: int dump_task(struct bpf_iter__task * ctx): ; struct task_struct *task = ctx->task; 0: (79) r2 = *(u64 *)(r1 +8) ; if (task == (void *)0 || called > 0) ... 19: (18) r2 = subprog[+17] 30: (18) r2 = subprog[+25] ... 36: (95) exit __u64 check_hash_elem(struct bpf_map * map, __u32 * key, __u64 * val, struct callback_ctx * data): ; struct bpf_iter__task *ctx = data->ctx; 37: (79) r5 = *(u64 *)(r4 +0) ... 55: (95) exit __u64 check_percpu_elem(struct bpf_map * map, __u32 * key, __u64 * val, void * unused): ; check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, void *unused) 56: (bf) r6 = r3 ... 83: (18) r2 = subprog[-47] Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204931.3885458-1-yhs@fb.com --- tools/bpf/bpftool/xlated_dumper.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 8608cd68cdd0..6fc3e6f7f40c 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -196,6 +196,9 @@ static const char *print_imm(void *private_data, else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm); + else if (insn->src_reg == BPF_PSEUDO_FUNC) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "subprog[%+d]", insn->imm); else snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), "0x%llx", (unsigned long long)full_imm); From 9de7f0fdab326a37c9f741f0f6c0f1cbc320a5ab Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:33 -0800 Subject: [PATCH 41/90] selftests/bpf: Add hashmap test for bpf_for_each_map_elem() helper A test case is added for hashmap and percpu hashmap. The test also exercises nested bpf_for_each_map_elem() calls like bpf_prog: bpf_for_each_map_elem(func1) func1: bpf_for_each_map_elem(func2) func2: $ ./test_progs -n 45 #45/1 hash_map:OK #45 for_each:OK Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204933.3885657-1-yhs@fb.com --- .../selftests/bpf/prog_tests/for_each.c | 73 ++++++++++++++ .../bpf/progs/for_each_hash_map_elem.c | 95 +++++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 11 +++ 3 files changed, 179 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/for_each.c create mode 100644 tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c new file mode 100644 index 000000000000..aa847cd9f71f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "for_each_hash_map_elem.skel.h" + +static unsigned int duration; + +static void test_hash_map(void) +{ + int i, err, hashmap_fd, max_entries, percpu_map_fd; + struct for_each_hash_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u32 key, num_cpus, retval; + __u64 val; + + skel = for_each_hash_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_hash_map_elem__open_and_load")) + return; + + hashmap_fd = bpf_map__fd(skel->maps.hashmap); + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + err = bpf_map_update_elem(hashmap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 1; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->hashmap_output, 4, "hashmap_output"); + ASSERT_EQ(skel->bss->hashmap_elems, max_entries, "hashmap_elems"); + + key = 1; + err = bpf_map_lookup_elem(hashmap_fd, &key, &val); + ASSERT_ERR(err, "hashmap_lookup"); + + ASSERT_EQ(skel->bss->percpu_called, 1, "percpu_called"); + ASSERT_LT(skel->bss->cpu, num_cpus, "num_cpus"); + ASSERT_EQ(skel->bss->percpu_map_elems, 1, "percpu_map_elems"); + ASSERT_EQ(skel->bss->percpu_key, 1, "percpu_key"); + ASSERT_EQ(skel->bss->percpu_val, skel->bss->cpu + 1, "percpu_val"); + ASSERT_EQ(skel->bss->percpu_output, 100, "percpu_output"); +out: + free(percpu_valbuf); + for_each_hash_map_elem__destroy(skel); +} + +void test_for_each(void) +{ + if (test__start_subtest("hash_map")) + test_hash_map(); +} diff --git a/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c new file mode 100644 index 000000000000..913dd91aafff --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_hash_map_elem.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} hashmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + struct __sk_buff *ctx; + int input; + int output; +}; + +static __u64 +check_hash_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + struct __sk_buff *skb = data->ctx; + __u32 k; + __u64 v; + + if (skb) { + k = *key; + v = *val; + if (skb->len == 10000 && k == 10 && v == 10) + data->output = 3; /* impossible path */ + else + data->output = 4; + } else { + data->output = data->input; + bpf_map_delete_elem(map, key); + } + + return 0; +} + +__u32 cpu = 0; +__u32 percpu_called = 0; +__u32 percpu_key = 0; +__u64 percpu_val = 0; +int percpu_output = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *unused) +{ + struct callback_ctx data; + + percpu_called++; + cpu = bpf_get_smp_processor_id(); + percpu_key = *key; + percpu_val = *val; + + data.ctx = 0; + data.input = 100; + data.output = 0; + bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + percpu_output = data.output; + + return 0; +} + +int hashmap_output = 0; +int hashmap_elems = 0; +int percpu_map_elems = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.ctx = skb; + data.input = 10; + data.output = 0; + hashmap_elems = bpf_for_each_map_elem(&hashmap, check_hash_elem, &data, 0); + hashmap_output = data.output; + + percpu_map_elems = bpf_for_each_map_elem(&percpu_map, check_percpu_elem, + (void *)0, 0); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index f7c2fd89d01a..e87c8546230e 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -152,6 +152,17 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_LT(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act < ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld >= expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ From 6b9e3331347ee9e84fe5c71d3eba7ec204f9bb25 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:34 -0800 Subject: [PATCH 42/90] selftests/bpf: Add arraymap test for bpf_for_each_map_elem() helper A test is added for arraymap and percpu arraymap. The test also exercises the early return for the helper which does not traverse all elements. $ ./test_progs -n 45 #45/1 hash_map:OK #45/2 array_map:OK #45 for_each:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204934.3885756-1-yhs@fb.com --- .../selftests/bpf/prog_tests/for_each.c | 57 +++++++++++++++++ .../bpf/progs/for_each_array_map_elem.c | 61 +++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/for_each_array_map_elem.c diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index aa847cd9f71f..68eb12a287d4 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -3,6 +3,7 @@ #include #include #include "for_each_hash_map_elem.skel.h" +#include "for_each_array_map_elem.skel.h" static unsigned int duration; @@ -66,8 +67,64 @@ out: for_each_hash_map_elem__destroy(skel); } +static void test_array_map(void) +{ + __u32 key, num_cpus, max_entries, retval; + int i, arraymap_fd, percpu_map_fd, err; + struct for_each_array_map_elem *skel; + __u64 *percpu_valbuf = NULL; + __u64 val, expected_total; + + skel = for_each_array_map_elem__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_array_map_elem__open_and_load")) + return; + + arraymap_fd = bpf_map__fd(skel->maps.arraymap); + expected_total = 0; + max_entries = bpf_map__max_entries(skel->maps.arraymap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i + 1; + /* skip the last iteration for expected total */ + if (i != max_entries - 1) + expected_total += val; + err = bpf_map_update_elem(arraymap_fd, &key, &val, BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + num_cpus = bpf_num_possible_cpus(); + percpu_map_fd = bpf_map__fd(skel->maps.percpu_map); + percpu_valbuf = malloc(sizeof(__u64) * num_cpus); + if (!ASSERT_OK_PTR(percpu_valbuf, "percpu_valbuf")) + goto out; + + key = 0; + for (i = 0; i < num_cpus; i++) + percpu_valbuf[i] = i + 1; + err = bpf_map_update_elem(percpu_map_fd, &key, percpu_valbuf, BPF_ANY); + if (!ASSERT_OK(err, "percpu_map_update")) + goto out; + + err = bpf_prog_test_run(bpf_program__fd(skel->progs.test_pkt_access), + 1, &pkt_v4, sizeof(pkt_v4), NULL, NULL, + &retval, &duration); + if (CHECK(err || retval, "ipv4", "err %d errno %d retval %d\n", + err, errno, retval)) + goto out; + + ASSERT_EQ(skel->bss->arraymap_output, expected_total, "array_output"); + ASSERT_EQ(skel->bss->cpu + 1, skel->bss->percpu_val, "percpu_val"); + +out: + free(percpu_valbuf); + for_each_array_map_elem__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) test_hash_map(); + if (test__start_subtest("array_map")) + test_array_map(); } diff --git a/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c new file mode 100644 index 000000000000..75e8e1069fe7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_array_map_elem.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 3); + __type(key, __u32); + __type(value, __u64); +} arraymap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} percpu_map SEC(".maps"); + +struct callback_ctx { + int output; +}; + +static __u64 +check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + data->output += *val; + if (*key == 1) + return 1; /* stop the iteration */ + return 0; +} + +__u32 cpu = 0; +__u64 percpu_val = 0; + +static __u64 +check_percpu_elem(struct bpf_map *map, __u32 *key, __u64 *val, + struct callback_ctx *data) +{ + cpu = bpf_get_smp_processor_id(); + percpu_val = *val; + return 0; +} + +u32 arraymap_output = 0; + +SEC("classifier") +int test_pkt_access(struct __sk_buff *skb) +{ + struct callback_ctx data; + + data.output = 0; + bpf_for_each_map_elem(&arraymap, check_array_elem, &data, 0); + arraymap_output = data.output; + + bpf_for_each_map_elem(&percpu_map, check_percpu_elem, (void *)0, 0); + return 0; +} From 04883a079968e6250a4549f6fadb6427c534885e Mon Sep 17 00:00:00 2001 From: Ian Denhardt Date: Tue, 23 Feb 2021 21:15:31 -0500 Subject: [PATCH 43/90] tools, bpf_asm: Hard error on out of range jumps Per discussion at [0] this was originally introduced as a warning due to concerns about breaking existing code, but a hard error probably makes more sense, especially given that concerns about breakage were only speculation. [0] https://lore.kernel.org/bpf/c964892195a6b91d20a67691448567ef528ffa6d.camel@linux.ibm.com/T/#t Signed-off-by: Ian Denhardt Signed-off-by: Daniel Borkmann Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/a6b6c7516f5d559049d669968e953b4a8d7adea3.1614201868.git.ian@zenhack.net --- tools/bpf/bpf_exp.y | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpf_exp.y b/tools/bpf/bpf_exp.y index 8d48e896be50..8d03e5245da5 100644 --- a/tools/bpf/bpf_exp.y +++ b/tools/bpf/bpf_exp.y @@ -549,9 +549,11 @@ static uint8_t bpf_encode_jt_jf_offset(int off, int i) { int delta = off - i - 1; - if (delta < 0 || delta > 255) - fprintf(stderr, "warning: insn #%d jumps to insn #%d, " + if (delta < 0 || delta > 255) { + fprintf(stderr, "error: insn #%d jumps to insn #%d, " "which is out of range\n", i, off); + exit(1); + } return (uint8_t) delta; } From 85e142cb42a1e7b33971bf035dae432d8670c46b Mon Sep 17 00:00:00 2001 From: Ian Denhardt Date: Tue, 23 Feb 2021 21:24:00 -0500 Subject: [PATCH 44/90] tools, bpf_asm: Exit non-zero on errors ... so callers can correctly detect failure. Signed-off-by: Ian Denhardt Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/b0bea780bc292f29e7b389dd062f20adc2a2d634.1614201868.git.ian@zenhack.net --- tools/bpf/bpf_exp.y | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpf_exp.y b/tools/bpf/bpf_exp.y index 8d03e5245da5..dfb7254a24e8 100644 --- a/tools/bpf/bpf_exp.y +++ b/tools/bpf/bpf_exp.y @@ -185,13 +185,13 @@ ldx | OP_LDXB number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } | OP_LDX number '*' '(' '[' number ']' '&' number ')' { if ($2 != 4 || $9 != 0xf) { fprintf(stderr, "ldxb offset not supported!\n"); - exit(0); + exit(1); } else { bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } } ; @@ -472,7 +472,7 @@ static void bpf_assert_max(void) { if (curr_instr >= BPF_MAXINSNS) { fprintf(stderr, "only max %u insns allowed!\n", BPF_MAXINSNS); - exit(0); + exit(1); } } @@ -522,7 +522,7 @@ static int bpf_find_insns_offset(const char *label) if (ret == -ENOENT) { fprintf(stderr, "no such label \'%s\'!\n", label); - exit(0); + exit(1); } return ret; From 303dcc25b5c782547eb13b9f29426de843dd6f34 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 2 Mar 2021 16:40:10 -0800 Subject: [PATCH 45/90] tools/runqslower: Allow substituting custom vmlinux.h for the build Just like was done for bpftool and selftests in ec23eb705620 ("tools/bpftool: Allow substituting custom vmlinux.h for the build") and ca4db6389d61 ("selftests/bpf: Allow substituting custom vmlinux.h for selftests build"), allow to provide pre-generated vmlinux.h for runqslower build. Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210303004010.653954-1-andrii@kernel.org --- tools/bpf/runqslower/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index c96ba90c6f01..3818ec511fd2 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -69,12 +69,16 @@ $(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): $(QUIET_MKDIR)mkdir -p $@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) +ifeq ($(VMLINUX_H),) $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ "specify its location." >&2; \ exit 1;\ fi $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@ From 6ed6e1c761f6c8391af654facbbbf1748ae9f386 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 1 Mar 2021 10:48:05 -0800 Subject: [PATCH 46/90] skmsg: Add function doc for skb->_sk_redir This should fix the following warning: include/linux/skbuff.h:932: warning: Function parameter or member '_sk_redir' not described in 'sk_buff' Reported-by: Lorenz Bauer Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210301184805.8174-1-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bd84f799c952..0503c917d773 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -656,6 +656,7 @@ typedef unsigned char *sk_buff_data_t; * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) + * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on From 86a35af628e539f7ae9796f1984761e8d3232ac0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 14:38:10 -0800 Subject: [PATCH 47/90] selftests/bpf: Add a verifier scale test with unknown bounded loop The original bcc pull request https://github.com/iovisor/bcc/pull/3270 exposed a verifier failure with Clang 12/13 while Clang 4 works fine. Further investigation exposed two issues: Issue 1: LLVM may generate code which uses less refined value. The issue is fixed in LLVM patch: https://reviews.llvm.org/D97479 Issue 2: Spills with initial value 0 are marked as precise which makes later state pruning less effective. This is my rough initial analysis and further investigation is needed to find how to improve verifier pruning in such cases. With the above LLVM patch, for the new loop6.c test, which has smaller loop bound compared to original test, I got: $ test_progs -s -n 10/16 ... stack depth 64 processed 390735 insns (limit 1000000) max_states_per_insn 87 total_states 8658 peak_states 964 mark_read 6 #10/16 loop6.o:OK Use the original loop bound, i.e., commenting out "#define WORKAROUND", I got: $ test_progs -s -n 10/16 ... BPF program is too large. Processed 1000001 insn stack depth 64 processed 1000001 insns (limit 1000000) max_states_per_insn 91 total_states 23176 peak_states 5069 mark_read 6 ... #10/16 loop6.o:FAIL The purpose of this patch is to provide a regression test for the above LLVM fix and also provide a test case for further analyzing the verifier pruning issue. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Cc: Zhenwei Pi Link: https://lore.kernel.org/bpf/20210226223810.236472-1-yhs@fb.com --- tools/testing/selftests/bpf/README.rst | 39 ++++++++ .../bpf/prog_tests/bpf_verif_scale.c | 1 + tools/testing/selftests/bpf/progs/loop6.c | 99 +++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/loop6.c diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index fd148b8410fa..dbc8f6cc5c67 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -111,6 +111,45 @@ available in 10.0.1. The patch is available in llvm 11.0.0 trunk. __ https://reviews.llvm.org/D78466 +bpf_verif_scale/loop6.o test failure with Clang 12 +================================================== + +With Clang 12, the following bpf_verif_scale test failed: + * ``bpf_verif_scale/loop6.o`` + +The verifier output looks like + +.. code-block:: c + + R1 type=ctx expected=fp + The sequence of 8193 jumps is too complex. + +The reason is compiler generating the following code + +.. code-block:: c + + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 14: 16 05 40 00 00 00 00 00 if w5 == 0 goto +64 + 15: bc 51 00 00 00 00 00 00 w1 = w5 + 16: 04 01 00 00 ff ff ff ff w1 += -1 + 17: 67 05 00 00 20 00 00 00 r5 <<= 32 + 18: 77 05 00 00 20 00 00 00 r5 >>= 32 + 19: a6 01 01 00 05 00 00 00 if w1 < 5 goto +1 + 20: b7 05 00 00 06 00 00 00 r5 = 6 + 00000000000000a8 : + 21: b7 02 00 00 00 00 00 00 r2 = 0 + 22: b7 01 00 00 00 00 00 00 r1 = 0 + ; for (i = 0; (i < VIRTIO_MAX_SGS) && (i < num); i++) { + 23: 7b 1a e0 ff 00 00 00 00 *(u64 *)(r10 - 32) = r1 + 24: 7b 5a c0 ff 00 00 00 00 *(u64 *)(r10 - 64) = r5 + +Note that insn #15 has w1 = w5 and w1 is refined later but +r5(w5) is eventually saved on stack at insn #24 for later use. +This cause later verifier failure. The bug has been `fixed`__ in +Clang 13. + +__ https://reviews.llvm.org/D97479 + BPF CO-RE-based tests and Clang version ======================================= diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index e698ee6bb6c2..3d002c245d2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -76,6 +76,7 @@ void test_bpf_verif_scale(void) { "loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, { "loop4.o", BPF_PROG_TYPE_SCHED_CLS }, { "loop5.o", BPF_PROG_TYPE_SCHED_CLS }, + { "loop6.o", BPF_PROG_TYPE_KPROBE }, /* partial unroll. 19k insn in a loop. * Total program size 20.8k insn. diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c new file mode 100644 index 000000000000..2a7141ac1656 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* typically virtio scsi has max SGs of 6 */ +#define VIRTIO_MAX_SGS 6 + +/* Verifier will fail with SG_MAX = 128. The failure can be + * workarounded with a smaller SG_MAX, e.g. 10. + */ +#define WORKAROUND +#ifdef WORKAROUND +#define SG_MAX 10 +#else +/* typically virtio blk has max SEG of 128 */ +#define SG_MAX 128 +#endif + +#define SG_CHAIN 0x01UL +#define SG_END 0x02UL + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; +}; + +#define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN) +#define sg_is_last(sg) ((sg)->page_link & SG_END) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END))) + +static inline struct scatterlist *__sg_next(struct scatterlist *sgp) +{ + struct scatterlist sg; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_last(&sg)) + return NULL; + + sgp++; + + bpf_probe_read_kernel(&sg, sizeof(sg), sgp); + if (sg_is_chain(&sg)) + sgp = sg_chain_ptr(&sg); + + return sgp; +} + +static inline struct scatterlist *get_sgp(struct scatterlist **sgs, int i) +{ + struct scatterlist *sgp; + + bpf_probe_read_kernel(&sgp, sizeof(sgp), sgs + i); + return sgp; +} + +int config = 0; +int result = 0; + +SEC("kprobe/virtqueue_add_sgs") +BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, + unsigned int out_sgs, unsigned int in_sgs) +{ + struct scatterlist *sgp = NULL; + __u64 length1 = 0, length2 = 0; + unsigned int i, n, len; + + if (config != 0) + return 0; + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < out_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length1 += len; + n++; + } + } + + for (i = 0; (i < VIRTIO_MAX_SGS) && (i < in_sgs); i++) { + for (n = 0, sgp = get_sgp(sgs, i); sgp && (n < SG_MAX); + sgp = __sg_next(sgp)) { + bpf_probe_read_kernel(&len, sizeof(len), &sgp->length); + length2 += len; + n++; + } + } + + config = 1; + result = length2 - length1; + return 0; +} From 8fd886911a6a99acf4a8facf619a2e7b5225be78 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:47 +0100 Subject: [PATCH 48/90] bpf: Add BTF_KIND_FLOAT to uapi Add a new kind value and expand the kind bitfield. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-2-iii@linux.ibm.com --- include/uapi/linux/btf.h | 5 +++-- tools/include/uapi/linux/btf.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately From 1b1ce92b24331b569a444858fc487a1ca19dc778 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:48 +0100 Subject: [PATCH 49/90] libbpf: Fix whitespace in btf_add_composite() comment Remove trailing space. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-3-iii@linux.ibm.com --- tools/lib/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d9c10830d749..0797ab714830 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1883,7 +1883,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 * - *byte_sz* - size of the struct, in bytes; * * Struct initially has no fields in it. Fields can be added by - * btf__add_field() right after btf__add_struct() succeeds. + * btf__add_field() right after btf__add_struct() succeeds. * * Returns: * - >0, type ID of newly added BTF type; From 22541a9eeb0d968c133aaebd95fa59da3208e705 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:49 +0100 Subject: [PATCH 50/90] libbpf: Add BTF_KIND_FLOAT support The logic follows that of BTF_KIND_INT most of the time. Sanitization replaces BTF_KIND_FLOATs with equally-sized empty BTF_KIND_STRUCTs on older kernels, for example, the following: [4] FLOAT 'float' size=4 becomes the following: [4] STRUCT '(anon)' size=4 vlen=0 With dwarves patch [1] and this patch, the older kernels, which were failing with the floating-point-related errors, will now start working correctly. [1] https://github.com/iii-i/dwarves/commit/btf-kind-float-v2 Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226202256.116518-4-iii@linux.ibm.com --- tools/lib/bpf/btf.c | 49 +++++++++++++++++++++++++++++++++ tools/lib/bpf/btf.h | 6 ++++ tools/lib/bpf/btf_dump.c | 4 +++ tools/lib/bpf/libbpf.c | 29 ++++++++++++++++++- tools/lib/bpf/libbpf.map | 5 ++++ tools/lib/bpf/libbpf_internal.h | 2 ++ 6 files changed, 94 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 0797ab714830..3aa58f2ac183 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -291,6 +291,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); @@ -338,6 +339,7 @@ static int btf_bswap_type_rest(struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return 0; case BTF_KIND_INT: *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1)); @@ -578,6 +580,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: size = t->size; goto done; case BTF_KIND_PTR: @@ -621,6 +624,7 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: return min(btf_ptr_sz(btf), (size_t)t->size); case BTF_KIND_PTR: return btf_ptr_sz(btf); @@ -1756,6 +1760,47 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding return btf_commit_type(btf, sz); } +/* + * Append new BTF_KIND_FLOAT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - size of the type, in bytes; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return -EINVAL; + + /* byte_sz must be one of the explicitly allowed values */ + if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 && + byte_sz != 16) + return -EINVAL; + + if (btf_ensure_modifiable(btf)) + return -ENOMEM; + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return -ENOMEM; + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + /* it's completely legal to append BTF types with type IDs pointing forward to * types that haven't been appended yet, so we only make sure that id looks * sane, we can't guarantee that ID will always be valid @@ -3626,6 +3671,7 @@ static int btf_dedup_prep(struct btf_dedup *d) case BTF_KIND_FWD: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: h = btf_hash_common(t); break; case BTF_KIND_INT: @@ -3722,6 +3768,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) break; case BTF_KIND_FWD: + case BTF_KIND_FLOAT: h = btf_hash_common(t); for_each_dedup_cand(d, hash_entry, h) { cand_id = (__u32)(long)hash_entry->value; @@ -3983,6 +4030,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return btf_compat_enum(cand_type, canon_type); case BTF_KIND_FWD: + case BTF_KIND_FLOAT: return btf_equal_common(cand_type, canon_type); case BTF_KIND_CONST: @@ -4479,6 +4527,7 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id) switch (btf_kind(t)) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: break; case BTF_KIND_FWD: diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 1237bcd1dd17..029a9cfc8c2d 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -95,6 +95,7 @@ LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); +LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id); LIBBPF_API int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 nr_elems); @@ -294,6 +295,11 @@ static inline bool btf_is_datasec(const struct btf_type *t) return btf_kind(t) == BTF_KIND_DATASEC; } +static inline bool btf_is_float(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FLOAT; +} + static inline __u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(__u32 *)(t + 1)); diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 2f9d685bd522..5e957fcceee6 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -279,6 +279,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d) case BTF_KIND_INT: case BTF_KIND_ENUM: case BTF_KIND_FWD: + case BTF_KIND_FLOAT: break; case BTF_KIND_VOLATILE: @@ -453,6 +454,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) switch (btf_kind(t)) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: tstate->order_state = ORDERED; return 0; @@ -1133,6 +1135,7 @@ skip_mod: case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: goto done; default: pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", @@ -1247,6 +1250,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, switch (kind) { case BTF_KIND_INT: + case BTF_KIND_FLOAT: btf_dump_emit_mods(d, decls); name = btf_name_of(d, t->name_off); btf_dump_printf(d, "%s", name); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 62d9ed56b081..2f351d3ad3e7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -178,6 +178,8 @@ enum kern_feature_id { FEAT_PROG_BIND_MAP, /* Kernel support for module BTFs */ FEAT_MODULE_BTF, + /* BTF_KIND_FLOAT support */ + FEAT_BTF_FLOAT, __FEAT_CNT, }; @@ -1946,6 +1948,7 @@ static const char *btf_kind_str(const struct btf_type *t) case BTF_KIND_FUNC_PROTO: return "func_proto"; case BTF_KIND_VAR: return "var"; case BTF_KIND_DATASEC: return "datasec"; + case BTF_KIND_FLOAT: return "float"; default: return "unknown"; } } @@ -2395,15 +2398,17 @@ static bool btf_needs_sanitization(struct bpf_object *obj) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); - return !has_func || !has_datasec || !has_func_global; + return !has_func || !has_datasec || !has_func_global || !has_float; } static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) { bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(FEAT_BTF_DATASEC); + bool has_float = kernel_supports(FEAT_BTF_FLOAT); bool has_func = kernel_supports(FEAT_BTF_FUNC); struct btf_type *t; int i, j, vlen; @@ -2456,6 +2461,13 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) } else if (!has_func_global && btf_is_func(t)) { /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); + } else if (!has_float && btf_is_float(t)) { + /* replace FLOAT with an equally-sized empty STRUCT; + * since C compilers do not accept e.g. "float" as a + * valid struct name, make it anonymous + */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0); } } } @@ -3927,6 +3939,18 @@ static int probe_kern_btf_datasec(void) strs, sizeof(strs))); } +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + static int probe_kern_array_mmap(void) { struct bpf_create_map_attr attr = { @@ -4106,6 +4130,9 @@ static struct kern_feature_desc { [FEAT_MODULE_BTF] = { "module BTF support", probe_module_btf, }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, }; static bool kernel_supports(enum kern_feature_id feat_id) diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 1c0fd2dd233a..ec898f464ab9 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -350,3 +350,8 @@ LIBBPF_0.3.0 { xsk_setup_xdp_prog; xsk_socket__update_xskmap; } LIBBPF_0.2.0; + +LIBBPF_0.4.0 { + global: + btf__add_float; +} LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 969d0ac592ba..343f6eb05637 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -31,6 +31,8 @@ #define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset) #define BTF_PARAM_ENC(name, type) (name), (type) #define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) From 737e0f919a8d2a313618d8ac67d50e8223bc5d74 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:50 +0100 Subject: [PATCH 51/90] tools/bpftool: Add BTF_KIND_FLOAT support Only dumping support needs to be adjusted, the code structure follows that of BTF_KIND_INT. Example plain and JSON outputs: [4] FLOAT 'float' size=4 {"id":4,"kind":"FLOAT","name":"float","size":4} Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-5-iii@linux.ibm.com --- tools/bpf/bpftool/btf.c | 8 ++++++++ tools/bpf/bpftool/btf_dumper.c | 1 + 2 files changed, 9 insertions(+) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index fe9e7b3a4b50..985610c3f193 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -36,6 +36,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; struct btf_attach_table { @@ -327,6 +328,13 @@ static int dump_btf_type(const struct btf *btf, __u32 id, jsonw_end_array(w); break; } + case BTF_KIND_FLOAT: { + if (json_output) + jsonw_uint_field(w, "size", t->size); + else + printf(" size=%u", t->size); + break; + } default: break; } diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 0e9310727281..7ca54d046362 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -596,6 +596,7 @@ static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id, switch (BTF_INFO_KIND(t->info)) { case BTF_KIND_INT: case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: BTF_PRINT_ARG("%s ", btf__name_by_offset(btf, t->name_off)); break; case BTF_KIND_STRUCT: From eea154a852e827c003215f7beed3e10f05471a86 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:51 +0100 Subject: [PATCH 52/90] selftests/bpf: Use the 25th bit in the "invalid BTF_INFO" test The bit being checked by this test is no longer reserved after introducing BTF_KIND_FLOAT, so use the next one instead. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-6-iii@linux.ibm.com --- tools/testing/selftests/bpf/prog_tests/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 6a7ee7420701..c29406736138 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -1903,7 +1903,7 @@ static struct btf_raw_test raw_tests[] = { .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), - BTF_TYPE_ENC(0, 0x10000000, 4), + BTF_TYPE_ENC(0, 0x20000000, 4), BTF_END_RAW, }, .str_sec = "", From b1828f0b04828aa8cccadf00a702f459caefeed9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:52 +0100 Subject: [PATCH 53/90] bpf: Add BTF_KIND_FLOAT support On the kernel side, introduce a new btf_kind_operations. It is similar to that of BTF_KIND_INT, however, it does not need to handle encodings and bit offsets. Do not implement printing, since the kernel does not know how to format floating-point values. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-7-iii@linux.ibm.com --- kernel/bpf/btf.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16e8148a28e2..c914d68f3bf8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -173,7 +173,7 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x8f00ffff +#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -280,6 +280,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; static const char *btf_type_str(const struct btf_type *t) @@ -574,6 +575,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: return true; } @@ -1704,6 +1706,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_FLOAT: size = type->size; goto resolved; @@ -1849,7 +1852,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env, return -EINVAL; } -/* Used for ptr, array and struct/union type members. +/* Used for ptr, array struct/union and float type members. * int, enum and modifier types have their specific callback functions. */ static int btf_generic_check_kflag_member(struct btf_verifier_env *env, @@ -3675,6 +3678,81 @@ static const struct btf_kind_operations datasec_ops = { .show = btf_datasec_show, }; +static s32 btf_float_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 && + t->size != 16) { + btf_verifier_log_type(env, t, "Invalid type_size"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static int btf_float_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u64 start_offset_bytes; + u64 end_offset_bytes; + u64 misalign_bits; + u64 align_bytes; + u64 align_bits; + + /* Different architectures have different alignment requirements, so + * here we check only for the reasonable minimum. This way we ensure + * that types after CO-RE can pass the kernel BTF verifier. + */ + align_bytes = min_t(u64, sizeof(void *), member_type->size); + align_bits = align_bytes * BITS_PER_BYTE; + div64_u64_rem(member->offset, align_bits, &misalign_bits); + if (misalign_bits) { + btf_verifier_log_member(env, struct_type, member, + "Member is not properly aligned"); + return -EINVAL; + } + + start_offset_bytes = member->offset / BITS_PER_BYTE; + end_offset_bytes = start_offset_bytes + member_type->size; + if (end_offset_bytes > struct_type->size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static void btf_float_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u", t->size); +} + +static const struct btf_kind_operations float_ops = { + .check_meta = btf_float_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_float_check_member, + .check_kflag_member = btf_generic_check_kflag_member, + .log_details = btf_float_log, + .show = btf_df_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -3808,6 +3886,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = &func_proto_ops, [BTF_KIND_VAR] = &var_ops, [BTF_KIND_DATASEC] = &datasec_ops, + [BTF_KIND_FLOAT] = &float_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, From 7e72aad3a15c06e40e3ccd2352e5010e978f1acf Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:53 +0100 Subject: [PATCH 54/90] selftest/bpf: Add BTF_KIND_FLOAT tests Test the good variants as well as the potential malformed ones. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226202256.116518-8-iii@linux.ibm.com --- tools/testing/selftests/bpf/btf_helpers.c | 4 + tools/testing/selftests/bpf/prog_tests/btf.c | 131 +++++++++++++++++++ tools/testing/selftests/bpf/test_btf.h | 3 + 3 files changed, 138 insertions(+) diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c index 48f90490f922..b692e6ead9b5 100644 --- a/tools/testing/selftests/bpf/btf_helpers.c +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -23,6 +23,7 @@ static const char * const btf_kind_str_mapping[] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", [BTF_KIND_VAR] = "VAR", [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", }; static const char *btf_kind_str(__u16 kind) @@ -173,6 +174,9 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id) } break; } + case BTF_KIND_FLOAT: + fprintf(out, " size=%u", t->size); + break; default: break; } diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index c29406736138..11d98d3cf949 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -3531,6 +3531,136 @@ static struct btf_raw_test raw_tests[] = { .max_entries = 1, }, +{ + .descr = "float test #1, well-formed", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [2] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [3] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 8), /* [4] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 12), /* [5] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 16), /* [6] */ + BTF_STRUCT_ENC(NAME_TBD, 5, 48), /* [7] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_MEMBER_ENC(NAME_TBD, 3, 32), + BTF_MEMBER_ENC(NAME_TBD, 4, 64), + BTF_MEMBER_ENC(NAME_TBD, 5, 128), + BTF_MEMBER_ENC(NAME_TBD, 6, 256), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0_Float16\0float\0double\0_Float80\0long_double" + "\0floats\0a\0b\0c\0d\0e"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 48, + .key_type_id = 1, + .value_type_id = 7, + .max_entries = 1, +}, +{ + .descr = "float test #2, invalid vlen", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 1), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "vlen != 0", +}, +{ + .descr = "float test #3, invalid kind_flag", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FLOAT, 1, 0), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid btf_info kind_flag", +}, +{ + .descr = "float test #4, member does not fit", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 2), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 0), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member exceeds struct_size", +}, +{ + .descr = "float test #5, member is not properly aligned", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 4), /* [2] */ + BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [3] */ + BTF_MEMBER_ENC(NAME_TBD, 2, 8), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float\0floats\0x"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 4, + .key_type_id = 1, + .value_type_id = 3, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Member is not properly aligned", +}, +{ + .descr = "float test #6, invalid size", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), + /* [1] */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 6), /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0float"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "float_type_check_btf", + .key_size = sizeof(int), + .value_size = 6, + .key_type_id = 1, + .value_type_id = 2, + .max_entries = 1, + .btf_load_err = true, + .err_str = "Invalid type_size", +}, + }; /* struct btf_raw_test raw_tests[] */ static const char *get_next_str(const char *start, const char *end) @@ -6630,6 +6760,7 @@ static int btf_type_size(const struct btf_type *t) case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: return base_size; case BTF_KIND_INT: return base_size + sizeof(__u32); diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h index 2023725f1962..e2394eea4b7f 100644 --- a/tools/testing/selftests/bpf/test_btf.h +++ b/tools/testing/selftests/bpf/test_btf.h @@ -66,4 +66,7 @@ #define BTF_FUNC_ENC(name, func_proto) \ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) + #endif /* _TEST_BTF_H */ From 7999cf7df899caf244236dcc11cce844347dab4a Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:54 +0100 Subject: [PATCH 55/90] selftests/bpf: Add BTF_KIND_FLOAT to the existing deduplication tests Check that floats don't interfere with struct deduplication, that they are not merged with another kinds and that floats of different sizes are not merged with each other. Suggested-by: Andrii Nakryiko Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226202256.116518-9-iii@linux.ibm.com --- tools/testing/selftests/bpf/prog_tests/btf.c | 43 ++++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 11d98d3cf949..0457ae32b270 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -6411,11 +6411,12 @@ const struct btf_dedup_test dedup_tests[] = { /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [3] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [3] */ BTF_MEMBER_ENC(NAME_NTH(3), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(4), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(5), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(6), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ @@ -6426,39 +6427,43 @@ const struct btf_dedup_test dedup_tests[] = { /* full copy of the above */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), /* [7] */ BTF_TYPE_ARRAY_ENC(7, 7, 16), /* [8] */ - BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [9] */ + BTF_STRUCT_ENC(NAME_NTH(2), 5, 88), /* [9] */ BTF_MEMBER_ENC(NAME_NTH(3), 10, 0), BTF_MEMBER_ENC(NAME_NTH(4), 11, 64), BTF_MEMBER_ENC(NAME_NTH(5), 8, 128), BTF_MEMBER_ENC(NAME_NTH(6), 7, 640), + BTF_MEMBER_ENC(NAME_NTH(8), 13, 672), BTF_PTR_ENC(9), /* [10] */ BTF_PTR_ENC(12), /* [11] */ BTF_CONST_ENC(7), /* [12] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [13] */ BTF_END_RAW, }, - BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0"), + BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0float\0d"), }, .expect = { .raw_types = { /* int */ - BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 32, 4), /* [1] */ /* int[16] */ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */ /* struct s { */ - BTF_STRUCT_ENC(NAME_NTH(6), 4, 84), /* [3] */ - BTF_MEMBER_ENC(NAME_NTH(5), 4, 0), /* struct s *next; */ + BTF_STRUCT_ENC(NAME_NTH(8), 5, 88), /* [3] */ + BTF_MEMBER_ENC(NAME_NTH(7), 4, 0), /* struct s *next; */ BTF_MEMBER_ENC(NAME_NTH(1), 5, 64), /* const int *a; */ BTF_MEMBER_ENC(NAME_NTH(2), 2, 128), /* int b[16]; */ BTF_MEMBER_ENC(NAME_NTH(3), 1, 640), /* int c; */ + BTF_MEMBER_ENC(NAME_NTH(4), 7, 672), /* float d; */ /* ptr -> [3] struct s */ BTF_PTR_ENC(3), /* [4] */ /* ptr -> [6] const int */ BTF_PTR_ENC(6), /* [5] */ /* const -> [1] int */ BTF_CONST_ENC(1), /* [6] */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(7), 4), /* [7] */ BTF_END_RAW, }, - BTF_STR_SEC("\0a\0b\0c\0int\0next\0s"), + BTF_STR_SEC("\0a\0b\0c\0d\0int\0float\0next\0s"), }, .opts = { .dont_resolve_fwds = false, @@ -6579,9 +6584,10 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .expect = { .raw_types = { @@ -6604,16 +6610,17 @@ const struct btf_dedup_test dedup_tests[] = { BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1), BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8), BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */ + BTF_TYPE_FLOAT_ENC(NAME_TBD, 2), /* [14] float */ BTF_END_RAW, }, - BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"), + BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M\0N"), }, .opts = { .dont_resolve_fwds = false, }, }, { - .descr = "dedup: no int duplicates", + .descr = "dedup: no int/float duplicates", .input = { .raw_types = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8), @@ -6628,9 +6635,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .expect = { .raw_types = { @@ -6646,9 +6659,15 @@ const struct btf_dedup_test dedup_tests[] = { BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8), /* different byte size */ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), + /* all allowed sizes */ + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 2), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 4), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 8), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 12), + BTF_TYPE_FLOAT_ENC(NAME_NTH(3), 16), BTF_END_RAW, }, - BTF_STR_SEC("\0int\0some other int"), + BTF_STR_SEC("\0int\0some other int\0float"), }, .opts = { .dont_resolve_fwds = false, From 6be6a0baffc1357b6d2023155753f111624c4fec Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:56 +0100 Subject: [PATCH 56/90] bpf: Document BTF_KIND_FLOAT in btf.rst Also document the expansion of the kind bitfield. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-11-iii@linux.ibm.com --- Documentation/bpf/btf.rst | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 44dc789de2b4..846354cd2d69 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -84,6 +84,7 @@ sequentially and type id is assigned to each recognized type starting from id #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ + #define BTF_KIND_FLOAT 16 /* Floating point */ Note that the type section encodes debug info, not just pure types. ``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram. @@ -95,8 +96,8 @@ Each type contains the following common data:: /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-30: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused * bit 31: kind_flag, currently used by * struct, union and fwd */ @@ -452,6 +453,18 @@ map definition. * ``offset``: the in-section offset of the variable * ``size``: the size of the variable in bytes +2.2.16 BTF_KIND_FLOAT +~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: any valid offset + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_FLOAT + * ``info.vlen``: 0 + * ``size``: the size of the float type in bytes: 2, 4, 8, 12 or 16. + +No additional type data follow ``btf_type``. + 3. BTF Kernel API ***************** From 7799e4d9d84f6f8231dfd9dca4da5f4b2f0aa932 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:33 -0800 Subject: [PATCH 57/90] bpf: Import syscall arg documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These descriptions are present in the man-pages project from the original submissions around 2015-2016. Import them so that they can be kept up to date as developers extend the bpf syscall commands. These descriptions follow the pattern used by scripts/bpf_helpers_doc.py so that we can take advantage of the parser to generate more up-to-date man page writing based upon these headers. Some minor wording adjustments were made to make the descriptions more consistent for the description / return format. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-2-joe@cilium.io Co-authored-by: Alexei Starovoitov Co-authored-by: Michael Kerrisk --- include/uapi/linux/bpf.h | 122 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b89af20cfa19..fb16c590e6d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -93,7 +93,127 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * For example, after **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. In addition, file descriptors + * referring to eBPF objects can be transferred over UNIX domain sockets. + * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. An eBPF object is + * deallocated only after all file descriptors referring to the object + * have been closed. + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, From f67c9cbf6c581468f6c7144d497565cfc7918c31 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:34 -0800 Subject: [PATCH 58/90] bpf: Add minimal bpf() command documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce high-level descriptions of the intent and return codes of the bpf() syscall commands. Subsequent patches may further flesh out the content to provide a more useful programming reference. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-3-joe@cilium.io --- include/uapi/linux/bpf.h | 368 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 368 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fb16c590e6d9..052bbfe65f77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -204,6 +204,374 @@ union bpf_iter_link_info { * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run an eBPF program a number of times against a provided + * program context and return the modified program context and + * duration of the test run. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Iterate and update multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_DELETE_BATCH + * Description + * Iterate and delete multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * For example, after **fork**\ (2), the child inherits file descriptors From 6690523bccb3e44cfcc4b2c995767e6814046e34 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:35 -0800 Subject: [PATCH 59/90] bpf: Document BPF_F_LOCK in syscall commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the meaning of the BPF_F_LOCK flag for the map lookup/update descriptions. Based on commit 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag"). Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-4-joe@cilium.io --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 052bbfe65f77..eb9f059f0569 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -123,6 +123,14 @@ union bpf_iter_link_info { * Look up an element with a given *key* in the map referred to * by the file descriptor *map_fd*. * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -140,6 +148,8 @@ union bpf_iter_link_info { * Create a new element only if it did not exist. * **BPF_EXIST** * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. * * Return * Returns zero on success. On error, -1 is returned and *errno* From 8aacb3c8d1a32b23c82645051bba55f0ae6c103b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:36 -0800 Subject: [PATCH 60/90] bpf: Document BPF_PROG_PIN syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b2197755b263 ("bpf: add support for persistent maps/progs") contains the original implementation and git logs, used as reference for this documentation. Also pull in the filename restriction as documented in commit 6d8cb045cde6 ("bpf: comment why dots in filenames under BPF virtual FS are not allowed") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-5-joe@cilium.io --- include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb9f059f0569..6946dde90c56 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -219,6 +219,22 @@ union bpf_iter_link_info { * Pin an eBPF program or map referred by the specified *bpf_fd* * to the provided *pathname* on the filesystem. * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -584,13 +600,19 @@ union bpf_iter_link_info { * * NOTES * eBPF objects (maps and programs) can be shared between processes. - * For example, after **fork**\ (2), the child inherits file descriptors - * referring to the same eBPF objects. In addition, file descriptors - * referring to eBPF objects can be transferred over UNIX domain sockets. - * File descriptors referring to eBPF objects can be duplicated in the - * usual way, using **dup**\ (2) and similar calls. An eBPF object is - * deallocated only after all file descriptors referring to the object - * have been closed. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). */ enum bpf_cmd { BPF_MAP_CREATE, From 32e76b187a90de5809d68c2ef3e3964176dacaf0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:37 -0800 Subject: [PATCH 61/90] bpf: Document BPF_PROG_ATTACH syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the prog attach command in more detail, based on git commits: * commit f4324551489e ("bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands") * commit 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") * commit f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2") * commit d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-6-joe@cilium.io --- include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6946dde90c56..a8f2964ec885 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -253,6 +253,43 @@ union bpf_iter_link_info { * Attach an eBPF program to a *target_fd* at the specified * *attach_type* hook. * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. From 2a3fdca4e3bc7a01316277ba26f4090c4b19bf7c Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:38 -0800 Subject: [PATCH 62/90] bpf: Document BPF_PROG_TEST_RUN syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on a brief read of the corresponding source code. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-7-joe@cilium.io --- include/uapi/linux/bpf.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a8f2964ec885..a6cd6650e23d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -306,14 +306,22 @@ union bpf_iter_link_info { * * BPF_PROG_TEST_RUN * Description - * Run an eBPF program a number of times against a provided - * program context and return the modified program context and - * duration of the test run. + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * * BPF_PROG_GET_NEXT_ID * Description * Fetch the next eBPF program currently loaded into the kernel. From 5d999994e05d62d4f53059540652014cf83cddfe Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:39 -0800 Subject: [PATCH 63/90] bpf: Document BPF_PROG_QUERY syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 468e2f64d220 ("bpf: introduce BPF_PROG_QUERY command") originally introduced this, but there have been several additions since then. Unlike BPF_PROG_ATTACH, it appears that the sockmap progs are not able to be queried so far. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-8-joe@cilium.io --- include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6cd6650e23d..0cf92ef011f1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -389,6 +389,43 @@ union bpf_iter_link_info { * Obtain information about eBPF programs associated with the * specified *attach_type* hook. * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. From 0cb804547927c05f6aa7e28c8d4a1e02fec1a6d4 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:40 -0800 Subject: [PATCH 64/90] bpf: Document BPF_MAP_*_BATCH syscall commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based roughly on the following commits: * Commit cb4d03ab499d ("bpf: Add generic support for lookup batch op") * Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") * Commit aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Brian Vazquez Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-9-joe@cilium.io --- include/uapi/linux/bpf.h | 114 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cf92ef011f1..c8b9d19fce22 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -553,13 +553,55 @@ union bpf_iter_link_info { * Description * Iterate and fetch multiple elements in a map. * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * * BPF_MAP_LOOKUP_AND_DELETE_BATCH * Description - * Iterate and delete multiple elements in a map. + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -567,15 +609,81 @@ union bpf_iter_link_info { * * BPF_MAP_UPDATE_BATCH * Description - * Iterate and update multiple elements in a map. + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * * BPF_MAP_DELETE_BATCH * Description - * Iterate and delete multiple elements in a map. + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. * * Return * Returns zero on success. On error, -1 is returned and *errno* From 923a932c982fd71856f80dbeaaa3ca41a75e89e0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:41 -0800 Subject: [PATCH 65/90] scripts/bpf: Abstract eBPF API target parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract out the target parameter so that upcoming commits, more than just the existing "helpers" target can be called to generate specific portions of docs from the eBPF UAPI headers. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-10-joe@cilium.io --- MAINTAINERS | 1 + include/uapi/linux/bpf.h | 2 +- scripts/{bpf_helpers_doc.py => bpf_doc.py} | 91 +++++++++++++++------- tools/bpf/Makefile.helpers | 2 +- tools/include/uapi/linux/bpf.h | 2 +- tools/lib/bpf/Makefile | 2 +- tools/perf/MANIFEST | 2 +- 7 files changed, 69 insertions(+), 33 deletions(-) rename scripts/{bpf_helpers_doc.py => bpf_doc.py} (92%) diff --git a/MAINTAINERS b/MAINTAINERS index a50a543e3c81..8d56c7044067 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3223,6 +3223,7 @@ F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ +F: scripts/bpf_doc.py F: tools/bpf/ F: tools/lib/bpf/ F: tools/testing/selftests/bpf/ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8b9d19fce22..63a56ed6a785 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1439,7 +1439,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_doc.py similarity index 92% rename from scripts/bpf_helpers_doc.py rename to scripts/bpf_doc.py index 867ada23281c..5a4f68aab335 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_doc.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: GPL-2.0-only # # Copyright (C) 2018-2019 Netronome Systems, Inc. +# Copyright (C) 2021 Isovalent, Inc. # In case user attempts to run with Python 2. from __future__ import print_function @@ -165,10 +166,11 @@ class Printer(object): """ A generic class for printers. Printers should be created with an array of Helper objects, and implement a way to print them in the desired fashion. - @helpers: array of Helper objects to print to standard output + @parser: A HeaderParser with objects to print to standard output """ - def __init__(self, helpers): - self.helpers = helpers + def __init__(self, parser): + self.parser = parser + self.elements = [] def print_header(self): pass @@ -181,19 +183,23 @@ class Printer(object): def print_all(self): self.print_header() - for helper in self.helpers: - self.print_one(helper) + for elem in self.elements: + self.print_one(elem) self.print_footer() + class PrinterRST(Printer): """ - A printer for dumping collected information about helpers as a ReStructured - Text page compatible with the rst2man program, which can be used to - generate a manual page for the helpers. - @helpers: array of Helper objects to print to standard output + A generic class for printers that print ReStructured Text. Printers should + be created with a HeaderParser object, and implement a way to print API + elements in the desired fashion. + @parser: A HeaderParser with objects to print to standard output """ - def print_header(self): - header = '''\ + def __init__(self, parser): + self.parser = parser + + def print_license(self): + license = '''\ .. Copyright (C) All BPF authors and contributors from 2014 to present. .. See git log include/uapi/linux/bpf.h in kernel tree for details. .. @@ -221,9 +227,39 @@ class PrinterRST(Printer): .. .. Please do not edit this file. It was generated from the documentation .. located in file include/uapi/linux/bpf.h of the Linux kernel sources -.. (helpers description), and from scripts/bpf_helpers_doc.py in the same +.. (helpers description), and from scripts/bpf_doc.py in the same .. repository (header and footer). +''' + print(license) + def print_elem(self, elem): + if (elem.desc): + print('\tDescription') + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', elem.desc, count=1).split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + if (elem.ret): + print('\tReturn') + for line in elem.ret.rstrip().split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + print('') + + +class PrinterHelpersRST(PrinterRST): + """ + A printer for dumping collected information about helpers as a ReStructured + Text page compatible with the rst2man program, which can be used to + generate a manual page for the helpers. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + def print_header(self): + header = '''\ =========== BPF-HELPERS =========== @@ -264,6 +300,7 @@ kernel at the top). HELPERS ======= ''' + PrinterRST.print_license(self) print(header) def print_footer(self): @@ -380,27 +417,19 @@ SEE ALSO def print_one(self, helper): self.print_proto(helper) + self.print_elem(helper) - if (helper.desc): - print('\tDescription') - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - if (helper.ret): - print('\tReturn') - for line in helper.ret.rstrip().split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - print('') class PrinterHelpers(Printer): """ A printer for dumping collected information about helpers as C header to be included from BPF program. - @helpers: array of Helper objects to print to standard output + @parser: A HeaderParser with Helper objects to print to standard output """ + def __init__(self, parser): + self.elements = parser.helpers type_fwds = [ 'struct bpf_fib_lookup', @@ -511,7 +540,7 @@ class PrinterHelpers(Printer): def print_header(self): header = '''\ -/* This is auto-generated file. See bpf_helpers_doc.py for details. */ +/* This is auto-generated file. See bpf_doc.py for details. */ /* Forward declarations of BPF structs */''' @@ -589,8 +618,12 @@ script = os.path.abspath(sys.argv[0]) linuxRoot = os.path.dirname(os.path.dirname(script)) bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') +printers = { + 'helpers': PrinterHelpersRST, +} + argParser = argparse.ArgumentParser(description=""" -Parse eBPF header file and generate documentation for eBPF helper functions. +Parse eBPF header file and generate documentation for the eBPF API. The RST-formatted output produced can be turned into a manual page with the rst2man utility. """) @@ -601,6 +634,8 @@ if (os.path.isfile(bpfh)): default=bpfh) else: argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') +argParser.add_argument('target', nargs='?', default='helpers', + choices=printers.keys(), help='eBPF API target') args = argParser.parse_args() # Parse file. @@ -609,7 +644,7 @@ headerParser.run() # Print formatted output to standard output. if args.header: - printer = PrinterHelpers(headerParser.helpers) + printer = PrinterHelpers(headerParser) else: - printer = PrinterRST(headerParser.helpers) + printer = printers[args.target](headerParser) printer.print_all() diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers index 854d084026dd..a26599022fd6 100644 --- a/tools/bpf/Makefile.helpers +++ b/tools/bpf/Makefile.helpers @@ -35,7 +35,7 @@ man7: $(DOC_MAN7) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) $(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@ + $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_doc.py --filename $< > $@ $(OUTPUT)%.7: $(OUTPUT)%.rst ifndef RST2MAN_DEP diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b89af20cfa19..b4c5c529ad17 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -729,7 +729,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..8170f88e8ea6 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h - $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \ + $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5d7b947320fb..f05c4d48fd7e 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -20,4 +20,4 @@ tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c tools/lib/zalloc.c -scripts/bpf_helpers_doc.py +scripts/bpf_doc.py From a67882a221e348ab1c925b47efdfec8b11272d3f Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:42 -0800 Subject: [PATCH 66/90] scripts/bpf: Add syscall commands printer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new target to bpf_doc.py to support generating the list of syscall commands directly from the UAPI headers. Assuming that developer submissions keep the main header up to date, this should allow the man pages to be automatically generated based on the latest API changes rather than requiring someone to separately go back through the API and describe each command. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-11-joe@cilium.io --- scripts/bpf_doc.py | 100 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 91 insertions(+), 9 deletions(-) diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 5a4f68aab335..2d94025b38e9 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -14,6 +14,9 @@ import sys, os class NoHelperFound(BaseException): pass +class NoSyscallCommandFound(BaseException): + pass + class ParsingError(BaseException): def __init__(self, line='', reader=None): if reader: @@ -23,18 +26,27 @@ class ParsingError(BaseException): else: BaseException.__init__(self, 'Error parsing line: %s' % line) -class Helper(object): + +class APIElement(object): """ - An object representing the description of an eBPF helper function. - @proto: function prototype of the helper function - @desc: textual description of the helper function - @ret: description of the return value of the helper function + An object representing the description of an aspect of the eBPF API. + @proto: prototype of the API symbol + @desc: textual description of the symbol + @ret: (optional) description of any associated return value """ def __init__(self, proto='', desc='', ret=''): self.proto = proto self.desc = desc self.ret = ret + +class Helper(APIElement): + """ + An object representing the description of an eBPF helper function. + @proto: function prototype of the helper function + @desc: textual description of the helper function + @ret: description of the return value of the helper function + """ def proto_break_down(self): """ Break down helper function protocol into smaller chunks: return type, @@ -61,6 +73,7 @@ class Helper(object): return res + class HeaderParser(object): """ An object used to parse a file in order to extract the documentation of a @@ -73,6 +86,13 @@ class HeaderParser(object): self.reader = open(filename, 'r') self.line = '' self.helpers = [] + self.commands = [] + + def parse_element(self): + proto = self.parse_symbol() + desc = self.parse_desc() + ret = self.parse_ret() + return APIElement(proto=proto, desc=desc, ret=ret) def parse_helper(self): proto = self.parse_proto() @@ -80,6 +100,18 @@ class HeaderParser(object): ret = self.parse_ret() return Helper(proto=proto, desc=desc, ret=ret) + def parse_symbol(self): + p = re.compile(' \* ?(.+)$') + capture = p.match(self.line) + if not capture: + raise NoSyscallCommandFound + end_re = re.compile(' \* ?NOTES$') + end = end_re.match(self.line) + if end: + raise NoSyscallCommandFound + self.line = self.reader.readline() + return capture.group(1) + def parse_proto(self): # Argument can be of shape: # - "void" @@ -141,16 +173,29 @@ class HeaderParser(object): break return ret - def run(self): - # Advance to start of helper function descriptions. - offset = self.reader.read().find('* Start of BPF helper function descriptions:') + def seek_to(self, target, help_message): + self.reader.seek(0) + offset = self.reader.read().find(target) if offset == -1: - raise Exception('Could not find start of eBPF helper descriptions list') + raise Exception(help_message) self.reader.seek(offset) self.reader.readline() self.reader.readline() self.line = self.reader.readline() + def parse_syscall(self): + self.seek_to('* DOC: eBPF Syscall Commands', + 'Could not find start of eBPF syscall descriptions list') + while True: + try: + command = self.parse_element() + self.commands.append(command) + except NoSyscallCommandFound: + break + + def parse_helpers(self): + self.seek_to('* Start of BPF helper function descriptions:', + 'Could not find start of eBPF helper descriptions list') while True: try: helper = self.parse_helper() @@ -158,6 +203,9 @@ class HeaderParser(object): except NoHelperFound: break + def run(self): + self.parse_syscall() + self.parse_helpers() self.reader.close() ############################################################################### @@ -420,6 +468,37 @@ SEE ALSO self.print_elem(helper) +class PrinterSyscallRST(PrinterRST): + """ + A printer for dumping collected information about the syscall API as a + ReStructured Text page compatible with the rst2man program, which can be + used to generate a manual page for the syscall. + @parser: A HeaderParser with APIElement objects to print to standard + output + """ + def __init__(self, parser): + self.elements = parser.commands + + def print_header(self): + header = '''\ +=== +bpf +=== +------------------------------------------------------------------------------- +Perform a command on an extended BPF object +------------------------------------------------------------------------------- + +:Manual section: 2 + +COMMANDS +======== +''' + PrinterRST.print_license(self) + print(header) + + def print_one(self, command): + print('**%s**' % (command.proto)) + self.print_elem(command) class PrinterHelpers(Printer): @@ -620,6 +699,7 @@ bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') printers = { 'helpers': PrinterHelpersRST, + 'syscall': PrinterSyscallRST, } argParser = argparse.ArgumentParser(description=""" @@ -644,6 +724,8 @@ headerParser.run() # Print formatted output to standard output. if args.header: + if args.target != 'helpers': + raise NotImplementedError('Only helpers header generation is supported') printer = PrinterHelpers(headerParser) else: printer = printers[args.target](headerParser) From a01d935b2e0916d32cb567f90c32a0c4eb46993c Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:43 -0800 Subject: [PATCH 67/90] tools/bpf: Remove bpf-helpers from bpftool docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This logic is used for validating the manual pages from selftests, so move the infra under tools/testing/selftests/bpf/ and rely on selftests for validation rather than tying it into the bpftool build. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-12-joe@cilium.io --- tools/bpf/bpftool/.gitignore | 1 - tools/bpf/bpftool/Documentation/Makefile | 11 +++---- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 20 +++++++++--- .../selftests/bpf/Makefile.docs} | 32 ++++++++++--------- .../selftests/bpf/test_bpftool_build.sh | 21 ------------ tools/testing/selftests/bpf/test_doc_build.sh | 13 ++++++++ 7 files changed, 50 insertions(+), 49 deletions(-) rename tools/{bpf/Makefile.helpers => testing/selftests/bpf/Makefile.docs} (64%) create mode 100755 tools/testing/selftests/bpf/test_doc_build.sh diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index 944cb4b7c95d..05ce4446b780 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -3,7 +3,6 @@ /bootstrap/ /bpftool bpftool*.8 -bpf-helpers.* FEATURE-DUMP.bpftool feature libbpf diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index f33cb02de95c..c49487905ceb 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -16,15 +16,12 @@ prefix ?= /usr/local mandir ?= $(prefix)/man man8dir = $(mandir)/man8 -# Load targets for building eBPF helpers man page. -include ../../Makefile.helpers - MAN8_RST = $(wildcard bpftool*.rst) _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) -man: man8 helpers +man: man8 man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) @@ -46,16 +43,16 @@ ifndef RST2MAN_DEP endif $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ -clean: helpers-clean +clean: $(call QUIET_CLEAN, Documentation) $(Q)$(RM) $(DOC_MAN8) -install: man helpers-install +install: man $(call QUIET_INSTALL, Documentation-man) $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir) $(Q)$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir) -uninstall: helpers-uninstall +uninstall: $(call QUIET_UNINST, Documentation-man) $(Q)$(RM) $(addprefix $(DESTDIR)$(man8dir)/,$(_DOC_MAN8)) $(Q)$(RMDIR) $(DESTDIR)$(man8dir) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index c0c48fdb9ac1..a0d5ec3cfc24 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +bpf-helpers* test_verifier test_maps test_lru_map diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a81af15e4ded..b5827464c6b5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,6 +68,7 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ + test_docs_build.sh \ test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ @@ -103,6 +104,7 @@ override define CLEAN $(call msg,CLEAN) $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) $(Q)$(MAKE) -C bpf_testmod clean + $(Q)$(MAKE) docs-clean endef include ../lib.mk @@ -180,6 +182,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) cp $(SCRATCH_DIR)/runqslower $@ $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) +$(TEST_GEN_FILES): docs $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c $(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c @@ -200,11 +203,16 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ CC=$(HOSTCC) LD=$(HOSTLD) \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install - $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation - $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ - -C $(BPFTOOLDIR)/Documentation \ - OUTPUT=$(BUILD_DIR)/bpftool/Documentation/ \ - prefix= DESTDIR=$(SCRATCH_DIR)/ install + +docs: + $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ + +docs-clean: + $(Q)$(MAKE) $(submake_extras) \ + -f Makefile.docs \ + prefix= OUTPUT=$(OUTPUT)/ DESTDIR=$(OUTPUT)/ $@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ @@ -477,3 +485,5 @@ EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature \ $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc bpf_testmod.ko) + +.PHONY: docs docs-clean diff --git a/tools/bpf/Makefile.helpers b/tools/testing/selftests/bpf/Makefile.docs similarity index 64% rename from tools/bpf/Makefile.helpers rename to tools/testing/selftests/bpf/Makefile.docs index a26599022fd6..546c4a763b46 100644 --- a/tools/bpf/Makefile.helpers +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -1,13 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only -ifndef allow-override - include ../scripts/Makefile.include - include ../scripts/utilities.mak -else - # Assume Makefile.helpers is being run from bpftool/Documentation - # subdirectory. Go up two more directories to fetch bpf.h header and - # associated script. - UP2DIR := ../../ -endif + +include ../../../scripts/Makefile.include +include ../../../scripts/utilities.mak INSTALL ?= install RM ?= rm -f @@ -29,13 +23,21 @@ MAN7_RST = $(HELPERS_RST) _DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) +DOCTARGETS := helpers + +docs: $(DOCTARGETS) helpers: man7 man7: $(DOC_MAN7) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) -$(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_doc.py --filename $< > $@ +# Configure make rules for the man page bpf-$1.$2. +# $1 - target for scripts/bpf_doc.py +# $2 - man page section to generate the troff file +define DOCS_RULES = +$(OUTPUT)bpf-$1.rst: ../../../../include/uapi/linux/bpf.h + $$(QUIET_GEN)../../../../scripts/bpf_doc.py $1 \ + --filename $$< > $$@ $(OUTPUT)%.7: $(OUTPUT)%.rst ifndef RST2MAN_DEP @@ -43,18 +45,18 @@ ifndef RST2MAN_DEP endif $(QUIET_GEN)rst2man $< > $@ -helpers-clean: +docs-clean: $(call QUIET_CLEAN, eBPF_helpers-manpage) $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST) -helpers-install: helpers +docs-install: helpers $(call QUIET_INSTALL, eBPF_helpers-manpage) $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) -helpers-uninstall: +docs-uninstall: $(call QUIET_UNINST, eBPF_helpers-manpage) $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7)) $(Q)$(RMDIR) $(DESTDIR)$(man7dir) -.PHONY: helpers helpers-clean helpers-install helpers-uninstall +.PHONY: docs docs-clean docs-install docs-uninstall diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index 2db3c60e1e61..ac349a5cea7e 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -85,23 +85,6 @@ make_with_tmpdir() { echo } -make_doc_and_clean() { - echo -e "\$PWD: $PWD" - echo -e "command: make -s $* doc >/dev/null" - RST2MAN_OPTS="--exit-status=1" make $J -s $* doc - if [ $? -ne 0 ] ; then - ERROR=1 - printf "FAILURE: Errors or warnings when building documentation\n" - fi - ( - if [ $# -ge 1 ] ; then - cd ${@: -1} - fi - make -s doc-clean - ) - echo -} - echo "Trying to build bpftool" echo -e "... through kbuild\n" @@ -162,7 +145,3 @@ make_and_clean make_with_tmpdir OUTPUT make_with_tmpdir O - -echo -e "Checking documentation build\n" -# From tools/bpf/bpftool -make_doc_and_clean diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh new file mode 100755 index 000000000000..7eb940a7b2eb --- /dev/null +++ b/tools/testing/selftests/bpf/test_doc_build.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +# Assume script is located under tools/testing/selftests/bpf/. We want to start +# build attempts from the top of kernel repository. +SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) +SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) +KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) +cd $KDIR_ROOT_DIR + +for tgt in docs docs-clean; do + make -s -C $PWD/$SCRIPT_REL_DIR $tgt; +done From 62b379a233a79e6f4d2e8b14750ae8fa13b8caf8 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:44 -0800 Subject: [PATCH 68/90] selftests/bpf: Templatize man page generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the Makefile here was only targeting a single manual page so it just hardcoded a bunch of individual rules to specifically handle build, clean, install, uninstall for that particular page. Upcoming commits will generate manual pages for an additional section, so this commit prepares the makefile first by converting the existing targets into an evaluated set of targets based on the manual page name and section. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-13-joe@cilium.io --- tools/testing/selftests/bpf/Makefile.docs | 40 ++++++++++++++--------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs index 546c4a763b46..f39ad19317c8 100644 --- a/tools/testing/selftests/bpf/Makefile.docs +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -39,24 +39,34 @@ $(OUTPUT)bpf-$1.rst: ../../../../include/uapi/linux/bpf.h $$(QUIET_GEN)../../../../scripts/bpf_doc.py $1 \ --filename $$< > $$@ -$(OUTPUT)%.7: $(OUTPUT)%.rst +$(OUTPUT)%.$2: $(OUTPUT)%.rst ifndef RST2MAN_DEP - $(error "rst2man not found, but required to generate man pages") + $$(error "rst2man not found, but required to generate man pages") endif - $(QUIET_GEN)rst2man $< > $@ + $$(QUIET_GEN)rst2man $$< > $$@ -docs-clean: - $(call QUIET_CLEAN, eBPF_helpers-manpage) - $(Q)$(RM) $(DOC_MAN7) $(OUTPUT)$(HELPERS_RST) +docs-clean-$1: + $$(call QUIET_CLEAN, eBPF_$1-manpage) + $(Q)$(RM) $$(DOC_MAN$2) $(OUTPUT)bpf-$1.rst -docs-install: helpers - $(call QUIET_INSTALL, eBPF_helpers-manpage) - $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) - $(Q)$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) +docs-install-$1: docs + $$(call QUIET_INSTALL, eBPF_$1-manpage) + $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$$(man$2dir) + $(Q)$(INSTALL) -m 644 $$(DOC_MAN$2) $(DESTDIR)$$(man$2dir) -docs-uninstall: - $(call QUIET_UNINST, eBPF_helpers-manpage) - $(Q)$(RM) $(addprefix $(DESTDIR)$(man7dir)/,$(_DOC_MAN7)) - $(Q)$(RMDIR) $(DESTDIR)$(man7dir) +docs-uninstall-$1: + $$(call QUIET_UNINST, eBPF_$1-manpage) + $(Q)$(RM) $$(addprefix $(DESTDIR)$$(man$2dir)/,$$(_DOC_MAN$2)) + $(Q)$(RMDIR) $(DESTDIR)$$(man$2dir) -.PHONY: docs docs-clean docs-install docs-uninstall +.PHONY: $1 docs-clean-$1 docs-install-$1 docs-uninstall-$1 +endef + +# Create the make targets to generate manual pages by name and section +$(eval $(call DOCS_RULES,helpers,7)) + +docs-clean: $(foreach doctarget,$(DOCTARGETS), docs-clean-$(doctarget)) +docs-install: $(foreach doctarget,$(DOCTARGETS), docs-install-$(doctarget)) +docs-uninstall: $(foreach doctarget,$(DOCTARGETS), docs-uninstall-$(doctarget)) + +.PHONY: docs docs-clean docs-install docs-uninstall man7 From accbd33a9b0328777899a85d148040e4d8921d87 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:45 -0800 Subject: [PATCH 69/90] selftests/bpf: Test syscall command parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add building of the bpf(2) syscall commands documentation as part of the docs building step in the build. This allows us to pick up on potential parse errors from the docs generator script as part of selftests. The generated manual pages here are not intended for distribution, they are just a fragment that can be integrated into the other static text of bpf(2) to form the full manual page. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-14-joe@cilium.io --- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile.docs | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index a0d5ec3cfc24..4866f6a21901 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only bpf-helpers* +bpf-syscall* test_verifier test_maps test_lru_map diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs index f39ad19317c8..ccf260021e83 100644 --- a/tools/testing/selftests/bpf/Makefile.docs +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -15,18 +15,27 @@ endif prefix ?= /usr/local mandir ?= $(prefix)/man +man2dir = $(mandir)/man2 man7dir = $(mandir)/man7 +SYSCALL_RST = bpf-syscall.rst +MAN2_RST = $(SYSCALL_RST) + HELPERS_RST = bpf-helpers.rst MAN7_RST = $(HELPERS_RST) +_DOC_MAN2 = $(patsubst %.rst,%.2,$(MAN2_RST)) +DOC_MAN2 = $(addprefix $(OUTPUT),$(_DOC_MAN2)) + _DOC_MAN7 = $(patsubst %.rst,%.7,$(MAN7_RST)) DOC_MAN7 = $(addprefix $(OUTPUT),$(_DOC_MAN7)) -DOCTARGETS := helpers +DOCTARGETS := helpers syscall docs: $(DOCTARGETS) +syscall: man2 helpers: man7 +man2: $(DOC_MAN2) man7: $(DOC_MAN7) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) @@ -64,9 +73,10 @@ endef # Create the make targets to generate manual pages by name and section $(eval $(call DOCS_RULES,helpers,7)) +$(eval $(call DOCS_RULES,syscall,2)) docs-clean: $(foreach doctarget,$(DOCTARGETS), docs-clean-$(doctarget)) docs-install: $(foreach doctarget,$(DOCTARGETS), docs-install-$(doctarget)) docs-uninstall: $(foreach doctarget,$(DOCTARGETS), docs-uninstall-$(doctarget)) -.PHONY: docs docs-clean docs-install docs-uninstall man7 +.PHONY: docs docs-clean docs-install docs-uninstall man2 man7 From 6197e5b7b1b5acd1e9b04bdf3527c694d84a27e2 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:46 -0800 Subject: [PATCH 70/90] docs/bpf: Add bpf() syscall command reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generate the syscall command reference from the UAPI header file and include it in the main bpf docs page. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-15-joe@cilium.io --- Documentation/bpf/index.rst | 9 +++++--- Documentation/userspace-api/ebpf/index.rst | 17 ++++++++++++++ Documentation/userspace-api/ebpf/syscall.rst | 24 ++++++++++++++++++++ Documentation/userspace-api/index.rst | 1 + MAINTAINERS | 1 + 5 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 Documentation/userspace-api/ebpf/index.rst create mode 100644 Documentation/userspace-api/ebpf/syscall.rst diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 4f2874b729c3..a702f67dd45f 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -12,9 +12,6 @@ BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. -The primary info for the bpf syscall is available in the `man-pages`_ -for `bpf(2)`_. - BPF Type Format (BTF) ===================== @@ -35,6 +32,12 @@ Two sets of Questions and Answers (Q&A) are maintained. bpf_design_QA bpf_devel_QA +Syscall API +=========== + +The primary info for the bpf syscall is available in the `man-pages`_ +for `bpf(2)`_. For more information about the userspace API, see +Documentation/userspace-api/ebpf/index.rst. Helper functions ================ diff --git a/Documentation/userspace-api/ebpf/index.rst b/Documentation/userspace-api/ebpf/index.rst new file mode 100644 index 000000000000..473dfba78116 --- /dev/null +++ b/Documentation/userspace-api/ebpf/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 + +eBPF Userspace API +================== + +eBPF is a kernel mechanism to provide a sandboxed runtime environment in the +Linux kernel for runtime extension and instrumentation without changing kernel +source code or loading kernel modules. eBPF programs can be attached to various +kernel subsystems, including networking, tracing and Linux security modules +(LSM). + +For internal kernel documentation on eBPF, see Documentation/bpf/index.rst. + +.. toctree:: + :maxdepth: 1 + + syscall diff --git a/Documentation/userspace-api/ebpf/syscall.rst b/Documentation/userspace-api/ebpf/syscall.rst new file mode 100644 index 000000000000..ea9918084221 --- /dev/null +++ b/Documentation/userspace-api/ebpf/syscall.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +eBPF Syscall +------------ + +:Authors: - Alexei Starovoitov + - Joe Stringer + - Michael Kerrisk + +The primary info for the bpf syscall is available in the `man-pages`_ +for `bpf(2)`_. + +bpf() subcommand reference +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. kernel-doc:: include/uapi/linux/bpf.h + :doc: eBPF Syscall Preamble + +.. kernel-doc:: include/uapi/linux/bpf.h + :doc: eBPF Syscall Commands + +.. Links: +.. _man-pages: https://www.kernel.org/doc/man-pages/ +.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index d29b020e5622..1e2438b7afa0 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -21,6 +21,7 @@ place where this information is gathered. unshare spec_ctrl accelerators/ocxl + ebpf/index ioctl/index iommu media/index diff --git a/MAINTAINERS b/MAINTAINERS index 8d56c7044067..4446d1455354 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3209,6 +3209,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git F: Documentation/bpf/ F: Documentation/networking/filter.rst +F: Documentation/userspace-api/ebpf/ F: arch/*/net/* F: include/linux/bpf* F: include/linux/filter.h From 242029f42691e05ac09b31b98221421bd564375e Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:47 -0800 Subject: [PATCH 71/90] tools: Sync uapi bpf.h header with latest changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Synchronize the header after all of the recent changes. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-16-joe@cilium.io --- tools/include/uapi/linux/bpf.h | 712 ++++++++++++++++++++++++++++++++- 1 file changed, 711 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b4c5c529ad17..63a56ed6a785 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -93,7 +93,717 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, From 607b9cc92bd7208338d714a22b8082fe83bcb177 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:12 +0000 Subject: [PATCH 72/90] bpf: Consolidate shared test timing code Share the timing / signal interruption logic between different implementations of PROG_TEST_RUN. There is a change in behaviour as well. We check the loop exit condition before checking for pending signals. This resolves an edge case where a signal arrives during the last iteration. Instead of aborting with EINTR we return the successful result to user space. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210303101816.36774-2-lmb@cloudflare.com --- net/bpf/test_run.c | 141 +++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 58bcb8c849d5..eb3c78cd4d7c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -16,14 +16,78 @@ #define CREATE_TRACE_POINTS #include +struct bpf_test_timer { + enum { NO_PREEMPT, NO_MIGRATE } mode; + u32 i; + u64 time_start, time_spent; +}; + +static void bpf_test_timer_enter(struct bpf_test_timer *t) + __acquires(rcu) +{ + rcu_read_lock(); + if (t->mode == NO_PREEMPT) + preempt_disable(); + else + migrate_disable(); + + t->time_start = ktime_get_ns(); +} + +static void bpf_test_timer_leave(struct bpf_test_timer *t) + __releases(rcu) +{ + t->time_start = 0; + + if (t->mode == NO_PREEMPT) + preempt_enable(); + else + migrate_enable(); + rcu_read_unlock(); +} + +static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration) + __must_hold(rcu) +{ + t->i++; + if (t->i >= repeat) { + /* We're done. */ + t->time_spent += ktime_get_ns() - t->time_start; + do_div(t->time_spent, t->i); + *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; + *err = 0; + goto reset; + } + + if (signal_pending(current)) { + /* During iteration: we've been cancelled, abort. */ + *err = -EINTR; + goto reset; + } + + if (need_resched()) { + /* During iteration: we need to reschedule between runs. */ + t->time_spent += ktime_get_ns() - t->time_start; + bpf_test_timer_leave(t); + cond_resched(); + bpf_test_timer_enter(t); + } + + /* Do another round. */ + return true; + +reset: + t->i = 0; + return false; +} + static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; - u64 time_start, time_spent = 0; - int ret = 0; - u32 i; + int ret; for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); @@ -38,40 +102,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, if (!repeat) repeat = 1; - rcu_read_lock(); - migrate_disable(); - time_start = ktime_get_ns(); - for (i = 0; i < repeat; i++) { + bpf_test_timer_enter(&t); + do { bpf_cgroup_storage_set(storage); if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); - - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - if (need_resched()) { - time_spent += ktime_get_ns() - time_start; - migrate_enable(); - rcu_read_unlock(); - - cond_resched(); - - rcu_read_lock(); - migrate_disable(); - time_start = ktime_get_ns(); - } - } - time_spent += ktime_get_ns() - time_start; - migrate_enable(); - rcu_read_unlock(); - - do_div(time_spent, repeat); - *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); @@ -674,18 +714,17 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + struct bpf_test_timer t = { NO_PREEMPT }; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; - u64 time_start, time_spent = 0; const struct ethhdr *eth; unsigned int flags = 0; u32 retval, duration; void *data; int ret; - u32 i; if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; @@ -721,39 +760,15 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ctx.data = data; ctx.data_end = (__u8 *)data + size; - rcu_read_lock(); - preempt_disable(); - time_start = ktime_get_ns(); - for (i = 0; i < repeat; i++) { + bpf_test_timer_enter(&t); + do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); - if (signal_pending(current)) { - preempt_enable(); - rcu_read_unlock(); - - ret = -EINTR; - goto out; - } - - if (need_resched()) { - time_spent += ktime_get_ns() - time_start; - preempt_enable(); - rcu_read_unlock(); - - cond_resched(); - - rcu_read_lock(); - preempt_disable(); - time_start = ktime_get_ns(); - } - } - time_spent += ktime_get_ns() - time_start; - preempt_enable(); - rcu_read_unlock(); - - do_div(time_spent, repeat); - duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + if (ret < 0) + goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), retval, duration); From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:13 +0000 Subject: [PATCH 73/90] bpf: Add PROG_TEST_RUN support for sk_lookup programs Allow to pass sk_lookup programs to PROG_TEST_RUN. User space provides the full bpf_sk_lookup struct as context. Since the context includes a socket pointer that can't be exposed to user space we define that PROG_TEST_RUN returns the cookie of the selected socket or zero in place of the socket pointer. We don't support testing programs that select a reuseport socket, since this would mean running another (unrelated) BPF program from the sk_lookup test handler. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com --- include/linux/bpf.h | 10 ++++ include/uapi/linux/bpf.h | 5 +- net/bpf/test_run.c | 105 +++++++++++++++++++++++++++++++++ net/core/filter.c | 1 + tools/include/uapi/linux/bpf.h | 5 +- 5 files changed, 124 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c730863fa77..c931bc97019d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1491,6 +1491,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1692,6 +1695,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, return -ENOTSUPP; } +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb3c78cd4d7c..0abdd67f44b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -781,3 +783,106 @@ out: kfree(data); return ret; } + +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; + + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; + } + + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; + + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); + +out: + bpf_prog_array_free(progs); + kfree(user_ctx); + return ret; +} diff --git a/net/core/filter.c b/net/core/filter.c index 13bcf248ee7b..a526db494c62 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ From 509b2937bce90089fd2785db9f27951a3d850c34 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:14 +0000 Subject: [PATCH 74/90] selftests: bpf: Convert sk_lookup ctx access tests to PROG_TEST_RUN Convert the selftests for sk_lookup narrow context access to use PROG_TEST_RUN instead of creating actual sockets. This ensures that ctx is populated correctly when using PROG_TEST_RUN. Assert concrete values since we now control remote_ip and remote_port. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-4-lmb@cloudflare.com --- .../selftests/bpf/prog_tests/sk_lookup.c | 83 +++++++++++++++---- .../selftests/bpf/progs/test_sk_lookup.c | 62 +++++++++----- 2 files changed, 109 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c index 9ff0412e1fd3..45c82db3c58c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c @@ -241,6 +241,48 @@ fail: return -1; } +static __u64 socket_cookie(int fd) +{ + __u64 cookie; + socklen_t cookie_len = sizeof(cookie); + + if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0, + "getsockopt(SO_COOKIE)", "%s\n", strerror(errno))) + return 0; + return cookie; +} + +static int fill_sk_lookup_ctx(struct bpf_sk_lookup *ctx, const char *local_ip, __u16 local_port, + const char *remote_ip, __u16 remote_port) +{ + void *local, *remote; + int err; + + memset(ctx, 0, sizeof(*ctx)); + ctx->local_port = local_port; + ctx->remote_port = htons(remote_port); + + if (is_ipv6(local_ip)) { + ctx->family = AF_INET6; + local = &ctx->local_ip6[0]; + remote = &ctx->remote_ip6[0]; + } else { + ctx->family = AF_INET; + local = &ctx->local_ip4; + remote = &ctx->remote_ip4; + } + + err = inet_pton(ctx->family, local_ip, local); + if (CHECK(err != 1, "inet_pton", "local_ip failed\n")) + return 1; + + err = inet_pton(ctx->family, remote_ip, remote); + if (CHECK(err != 1, "inet_pton", "remote_ip failed\n")) + return 1; + + return 0; +} + static int send_byte(int fd) { ssize_t n; @@ -1009,18 +1051,27 @@ static void test_drop_on_reuseport(struct test_sk_lookup *skel) static void run_sk_assign(struct test_sk_lookup *skel, struct bpf_program *lookup_prog, - const char *listen_ip, const char *connect_ip) + const char *remote_ip, const char *local_ip) { - int client_fd, peer_fd, server_fds[MAX_SERVERS] = { -1 }; - struct bpf_link *lookup_link; + int server_fds[MAX_SERVERS] = { -1 }; + struct bpf_sk_lookup ctx; + __u64 server_cookie; int i, err; - lookup_link = attach_lookup_prog(lookup_prog); - if (!lookup_link) + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .ctx_in = &ctx, + .ctx_size_in = sizeof(ctx), + .ctx_out = &ctx, + .ctx_size_out = sizeof(ctx), + ); + + if (fill_sk_lookup_ctx(&ctx, local_ip, EXT_PORT, remote_ip, INT_PORT)) return; + ctx.protocol = IPPROTO_TCP; + for (i = 0; i < ARRAY_SIZE(server_fds); i++) { - server_fds[i] = make_server(SOCK_STREAM, listen_ip, 0, NULL); + server_fds[i] = make_server(SOCK_STREAM, local_ip, 0, NULL); if (server_fds[i] < 0) goto close_servers; @@ -1030,23 +1081,25 @@ static void run_sk_assign(struct test_sk_lookup *skel, goto close_servers; } - client_fd = make_client(SOCK_STREAM, connect_ip, EXT_PORT); - if (client_fd < 0) + server_cookie = socket_cookie(server_fds[SERVER_B]); + if (!server_cookie) + return; + + err = bpf_prog_test_run_opts(bpf_program__fd(lookup_prog), &opts); + if (CHECK(err, "test_run", "failed with error %d\n", errno)) goto close_servers; - peer_fd = accept(server_fds[SERVER_B], NULL, NULL); - if (CHECK(peer_fd < 0, "accept", "failed\n")) - goto close_client; + if (CHECK(ctx.cookie == 0, "ctx.cookie", "no socket selected\n")) + goto close_servers; + + CHECK(ctx.cookie != server_cookie, "ctx.cookie", + "selected sk %llu instead of %llu\n", ctx.cookie, server_cookie); - close(peer_fd); -close_client: - close(client_fd); close_servers: for (i = 0; i < ARRAY_SIZE(server_fds); i++) { if (server_fds[i] != -1) close(server_fds[i]); } - bpf_link__destroy(lookup_link); } static void run_sk_assign_v4(struct test_sk_lookup *skel, diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c index 1032b292af5b..ac6f7f205e25 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c @@ -64,6 +64,10 @@ static const int PROG_DONE = 1; static const __u32 KEY_SERVER_A = SERVER_A; static const __u32 KEY_SERVER_B = SERVER_B; +static const __u16 SRC_PORT = bpf_htons(8008); +static const __u32 SRC_IP4 = IP4(127, 0, 0, 2); +static const __u32 SRC_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000002); + static const __u16 DST_PORT = 7007; /* Host byte order */ static const __u32 DST_IP4 = IP4(127, 0, 0, 1); static const __u32 DST_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000001); @@ -398,11 +402,12 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) if (LSW(ctx->protocol, 0) != IPPROTO_TCP) return SK_DROP; - /* Narrow loads from remote_port field. Expect non-0 value. */ - if (LSB(ctx->remote_port, 0) == 0 && LSB(ctx->remote_port, 1) == 0 && - LSB(ctx->remote_port, 2) == 0 && LSB(ctx->remote_port, 3) == 0) + /* Narrow loads from remote_port field. Expect SRC_PORT. */ + if (LSB(ctx->remote_port, 0) != ((SRC_PORT >> 0) & 0xff) || + LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff) || + LSB(ctx->remote_port, 2) != 0 || LSB(ctx->remote_port, 3) != 0) return SK_DROP; - if (LSW(ctx->remote_port, 0) == 0) + if (LSW(ctx->remote_port, 0) != SRC_PORT) return SK_DROP; /* Narrow loads from local_port field. Expect DST_PORT. */ @@ -415,11 +420,14 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv4 fields */ if (v4) { - /* Expect non-0.0.0.0 in remote_ip4 */ - if (LSB(ctx->remote_ip4, 0) == 0 && LSB(ctx->remote_ip4, 1) == 0 && - LSB(ctx->remote_ip4, 2) == 0 && LSB(ctx->remote_ip4, 3) == 0) + /* Expect SRC_IP4 in remote_ip4 */ + if (LSB(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xff) || + LSB(ctx->remote_ip4, 1) != ((SRC_IP4 >> 8) & 0xff) || + LSB(ctx->remote_ip4, 2) != ((SRC_IP4 >> 16) & 0xff) || + LSB(ctx->remote_ip4, 3) != ((SRC_IP4 >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip4, 0) == 0 && LSW(ctx->remote_ip4, 1) == 0) + if (LSW(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xffff) || + LSW(ctx->remote_ip4, 1) != ((SRC_IP4 >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP4 in local_ip4 */ @@ -448,20 +456,32 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx) /* Narrow loads from IPv6 fields */ if (!v4) { - /* Expect non-:: IP in remote_ip6 */ - if (LSB(ctx->remote_ip6[0], 0) == 0 && LSB(ctx->remote_ip6[0], 1) == 0 && - LSB(ctx->remote_ip6[0], 2) == 0 && LSB(ctx->remote_ip6[0], 3) == 0 && - LSB(ctx->remote_ip6[1], 0) == 0 && LSB(ctx->remote_ip6[1], 1) == 0 && - LSB(ctx->remote_ip6[1], 2) == 0 && LSB(ctx->remote_ip6[1], 3) == 0 && - LSB(ctx->remote_ip6[2], 0) == 0 && LSB(ctx->remote_ip6[2], 1) == 0 && - LSB(ctx->remote_ip6[2], 2) == 0 && LSB(ctx->remote_ip6[2], 3) == 0 && - LSB(ctx->remote_ip6[3], 0) == 0 && LSB(ctx->remote_ip6[3], 1) == 0 && - LSB(ctx->remote_ip6[3], 2) == 0 && LSB(ctx->remote_ip6[3], 3) == 0) + /* Expect SRC_IP6 in remote_ip6 */ + if (LSB(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xff) || + LSB(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 8) & 0xff) || + LSB(ctx->remote_ip6[0], 2) != ((SRC_IP6[0] >> 16) & 0xff) || + LSB(ctx->remote_ip6[0], 3) != ((SRC_IP6[0] >> 24) & 0xff) || + LSB(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xff) || + LSB(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 8) & 0xff) || + LSB(ctx->remote_ip6[1], 2) != ((SRC_IP6[1] >> 16) & 0xff) || + LSB(ctx->remote_ip6[1], 3) != ((SRC_IP6[1] >> 24) & 0xff) || + LSB(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xff) || + LSB(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 8) & 0xff) || + LSB(ctx->remote_ip6[2], 2) != ((SRC_IP6[2] >> 16) & 0xff) || + LSB(ctx->remote_ip6[2], 3) != ((SRC_IP6[2] >> 24) & 0xff) || + LSB(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xff) || + LSB(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 8) & 0xff) || + LSB(ctx->remote_ip6[3], 2) != ((SRC_IP6[3] >> 16) & 0xff) || + LSB(ctx->remote_ip6[3], 3) != ((SRC_IP6[3] >> 24) & 0xff)) return SK_DROP; - if (LSW(ctx->remote_ip6[0], 0) == 0 && LSW(ctx->remote_ip6[0], 1) == 0 && - LSW(ctx->remote_ip6[1], 0) == 0 && LSW(ctx->remote_ip6[1], 1) == 0 && - LSW(ctx->remote_ip6[2], 0) == 0 && LSW(ctx->remote_ip6[2], 1) == 0 && - LSW(ctx->remote_ip6[3], 0) == 0 && LSW(ctx->remote_ip6[3], 1) == 0) + if (LSW(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 16) & 0xffff) || + LSW(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xffff) || + LSW(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 16) & 0xffff)) return SK_DROP; /* Expect DST_IP6 in local_ip6 */ if (LSB(ctx->local_ip6[0], 0) != ((DST_IP6[0] >> 0) & 0xff) || From abab306ff04b570e817be3f026068fe9a5d99fbb Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:15 +0000 Subject: [PATCH 75/90] selftests: bpf: Check that PROG_TEST_RUN repeats as requested Extend a simple prog_run test to check that PROG_TEST_RUN adheres to the requested repetitions. Convert it to use BPF skeleton. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-5-lmb@cloudflare.com --- .../selftests/bpf/prog_tests/prog_run_xattr.c | 51 +++++++++++++++---- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c index 935a294f049a..131d7f7eeb42 100644 --- a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c +++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c @@ -2,12 +2,31 @@ #include #include +#include "test_pkt_access.skel.h" + +static const __u32 duration; + +static void check_run_cnt(int prog_fd, __u64 run_cnt) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + int err; + + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + if (CHECK(err, "get_prog_info", "failed to get bpf_prog_info for fd %d\n", prog_fd)) + return; + + CHECK(run_cnt != info.run_cnt, "run_cnt", + "incorrect number of repetitions, want %llu have %llu\n", run_cnt, info.run_cnt); +} + void test_prog_run_xattr(void) { - const char *file = "./test_pkt_access.o"; - struct bpf_object *obj; - char buf[10]; - int err; + struct test_pkt_access *skel; + int err, stats_fd = -1; + char buf[10] = {}; + __u64 run_cnt = 0; + struct bpf_prog_test_run_attr tattr = { .repeat = 1, .data_in = &pkt_v4, @@ -16,12 +35,15 @@ void test_prog_run_xattr(void) .data_size_out = 5, }; - err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, - &tattr.prog_fd); - if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME); + if (CHECK_ATTR(stats_fd < 0, "enable_stats", "failed %d\n", errno)) return; - memset(buf, 0, sizeof(buf)); + skel = test_pkt_access__open_and_load(); + if (CHECK_ATTR(!skel, "open_and_load", "failed\n")) + goto cleanup; + + tattr.prog_fd = bpf_program__fd(skel->progs.test_pkt_access); err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run", @@ -34,8 +56,12 @@ void test_prog_run_xattr(void) CHECK_ATTR(buf[5] != 0, "overflow", "BPF_PROG_TEST_RUN ignored size hint\n"); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + tattr.data_out = NULL; tattr.data_size_out = 0; + tattr.repeat = 2; errno = 0; err = bpf_prog_test_run_xattr(&tattr); @@ -46,5 +72,12 @@ void test_prog_run_xattr(void) err = bpf_prog_test_run_xattr(&tattr); CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err); - bpf_object__close(obj); + run_cnt += tattr.repeat; + check_run_cnt(tattr.prog_fd, run_cnt); + +cleanup: + if (skel) + test_pkt_access__destroy(skel); + if (stats_fd != -1) + close(stats_fd); } From b4f894633fa14d7d46ba7676f950b90a401504bb Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:16 +0000 Subject: [PATCH 76/90] selftests: bpf: Don't run sk_lookup in verifier tests sk_lookup doesn't allow setting data_in for bpf_prog_run. This doesn't play well with the verifier tests, since they always set a 64 byte input buffer. Allow not running verifier tests by setting bpf_test.runs to a negative value and don't run the ctx access case for sk_lookup. We have dedicated ctx access tests so skipping here doesn't reduce coverage. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-6-lmb@cloudflare.com --- tools/testing/selftests/bpf/test_verifier.c | 4 ++-- tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 58b5a349d3ba..1512092e1e68 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -105,7 +105,7 @@ struct bpf_test { enum bpf_prog_type prog_type; uint8_t flags; void (*fill_helper)(struct bpf_test *self); - uint8_t runs; + int runs; #define bpf_testdata_struct_t \ struct { \ uint32_t retval, retval_unpriv; \ @@ -1165,7 +1165,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, run_errs = 0; run_successes = 0; - if (!alignment_prevented_execution && fd_prog >= 0) { + if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) { uint32_t expected_val; int i; diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c index fb13ca2d5606..d78627be060f 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c +++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c @@ -239,6 +239,7 @@ .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SK_LOOKUP, .expected_attach_type = BPF_SK_LOOKUP, + .runs = -1, }, /* invalid 8-byte reads from a 4-byte fields in bpf_sk_lookup */ { From 46ac034f769fcd50d3d554041a3879a0cdf2ee57 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 3 Mar 2021 15:20:35 +0800 Subject: [PATCH 77/90] bpf: Simplify the calculation of variables Fix the following coccicheck warnings: ./tools/bpf/bpf_dbg.c:1201:55-57: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1614756035-111280-1-git-send-email-jiapeng.chong@linux.alibaba.com --- tools/bpf/bpf_dbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index a07dfc479270..00e560a17baf 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -1198,7 +1198,7 @@ static int cmd_run(char *num) else return CMD_OK; bpf_reset(); - } while (pcap_next_pkt() && (!has_limit || (has_limit && ++i < pkts))); + } while (pcap_next_pkt() && (!has_limit || (++i < pkts))); rl_printf("bpf passes:%u fails:%u\n", pass, fail); From bce8623135fbe54bd86797df72cb85bfe4118b6e Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 3 Mar 2021 15:52:10 +0800 Subject: [PATCH 78/90] selftests/bpf: Simplify the calculation of variables Fix the following coccicheck warnings: ./tools/testing/selftests/bpf/test_sockmap.c:735:35-37: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1614757930-17197-1-git-send-email-jiapeng.chong@linux.alibaba.com --- tools/testing/selftests/bpf/test_sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 427ca00a3217..eefd445b96fc 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -732,7 +732,7 @@ static int sendmsg_test(struct sockmap_options *opt) * socket is not a valid test. So in this case lets not * enable kTLS but still run the test. */ - if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) { + if (!txmsg_redir || txmsg_ingress) { err = sockmap_init_ktls(opt->verbose, rx_fd); if (err) return err; From d01b59c9ae94560fbcceaafeef39784d72765033 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Thu, 4 Mar 2021 14:40:46 +0800 Subject: [PATCH 79/90] bpf: Add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets encapsulation. But that is not appropriate when pushing Ethernet header. Add an option to further specify encap L2 type and set the inner_protocol as ETH_P_TEB. Suggested-by: Willem de Bruijn Signed-off-by: Xuesen Huang Signed-off-by: Zhiyong Cheng Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210304064046.6232-1-hxseverything@gmail.com --- include/uapi/linux/bpf.h | 5 +++++ net/core/filter.c | 11 ++++++++++- tools/include/uapi/linux/bpf.h | 5 +++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index a526db494c62..588b19ba0da8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3409,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3445,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3463,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { From 256becd450172eec74566f1aa7819ce80181d7e1 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Fri, 5 Mar 2021 20:33:47 +0800 Subject: [PATCH 80/90] selftests, bpf: Extend test_tc_tunnel test with vxlan Add BPF_F_ADJ_ROOM_ENCAP_L2_ETH flag to the existing tests which encapsulates the ethernet as the inner l2 header. Update a vxlan encapsulation test case. Signed-off-by: Xuesen Huang Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210305123347.15311-1-hxseverything@gmail.com --- .../selftests/bpf/progs/test_tc_tunnel.c | 113 +++++++++++++++--- tools/testing/selftests/bpf/test_tc_tunnel.sh | 15 ++- 2 files changed, 111 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 37bce7a7c394..84cd63259554 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -24,14 +24,29 @@ static const int cfg_port = 8000; static const int cfg_udp_src = 20000; +#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) + #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 #define ETH_OVER_UDP_PORT 7777 +#define VXLAN_UDP_PORT 8472 + +#define EXTPROTO_VXLAN 0x1 + +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) +#define VXLAN_FLAGS 0x8 +#define VXLAN_VNI 1 /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | MPLS_LS_S_MASK | 0xff); +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +} __attribute__((packed)); + struct gre_hdr { __be16 flags; __be16 protocol; @@ -45,13 +60,13 @@ union l4hdr { struct v4hdr { struct iphdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); struct v6hdr { struct ipv6hdr ip; union l4hdr l4hdr; - __u8 pad[16]; /* enough space for L2 header */ + __u8 pad[L2_PAD_SZ]; /* space for L2 header / vxlan header ... */ } __attribute__((packed)); static __always_inline void set_ipv4_csum(struct iphdr *iph) @@ -69,14 +84,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) iph->check = ~((csum & 0xffff) + (csum >> 16)); } -static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, - __u16 l2_proto) +static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) { __u16 udp_dst = UDP_PORT; struct iphdr iph_inner; struct v4hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; int tcp_off; __u64 flags; @@ -141,7 +157,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -171,14 +191,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; + break; } olen += l2_len; @@ -214,14 +246,21 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } -static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, +static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, __u16 l2_proto) +{ + return __encap_ipv4(skb, encap_proto, l2_proto, 0); +} + +static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto, __u16 ext_proto) { __u16 udp_dst = UDP_PORT; struct ipv6hdr iph_inner; struct v6hdr h_outer; struct tcphdr tcph; int olen, l2_len; + __u8 *l2_hdr = NULL; __u16 tot_len; __u64 flags; @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, break; case ETH_P_TEB: l2_len = ETH_HLEN; - udp_dst = ETH_OVER_UDP_PORT; + if (ext_proto & EXTPROTO_VXLAN) { + udp_dst = VXLAN_UDP_PORT; + l2_len += sizeof(struct vxlanhdr); + } else + udp_dst = ETH_OVER_UDP_PORT; break; } flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src); h_outer.l4hdr.udp.dest = bpf_htons(udp_dst); tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) + - sizeof(h_outer.l4hdr.udp); + sizeof(h_outer.l4hdr.udp) + l2_len; h_outer.l4hdr.udp.check = 0; h_outer.l4hdr.udp.len = bpf_htons(tot_len); break; @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, } /* add L2 encap (if specified) */ + l2_hdr = (__u8 *)&h_outer + olen; switch (l2_proto) { case ETH_P_MPLS_UC: - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; + *(__u32 *)l2_hdr = mpls_label; break; case ETH_P_TEB: - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, - ETH_HLEN)) + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; + + if (ext_proto & EXTPROTO_VXLAN) { + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + + vxlan_hdr->vx_flags = VXLAN_FLAGS; + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); + + l2_hdr += sizeof(struct vxlanhdr); + } + + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) return TC_ACT_SHOT; break; } @@ -309,6 +363,12 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, return TC_ACT_OK; } +static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, + __u16 l2_proto) +{ + return __encap_ipv6(skb, encap_proto, l2_proto, 0); +} + SEC("encap_ipip_none") int __encap_ipip_none(struct __sk_buff *skb) { @@ -372,6 +432,17 @@ int __encap_udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_vxlan_eth") +int __encap_vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) + return __encap_ipv4(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + SEC("encap_sit_none") int __encap_sit_none(struct __sk_buff *skb) { @@ -444,6 +515,17 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("encap_ip6vxlan_eth") +int __encap_ip6vxlan_eth(struct __sk_buff *skb) +{ + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) + return __encap_ipv6(skb, IPPROTO_UDP, + ETH_P_TEB, + EXTPROTO_VXLAN); + else + return TC_ACT_OK; +} + static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) { char buf[sizeof(struct v6hdr)]; @@ -479,6 +561,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) case ETH_OVER_UDP_PORT: olen += ETH_HLEN; break; + case VXLAN_UDP_PORT: + olen += ETH_HLEN + sizeof(struct vxlanhdr); + break; } break; default: diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh index 7c76b841b17b..c9dde9b9d987 100755 --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh @@ -44,8 +44,8 @@ setup() { # clamp route to reserve room for tunnel headers ip -netns "${ns1}" -4 route flush table main ip -netns "${ns1}" -6 route flush table main - ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1 - ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1 + ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1 + ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1 sleep 1 @@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then echo "sit" $0 ipv6 sit none 100 + echo "ip4 vxlan" + $0 ipv4 vxlan eth 2000 + + echo "ip6 vxlan" + $0 ipv6 ip6vxlan eth 2000 + for mac in none mpls eth ; do echo "ip gre $mac" $0 ipv4 gre $mac 100 @@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then targs="encap fou encap-sport auto encap-dport $dport" elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then ttype=$gretaptype +elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then + ttype="vxlan" + targs="id 1 dstport 8472 udp6zerocsumrx" else ttype=$tuntype targs="" @@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then # No support for TEB fou tunnel; expect failure. expect_tun_fail=1 -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then +elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then # Share ethernet address between tunnel/veth2 so L2 decap works. ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \ awk '/ether/ { print $2 }') From 299194a91451263020c73dd2a3b7e0218c88dbd0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 5 Mar 2021 14:40:50 +0100 Subject: [PATCH 81/90] selftests/bpf: Fix test_attach_probe for powerpc uprobes When testing uprobes we the test gets GEP (Global Entry Point) address from kallsyms, but then the function is called locally so the uprobe is not triggered. Fixing this by adjusting the address to LEP (Local Entry Point) for powerpc arch plus instruction check stolen from ppc_function_entry function pointed out and explained by Michael and Naveen. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Naveen N. Rao Link: https://lore.kernel.org/bpf/20210305134050.139840-1-jolsa@kernel.org --- .../selftests/bpf/prog_tests/attach_probe.c | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index a0ee87c8e1ea..9dc4e3dfbcf3 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -2,6 +2,44 @@ #include #include "test_attach_probe.skel.h" +#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 + +#define OP_RT_RA_MASK 0xffff0000UL +#define LIS_R2 0x3c400000UL +#define ADDIS_R2_R12 0x3c4c0000UL +#define ADDI_R2_R2 0x38420000UL + +static ssize_t get_offset(ssize_t addr, ssize_t base) +{ + u32 *insn = (u32 *) addr; + + /* + * A PPC64 ABIv2 function may have a local and a global entry + * point. We need to use the local entry point when patching + * functions, so identify and step over the global entry point + * sequence. + * + * The global entry point sequence is always of the form: + * + * addis r2,r12,XXXX + * addi r2,r2,XXXX + * + * A linker optimisation may convert the addis to lis: + * + * lis r2,XXXX + * addi r2,r2,XXXX + */ + if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || + ((*insn & OP_RT_RA_MASK) == LIS_R2)) && + ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) + return (ssize_t)(insn + 2) - base; + else + return addr - base; +} +#else +#define get_offset(addr, base) (addr - base) +#endif + ssize_t get_base_addr() { size_t start, offset; char buf[256]; @@ -36,7 +74,7 @@ void test_attach_probe(void) if (CHECK(base_addr < 0, "get_base_addr", "failed to find base addr: %zd", base_addr)) return; - uprobe_offset = (size_t)&get_base_addr - base_addr; + uprobe_offset = get_offset((size_t)&get_base_addr, base_addr); skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) From a23b3f5697e6cf8affa7adf3e967e5ab569ea757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 5 Mar 2021 10:41:12 +0100 Subject: [PATCH 82/90] xsk: Update rings for load-acquire/store-release barriers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the AF_XDP rings uses general smp_{r,w,}mb() barriers on the kernel-side. On most modern architectures load-acquire/store-release barriers perform better, and results in simpler code for circular ring buffers. This change updates the XDP socket rings to use load-acquire/store-release barriers. It is important to note that changing from the old smp_{r,w,}mb() barriers, to load-acquire/store-release barriers does not break compatibility. The old semantics work with the new one, and vice versa. As pointed out by "Documentation/memory-barriers.txt" in the "SMP BARRIER PAIRING" section: "General barriers pair with each other, though they also pair with most other types of barriers, albeit without multicopy atomicity. An acquire barrier pairs with a release barrier, but both may also pair with other barriers, including of course general barriers." How different barriers behaves and pairs is outlined in "tools/memory-model/Documentation/cheatsheet.txt". In order to make sure that compatibility is not broken, LKMM herd7 based litmus tests can be constructed and verified. We generalize the XDP socket ring to a one entry ring, and create two scenarios; One where the ring is full, where only the consumer can proceed, followed by the producer. One where the ring is empty, where only the producer can proceed, followed by the consumer. Each scenario is then expanded to four different tests: general producer/general consumer, general producer/acqrel consumer, acqrel producer/general consumer, acqrel producer/acqrel consumer. In total eight tests. The empty ring test: C spsc-rb+empty // Simple one entry ring: // prod cons allowed action prod cons // 0 0 => prod => 1 0 // 0 1 => cons => 0 0 // 1 0 => cons => 1 1 // 1 1 => prod => 0 1 {} // We start at prod==0, cons==0, data==0, i.e. nothing has been // written to the ring. From here only the producer can start, and // should write 1. Afterwards, consumer can continue and read 1 to // data. Can we enter state prod==1, cons==1, but consumer observed // the incorrect value of 0? P0(int *prod, int *cons, int *data) { ... producer } P1(int *prod, int *cons, int *data) { ... consumer } exists( 1:d=0 /\ prod=1 /\ cons=1 ); The full ring test: C spsc-rb+full // Simple one entry ring: // prod cons allowed action prod cons // 0 0 => prod => 1 0 // 0 1 => cons => 0 0 // 1 0 => cons => 1 1 // 1 1 => prod => 0 1 { prod = 1; } // We start at prod==1, cons==0, data==1, i.e. producer has // written 0, so from here only the consumer can start, and should // consume 0. Afterwards, producer can continue and write 1 to // data. Can we enter state prod==0, cons==1, but consumer observed // the write of 1? P0(int *prod, int *cons, int *data) { ... producer } P1(int *prod, int *cons, int *data) { ... consumer } exists( 1:d=1 /\ prod=0 /\ cons=1 ); where P0 and P1 are: P0(int *prod, int *cons, int *data) { int p; p = READ_ONCE(*prod); if (READ_ONCE(*cons) == p) { WRITE_ONCE(*data, 1); smp_wmb(); WRITE_ONCE(*prod, p ^ 1); } } P0(int *prod, int *cons, int *data) { int p; p = READ_ONCE(*prod); if (READ_ONCE(*cons) == p) { WRITE_ONCE(*data, 1); smp_store_release(prod, p ^ 1); } } P1(int *prod, int *cons, int *data) { int c; int d = -1; c = READ_ONCE(*cons); if (READ_ONCE(*prod) != c) { smp_rmb(); d = READ_ONCE(*data); smp_mb(); WRITE_ONCE(*cons, c ^ 1); } } P1(int *prod, int *cons, int *data) { int c; int d = -1; c = READ_ONCE(*cons); if (smp_load_acquire(prod) != c) { d = READ_ONCE(*data); smp_store_release(cons, c ^ 1); } } The full LKMM litmus tests are found at [1]. On x86-64 systems the l2fwd AF_XDP xdpsock sample performance increases by 1%. This is mostly due to that the smp_mb() is removed, which is a relatively expensive operation on these platforms. Weakly-ordered platforms, such as ARM64 might benefit even more. [1] https://github.com/bjoto/litmus-xsk Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210305094113.413544-2-bjorn.topel@gmail.com --- net/xdp/xsk_queue.h | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 2823b7c3302d..2ac3802c2cd7 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -47,19 +47,18 @@ struct xsk_queue { u64 queue_empty_descs; }; -/* The structure of the shared state of the rings are the same as the - * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion - * ring, the kernel is the producer and user space is the consumer. For - * the Tx and fill rings, the kernel is the consumer and user space is - * the producer. +/* The structure of the shared state of the rings are a simple + * circular buffer, as outlined in + * Documentation/core-api/circular-buffers.rst. For the Rx and + * completion ring, the kernel is the producer and user space is the + * consumer. For the Tx and fill rings, the kernel is the consumer and + * user space is the producer. * * producer consumer * - * if (LOAD ->consumer) { LOAD ->producer - * (A) smp_rmb() (C) + * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) * STORE $data LOAD $data - * smp_wmb() (B) smp_mb() (D) - * STORE ->producer STORE ->consumer + * STORE.rel ->producer (B) STORE.rel ->consumer (D) * } * * (A) pairs with (D), and (B) pairs with (C). @@ -78,7 +77,8 @@ struct xsk_queue { * * (A) is a control dependency that separates the load of ->consumer * from the stores of $data. In case ->consumer indicates there is no - * room in the buffer to store $data we do not. So no barrier is needed. + * room in the buffer to store $data we do not. The dependency will + * order both of the stores after the loads. So no barrier is needed. * * (D) protects the load of the data to be observed to happen after the * store of the consumer pointer. If we did not have this memory @@ -227,15 +227,13 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, static inline void __xskq_cons_release(struct xsk_queue *q) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cached_cons); + smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ } static inline void __xskq_cons_peek(struct xsk_queue *q) { /* Refresh the local pointer */ - q->cached_prod = READ_ONCE(q->ring->producer); - smp_rmb(); /* C, matches B */ + q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ } static inline void xskq_cons_get_entries(struct xsk_queue *q) @@ -397,9 +395,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, idx); + smp_store_release(&q->ring->producer, idx); /* B, matches C */ } static inline void xskq_prod_submit(struct xsk_queue *q) From 291471dd1559528a4c2ad5026eff94ed1030562b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 5 Mar 2021 10:41:13 +0100 Subject: [PATCH 83/90] libbpf, xsk: Add libbpf_smp_store_release libbpf_smp_load_acquire MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the AF_XDP rings have load-acquire/store-release semantics, move libbpf to that as well. The library-internal libbpf_smp_{load_acquire,store_release} are only valid for 32-bit words on ARM64. Also, remove the barriers that are no longer in use. Signed-off-by: Björn Töpel Signed-off-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210305094113.413544-3-bjorn.topel@gmail.com --- tools/lib/bpf/libbpf_util.h | 72 +++++++++++++++++++++++++------------ tools/lib/bpf/xsk.h | 17 +++------ 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h index 59c779c5790c..94a0d7bb6f3c 100644 --- a/tools/lib/bpf/libbpf_util.h +++ b/tools/lib/bpf/libbpf_util.h @@ -5,6 +5,7 @@ #define __LIBBPF_LIBBPF_UTIL_H #include +#include #ifdef __cplusplus extern "C" { @@ -15,29 +16,56 @@ extern "C" { * application that uses libbpf. */ #if defined(__i386__) || defined(__x86_64__) -# define libbpf_smp_rmb() asm volatile("" : : : "memory") -# define libbpf_smp_wmb() asm volatile("" : : : "memory") -# define libbpf_smp_mb() \ - asm volatile("lock; addl $0,-4(%%rsp)" : : : "memory", "cc") -/* Hinders stores to be observed before older loads. */ -# define libbpf_smp_rwmb() asm volatile("" : : : "memory") +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile("" : : : "memory"); \ + WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + asm volatile("" : : : "memory"); \ + ___p1; \ + }) #elif defined(__aarch64__) -# define libbpf_smp_rmb() asm volatile("dmb ishld" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#elif defined(__arm__) -/* These are only valid for armv7 and above */ -# define libbpf_smp_rmb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_wmb() asm volatile("dmb ishst" : : : "memory") -# define libbpf_smp_mb() asm volatile("dmb ish" : : : "memory") -# define libbpf_smp_rwmb() libbpf_smp_mb() -#else -/* Architecture missing native barrier functions. */ -# define libbpf_smp_rmb() __sync_synchronize() -# define libbpf_smp_wmb() __sync_synchronize() -# define libbpf_smp_mb() __sync_synchronize() -# define libbpf_smp_rwmb() __sync_synchronize() +# define libbpf_smp_store_release(p, v) \ + asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1; \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + __p1; \ + }) +#elif defined(__riscv) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile ("fence rw,w" : : : "memory"); \ + WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + asm volatile ("fence r,rw" : : : "memory"); \ + ___p1; \ + }) +#endif + +#ifndef libbpf_smp_store_release +#define libbpf_smp_store_release(p, v) \ + do { \ + __sync_synchronize(); \ + WRITE_ONCE(*p, v); \ + } while (0) +#endif + +#ifndef libbpf_smp_load_acquire +#define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + __sync_synchronize(); \ + ___p1; \ + }) #endif #ifdef __cplusplus diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index e9f121f5d129..a9fdea87b5cd 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -96,7 +96,8 @@ static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) * this function. Without this optimization it whould have been * free_entries = r->cached_prod - r->cached_cons + r->size. */ - r->cached_cons = *r->consumer + r->size; + r->cached_cons = libbpf_smp_load_acquire(r->consumer); + r->cached_cons += r->size; return r->cached_cons - r->cached_prod; } @@ -106,7 +107,7 @@ static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) __u32 entries = r->cached_prod - r->cached_cons; if (entries == 0) { - r->cached_prod = *r->producer; + r->cached_prod = libbpf_smp_load_acquire(r->producer); entries = r->cached_prod - r->cached_cons; } @@ -129,9 +130,7 @@ static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, __u32 nb) /* Make sure everything has been written to the ring before indicating * this to the kernel by writing the producer pointer. */ - libbpf_smp_wmb(); - - *prod->producer += nb; + libbpf_smp_store_release(prod->producer, *prod->producer + nb); } static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __u32 *idx) @@ -139,11 +138,6 @@ static inline __u32 xsk_ring_cons__peek(struct xsk_ring_cons *cons, __u32 nb, __ __u32 entries = xsk_cons_nb_avail(cons, nb); if (entries > 0) { - /* Make sure we do not speculatively read the data before - * we have received the packet buffers from the ring. - */ - libbpf_smp_rmb(); - *idx = cons->cached_cons; cons->cached_cons += entries; } @@ -161,9 +155,8 @@ static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, __u32 nb) /* Make sure data has been read before indicating we are done * with the entries by updating the consumer pointer. */ - libbpf_smp_rwmb(); + libbpf_smp_store_release(cons->consumer, *cons->consumer + nb); - *cons->consumer += nb; } static inline void *xsk_umem__get_data(void *umem_area, __u64 addr) From a6aac408c56112f73d28ea8567c29b2a7fe8fccc Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Mar 2021 19:25:22 +0100 Subject: [PATCH 84/90] libbpf: Fix arm64 build The macro for libbpf_smp_store_release() doesn't build on arm64, fix it. Fixes: 291471dd1559 ("libbpf, xsk: Add libbpf_smp_store_release libbpf_smp_load_acquire") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210308182521.155536-1-jean-philippe@linaro.org --- tools/lib/bpf/libbpf_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h index 94a0d7bb6f3c..cfbcfc063c81 100644 --- a/tools/lib/bpf/libbpf_util.h +++ b/tools/lib/bpf/libbpf_util.h @@ -35,7 +35,7 @@ extern "C" { typeof(*p) ___p1; \ asm volatile ("ldar %w0, %1" \ : "=r" (___p1) : "Q" (*p) : "memory"); \ - __p1; \ + ___p1; \ }) #elif defined(__riscv) # define libbpf_smp_store_release(p, v) \ From a0d73acc1e4bc1c542701e37b2e0e233fe6a271d Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Mar 2021 19:28:31 +0100 Subject: [PATCH 85/90] selftests/bpf: Fix typo in Makefile The selftest build fails when trying to install the scripts: rsync: [sender] link_stat "tools/testing/selftests/bpf/test_docs_build.sh" failed: No such file or directory (2) Fix the filename. Fixes: a01d935b2e09 ("tools/bpf: Remove bpf-helpers from bpftool docs") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210308182830.155784-1-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index b5827464c6b5..c3999587bc23 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -68,7 +68,7 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_metadata.sh \ - test_docs_build.sh \ + test_doc_build.sh \ test_xsk.sh TEST_PROGS_EXTENDED := with_addr.sh \ From 3fcd50d6f9a963a21e142d71be135eff6a374d2d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 9 Mar 2021 01:56:48 +0100 Subject: [PATCH 86/90] selftests/bpf: Add BTF_KIND_FLOAT to test_core_reloc_size Verify that bpf_core_field_size() is working correctly with floats. Also document the required clang version. Suggested-by: John Fastabend Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210309005649.162480-2-iii@linux.ibm.com --- tools/testing/selftests/bpf/README.rst | 9 +++++++++ tools/testing/selftests/bpf/prog_tests/core_reloc.c | 1 + tools/testing/selftests/bpf/progs/core_reloc_types.h | 5 +++++ tools/testing/selftests/bpf/progs/test_core_reloc_size.c | 3 +++ 4 files changed, 18 insertions(+) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index dbc8f6cc5c67..3464161c8eea 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -170,3 +170,12 @@ failures: .. _2: https://reviews.llvm.org/D85174 .. _3: https://reviews.llvm.org/D83878 .. _4: https://reviews.llvm.org/D83242 + +Floating-point tests and Clang version +====================================== + +Certain selftests, e.g. core_reloc, require support for the floating-point +types, which was introduced in `Clang 13`__. The older Clang versions will +either crash when compiling these tests, or generate an incorrect BTF. + +__ https://reviews.llvm.org/D83289 diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 06eb956ff7bb..d94dcead72e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -266,6 +266,7 @@ static int duration = 0; .arr_elem_sz = sizeof(((type *)0)->arr_field[0]), \ .ptr_sz = 8, /* always 8-byte pointer for BPF */ \ .enum_sz = sizeof(((type *)0)->enum_field), \ + .float_sz = sizeof(((type *)0)->float_field), \ } #define SIZE_CASE(name) { \ diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h index 9a2850850121..9982eb969048 100644 --- a/tools/testing/selftests/bpf/progs/core_reloc_types.h +++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h @@ -807,6 +807,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -816,6 +817,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; struct core_reloc_size___diff_sz { @@ -825,6 +827,7 @@ struct core_reloc_size___diff_sz { char arr_field[10]; void *ptr_field; enum { OTHER_VALUE = 0xFFFFFFFFFFFFFFFF } enum_field; + double float_field; }; /* Error case of two candidates with the fields (int_field) at the same @@ -839,6 +842,7 @@ struct core_reloc_size___err_ambiguous1 { int arr_field[4]; void *ptr_field; enum { VALUE___1 = 123 } enum_field; + float float_field; }; struct core_reloc_size___err_ambiguous2 { @@ -850,6 +854,7 @@ struct core_reloc_size___err_ambiguous2 { int arr_field[4]; void *ptr_field; enum { VALUE___2 = 123 } enum_field; + float float_field; }; /* diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c index d7fb6cfc7891..7b2d576aeea1 100644 --- a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c +++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c @@ -21,6 +21,7 @@ struct core_reloc_size_output { int arr_elem_sz; int ptr_sz; int enum_sz; + int float_sz; }; struct core_reloc_size { @@ -30,6 +31,7 @@ struct core_reloc_size { int arr_field[4]; void *ptr_field; enum { VALUE = 123 } enum_field; + float float_field; }; SEC("raw_tracepoint/sys_enter") @@ -45,6 +47,7 @@ int test_core_size(void *ctx) out->arr_elem_sz = bpf_core_field_size(in->arr_field[0]); out->ptr_sz = bpf_core_field_size(in->ptr_field); out->enum_sz = bpf_core_field_size(in->enum_field); + out->float_sz = bpf_core_field_size(in->float_field); return 0; } From ccb0e23ca27445e2408f52944660f9e5e5d1c4b1 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 9 Mar 2021 01:56:49 +0100 Subject: [PATCH 87/90] selftests/bpf: Add BTF_KIND_FLOAT to btf_dump_test_case_syntax Check that dumping various floating-point types produces a valid C code. Suggested-by: Andrii Nakryiko Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210309005649.162480-3-iii@linux.ibm.com --- .../selftests/bpf/progs/btf_dump_test_case_syntax.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 31975c96e2c9..12b40dc81e14 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -205,6 +205,12 @@ struct struct_with_embedded_stuff { int t[11]; }; +struct float_struct { + float f; + const double *d; + volatile long double *ld; +}; + struct root_struct { enum e1 _1; enum e2 _2; @@ -219,6 +225,7 @@ struct root_struct { union_fwd_t *_12; union_fwd_ptr_t _13; struct struct_with_embedded_stuff _14; + struct float_struct _15; }; /* ------ END-EXPECTED-OUTPUT ------ */ From 11d39cfeecfc9d92a5faa2a55c228e796478e0cb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 8 Mar 2021 20:43:22 -0800 Subject: [PATCH 88/90] selftests/bpf: Fix compiler warning in BPF_KPROBE definition in loop6.c Add missing return type to BPF_KPROBE definition. Without it, compiler generates the following warning: progs/loop6.c:68:12: warning: type specifier missing, defaults to 'int' [-Wimplicit-int] BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, ^ 1 warning generated. Fixes: 86a35af628e5 ("selftests/bpf: Add a verifier scale test with unknown bounded loop") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210309044322.3487636-1-andrii@kernel.org --- tools/testing/selftests/bpf/progs/loop6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/loop6.c b/tools/testing/selftests/bpf/progs/loop6.c index 2a7141ac1656..38de0331e6b4 100644 --- a/tools/testing/selftests/bpf/progs/loop6.c +++ b/tools/testing/selftests/bpf/progs/loop6.c @@ -65,8 +65,8 @@ int config = 0; int result = 0; SEC("kprobe/virtqueue_add_sgs") -BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, - unsigned int out_sgs, unsigned int in_sgs) +int BPF_KPROBE(trace_virtqueue_add_sgs, void *unused, struct scatterlist **sgs, + unsigned int out_sgs, unsigned int in_sgs) { struct scatterlist *sgp = NULL; __u64 length1 = 0, length2 = 0; From e6a4750ffe9d701c4d55212b14b615e63571d235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 8 Mar 2021 12:29:06 +0100 Subject: [PATCH 89/90] bpf, xdp: Make bpf_redirect_map() a map operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the bpf_redirect_map() implementation dispatches to the correct map-lookup function via a switch-statement. To avoid the dispatching, this change adds bpf_redirect_map() as a map operation. Each map provides its bpf_redirect_map() version, and correct function is automatically selected by the BPF verifier. A nice side-effect of the code movement is that the map lookup functions are now local to the map implementation files, which removes one additional function call. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-2-bjorn.topel@gmail.com --- include/linux/bpf.h | 26 ++++++-------------------- include/linux/filter.h | 27 +++++++++++++++++++++++++++ include/net/xdp_sock.h | 19 ------------------- kernel/bpf/cpumap.c | 8 +++++++- kernel/bpf/devmap.c | 16 ++++++++++++++-- kernel/bpf/verifier.c | 13 +++++++++++-- net/core/filter.c | 39 +-------------------------------------- net/xdp/xskmap.c | 16 ++++++++++++++++ 8 files changed, 82 insertions(+), 82 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c931bc97019d..a25730eaa148 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -118,6 +118,9 @@ struct bpf_map_ops { void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + /* Misc helpers.*/ + int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. @@ -1450,9 +1453,9 @@ struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ struct xdp_buff; struct sk_buff; +struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_flush(void); int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1462,7 +1465,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); bool dev_map_can_have_prog(struct bpf_map *map); -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1593,17 +1595,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - -static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} static inline bool dev_map_can_have_prog(struct bpf_map *map) { return false; @@ -1615,6 +1606,7 @@ static inline void __dev_flush(void) struct xdp_buff; struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; static inline int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, @@ -1639,12 +1631,6 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, return 0; } -static inline -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) -{ - return NULL; -} - static inline void __cpu_map_flush(void) { } diff --git a/include/linux/filter.h b/include/linux/filter.h index 3b00fc906ccd..008691fd3b58 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1472,4 +1472,31 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, + void *lookup_elem(struct bpf_map *map, u32 key)) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) + return XDP_ABORTED; + + ri->tgt_value = lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + WRITE_ONCE(ri->map, NULL); + return flags; + } + + ri->flags = flags; + ri->tgt_index = ifindex; + WRITE_ONCE(ri->map, map); + + return XDP_REDIRECT; +} + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index cc17bc957548..9c0722c6d7ac 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -80,19 +80,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs; - - if (key >= map->max_entries) - return NULL; - - xs = READ_ONCE(m->xsk_map[key]); - return xs; -} - #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -109,12 +96,6 @@ static inline void __xsk_map_flush(void) { } -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5d1469de6921..7352d4160b7f 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -563,7 +563,7 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -600,6 +600,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); +} + static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -612,6 +617,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_cpu_map", .map_btf_id = &cpu_map_btf_id, + .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 85d9d1b72a33..f7f42448259f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -258,7 +258,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); @@ -392,7 +392,7 @@ void __dev_flush(void) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; @@ -735,6 +735,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); +} + +static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); +} + static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -747,6 +757,7 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_btf_id, + .map_redirect = dev_map_redirect, }; static int dev_map_hash_map_btf_id; @@ -761,6 +772,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_hash_map_btf_id, + .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fe90ce52a65..97eb0b2435b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5582,7 +5582,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && - func_id != BPF_FUNC_for_each_map_elem) + func_id != BPF_FUNC_for_each_map_elem && + func_id != BPF_FUNC_redirect_map) return 0; if (map == NULL) { @@ -12017,7 +12018,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_delete_elem || insn->imm == BPF_FUNC_map_push_elem || insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem)) { + insn->imm == BPF_FUNC_map_peek_elem || + insn->imm == BPF_FUNC_redirect_map)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -12059,6 +12061,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_redirect, + (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: @@ -12085,6 +12090,10 @@ patch_map_ops_generic: insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - __bpf_call_base; continue; + case BPF_FUNC_redirect_map: + insn->imm = BPF_CAST_CALL(ops->map_redirect) - + __bpf_call_base; + continue; } goto patch_call_imm; diff --git a/net/core/filter.c b/net/core/filter.c index 588b19ba0da8..183b0aa6b027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3943,22 +3943,6 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - return __dev_map_lookup_elem(map, index); - case BPF_MAP_TYPE_DEVMAP_HASH: - return __dev_map_hash_lookup_elem(map, index); - case BPF_MAP_TYPE_CPUMAP: - return __cpu_map_lookup_elem(map, index); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_lookup_elem(map, index); - default: - return NULL; - } -} - void bpf_clear_redirect_map(struct bpf_map *map) { struct bpf_redirect_info *ri; @@ -4112,28 +4096,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) - return XDP_ABORTED; - - ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { - /* If the lookup fails we want to clear out the state in the - * redirect_info struct completely, so that if an eBPF program - * performs multiple lookups, the last one always takes - * precedence. - */ - WRITE_ONCE(ri->map, NULL); - return flags; - } - - ri->flags = flags; - ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); - - return XDP_REDIRECT; + return map->ops->map_redirect(map, ifindex, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 113fd9017203..fbeb4870f798 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -125,6 +125,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(m->xsk_map[key]); +} + static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -215,6 +225,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); +} + void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry) { @@ -247,4 +262,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "xsk_map", .map_btf_id = &xsk_map_btf_id, + .map_redirect = xsk_map_redirect, }; From ee75aef23afe6e88497151c127c13ed69f41aaa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 8 Mar 2021 12:29:07 +0100 Subject: [PATCH 90/90] bpf, xdp: Restructure redirect actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XDP_REDIRECT implementations for maps and non-maps are fairly similar, but obviously need to take different code paths depending on if the target is using a map or not. Today, the redirect targets for XDP either uses a map, or is based on ifindex. Here, the map type and id are added to bpf_redirect_info, instead of the actual map. Map type, map item/ifindex, and the map_id (if any) is passed to xdp_do_redirect(). For ifindex-based redirect, used by the bpf_redirect() XDP BFP helper, a special map type/id are used. Map type of UNSPEC together with map id equal to INT_MAX has the special meaning of an ifindex based redirect. Note that valid map ids are 1 inclusive, INT_MAX exclusive ([1,INT_MAX[). In addition to making the code easier to follow, using explicit type and id in bpf_redirect_info has a slight positive performance impact by avoiding a pointer indirection for the map type lookup, and instead use the cacheline for bpf_redirect_info. Since the actual map is not passed via bpf_redirect_info anymore, the map lookup is only done in the BPF helper. This means that the bpf_clear_redirect_map() function can be removed. The actual map item is RCU protected. The bpf_redirect_info flags member is not used by XDP, and not read/written any more. The map member is only written to when required/used, and not unconditionally. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Maciej Fijalkowski Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-3-bjorn.topel@gmail.com --- include/linux/filter.h | 10 ++- include/trace/events/xdp.h | 62 ++++++++------ kernel/bpf/cpumap.c | 1 - kernel/bpf/devmap.c | 1 - net/core/filter.c | 168 ++++++++++++++++--------------------- net/xdp/xskmap.c | 1 - 6 files changed, 115 insertions(+), 128 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 008691fd3b58..b2b85b2cad8e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -646,7 +646,8 @@ struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; - struct bpf_map *map; + u32 map_id; + enum bpf_map_type map_type; u32 kern_flags; struct bpf_nh_params nh; }; @@ -1488,13 +1489,14 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind * performs multiple lookups, the last one always takes * precedence. */ - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags; } - ri->flags = flags; ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); + ri->map_id = map->id; + ri->map_type = map->map_type; return XDP_REDIRECT; } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 76a97176ab81..fcad3645a70b 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -86,19 +86,15 @@ struct _bpf_dtab_netdev { }; #endif /* __DEVMAP_OBJ_TYPE */ -#define devmap_ifindex(tgt, map) \ - (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ - ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) - DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), + enum bpf_map_type map_type, + u32 map_id, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) @@ -111,14 +107,22 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, ), TP_fast_assign( + u32 ifindex = 0, map_index = index; + + if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; + } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + ifindex = index; + map_index = 0; + } + __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; - __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : - index; - __entry->map_id = map ? map->id : 0; - __entry->map_index = map ? index : 0; + __entry->to_ifindex = ifindex; + __entry->map_id = map_id; + __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" @@ -133,45 +137,49 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); -#define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to) +#define _trace_xdp_redirect(dev, xdp, to) \ + trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to) +#define _trace_xdp_redirect_err(dev, xdp, to, err) \ + trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ - trace_xdp_redirect(dev, xdp, to, 0, map, index) +#define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ + trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) -#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, map, index) +#define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ + trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 7352d4160b7f..0cf2791d5099 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map) * complete. */ - bpf_clear_redirect_map(map); synchronize_rcu(); /* For cpu_map the remote CPUs can still be using the entries diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f7f42448259f..7a5ad7331c3b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ diff --git a/net/core/filter.c b/net/core/filter.c index 183b0aa6b027..b6732000d8a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3918,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, struct xdp_buff *xdp) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - return dev_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_CPUMAP: - return cpu_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_redirect(fwd, xdp); - default: - return -EBADRQC; - } - return 0; -} - void xdp_do_flush(void) { __dev_flush(); @@ -3943,55 +3926,52 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; + enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; int err; - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - if (unlikely(!map)) { - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_XSKMAP: + err = __xsk_map_redirect(fwd, xdp); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdp, dev); + break; } - - err = dev_xdp_enqueue(fwd, xdp, dev); - } else { - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + fallthrough; + default: + err = -EBADRQC; } if (unlikely(err)) goto err; - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -4000,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, - struct bpf_map *map) + void *fwd, + enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->tgt_index; - void *fwd = ri->tgt_value; - int err = 0; + int err; - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); - - if (map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - struct bpf_dtab_netdev *dst = fwd; - - err = dev_map_generic_redirect(dst, skb, xdp_prog); + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_generic_redirect(fwd, skb, xdp_prog); if (unlikely(err)) goto err; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - struct xdp_sock *xs = fwd; - - err = xsk_generic_rcv(xs, xdp); + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); - } else { + break; + default: /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } @@ -4042,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; - struct net_device *fwd; - int err = 0; + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; - if (map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, - map); - ri->tgt_index = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; } - err = xdp_ok_fwd_dev(fwd, skb->len); - if (unlikely(err)) - goto err; - - skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); - generic_xdp_tx(skb, xdp_prog); - return 0; + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } @@ -4077,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->flags = flags; + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ ri->tgt_index = ifindex; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index fbeb4870f798..67b4ce504852 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - bpf_clear_redirect_map(map); synchronize_net(); bpf_map_area_free(m); }