From b22bf1b9979a608827dea98c61ed9ec297bcc513 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Tue, 4 Jan 2022 18:59:29 +0100 Subject: [PATCH 01/41] bpftool: Refactor misc. feature probe There is currently a single miscellaneous feature probe, HAVE_LARGE_INSN_LIMIT, to check for the 1M instructions limit in the verifier. Subsequent patches will add additional miscellaneous probes, which follow the same pattern at the existing probe. This patch therefore refactors the probe to avoid code duplication in subsequent patches. The BPF program type and the checked error numbers in the HAVE_LARGE_INSN_LIMIT probe are changed to better generalize to other probes. The feature probe retains its current behavior despite those changes. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/956c9329a932c75941194f91790d01f31dfbe01b.1641314075.git.paul@isovalent.com --- tools/bpf/bpftool/feature.c | 45 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 6719b9282eca..3da97a02f455 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -642,6 +642,30 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, printf("\n"); } +static void +probe_misc_feature(struct bpf_insn *insns, size_t len, + const char *define_prefix, __u32 ifindex, + const char *feat_name, const char *plain_name, + const char *define_name) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .prog_ifindex = ifindex, + ); + bool res; + int fd; + + errno = 0; + fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + insns, len, &opts); + res = fd >= 0 || !errno; + + if (fd >= 0) + close(fd); + + print_bool_feature(feat_name, plain_name, define_name, res, + define_prefix); +} + /* * Probe for availability of kernel commit (5.3): * @@ -649,29 +673,18 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, */ static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex) { - LIBBPF_OPTS(bpf_prog_load_opts, opts, - .prog_ifindex = ifindex, - ); struct bpf_insn insns[BPF_MAXINSNS + 1]; - bool res; - int i, fd; + int i; for (i = 0; i < BPF_MAXINSNS; i++) insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1); insns[BPF_MAXINSNS] = BPF_EXIT_INSN(); - errno = 0; - fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, NULL, "GPL", - insns, ARRAY_SIZE(insns), &opts); - res = fd >= 0 || (errno != E2BIG && errno != EINVAL); - - if (fd >= 0) - close(fd); - - print_bool_feature("have_large_insn_limit", + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_large_insn_limit", "Large program size limit", - "LARGE_INSN_LIMIT", - res, define_prefix); + "LARGE_INSN_LIMIT"); } static void From c04fb2b0bd9275969be3b0a95f9c3ef76b1bfb73 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Tue, 4 Jan 2022 18:59:57 +0100 Subject: [PATCH 02/41] bpftool: Probe for bounded loop support This patch introduces a new probe to check whether the verifier supports bounded loops as introduced in commit 2589726d12a1 ("bpf: introduce bounded loops"). This patch will allow BPF users such as Cilium to probe for loop support on startup and only unconditionally unroll loops on older kernels. The results are displayed as part of the miscellaneous section, as shown below. $ bpftool feature probe | grep loops Bounded loop support is available $ bpftool feature probe macro | grep LOOPS #define HAVE_BOUNDED_LOOPS $ bpftool feature probe -j | jq .misc { "have_large_insn_limit": true, "have_bounded_loops": true } Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/f7807c0b27d79f48e71de7b5a99c680ca4bd0151.1641314075.git.paul@isovalent.com --- tools/bpf/bpftool/feature.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 3da97a02f455..03579d113042 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -687,6 +687,27 @@ static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex) "LARGE_INSN_LIMIT"); } +/* + * Probe for bounded loop support introduced in commit 2589726d12a1 + * ("bpf: introduce bounded loops"). + */ +static void +probe_bounded_loops(const char *define_prefix, __u32 ifindex) +{ + struct bpf_insn insns[4] = { + BPF_MOV64_IMM(BPF_REG_0, 10), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, -2), + BPF_EXIT_INSN() + }; + + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_bounded_loops", + "Bounded loop support", + "BOUNDED_LOOPS"); +} + static void section_system_config(enum probe_component target, const char *define_prefix) { @@ -801,6 +822,7 @@ static void section_misc(const char *define_prefix, __u32 ifindex) "/*** eBPF misc features ***/", define_prefix); probe_large_insn_limit(define_prefix, ifindex); + probe_bounded_loops(define_prefix, ifindex); print_end_section(); } From 0fd800b2456cf90ed738a1260b53acaa8843b5ae Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Tue, 4 Jan 2022 19:00:13 +0100 Subject: [PATCH 03/41] bpftool: Probe for instruction set extensions This patch introduces new probes to check whether the kernel supports instruction set extensions v2 and v3. The first introduced eBPF instructions BPF_J{LT,LE,SLT,SLE} in commit 92b31a9af73b ("bpf: add BPF_J{LT,LE,SLT,SLE} instructions"). The second introduces 32-bit variants of all jump instructions in commit 092ed0968bb6 ("bpf: verifier support JMP32"). These probes are useful for userspace BPF projects that want to use newer instruction set extensions on newer kernels, to reduce the programs' sizes or their complexity. LLVM already provides an mcpu=probe option to automatically probe the kernel and select the newest-supported instruction set extension. That is however not flexible enough for all use cases. For example, in Cilium, we only want to use the v3 instruction set extension on v5.10+, even though it is supported on all kernels v5.1+. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/3bfedcd9898c1f41ac67ca61f144fec84c6c3a92.1641314075.git.paul@isovalent.com --- tools/bpf/bpftool/feature.c | 44 +++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 03579d113042..e999159fa28d 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -708,6 +708,48 @@ probe_bounded_loops(const char *define_prefix, __u32 ifindex) "BOUNDED_LOOPS"); } +/* + * Probe for the v2 instruction set extension introduced in commit 92b31a9af73b + * ("bpf: add BPF_J{LT,LE,SLT,SLE} instructions"). + */ +static void +probe_v2_isa_extension(const char *define_prefix, __u32 ifindex) +{ + struct bpf_insn insns[4] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_v2_isa_extension", + "ISA extension v2", + "V2_ISA_EXTENSION"); +} + +/* + * Probe for the v3 instruction set extension introduced in commit 092ed0968bb6 + * ("bpf: verifier support JMP32"). + */ +static void +probe_v3_isa_extension(const char *define_prefix, __u32 ifindex) +{ + struct bpf_insn insns[4] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP32_IMM(BPF_JLT, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_v3_isa_extension", + "ISA extension v3", + "V3_ISA_EXTENSION"); +} + static void section_system_config(enum probe_component target, const char *define_prefix) { @@ -823,6 +865,8 @@ static void section_misc(const char *define_prefix, __u32 ifindex) define_prefix); probe_large_insn_limit(define_prefix, ifindex); probe_bounded_loops(define_prefix, ifindex); + probe_v2_isa_extension(define_prefix, ifindex); + probe_v3_isa_extension(define_prefix, ifindex); print_end_section(); } From 5e22dd18626726028a93ff1350a8a71a00fd843d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Jan 2022 13:10:30 +0100 Subject: [PATCH 04/41] bpf/selftests: Fix namespace mount setup in tc_redirect The tc_redirect umounts /sys in the new namespace, which can be mounted as shared and cause global umount. The lazy umount also takes down mounted trees under /sys like debugfs, which won't be available after sysfs mounts again and could cause fails in other tests. # cat /proc/self/mountinfo | grep debugfs 34 23 0:7 / /sys/kernel/debug rw,nosuid,nodev,noexec,relatime shared:14 - debugfs debugfs rw # cat /proc/self/mountinfo | grep sysfs 23 86 0:22 / /sys rw,nosuid,nodev,noexec,relatime shared:2 - sysfs sysfs rw # mount | grep debugfs debugfs on /sys/kernel/debug type debugfs (rw,nosuid,nodev,noexec,relatime) # ./test_progs -t tc_redirect #164 tc_redirect:OK Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED # mount | grep debugfs # cat /proc/self/mountinfo | grep debugfs # cat /proc/self/mountinfo | grep sysfs 25 86 0:22 / /sys rw,relatime shared:2 - sysfs sysfs rw Making the sysfs private under the new namespace so the umount won't trigger the global sysfs umount. Reported-by: Hangbin Liu Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Cc: Jussi Maki Link: https://lore.kernel.org/bpf/20220104121030.138216-1-jolsa@kernel.org --- tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 4b18b73df10b..c2426df58e17 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -105,6 +105,13 @@ static int setns_by_fd(int nsfd) if (!ASSERT_OK(err, "unshare")) return err; + /* Make our /sys mount private, so the following umount won't + * trigger the global umount in case it's shared. + */ + err = mount("none", "/sys", NULL, MS_PRIVATE, NULL); + if (!ASSERT_OK(err, "remount private /sys")) + return err; + err = umount2("/sys", MNT_DETACH); if (!ASSERT_OK(err, "umount2 /sys")) return err; From e4a41c2c1fa916547e63440c73a51a5eb06247af Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 31 Dec 2021 23:10:18 +0800 Subject: [PATCH 05/41] bpf, arm64: Use emit_addr_mov_i64() for BPF_PSEUDO_FUNC The following error is reported when running "./test_progs -t for_each" under arm64: bpf_jit: multi-func JIT bug 58 != 56 [...] JIT doesn't support bpf-to-bpf calls The root cause is the size of BPF_PSEUDO_FUNC instruction increases from 2 to 3 after the address of called bpf-function is settled and there are two bpf-to-bpf calls in test_pkt_access. The generated instructions are shown below: 0x48: 21 00 C0 D2 movz x1, #0x1, lsl #32 0x4c: 21 00 80 F2 movk x1, #0x1 0x48: E1 3F C0 92 movn x1, #0x1ff, lsl #32 0x4c: 41 FE A2 F2 movk x1, #0x17f2, lsl #16 0x50: 81 70 9F F2 movk x1, #0xfb84 Fixing it by using emit_addr_mov_i64() for BPF_PSEUDO_FUNC, so the size of jited image will not change. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211231151018.3781550-1-houtao1@huawei.com --- arch/arm64/net/bpf_jit_comp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 07aad85848fa..e96d4d87291f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -792,7 +792,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, u64 imm64; imm64 = (u64)insn1.imm << 32 | (u32)imm; - emit_a64_mov_i64(dst, imm64, ctx); + if (bpf_pseudo_func(insn)) + emit_addr_mov_i64(dst, imm64, ctx); + else + emit_a64_mov_i64(dst, imm64, ctx); return 1; } From 5b2c5540b8110eea0d67a78fb0ddb9654c58daeb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 4 Jan 2022 12:59:18 -0800 Subject: [PATCH 06/41] bpf, sockmap: Fix return codes from tcp_bpf_recvmsg_parser() Applications can be confused slightly because we do not always return the same error code as expected, e.g. what the TCP stack normally returns. For example on a sock err sk->sk_err instead of returning the sock_error we return EAGAIN. This usually means the application will 'try again' instead of aborting immediately. Another example, when a shutdown event is received we should immediately abort instead of waiting for data when the user provides a timeout. These tend to not be fatal, applications usually recover, but introduces bogus errors to the user or introduces unexpected latency. Before 'c5d2177a72a16' we fell back to the TCP stack when no data was available so we managed to catch many of the cases here, although with the extra latency cost of calling tcp_msg_wait_data() first. To fix lets duplicate the error handling in TCP stack into tcp_bpf so that we get the same error codes. These were found in our CI tests that run applications against sockmap and do longer lived testing, at least compared to test_sockmap that does short-lived ping/pong tests, and in some of our test clusters we deploy. Its non-trivial to do these in a shorter form CI tests that would be appropriate for BPF selftests, but we are looking into it so we can ensure this keeps working going forward. As a preview one idea is to pull in the packetdrill testing which catches some of this. Fixes: c5d2177a72a16 ("bpf, sockmap: Fix race in ingress receive verdict with redirect to self") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220104205918.286416-1-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index f70aa0932bd6..9b9b02052fd3 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -196,12 +196,39 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, long timeo; int data; + if (sock_flag(sk, SOCK_DONE)) + goto out; + + if (sk->sk_err) { + copied = sock_error(sk); + goto out; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + + if (sk->sk_state == TCP_CLOSE) { + copied = -ENOTCONN; + goto out; + } + timeo = sock_rcvtimeo(sk, nonblock); + if (!timeo) { + copied = -EAGAIN; + goto out; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + goto out; + } + data = tcp_msg_wait_data(sk, psock, timeo); if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; } +out: release_sock(sk); sk_psock_put(sk, psock); return copied; From 218d747a4142f281a256687bb513a135c905867b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 4 Jan 2022 13:46:45 -0800 Subject: [PATCH 07/41] bpf, sockmap: Fix double bpf_prog_put on error case in map_link sock_map_link() is called to update a sockmap entry with a sk. But, if the sock_map_init_proto() call fails then we return an error to the map_update op against the sockmap. In the error path though we need to cleanup psock and dec the refcnt on any programs associated with the map, because we refcnt them early in the update process to ensure they are pinned for the psock. (This avoids a race where user deletes programs while also updating the map with new socks.) In current code we do the prog refcnt dec explicitely by calling bpf_prog_put() when the program was found in the map. But, after commit '38207a5e81230' in this error path we've already done the prog to psock assignment so the programs have a reference from the psock as well. This then causes the psock tear down logic, invoked by sk_psock_put() in the error path, to similarly call bpf_prog_put on the programs there. To be explicit this logic does the prog->psock assignment: if (msg_*) psock_set_prog(...) Then the error path under the out_progs label does a similar check and dec with: if (msg_*) bpf_prog_put(...) And the teardown logic sk_psock_put() does ... psock_set_prog(msg_*, NULL) ... triggering another bpf_prog_put(...). Then KASAN gives us this splat, found by syzbot because we've created an inbalance between bpf_prog_inc and bpf_prog_put calling put twice on the program. BUG: KASAN: vmalloc-out-of-bounds in __bpf_prog_put kernel/bpf/syscall.c:1812 [inline] BUG: KASAN: vmalloc-out-of-bounds in __bpf_prog_put kernel/bpf/syscall.c:1812 [inline] kernel/bpf/syscall.c:1829 BUG: KASAN: vmalloc-out-of-bounds in bpf_prog_put+0x8c/0x4f0 kernel/bpf/syscall.c:1829 kernel/bpf/syscall.c:1829 Read of size 8 at addr ffffc90000e76038 by task syz-executor020/3641 To fix clean up error path so it doesn't try to do the bpf_prog_put in the error path once progs are assigned then it relies on the normal psock tear down logic to do complete cleanup. For completness we also cover the case whereh sk_psock_init_strp() fails, but this is not expected because it indicates an incorrect socket type and should be caught earlier. Fixes: 38207a5e8123 ("bpf, sockmap: Attach map progs to psock early for feature probes") Reported-by: syzbot+bb73e71cf4b8fd376a4f@syzkaller.appspotmail.com Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220104214645.290900-1-john.fastabend@gmail.com --- net/core/sock_map.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9618ab6d7cc9..1827669eedd6 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -292,15 +292,23 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + /* msg_* and stream_* programs references tracked in psock after this + * point. Reference dec and cleanup will occur through psock destructor + */ ret = sock_map_init_proto(sk, psock); - if (ret < 0) - goto out_drop; + if (ret < 0) { + sk_psock_put(sk, psock); + goto out; + } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); - if (ret) - goto out_unlock_drop; + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_put(sk, psock); + goto out; + } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); @@ -309,10 +317,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } write_unlock_bh(&sk->sk_callback_lock); return 0; -out_unlock_drop: - write_unlock_bh(&sk->sk_callback_lock); -out_drop: - sk_psock_put(sk, psock); out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); @@ -325,6 +329,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); +out: return ret; } From e60b0d12a95dcf16a63225cead4541567f5cb517 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 5 Jan 2022 11:35:13 -0800 Subject: [PATCH 08/41] bpf: Don't promote bogus looking registers after null check. If we ever get to a point again where we convert a bogus looking _or_null typed register containing a non-zero fixed or variable offset, then lets not reset these bounds to zero since they are not and also don't promote the register to a type, but instead leave it as _or_null. Converting to a unknown register could be an avenue as well, but then if we run into this case it would allow to leak a kernel pointer this way. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b70c66c6db3b..c8d9e761173b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9079,15 +9079,15 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, { if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { - /* Old offset (both fixed and variable parts) should - * have been known-zero, because we don't allow pointer - * arithmetic on pointers that might be NULL. - */ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0) || reg->off)) { - __mark_reg_known_zero(reg); - reg->off = 0; + /* Old offset (both fixed and variable parts) should + * have been known-zero, because we don't allow pointer + * arithmetic on pointers that might be NULL. If we + * see this happening, don't convert the register. + */ + return; } if (is_null) { reg->type = SCALAR_VALUE; From ca796fe66f7fceff17679ee6cc5fe4b4023de44d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 5 Jan 2022 11:33:34 -0800 Subject: [PATCH 09/41] bpf, selftests: Add verifier test for mem_or_null register with offset. Add a new test case with mem_or_null typed register with off > 0 to ensure it gets rejected by the verifier: # ./test_verifier 1011 #1009/u check with invalid reg offset 0 OK #1009/p check with invalid reg offset 0 OK Summary: 2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/verifier/spill_fill.c | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c index 6c907144311f..1a8eb9672bd1 100644 --- a/tools/testing/selftests/bpf/verifier/spill_fill.c +++ b/tools/testing/selftests/bpf/verifier/spill_fill.c @@ -58,6 +58,34 @@ .result = ACCEPT, .result_unpriv = ACCEPT, }, +{ + "check with invalid reg offset 0", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* add invalid offset to memory or NULL */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + /* should not be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = REJECT, + .errstr = "R0 pointer arithmetic on mem_or_null prohibited", +}, { "check corrupted spill/fill", .insns = { From 62e4683849b6516c71e91f36e4fc0393a5883cfb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jan 2022 19:35:51 +0100 Subject: [PATCH 10/41] bpf, docs: Add a setion to explain the basic instruction encoding The eBPF instruction set document does not currently document the basic instruction encoding. Add a section to do that. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103183556.41040-2-hch@lst.de --- Documentation/bpf/instruction-set.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 1af51143ff9f..80f42984b594 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -19,8 +19,22 @@ The eBPF calling convention is defined as: R0 - R5 are scratch registers and eBPF programs needs to spill/fill them if necessary across calls. +Instruction encoding +==================== + +eBPF uses 64-bit instructions with the following encoding: + + ============= ======= =============== ==================== ============ + 32 bits (MSB) 16 bits 4 bits 4 bits 8 bits (LSB) + ============= ======= =============== ==================== ============ + immediate offset source register destination register opcode + ============= ======= =============== ==================== ============ + +Note that most instructions do not use all of the fields. +Unused fields shall be cleared to zero. + Instruction classes -=================== +------------------- The three LSB bits of the 'opcode' field store the instruction class: From be3193cded9d5c030be1713bf52d307427e88d19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jan 2022 19:35:52 +0100 Subject: [PATCH 11/41] bpf, docs: Add subsections for ALU and JMP instructions Add a little more stucture to the ALU/JMP documentation with sections and improve the example text. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103183556.41040-3-hch@lst.de --- Documentation/bpf/instruction-set.rst | 52 ++++++++++++++++----------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 80f42984b594..03bf3c6c5577 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -74,7 +74,13 @@ The 4th bit encodes the source operand: The four MSB bits store the operation code. -For class BPF_ALU or BPF_ALU64: + +Arithmetic instructions +----------------------- + +BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for +otherwise identical operations. +The code field encodes the operation as below: ======== ===== ========================= code value description @@ -95,7 +101,29 @@ For class BPF_ALU or BPF_ALU64: BPF_END 0xd0 endianness conversion ======== ===== ========================= -For class BPF_JMP or BPF_JMP32: +BPF_ADD | BPF_X | BPF_ALU means:: + + dst_reg = (u32) dst_reg + (u32) src_reg; + +BPF_ADD | BPF_X | BPF_ALU64 means:: + + dst_reg = dst_reg + src_reg + +BPF_XOR | BPF_K | BPF_ALU means:: + + src_reg = (u32) src_reg ^ (u32) imm32 + +BPF_XOR | BPF_K | BPF_ALU64 means:: + + src_reg = src_reg ^ imm32 + + +Jump instructions +----------------- + +BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for +otherwise identical operations. +The code field encodes the operation as below: ======== ===== ========================= code value description @@ -116,24 +144,8 @@ For class BPF_JMP or BPF_JMP32: BPF_JSLE 0xd0 signed '<=' ======== ===== ========================= -So BPF_ADD | BPF_X | BPF_ALU means:: - - dst_reg = (u32) dst_reg + (u32) src_reg; - -Similarly, BPF_XOR | BPF_K | BPF_ALU means:: - - src_reg = (u32) src_reg ^ (u32) imm32 - -eBPF is using BPF_MOV | BPF_X | BPF_ALU to represent A = B moves. BPF_ALU64 -is used to mean exactly the same operations as BPF_ALU, but with 64-bit wide -operands instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.:: - - dst_reg = dst_reg + src_reg - -BPF_JMP | BPF_EXIT means function exit only. The eBPF program needs to store -the return value into register R0 before doing a BPF_EXIT. Class 6 is used as -BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide -operands for the comparisons instead. +The eBPF program needs to store the return value into register R0 before doing a +BPF_EXIT. Load and store instructions From 894cda554c3c3dc836f3cc873c47a465ba9433b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jan 2022 19:35:53 +0100 Subject: [PATCH 12/41] bpf, docs: Document the opcode classes Add a description for each opcode class. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103183556.41040-4-hch@lst.de --- Documentation/bpf/instruction-set.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 03bf3c6c5577..2987cbb07f7f 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -38,18 +38,18 @@ Instruction classes The three LSB bits of the 'opcode' field store the instruction class: - ========= ===== - class value - ========= ===== - BPF_LD 0x00 - BPF_LDX 0x01 - BPF_ST 0x02 - BPF_STX 0x03 - BPF_ALU 0x04 - BPF_JMP 0x05 - BPF_JMP32 0x06 - BPF_ALU64 0x07 - ========= ===== + ========= ===== =============================== + class value description + ========= ===== =============================== + BPF_LD 0x00 non-standard load operations + BPF_LDX 0x01 load into register operations + BPF_ST 0x02 store from immediate operations + BPF_STX 0x03 store from register operations + BPF_ALU 0x04 32-bit arithmetic operations + BPF_JMP 0x05 64-bit jump operations + BPF_JMP32 0x06 32-bit jump operations + BPF_ALU64 0x07 64-bit arithmetic operations + ========= ===== =============================== Arithmetic and jump instructions ================================ From 03c517ee9eedd95472c36c6291fc97368b48c9e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jan 2022 19:35:54 +0100 Subject: [PATCH 13/41] bpf, docs: Fully document the ALU opcodes Add pseudo-code to document all the different BPF_ALU / BPF_ALU64 opcodes. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103183556.41040-5-hch@lst.de --- Documentation/bpf/instruction-set.rst | 30 +++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 2987cbb07f7f..efba4d193185 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -82,24 +82,24 @@ BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for otherwise identical operations. The code field encodes the operation as below: - ======== ===== ========================= + ======== ===== ========================== code value description - ======== ===== ========================= - BPF_ADD 0x00 - BPF_SUB 0x10 - BPF_MUL 0x20 - BPF_DIV 0x30 - BPF_OR 0x40 - BPF_AND 0x50 - BPF_LSH 0x60 - BPF_RSH 0x70 - BPF_NEG 0x80 - BPF_MOD 0x90 - BPF_XOR 0xa0 - BPF_MOV 0xb0 mov reg to reg + ======== ===== ========================== + BPF_ADD 0x00 dst += src + BPF_SUB 0x10 dst -= src + BPF_MUL 0x20 dst \*= src + BPF_DIV 0x30 dst /= src + BPF_OR 0x40 dst \|= src + BPF_AND 0x50 dst &= src + BPF_LSH 0x60 dst <<= src + BPF_RSH 0x70 dst >>= src + BPF_NEG 0x80 dst = ~src + BPF_MOD 0x90 dst %= src + BPF_XOR 0xa0 dst ^= src + BPF_MOV 0xb0 dst = src BPF_ARSH 0xc0 sign extending shift right BPF_END 0xd0 endianness conversion - ======== ===== ========================= + ======== ===== ========================== BPF_ADD | BPF_X | BPF_ALU means:: From 9e533e22b5700097e84b8a841d9e1c251cc132c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jan 2022 19:35:55 +0100 Subject: [PATCH 14/41] bpf, docs: Fully document the JMP opcodes Add pseudo-code to document all the different BPF_JMP / BPF_JMP64 opcodes. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103183556.41040-6-hch@lst.de --- Documentation/bpf/instruction-set.rst | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index efba4d193185..88e8d6a9195c 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -125,24 +125,24 @@ BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for otherwise identical operations. The code field encodes the operation as below: - ======== ===== ========================= - code value description - ======== ===== ========================= - BPF_JA 0x00 BPF_JMP only - BPF_JEQ 0x10 - BPF_JGT 0x20 - BPF_JGE 0x30 - BPF_JSET 0x40 - BPF_JNE 0x50 jump '!=' - BPF_JSGT 0x60 signed '>' - BPF_JSGE 0x70 signed '>=' + ======== ===== ========================= ============ + code value description notes + ======== ===== ========================= ============ + BPF_JA 0x00 PC += off BPF_JMP only + BPF_JEQ 0x10 PC += off if dst == src + BPF_JGT 0x20 PC += off if dst > src unsigned + BPF_JGE 0x30 PC += off if dst >= src unsigned + BPF_JSET 0x40 PC += off if dst & src + BPF_JNE 0x50 PC += off if dst != src + BPF_JSGT 0x60 PC += off if dst > src signed + BPF_JSGE 0x70 PC += off if dst >= src signed BPF_CALL 0x80 function call - BPF_EXIT 0x90 function return - BPF_JLT 0xa0 unsigned '<' - BPF_JLE 0xb0 unsigned '<=' - BPF_JSLT 0xc0 signed '<' - BPF_JSLE 0xd0 signed '<=' - ======== ===== ========================= + BPF_EXIT 0x90 function / program return BPF_JMP only + BPF_JLT 0xa0 PC += off if dst < src unsigned + BPF_JLE 0xb0 PC += off if dst <= src unsigned + BPF_JSLT 0xc0 PC += off if dst < src signed + BPF_JSLE 0xd0 PC += off if dst <= src signed + ======== ===== ========================= ============ The eBPF program needs to store the return value into register R0 before doing a BPF_EXIT. From 58d8a3fc4a40dcfebf333ab2dc2c7c338249be51 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Jan 2022 19:35:56 +0100 Subject: [PATCH 15/41] bpf, docs: Fully document the JMP mode modifiers Add a description for all the modifiers. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103183556.41040-7-hch@lst.de --- Documentation/bpf/instruction-set.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst index 88e8d6a9195c..3704836fe6df 100644 --- a/Documentation/bpf/instruction-set.rst +++ b/Documentation/bpf/instruction-set.rst @@ -173,15 +173,15 @@ The size modifier is one of: The mode modifier is one of: - ============= ===== ===================== + ============= ===== ==================================== mode modifier value description - ============= ===== ===================== + ============= ===== ==================================== BPF_IMM 0x00 used for 64-bit mov - BPF_ABS 0x20 - BPF_IND 0x40 - BPF_MEM 0x60 + BPF_ABS 0x20 legacy BPF packet access + BPF_IND 0x40 legacy BPF packet access + BPF_MEM 0x60 all normal load and store operations BPF_ATOMIC 0xc0 atomic operations - ============= ===== ===================== + ============= ===== ==================================== BPF_MEM | | BPF_STX means:: From a5bebc4f00dee47113eed48098c68e88b5ba70e8 Mon Sep 17 00:00:00 2001 From: Kris Van Hees Date: Wed, 5 Jan 2022 16:01:50 -0500 Subject: [PATCH 16/41] bpf: Fix verifier support for validation of async callbacks Commit bfc6bb74e4f1 ("bpf: Implement verifier support for validation of async callbacks.") added support for BPF_FUNC_timer_set_callback to the __check_func_call() function. The test in __check_func_call() is flaweed because it can mis-interpret a regular BPF-to-BPF pseudo-call as a BPF_FUNC_timer_set_callback callback call. Consider the conditional in the code: if (insn->code == (BPF_JMP | BPF_CALL) && insn->imm == BPF_FUNC_timer_set_callback) { The BPF_FUNC_timer_set_callback has value 170. This means that if you have a BPF program that contains a pseudo-call with an instruction delta of 170, this conditional will be found to be true by the verifier, and it will interpret the pseudo-call as a callback. This leads to a mess with the verification of the program because it makes the wrong assumptions about the nature of this call. Solution: include an explicit check to ensure that insn->src_reg == 0. This ensures that calls cannot be mis-interpreted as an async callback call. Fixes: bfc6bb74e4f1 ("bpf: Implement verifier support for validation of async callbacks.") Signed-off-by: Kris Van Hees Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220105210150.GH1559@oracle.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c8d9e761173b..bfb45381fb3f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6031,6 +6031,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { struct bpf_verifier_state *async_cb; From 04c350b1ae6bdb12b84009a4d0bf5ab4e621c47b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jan 2022 10:31:48 +0900 Subject: [PATCH 17/41] bpf: Fix SO_RCVBUF/SO_SNDBUF handling in _bpf_setsockopt(). The commit 4057765f2dee ("sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values") added a change to prevent underflow in setsockopt() around SO_SNDBUF/SO_RCVBUF. This patch adds the same change to _bpf_setsockopt(). Fixes: 4057765f2dee ("sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220104013153.97906-2-kuniyu@amazon.co.jp --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 606ab5a98a1a..368fe28c8dc6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4741,12 +4741,14 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case SO_RCVBUF: val = min_t(u32, val, sysctl_rmem_max); + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); From 28479934f26bcf9ddeb94125e05ddc5c4312b1f3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 4 Jan 2022 10:31:49 +0900 Subject: [PATCH 18/41] bpf: Add SO_RCVBUF/SO_SNDBUF in _bpf_getsockopt(). This patch exposes SO_RCVBUF/SO_SNDBUF through bpf_getsockopt(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220104013153.97906-3-kuniyu@amazon.co.jp --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 368fe28c8dc6..cac2be559ab0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4969,6 +4969,12 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, goto err_clear; switch (optname) { + case SO_RCVBUF: + *((int *)optval) = sk->sk_rcvbuf; + break; + case SO_SNDBUF: + *((int *)optval) = sk->sk_sndbuf; + break; case SO_MARK: *((int *)optval) = sk->sk_mark; break; From 7218c28c87f57c131879a75a226b9033ac90b266 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Wed, 29 Dec 2021 12:41:56 -0800 Subject: [PATCH 19/41] libbpf: Deprecate bpf_perf_event_read_simple() API With perf_buffer__poll() and perf_buffer__consume() APIs available, there is no reason to expose bpf_perf_event_read_simple() API to users. If users need custom perf buffer, they could re-implement the function. Mark bpf_perf_event_read_simple() and move the logic to a new static function so it can still be called by other functions in the same file. [0] Closes: https://github.com/libbpf/libbpf/issues/310 Signed-off-by: Christy Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211229204156.13569-1-christylee@fb.com --- tools/lib/bpf/libbpf.c | 22 ++++++++++++++-------- tools/lib/bpf/libbpf.h | 1 + 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9cb99d1e2385..1d02ba7f11b4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10676,10 +10676,10 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) return link; } -enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, - void **copy_mem, size_t *copy_size, - bpf_perf_event_print_t fn, void *private_data) +static enum bpf_perf_event_ret +perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, + void **copy_mem, size_t *copy_size, + bpf_perf_event_print_t fn, void *private_data) { struct perf_event_mmap_page *header = mmap_mem; __u64 data_head = ring_buffer_read_head(header); @@ -10724,6 +10724,12 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, return libbpf_err(ret); } +__attribute__((alias("perf_event_read_simple"))) +enum bpf_perf_event_ret +bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, + void **copy_mem, size_t *copy_size, + bpf_perf_event_print_t fn, void *private_data); + struct perf_buffer; struct perf_buffer_params { @@ -11132,10 +11138,10 @@ static int perf_buffer__process_records(struct perf_buffer *pb, { enum bpf_perf_event_ret ret; - ret = bpf_perf_event_read_simple(cpu_buf->base, pb->mmap_size, - pb->page_size, &cpu_buf->buf, - &cpu_buf->buf_size, - perf_buffer__process_record, cpu_buf); + ret = perf_event_read_simple(cpu_buf->base, pb->mmap_size, + pb->page_size, &cpu_buf->buf, + &cpu_buf->buf_size, + perf_buffer__process_record, cpu_buf); if (ret != LIBBPF_PERF_EVENT_CONT) return ret; return 0; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 85dfef88b3d2..ddf1cc9e7803 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1026,6 +1026,7 @@ LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_i typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, void *private_data); +LIBBPF_DEPRECATED_SINCE(0, 8, "use perf_buffer__poll() or perf_buffer__consume() instead") LIBBPF_API enum bpf_perf_event_ret bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, From 71cff670baff5cc6a6eeb0181e2cc55579c5e1e0 Mon Sep 17 00:00:00 2001 From: Qiang Wang Date: Mon, 27 Dec 2021 21:07:12 +0800 Subject: [PATCH 20/41] libbpf: Use probe_name for legacy kprobe Fix a bug in commit 46ed5fc33db9, which wrongly used the func_name instead of probe_name to register legacy kprobe. Fixes: 46ed5fc33db9 ("libbpf: Refactor and simplify legacy kprobe code") Co-developed-by: Chengming Zhou Signed-off-by: Qiang Wang Signed-off-by: Chengming Zhou Signed-off-by: Andrii Nakryiko Tested-by: Hengqi Chen Reviewed-by: Hengqi Chen Link: https://lore.kernel.org/bpf/20211227130713.66933-1-wangqiang.wq.frank@bytedance.com --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1d02ba7f11b4..26e49e6aa5b1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10017,7 +10017,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), func_name, offset); - legacy_probe = strdup(func_name); + legacy_probe = strdup(probe_name); if (!legacy_probe) return libbpf_err_ptr(-ENOMEM); From 51a33c60f1c22c0d2dafad774315ba1537765442 Mon Sep 17 00:00:00 2001 From: Qiang Wang Date: Mon, 27 Dec 2021 21:07:13 +0800 Subject: [PATCH 21/41] libbpf: Support repeated legacy kprobes on same function If repeated legacy kprobes on same function in one process, libbpf will register using the same probe name and got -EBUSY error. So append index to the probe name format to fix this problem. Co-developed-by: Chengming Zhou Signed-off-by: Qiang Wang Signed-off-by: Chengming Zhou Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211227130713.66933-2-wangqiang.wq.frank@bytedance.com --- tools/lib/bpf/libbpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 26e49e6aa5b1..7f10dd501a52 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9916,7 +9916,10 @@ static int append_to_file(const char *file, const char *fmt, ...) static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, const char *kfunc_name, size_t offset) { - snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx", getpid(), kfunc_name, offset); + static int index = 0; + + snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset, + __sync_fetch_and_add(&index, 1)); } static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, From 9855c131b9c8b0327ff5182f88bb1991f212415b Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Tue, 4 Jan 2022 16:06:01 -0800 Subject: [PATCH 22/41] libbpf 1.0: Deprecate bpf_map__is_offload_neutral() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deprecate bpf_map__is_offload_neutral(). It’s most probably broken already. PERF_EVENT_ARRAY isn’t the only map that’s not suitable for hardware offloading. Applications can directly check map type instead. [0] Closes: https://github.com/libbpf/libbpf/issues/306 Signed-off-by: Christy Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220105000601.2090044-1-christylee@fb.com --- tools/bpf/bpftool/prog.c | 2 +- tools/lib/bpf/libbpf.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f874896c4154..2a21d50516bc 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1655,7 +1655,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) j = 0; idx = 0; bpf_object__for_each_map(map, obj) { - if (!bpf_map__is_offload_neutral(map)) + if (bpf_map__type(map) != BPF_MAP_TYPE_PERF_EVENT_ARRAY) bpf_map__set_ifindex(map, ifindex); if (j < old_map_fds && idx == map_replace[j].idx) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index ddf1cc9e7803..88dd943ba545 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -744,6 +744,7 @@ LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); +LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__type() instead") LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); /** From 5f6082642814050352a3e29f8713796b55ebf788 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Tue, 4 Jan 2022 16:31:20 -0800 Subject: [PATCH 23/41] libbpf 1.0: Deprecate bpf_object__find_map_by_offset() API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit API created with simplistic assumptions about BPF map definitions. It hasn’t worked for a while, deprecate it in preparation for libbpf 1.0. [0] Closes: https://github.com/libbpf/libbpf/issues/302 Signed-off-by: Christy Lee Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220105003120.2222673-1-christylee@fb.com --- tools/lib/bpf/libbpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 88dd943ba545..8b9bc5e90c2b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -677,7 +677,8 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); * Get bpf_map through the offset of corresponding struct bpf_map_def * in the BPF object file. */ -LIBBPF_API struct bpf_map * +LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__find_map_by_name() instead") +struct bpf_map * bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_map() instead") From 2741a0493c04067d7acb0e44035aa27618b7d204 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Thu, 30 Dec 2021 11:54:41 +0800 Subject: [PATCH 24/41] samples/bpf: xdpsock: Add VLAN support for Tx-only operation In multi-queue environment testing, the support for VLAN-tag based steering is useful. So, this patch adds the capability to add VLAN tag (VLAN ID and Priority) to the generated Tx frame. To set the VLAN ID=10 and Priority=2 for Tx only through TxQ=3: $ xdpsock -i eth0 -t -N -z -q 3 -V -J 10 -K 2 If VLAN ID (-J) and Priority (-K) is set, it default to VLAN ID = 1 VLAN Priority = 0. For example, VLAN-tagged Tx only, xdp copy mode through TxQ=1: $ xdpsock -i eth0 -t -N -c -q 1 -V Signed-off-by: Ong Boon Leong Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211230035447.523177-2-boon.leong.ong@intel.com --- samples/bpf/xdpsock_user.c | 88 ++++++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 14 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 616d663d55aa..d5e298ccf491 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -56,6 +56,12 @@ #define DEBUG_HEXDUMP 0 +#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ +#define VLAN_PRIO_SHIFT 13 +#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ +#define VLAN_VID__DEFAULT 1 +#define VLAN_PRI__DEFAULT 0 + typedef __u64 u64; typedef __u32 u32; typedef __u16 u16; @@ -81,6 +87,9 @@ static u32 opt_batch_size = 64; static int opt_pkt_count; static u16 opt_pkt_size = MIN_PKT_SIZE; static u32 opt_pkt_fill_pattern = 0x12345678; +static bool opt_vlan_tag; +static u16 opt_pkt_vlan_id = VLAN_VID__DEFAULT; +static u16 opt_pkt_vlan_pri = VLAN_PRI__DEFAULT; static bool opt_extra_stats; static bool opt_quiet; static bool opt_app_stats; @@ -101,6 +110,14 @@ static u32 prog_id; static bool opt_busy_poll; static bool opt_reduced_cap; +struct vlan_ethhdr { + unsigned char h_dest[6]; + unsigned char h_source[6]; + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + struct xsk_ring_stats { unsigned long rx_npkts; unsigned long tx_npkts; @@ -740,11 +757,13 @@ static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, #define ETH_FCS_SIZE 4 -#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ +#define ETH_HDR_SIZE (opt_vlan_tag ? sizeof(struct vlan_ethhdr) : \ + sizeof(struct ethhdr)) +#define PKT_HDR_SIZE (ETH_HDR_SIZE + sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) -#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) +#define IP_PKT_SIZE (PKT_SIZE - ETH_HDR_SIZE) #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) @@ -752,17 +771,42 @@ static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static void gen_eth_hdr_data(void) { - struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + - sizeof(struct ethhdr) + - sizeof(struct iphdr)); - struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + - sizeof(struct ethhdr)); - struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; + struct udphdr *udp_hdr; + struct iphdr *ip_hdr; + + if (opt_vlan_tag) { + struct vlan_ethhdr *veth_hdr = (struct vlan_ethhdr *)pkt_data; + u16 vlan_tci = 0; + + udp_hdr = (struct udphdr *)(pkt_data + + sizeof(struct vlan_ethhdr) + + sizeof(struct iphdr)); + ip_hdr = (struct iphdr *)(pkt_data + + sizeof(struct vlan_ethhdr)); + + /* ethernet & VLAN header */ + memcpy(veth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); + memcpy(veth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); + veth_hdr->h_vlan_proto = htons(ETH_P_8021Q); + vlan_tci = opt_pkt_vlan_id & VLAN_VID_MASK; + vlan_tci |= (opt_pkt_vlan_pri << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; + veth_hdr->h_vlan_TCI = htons(vlan_tci); + veth_hdr->h_vlan_encapsulated_proto = htons(ETH_P_IP); + } else { + struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; + + udp_hdr = (struct udphdr *)(pkt_data + + sizeof(struct ethhdr) + + sizeof(struct iphdr)); + ip_hdr = (struct iphdr *)(pkt_data + + sizeof(struct ethhdr)); + + /* ethernet header */ + memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); + memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); + eth_hdr->h_proto = htons(ETH_P_IP); + } - /* ethernet header */ - memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); - memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); - eth_hdr->h_proto = htons(ETH_P_IP); /* IP header */ ip_hdr->version = IPVERSION; @@ -920,6 +964,9 @@ static struct option long_options[] = { {"tx-pkt-count", required_argument, 0, 'C'}, {"tx-pkt-size", required_argument, 0, 's'}, {"tx-pkt-pattern", required_argument, 0, 'P'}, + {"tx-vlan", no_argument, 0, 'V'}, + {"tx-vlan-id", required_argument, 0, 'J'}, + {"tx-vlan-pri", required_argument, 0, 'K'}, {"extra-stats", no_argument, 0, 'x'}, {"quiet", no_argument, 0, 'Q'}, {"app-stats", no_argument, 0, 'a'}, @@ -960,6 +1007,9 @@ static void usage(const char *prog) " (Default: %d bytes)\n" " Min size: %d, Max size %d.\n" " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" + " -V, --tx-vlan Send VLAN tagged packets (For -t|--txonly)\n" + " -J, --tx-vlan-id=n Tx VLAN ID [1-4095]. Default: %d (For -V|--tx-vlan)\n" + " -K, --tx-vlan-pri=n Tx VLAN Priority [0-7]. Default: %d (For -V|--tx-vlan)\n" " -x, --extra-stats Display extra statistics.\n" " -Q, --quiet Do not display any stats.\n" " -a, --app-stats Display application (syscall) statistics.\n" @@ -969,7 +1019,8 @@ static void usage(const char *prog) "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, - XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern); + XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern, + VLAN_VID__DEFAULT, VLAN_PRI__DEFAULT); exit(EXIT_FAILURE); } @@ -981,7 +1032,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:BR", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1062,6 +1113,15 @@ static void parse_command_line(int argc, char **argv) case 'P': opt_pkt_fill_pattern = strtol(optarg, NULL, 16); break; + case 'V': + opt_vlan_tag = true; + break; + case 'J': + opt_pkt_vlan_id = atoi(optarg); + break; + case 'K': + opt_pkt_vlan_pri = atoi(optarg); + break; case 'x': opt_extra_stats = 1; break; From 6440a6c23f6c72c57dbdf7928d92d3fc1aef6edc Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Thu, 30 Dec 2021 11:54:42 +0800 Subject: [PATCH 25/41] samples/bpf: xdpsock: Add Dest and Src MAC setting for Tx-only operation To set Dest MAC address (-G|--tx-dmac) only: $ xdpsock -i eth0 -t -N -z -G aa:bb:cc:dd:ee:ff To set Source MAC address (-H|--tx-smac) only: $ xdpsock -i eth0 -t -N -z -H 11:22:33:44:55:66 To set both Dest and Source MAC address: $ xdpsock -i eth0 -t -N -z -G aa:bb:cc:dd:ee:ff \ -H 11:22:33:44:55:66 The default Dest and Source MAC address remain the same as before. Signed-off-by: Ong Boon Leong Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20211230035447.523177-3-boon.leong.ong@intel.com --- samples/bpf/xdpsock_user.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index d5e298ccf491..c9a8748a460f 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -90,6 +91,10 @@ static u32 opt_pkt_fill_pattern = 0x12345678; static bool opt_vlan_tag; static u16 opt_pkt_vlan_id = VLAN_VID__DEFAULT; static u16 opt_pkt_vlan_pri = VLAN_PRI__DEFAULT; +static struct ether_addr opt_txdmac = {{ 0x3c, 0xfd, 0xfe, + 0x9e, 0x7f, 0x71 }}; +static struct ether_addr opt_txsmac = {{ 0xec, 0xb1, 0xd7, + 0x98, 0x3a, 0xc0 }}; static bool opt_extra_stats; static bool opt_quiet; static bool opt_app_stats; @@ -785,8 +790,8 @@ static void gen_eth_hdr_data(void) sizeof(struct vlan_ethhdr)); /* ethernet & VLAN header */ - memcpy(veth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); - memcpy(veth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); + memcpy(veth_hdr->h_dest, &opt_txdmac, ETH_ALEN); + memcpy(veth_hdr->h_source, &opt_txsmac, ETH_ALEN); veth_hdr->h_vlan_proto = htons(ETH_P_8021Q); vlan_tci = opt_pkt_vlan_id & VLAN_VID_MASK; vlan_tci |= (opt_pkt_vlan_pri << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; @@ -802,8 +807,8 @@ static void gen_eth_hdr_data(void) sizeof(struct ethhdr)); /* ethernet header */ - memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); - memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); + memcpy(eth_hdr->h_dest, &opt_txdmac, ETH_ALEN); + memcpy(eth_hdr->h_source, &opt_txsmac, ETH_ALEN); eth_hdr->h_proto = htons(ETH_P_IP); } @@ -967,6 +972,8 @@ static struct option long_options[] = { {"tx-vlan", no_argument, 0, 'V'}, {"tx-vlan-id", required_argument, 0, 'J'}, {"tx-vlan-pri", required_argument, 0, 'K'}, + {"tx-dmac", required_argument, 0, 'G'}, + {"tx-smac", required_argument, 0, 'H'}, {"extra-stats", no_argument, 0, 'x'}, {"quiet", no_argument, 0, 'Q'}, {"app-stats", no_argument, 0, 'a'}, @@ -1010,6 +1017,8 @@ static void usage(const char *prog) " -V, --tx-vlan Send VLAN tagged packets (For -t|--txonly)\n" " -J, --tx-vlan-id=n Tx VLAN ID [1-4095]. Default: %d (For -V|--tx-vlan)\n" " -K, --tx-vlan-pri=n Tx VLAN Priority [0-7]. Default: %d (For -V|--tx-vlan)\n" + " -G, --tx-dmac= Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" + " -H, --tx-smac= Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" " -x, --extra-stats Display extra statistics.\n" " -Q, --quiet Do not display any stats.\n" " -a, --app-stats Display application (syscall) statistics.\n" @@ -1032,7 +1041,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:xQaI:BR", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1122,6 +1131,22 @@ static void parse_command_line(int argc, char **argv) case 'K': opt_pkt_vlan_pri = atoi(optarg); break; + case 'G': + if (!ether_aton_r(optarg, + (struct ether_addr *)&opt_txdmac)) { + fprintf(stderr, "Invalid dmac address:%s\n", + optarg); + usage(basename(argv[0])); + } + break; + case 'H': + if (!ether_aton_r(optarg, + (struct ether_addr *)&opt_txsmac)) { + fprintf(stderr, "Invalid smac address:%s\n", + optarg); + usage(basename(argv[0])); + } + break; case 'x': opt_extra_stats = 1; break; From 5a3882542acda1ac5f0a22dddf7f7f8533d3a8cc Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Thu, 30 Dec 2021 11:54:43 +0800 Subject: [PATCH 26/41] samples/bpf: xdpsock: Add clockid selection support User specifies the clock selection by using -w CLOCK or --clock=CLOCK where CLOCK=[REALTIME, TAI, BOOTTIME, MONOTONIC]. The default CLOCK selection is MONOTONIC. The implementation of clock selection parsing is borrowed from iproute2/tc/q_taprio.c Signed-off-by: Ong Boon Leong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230035447.523177-4-boon.leong.ong@intel.com --- samples/bpf/xdpsock_user.c | 40 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index c9a8748a460f..e6e9a20375cb 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -114,6 +114,7 @@ static u32 opt_num_xsks = 1; static u32 prog_id; static bool opt_busy_poll; static bool opt_reduced_cap; +static clockid_t opt_clock = CLOCK_MONOTONIC; struct vlan_ethhdr { unsigned char h_dest[6]; @@ -178,15 +179,40 @@ struct xsk_socket_info { u32 outstanding_tx; }; +static const struct clockid_map { + const char *name; + clockid_t clockid; +} clockids_map[] = { + { "REALTIME", CLOCK_REALTIME }, + { "TAI", CLOCK_TAI }, + { "BOOTTIME", CLOCK_BOOTTIME }, + { "MONOTONIC", CLOCK_MONOTONIC }, + { NULL } +}; + static int num_socks; struct xsk_socket_info *xsks[MAX_SOCKS]; int sock; +static int get_clockid(clockid_t *id, const char *name) +{ + const struct clockid_map *clk; + + for (clk = clockids_map; clk->name; clk++) { + if (strcasecmp(clk->name, name) == 0) { + *id = clk->clockid; + return 0; + } + } + + return -1; +} + static unsigned long get_nsecs(void) { struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); + clock_gettime(opt_clock, &ts); return ts.tv_sec * 1000000000UL + ts.tv_nsec; } @@ -965,6 +991,7 @@ static struct option long_options[] = { {"shared-umem", no_argument, 0, 'M'}, {"force", no_argument, 0, 'F'}, {"duration", required_argument, 0, 'd'}, + {"clock", required_argument, 0, 'w'}, {"batch-size", required_argument, 0, 'b'}, {"tx-pkt-count", required_argument, 0, 'C'}, {"tx-pkt-size", required_argument, 0, 's'}, @@ -1006,6 +1033,7 @@ static void usage(const char *prog) " -F, --force Force loading the XDP prog\n" " -d, --duration=n Duration in secs to run command.\n" " Default: forever.\n" + " -w, --clock=CLOCK Clock NAME (default MONOTONIC).\n" " -b, --batch-size=n Batch size for sending or receiving\n" " packets. Default: %d\n" " -C, --tx-pkt-count=n Number of packets to send.\n" @@ -1041,7 +1069,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR", + c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1075,6 +1103,14 @@ static void parse_command_line(int argc, char **argv) case 'n': opt_interval = atoi(optarg); break; + case 'w': + if (get_clockid(&opt_clock, optarg)) { + fprintf(stderr, + "ERROR: Invalid clock %s. Default to CLOCK_MONOTONIC.\n", + optarg); + opt_clock = CLOCK_MONOTONIC; + } + break; case 'z': opt_xdp_bind_flags |= XDP_ZEROCOPY; break; From fa0d27a1d5a8c1f07b0229348b0d178233694fbc Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Thu, 30 Dec 2021 11:54:44 +0800 Subject: [PATCH 27/41] samples/bpf: xdpsock: Add cyclic TX operation capability Tx cycle time is in micro-seconds unit. By combining the batch size (-b M) and Tx cycle time (-T|--tx-cycle N), xdpsock now can transmit batch-size of packets every N-us periodically. Cyclic TX operation is not applicable if --poll mode is used. To transmit 16 packets every 1ms cycle time for total of 100000 packets silently: $ xdpsock -i eth0 -T -N -z -T 1000 -b 16 -C 100000 To print cyclic TX schedule variance stats, use --app-stats|-a: $ xdpsock -i eth0 -T -N -z -T 1000 -b 16 -C 100000 -a sock0@eth0:0 txonly xdp-drv pps pkts 0.00 rx 0 0 tx 0 100000 calls/s count rx empty polls 0 0 fill fail polls 0 0 copy tx sendtos 0 0 tx wakeup sendtos 0 6254 opt polls 0 0 period min ave max cycle Cyclic TX 1000000 53507 75334 712642 6250 Signed-off-by: Ong Boon Leong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230035447.523177-5-boon.leong.ong@intel.com --- samples/bpf/xdpsock_user.c | 87 +++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 6 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index e6e9a20375cb..a2a42ec4b0e9 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -63,12 +63,19 @@ #define VLAN_VID__DEFAULT 1 #define VLAN_PRI__DEFAULT 0 +#define NSEC_PER_SEC 1000000000UL +#define NSEC_PER_USEC 1000 + typedef __u64 u64; typedef __u32 u32; typedef __u16 u16; typedef __u8 u8; static unsigned long prev_time; +static long tx_cycle_diff_min; +static long tx_cycle_diff_max; +static double tx_cycle_diff_ave; +static long tx_cycle_cnt; enum benchmark_type { BENCH_RXDROP = 0, @@ -115,6 +122,7 @@ static u32 prog_id; static bool opt_busy_poll; static bool opt_reduced_cap; static clockid_t opt_clock = CLOCK_MONOTONIC; +static unsigned long opt_tx_cycle_ns; struct vlan_ethhdr { unsigned char h_dest[6]; @@ -305,6 +313,15 @@ static void dump_app_stats(long dt) xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; } + + if (opt_tx_cycle_ns) { + printf("\n%-18s %-10s %-10s %-10s %-10s %-10s\n", + "", "period", "min", "ave", "max", "cycle"); + printf("%-18s %-10lu %-10lu %-10lu %-10lu %-10lu\n", + "Cyclic TX", opt_tx_cycle_ns, tx_cycle_diff_min, + (long)(tx_cycle_diff_ave / tx_cycle_cnt), + tx_cycle_diff_max, tx_cycle_cnt); + } } static bool get_interrupt_number(void) @@ -1001,6 +1018,7 @@ static struct option long_options[] = { {"tx-vlan-pri", required_argument, 0, 'K'}, {"tx-dmac", required_argument, 0, 'G'}, {"tx-smac", required_argument, 0, 'H'}, + {"tx-cycle", required_argument, 0, 'T'}, {"extra-stats", no_argument, 0, 'x'}, {"quiet", no_argument, 0, 'Q'}, {"app-stats", no_argument, 0, 'a'}, @@ -1047,6 +1065,7 @@ static void usage(const char *prog) " -K, --tx-vlan-pri=n Tx VLAN Priority [0-7]. Default: %d (For -V|--tx-vlan)\n" " -G, --tx-dmac= Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" " -H, --tx-smac= Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" + " -T, --tx-cycle=n Tx cycle time in micro-seconds (For -t|--txonly).\n" " -x, --extra-stats Display extra statistics.\n" " -Q, --quiet Do not display any stats.\n" " -a, --app-stats Display application (syscall) statistics.\n" @@ -1069,7 +1088,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR", + c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1183,6 +1202,10 @@ static void parse_command_line(int argc, char **argv) usage(basename(argv[0])); } break; + case 'T': + opt_tx_cycle_ns = atoi(optarg); + opt_tx_cycle_ns *= NSEC_PER_USEC; + break; case 'x': opt_extra_stats = 1; break; @@ -1388,7 +1411,7 @@ static void rx_drop_all(void) } } -static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) +static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) { u32 idx; unsigned int i; @@ -1397,7 +1420,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) batch_size) { complete_tx_only(xsk, batch_size); if (benchmark_done) - return; + return 0; } for (i = 0; i < batch_size; i++) { @@ -1413,6 +1436,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) *frame_nb += batch_size; *frame_nb %= NUM_FRAMES; complete_tx_only(xsk, batch_size); + + return batch_size; } static inline int get_batch_size(int pkt_cnt) @@ -1446,16 +1471,39 @@ static void tx_only_all(void) { struct pollfd fds[MAX_SOCKS] = {}; u32 frame_nb[MAX_SOCKS] = {}; + unsigned long next_tx_ns = 0; int pkt_cnt = 0; int i, ret; + if (opt_poll && opt_tx_cycle_ns) { + fprintf(stderr, + "Error: --poll and --tx-cycles are both set\n"); + return; + } + for (i = 0; i < num_socks; i++) { fds[0].fd = xsk_socket__fd(xsks[i]->xsk); fds[0].events = POLLOUT; } + if (opt_tx_cycle_ns) { + /* Align Tx time to micro-second boundary */ + next_tx_ns = (get_nsecs() / NSEC_PER_USEC + 1) * + NSEC_PER_USEC; + next_tx_ns += opt_tx_cycle_ns; + + /* Initialize periodic Tx scheduling variance */ + tx_cycle_diff_min = 1000000000; + tx_cycle_diff_max = 0; + tx_cycle_diff_ave = 0.0; + } + while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { int batch_size = get_batch_size(pkt_cnt); + struct timespec next; + int tx_cnt = 0; + long diff; + int err; if (opt_poll) { for (i = 0; i < num_socks; i++) @@ -1468,13 +1516,40 @@ static void tx_only_all(void) continue; } - for (i = 0; i < num_socks; i++) - tx_only(xsks[i], &frame_nb[i], batch_size); + if (opt_tx_cycle_ns) { + next.tv_sec = next_tx_ns / NSEC_PER_SEC; + next.tv_nsec = next_tx_ns % NSEC_PER_SEC; + err = clock_nanosleep(opt_clock, TIMER_ABSTIME, &next, NULL); + if (err) { + if (err != EINTR) + fprintf(stderr, + "clock_nanosleep failed. Err:%d errno:%d\n", + err, errno); + break; + } - pkt_cnt += batch_size; + /* Measure periodic Tx scheduling variance */ + diff = get_nsecs() - next_tx_ns; + if (diff < tx_cycle_diff_min) + tx_cycle_diff_min = diff; + + if (diff > tx_cycle_diff_max) + tx_cycle_diff_max = diff; + + tx_cycle_diff_ave += (double)diff; + tx_cycle_cnt++; + } + + for (i = 0; i < num_socks; i++) + tx_cnt += tx_only(xsks[i], &frame_nb[i], batch_size); + + pkt_cnt += tx_cnt; if (benchmark_done) break; + + if (opt_tx_cycle_ns) + next_tx_ns += opt_tx_cycle_ns; } if (opt_pkt_count) From fa24d0b1d57825d1a5b802339728d4d8ac20b6d6 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Thu, 30 Dec 2021 11:54:45 +0800 Subject: [PATCH 28/41] samples/bpf: xdpsock: Add sched policy and priority support By default, TX schedule policy is SCHED_OTHER (round-robin time-sharing). To improve TX cyclic scheduling, we add SCHED_FIFO policy and its priority by using -W FIFO or --policy=FIFO and -U or --schpri=. A) From xdpsock --app-stats, for SCHED_OTHER policy: $ xdpsock -i eth0 -t -N -z -T 1000 -b 16 -C 100000 -a period min ave max cycle Cyclic TX 1000000 53507 75334 712642 6250 B) For SCHED_FIFO policy and schpri=50: $ xdpsock -i eth0 -t -N -z -T 1000 -b 16 -C 100000 -a -W FIFO -U 50 period min ave max cycle Cyclic TX 1000000 3699 24859 54397 6250 Signed-off-by: Ong Boon Leong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230035447.523177-6-boon.leong.ong@intel.com --- samples/bpf/xdpsock_user.c | 61 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index a2a42ec4b0e9..b7d0f536f974 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -66,6 +67,8 @@ #define NSEC_PER_SEC 1000000000UL #define NSEC_PER_USEC 1000 +#define SCHED_PRI__DEFAULT 0 + typedef __u64 u64; typedef __u32 u32; typedef __u16 u16; @@ -123,6 +126,8 @@ static bool opt_busy_poll; static bool opt_reduced_cap; static clockid_t opt_clock = CLOCK_MONOTONIC; static unsigned long opt_tx_cycle_ns; +static int opt_schpolicy = SCHED_OTHER; +static int opt_schprio = SCHED_PRI__DEFAULT; struct vlan_ethhdr { unsigned char h_dest[6]; @@ -198,6 +203,15 @@ static const struct clockid_map { { NULL } }; +static const struct sched_map { + const char *name; + int policy; +} schmap[] = { + { "OTHER", SCHED_OTHER }, + { "FIFO", SCHED_FIFO }, + { NULL } +}; + static int num_socks; struct xsk_socket_info *xsks[MAX_SOCKS]; int sock; @@ -216,6 +230,20 @@ static int get_clockid(clockid_t *id, const char *name) return -1; } +static int get_schpolicy(int *policy, const char *name) +{ + const struct sched_map *sch; + + for (sch = schmap; sch->name; sch++) { + if (strcasecmp(sch->name, name) == 0) { + *policy = sch->policy; + return 0; + } + } + + return -1; +} + static unsigned long get_nsecs(void) { struct timespec ts; @@ -1019,6 +1047,8 @@ static struct option long_options[] = { {"tx-dmac", required_argument, 0, 'G'}, {"tx-smac", required_argument, 0, 'H'}, {"tx-cycle", required_argument, 0, 'T'}, + {"policy", required_argument, 0, 'W'}, + {"schpri", required_argument, 0, 'U'}, {"extra-stats", no_argument, 0, 'x'}, {"quiet", no_argument, 0, 'Q'}, {"app-stats", no_argument, 0, 'a'}, @@ -1066,6 +1096,8 @@ static void usage(const char *prog) " -G, --tx-dmac= Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" " -H, --tx-smac= Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" " -T, --tx-cycle=n Tx cycle time in micro-seconds (For -t|--txonly).\n" + " -W, --policy=POLICY Schedule policy. Default: SCHED_OTHER\n" + " -U, --schpri=n Schedule priority. Default: %d\n" " -x, --extra-stats Display extra statistics.\n" " -Q, --quiet Do not display any stats.\n" " -a, --app-stats Display application (syscall) statistics.\n" @@ -1076,7 +1108,8 @@ static void usage(const char *prog) fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern, - VLAN_VID__DEFAULT, VLAN_PRI__DEFAULT); + VLAN_VID__DEFAULT, VLAN_PRI__DEFAULT, + SCHED_PRI__DEFAULT); exit(EXIT_FAILURE); } @@ -1088,7 +1121,8 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:xQaI:BR", + c = getopt_long(argc, argv, + "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1206,6 +1240,17 @@ static void parse_command_line(int argc, char **argv) opt_tx_cycle_ns = atoi(optarg); opt_tx_cycle_ns *= NSEC_PER_USEC; break; + case 'W': + if (get_schpolicy(&opt_schpolicy, optarg)) { + fprintf(stderr, + "ERROR: Invalid policy %s. Default to SCHED_OTHER.\n", + optarg); + opt_schpolicy = SCHED_OTHER; + } + break; + case 'U': + opt_schprio = atoi(optarg); + break; case 'x': opt_extra_stats = 1; break; @@ -1780,6 +1825,7 @@ int main(int argc, char **argv) struct __user_cap_data_struct data[2] = { { 0 } }; struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; bool rx = false, tx = false; + struct sched_param schparam; struct xsk_umem_info *umem; struct bpf_object *obj; int xsks_map_fd = 0; @@ -1881,6 +1927,16 @@ int main(int argc, char **argv) prev_time = get_nsecs(); start_time = prev_time; + /* Configure sched priority for better wake-up accuracy */ + memset(&schparam, 0, sizeof(schparam)); + schparam.sched_priority = opt_schprio; + ret = sched_setscheduler(0, opt_schpolicy, &schparam); + if (ret) { + fprintf(stderr, "Error(%d) in setting priority(%d): %s\n", + errno, opt_schprio, strerror(errno)); + goto out; + } + if (opt_bench == BENCH_RXDROP) rx_drop_all(); else if (opt_bench == BENCH_TXONLY) @@ -1888,6 +1944,7 @@ int main(int argc, char **argv) else l2fwd_all(); +out: benchmark_done = true; if (!opt_quiet) From 8121e78932018df48758985e00651e16ff34ae5f Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Thu, 30 Dec 2021 11:54:46 +0800 Subject: [PATCH 29/41] samples/bpf: xdpsock: Add time-out for cleaning Tx When user sets tx-pkt-count and in case where there are invalid Tx frame, the complete_tx_only_all() process polls indefinitely. So, this patch adds a time-out mechanism into the process so that the application can terminate automatically after it retries 3*polling interval duration. v1->v2: Thanks to Jesper's and Song Liu's suggestion. - clean-up git message to remove polling log - make the Tx time-out retries configurable with 1s granularity Signed-off-by: Ong Boon Leong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230035447.523177-7-boon.leong.ong@intel.com --- samples/bpf/xdpsock_user.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index b7d0f536f974..319cb3cdb226 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -113,6 +113,7 @@ static u32 irq_no; static int irqs_at_init = -1; static int opt_poll; static int opt_interval = 1; +static int opt_retries = 3; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; static u32 opt_umem_flags; static int opt_unaligned_chunks; @@ -1028,6 +1029,7 @@ static struct option long_options[] = { {"xdp-skb", no_argument, 0, 'S'}, {"xdp-native", no_argument, 0, 'N'}, {"interval", required_argument, 0, 'n'}, + {"retries", required_argument, 0, 'O'}, {"zero-copy", no_argument, 0, 'z'}, {"copy", no_argument, 0, 'c'}, {"frame-size", required_argument, 0, 'f'}, @@ -1072,6 +1074,7 @@ static void usage(const char *prog) " -S, --xdp-skb=n Use XDP skb-mod\n" " -N, --xdp-native=n Enforce XDP native mode\n" " -n, --interval=n Specify statistics update interval (default 1 sec).\n" + " -O, --retries=n Specify time-out retries (1s interval) attempt (default 3).\n" " -z, --zero-copy Force zero-copy mode.\n" " -c, --copy Force copy mode.\n" " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" @@ -1122,7 +1125,7 @@ static void parse_command_line(int argc, char **argv) for (;;) { c = getopt_long(argc, argv, - "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR", + "Frtli:q:pSNn:w:O:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1164,6 +1167,9 @@ static void parse_command_line(int argc, char **argv) opt_clock = CLOCK_MONOTONIC; } break; + case 'O': + opt_retries = atoi(optarg); + break; case 'z': opt_xdp_bind_flags |= XDP_ZEROCOPY; break; @@ -1509,7 +1515,8 @@ static void complete_tx_only_all(void) pending = !!xsks[i]->outstanding_tx; } } - } while (pending); + sleep(1); + } while (pending && opt_retries-- > 0); } static void tx_only_all(void) From eb68db45b747756c351ea84e9af55a69468d0549 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Thu, 30 Dec 2021 11:54:47 +0800 Subject: [PATCH 30/41] samples/bpf: xdpsock: Add timestamp for Tx-only operation It may be useful to add timestamp for Tx packets for continuous or cyclic transmit operation. The timestamp and sequence ID of a Tx packet are stored according to pktgen header format. To enable per-packet timestamp, use -y|--tstamp option. If timestamp is off, pktgen header is not included in the UDP payload. This means receiving side can use the magic number for pktgen for differentiation. The implementation supports both VLAN tagged and untagged option. By default, the minimum packet size is set at 64B. However, if VLAN tagged is on (-V), the minimum packet size is increased to 66B just so to fit the pktgen_hdr size. Added hex_dump() into the code path just for future cross-checking. As before, simply change to "#define DEBUG_HEXDUMP 1" to inspect the accuracy of TX packet. Signed-off-by: Ong Boon Leong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230035447.523177-8-boon.leong.ong@intel.com --- samples/bpf/xdpsock_user.c | 77 +++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 9 deletions(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 319cb3cdb226..aa50864e4415 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -111,6 +111,7 @@ static bool opt_app_stats; static const char *opt_irq_str = ""; static u32 irq_no; static int irqs_at_init = -1; +static u32 sequence; static int opt_poll; static int opt_interval = 1; static int opt_retries = 3; @@ -129,6 +130,7 @@ static clockid_t opt_clock = CLOCK_MONOTONIC; static unsigned long opt_tx_cycle_ns; static int opt_schpolicy = SCHED_OTHER; static int opt_schprio = SCHED_PRI__DEFAULT; +static bool opt_tstamp; struct vlan_ethhdr { unsigned char h_dest[6]; @@ -138,6 +140,14 @@ struct vlan_ethhdr { __be16 h_vlan_encapsulated_proto; }; +#define PKTGEN_MAGIC 0xbe9be955 +struct pktgen_hdr { + __be32 pgh_magic; + __be32 seq_num; + __be32 tv_sec; + __be32 tv_usec; +}; + struct xsk_ring_stats { unsigned long rx_npkts; unsigned long tx_npkts; @@ -836,18 +846,25 @@ static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, #define ETH_HDR_SIZE (opt_vlan_tag ? sizeof(struct vlan_ethhdr) : \ sizeof(struct ethhdr)) +#define PKTGEN_HDR_SIZE (opt_tstamp ? sizeof(struct pktgen_hdr) : 0) #define PKT_HDR_SIZE (ETH_HDR_SIZE + sizeof(struct iphdr) + \ - sizeof(struct udphdr)) + sizeof(struct udphdr) + PKTGEN_HDR_SIZE) +#define PKTGEN_HDR_OFFSET (ETH_HDR_SIZE + sizeof(struct iphdr) + \ + sizeof(struct udphdr)) +#define PKTGEN_SIZE_MIN (PKTGEN_HDR_OFFSET + sizeof(struct pktgen_hdr) + \ + ETH_FCS_SIZE) #define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) #define IP_PKT_SIZE (PKT_SIZE - ETH_HDR_SIZE) #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) -#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) +#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - \ + (sizeof(struct udphdr) + PKTGEN_HDR_SIZE)) static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static void gen_eth_hdr_data(void) { + struct pktgen_hdr *pktgen_hdr; struct udphdr *udp_hdr; struct iphdr *ip_hdr; @@ -860,7 +877,10 @@ static void gen_eth_hdr_data(void) sizeof(struct iphdr)); ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct vlan_ethhdr)); - + pktgen_hdr = (struct pktgen_hdr *)(pkt_data + + sizeof(struct vlan_ethhdr) + + sizeof(struct iphdr) + + sizeof(struct udphdr)); /* ethernet & VLAN header */ memcpy(veth_hdr->h_dest, &opt_txdmac, ETH_ALEN); memcpy(veth_hdr->h_source, &opt_txsmac, ETH_ALEN); @@ -877,7 +897,10 @@ static void gen_eth_hdr_data(void) sizeof(struct iphdr)); ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct ethhdr)); - + pktgen_hdr = (struct pktgen_hdr *)(pkt_data + + sizeof(struct ethhdr) + + sizeof(struct iphdr) + + sizeof(struct udphdr)); /* ethernet header */ memcpy(eth_hdr->h_dest, &opt_txdmac, ETH_ALEN); memcpy(eth_hdr->h_source, &opt_txsmac, ETH_ALEN); @@ -906,6 +929,9 @@ static void gen_eth_hdr_data(void) udp_hdr->dest = htons(0x1000); udp_hdr->len = htons(UDP_PKT_SIZE); + if (opt_tstamp) + pktgen_hdr->pgh_magic = htonl(PKTGEN_MAGIC); + /* UDP data */ memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern, UDP_PKT_DATA_SIZE); @@ -1049,6 +1075,7 @@ static struct option long_options[] = { {"tx-dmac", required_argument, 0, 'G'}, {"tx-smac", required_argument, 0, 'H'}, {"tx-cycle", required_argument, 0, 'T'}, + {"tstamp", no_argument, 0, 'y'}, {"policy", required_argument, 0, 'W'}, {"schpri", required_argument, 0, 'U'}, {"extra-stats", no_argument, 0, 'x'}, @@ -1099,6 +1126,7 @@ static void usage(const char *prog) " -G, --tx-dmac= Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" " -H, --tx-smac= Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n" " -T, --tx-cycle=n Tx cycle time in micro-seconds (For -t|--txonly).\n" + " -y, --tstamp Add time-stamp to packet (For -t|--txonly).\n" " -W, --policy=POLICY Schedule policy. Default: SCHED_OTHER\n" " -U, --schpri=n Schedule priority. Default: %d\n" " -x, --extra-stats Display extra statistics.\n" @@ -1125,7 +1153,7 @@ static void parse_command_line(int argc, char **argv) for (;;) { c = getopt_long(argc, argv, - "Frtli:q:pSNn:w:O:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR", + "Frtli:q:pSNn:w:O:czf:muMd:b:C:s:P:VJ:K:G:H:T:yW:U:xQaI:BR", long_options, &option_index); if (c == -1) break; @@ -1246,6 +1274,9 @@ static void parse_command_line(int argc, char **argv) opt_tx_cycle_ns = atoi(optarg); opt_tx_cycle_ns *= NSEC_PER_USEC; break; + case 'y': + opt_tstamp = 1; + break; case 'W': if (get_schpolicy(&opt_schpolicy, optarg)) { fprintf(stderr, @@ -1462,9 +1493,10 @@ static void rx_drop_all(void) } } -static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) +static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, + int batch_size, unsigned long tx_ns) { - u32 idx; + u32 idx, tv_sec, tv_usec; unsigned int i; while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < @@ -1474,11 +1506,31 @@ static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) return 0; } + if (opt_tstamp) { + tv_sec = (u32)(tx_ns / NSEC_PER_SEC); + tv_usec = (u32)((tx_ns % NSEC_PER_SEC) / 1000); + } + for (i = 0; i < batch_size; i++) { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size; tx_desc->len = PKT_SIZE; + + if (opt_tstamp) { + struct pktgen_hdr *pktgen_hdr; + u64 addr = tx_desc->addr; + char *pkt; + + pkt = xsk_umem__get_data(xsk->umem->buffer, addr); + pktgen_hdr = (struct pktgen_hdr *)(pkt + PKTGEN_HDR_OFFSET); + + pktgen_hdr->seq_num = htonl(sequence++); + pktgen_hdr->tv_sec = htonl(tv_sec); + pktgen_hdr->tv_usec = htonl(tv_usec); + + hex_dump(pkt, PKT_SIZE, addr); + } } xsk_ring_prod__submit(&xsk->tx, batch_size); @@ -1552,6 +1604,7 @@ static void tx_only_all(void) while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { int batch_size = get_batch_size(pkt_cnt); + unsigned long tx_ns = 0; struct timespec next; int tx_cnt = 0; long diff; @@ -1581,7 +1634,8 @@ static void tx_only_all(void) } /* Measure periodic Tx scheduling variance */ - diff = get_nsecs() - next_tx_ns; + tx_ns = get_nsecs(); + diff = tx_ns - next_tx_ns; if (diff < tx_cycle_diff_min) tx_cycle_diff_min = diff; @@ -1590,10 +1644,12 @@ static void tx_only_all(void) tx_cycle_diff_ave += (double)diff; tx_cycle_cnt++; + } else if (opt_tstamp) { + tx_ns = get_nsecs(); } for (i = 0; i < num_socks; i++) - tx_cnt += tx_only(xsks[i], &frame_nb[i], batch_size); + tx_cnt += tx_only(xsks[i], &frame_nb[i], batch_size, tx_ns); pkt_cnt += tx_cnt; @@ -1895,6 +1951,9 @@ int main(int argc, char **argv) apply_setsockopt(xsks[i]); if (opt_bench == BENCH_TXONLY) { + if (opt_tstamp && opt_pkt_size < PKTGEN_SIZE_MIN) + opt_pkt_size = PKTGEN_SIZE_MIN; + gen_eth_hdr_data(); for (i = 0; i < NUM_FRAMES; i++) From 4a48ef70b93b8c7ed5190adfca18849e76387b80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:06 +0100 Subject: [PATCH 31/41] xdp: Allow registering memory model without rxq reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions that register an XDP memory model take a struct xdp_rxq as parameter, but the RXQ is not actually used for anything other than pulling out the struct xdp_mem_info that it embeds. So refactor the register functions and export variants that just take a pointer to the xdp_mem_info. This is in preparation for enabling XDP_REDIRECT in bpf_prog_run(), using a page_pool instance that is not connected to any network device. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-2-toke@redhat.com --- include/net/xdp.h | 3 ++ net/core/xdp.c | 92 +++++++++++++++++++++++++++++++---------------- 2 files changed, 65 insertions(+), 30 deletions(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 447f9b1578f3..8f0812e4996d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -260,6 +260,9 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); +int xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, void *allocator); +void xdp_unreg_mem_model(struct xdp_mem_info *mem); /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. diff --git a/net/core/xdp.c b/net/core/xdp.c index 7fe1df85f505..58089f6d2c7a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -110,20 +110,15 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +void xdp_unreg_mem_model(struct xdp_mem_info *mem) { struct xdp_mem_allocator *xa; - int type = xdp_rxq->mem.type; - int id = xdp_rxq->mem.id; + int type = mem->type; + int id = mem->id; /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; - - if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { - WARN(1, "Missing register, driver bug"); - return; - } + mem->id = 0; + mem->type = 0; if (id == 0) return; @@ -135,6 +130,17 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) rcu_read_unlock(); } } +EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); + +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return; + } + + xdp_unreg_mem_model(&xdp_rxq->mem); +} EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) @@ -259,28 +265,24 @@ static bool __is_supported_mem_type(enum xdp_mem_type type) return true; } -int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, - enum xdp_mem_type type, void *allocator) +static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, + void *allocator) { struct xdp_mem_allocator *xdp_alloc; gfp_t gfp = GFP_KERNEL; int id, errno, ret; void *ptr; - if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { - WARN(1, "Missing register, driver bug"); - return -EFAULT; - } - if (!__is_supported_mem_type(type)) - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); - xdp_rxq->mem.type = type; + mem->type = type; if (!allocator) { if (type == MEM_TYPE_PAGE_POOL) - return -EINVAL; /* Setup time check page_pool req */ - return 0; + return ERR_PTR(-EINVAL); /* Setup time check page_pool req */ + return NULL; } /* Delay init of rhashtable to save memory if feature isn't used */ @@ -290,13 +292,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, mutex_unlock(&mem_id_lock); if (ret < 0) { WARN_ON(1); - return ret; + return ERR_PTR(ret); } } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); if (!xdp_alloc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); mutex_lock(&mem_id_lock); id = __mem_id_cyclic_get(gfp); @@ -304,15 +306,15 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, errno = id; goto err; } - xdp_rxq->mem.id = id; - xdp_alloc->mem = xdp_rxq->mem; + mem->id = id; + xdp_alloc->mem = *mem; xdp_alloc->allocator = allocator; /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { - ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id); - xdp_rxq->mem.id = 0; + ida_simple_remove(&mem_id_pool, mem->id); + mem->id = 0; errno = PTR_ERR(ptr); goto err; } @@ -322,13 +324,43 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, mutex_unlock(&mem_id_lock); - trace_mem_connect(xdp_alloc, xdp_rxq); - return 0; + return xdp_alloc; err: mutex_unlock(&mem_id_lock); kfree(xdp_alloc); - return errno; + return ERR_PTR(errno); } + +int xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + xdp_alloc = __xdp_reg_mem_model(mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + return 0; +} +EXPORT_SYMBOL_GPL(xdp_reg_mem_model); + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + + trace_mem_connect(xdp_alloc, xdp_rxq); + return 0; +} + EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error From 35b2e549894b7ef0b6e7f3a70c2ab75b767cfce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:07 +0100 Subject: [PATCH 32/41] page_pool: Add callback to init pages when they are allocated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new callback function to page_pool that, if set, will be called every time a new page is allocated. This will be used from bpf_test_run() to initialise the page data with the data provided by userspace when running XDP programs with redirect turned on. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20220103150812.87914-3-toke@redhat.com --- include/net/page_pool.h | 2 ++ net/core/page_pool.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index a4082406a003..d807b6800a4a 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -80,6 +80,8 @@ struct page_pool_params { enum dma_data_direction dma_dir; /* DMA mapping direction */ unsigned int max_len; /* max DMA sync memory size */ unsigned int offset; /* DMA addr offset */ + void (*init_callback)(struct page *page, void *arg); + void *init_arg; }; struct page_pool { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1a6978427d6c..f53786f6666d 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -217,6 +217,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, { page->pp = pool; page->pp_magic |= PP_SIGNATURE; + if (pool->p.init_callback) + pool->p.init_callback(page, pool->p.init_arg); } static void page_pool_clear_pp_info(struct page *page) From 64693ec7774e471f817a725686d93903e919a2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:08 +0100 Subject: [PATCH 33/41] page_pool: Store the XDP mem id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store the XDP mem ID inside the page_pool struct so it can be retrieved later for use in bpf_prog_run(). Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20220103150812.87914-4-toke@redhat.com --- include/net/page_pool.h | 9 +++++++-- net/core/page_pool.c | 4 +++- net/core/xdp.c | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index d807b6800a4a..79a805542d0f 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -96,6 +96,7 @@ struct page_pool { unsigned int frag_offset; struct page *frag_page; long frag_users; + u32 xdp_mem_id; /* * Data structure for allocation side @@ -170,9 +171,12 @@ bool page_pool_return_skb_page(struct page *page); struct page_pool *page_pool_create(const struct page_pool_params *params); +struct xdp_mem_info; + #ifdef CONFIG_PAGE_POOL void page_pool_destroy(struct page_pool *pool); -void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + struct xdp_mem_info *mem); void page_pool_release_page(struct page_pool *pool, struct page *page); void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count); @@ -182,7 +186,8 @@ static inline void page_pool_destroy(struct page_pool *pool) } static inline void page_pool_use_xdp_mem(struct page_pool *pool, - void (*disconnect)(void *)) + void (*disconnect)(void *), + struct xdp_mem_info *mem) { } static inline void page_pool_release_page(struct page_pool *pool, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f53786f6666d..7347d5c7dbe0 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -693,10 +693,12 @@ static void page_pool_release_retry(struct work_struct *wq) schedule_delayed_work(&pool->release_dw, DEFER_TIME); } -void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; + pool->xdp_mem_id = mem->id; } void page_pool_destroy(struct page_pool *pool) diff --git a/net/core/xdp.c b/net/core/xdp.c index 58089f6d2c7a..7aba35504986 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -320,7 +320,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, } if (type == MEM_TYPE_PAGE_POOL) - page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem); mutex_unlock(&mem_id_lock); From d53ad5d8b218a885e95080d4d3d556b16b91b1b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:09 +0100 Subject: [PATCH 34/41] xdp: Move conversion to xdp_frame out of map functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All map redirect functions except XSK maps convert xdp_buff to xdp_frame before enqueueing it. So move this conversion of out the map functions and into xdp_do_redirect(). This removes a bit of duplicated code, but more importantly it makes it possible to support caller-allocated xdp_frame structures, which will be added in a subsequent commit. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-5-toke@redhat.com --- include/linux/bpf.h | 20 ++++++++++---------- kernel/bpf/cpumap.c | 8 +------- kernel/bpf/devmap.c | 32 +++++++++++--------------------- net/core/filter.c | 24 +++++++++++++++++------- 4 files changed, 39 insertions(+), 45 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 26753139d5b4..6e947cd91152 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1669,17 +1669,17 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ -struct xdp_buff; +struct xdp_frame; struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; void __dev_flush(void); -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); @@ -1688,7 +1688,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, bool exclude_ingress); void __cpu_map_flush(void); -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb); @@ -1866,26 +1866,26 @@ static inline void __dev_flush(void) { } -struct xdp_buff; +struct xdp_frame; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; static inline -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { return 0; @@ -1913,7 +1913,7 @@ static inline void __cpu_map_flush(void) } static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, - struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 0421061d95f1..b3e6b9422238 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) list_add(&bq->flush_node, flush_list); } -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct xdp_frame *xdpf; - - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - /* Info needed when constructing SKB on remote CPU */ xdpf->dev_rx = dev_rx; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 6feea293ff10..fe019dbdb3f0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, bq->q[bq->count++] = xdpf; } -static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + err = xdp_ok_fwd_dev(dev, xdpf->len); if (unlikely(err)) return err; - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } @@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev return act; } -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { - return __xdp_enqueue(dev, xdp, dev_rx, NULL); + return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; - return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); + return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } -static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp) +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj || !obj->dev->netdev_ops->ndo_xdp_xmit) return false; - if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data)) + if (xdp_ok_fwd_dev(obj->dev, xdpf->len)) return false; return true; @@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes) return n; } -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; - struct xdp_frame *xdpf; int num_excluded = 0; unsigned int i; int err; @@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, excluded_devices[num_excluded++] = dev_rx->ifindex; } - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) @@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, diff --git a/net/core/filter.c b/net/core/filter.c index cac2be559ab0..e2b83056246c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3964,12 +3964,24 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + struct xdp_frame *xdpf; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; + if (map_type == BPF_MAP_TYPE_XSKMAP) { + err = __xsk_map_redirect(fwd, xdp); + goto out; + } + + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) { + err = -EOVERFLOW; + goto err; + } + switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; @@ -3977,17 +3989,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, map = READ_ONCE(ri->map); if (unlikely(map)) { WRITE_ONCE(ri->map, NULL); - err = dev_map_enqueue_multi(xdp, dev, map, + err = dev_map_enqueue_multi(xdpf, dev, map, ri->flags & BPF_F_EXCLUDE_INGRESS); } else { - err = dev_map_enqueue(fwd, xdp, dev); + err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: - err = cpu_map_enqueue(fwd, xdp, dev); - break; - case BPF_MAP_TYPE_XSKMAP: - err = __xsk_map_redirect(fwd, xdp); + err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { @@ -3996,7 +4005,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, err = -EINVAL; break; } - err = dev_xdp_enqueue(fwd, xdp, dev); + err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; @@ -4004,6 +4013,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, err = -EBADRQC; } +out: if (unlikely(err)) goto err; From 1372d34ccf6dd480332b2bcb2fd59a2b9a0df415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 3 Jan 2022 16:08:10 +0100 Subject: [PATCH 35/41] xdp: Add xdp_do_redirect_frame() for pre-computed xdp_frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an xdp_do_redirect_frame() variant which supports pre-computed xdp_frame structures. This will be used in bpf_prog_run() to avoid having to write to the xdp_frame structure when the XDP program doesn't modify the frame boundaries. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-6-toke@redhat.com --- include/linux/filter.h | 4 +++ net/core/filter.c | 65 +++++++++++++++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 60eec80fa1d4..71fa57b88bfc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1019,6 +1019,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); +int xdp_do_redirect_frame(struct net_device *dev, + struct xdp_buff *xdp, + struct xdp_frame *xdpf, + struct bpf_prog *prog); void xdp_do_flush(void); /* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as diff --git a/net/core/filter.c b/net/core/filter.c index e2b83056246c..4603b7cd3cd1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3957,26 +3957,44 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_master_redirect); -int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + err = __xsk_map_redirect(fwd, xdp); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_frame *xdpf, + struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; - struct xdp_frame *xdpf; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; - if (map_type == BPF_MAP_TYPE_XSKMAP) { - err = __xsk_map_redirect(fwd, xdp); - goto out; - } - - xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) { err = -EOVERFLOW; goto err; @@ -4013,7 +4031,6 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, err = -EBADRQC; } -out: if (unlikely(err)) goto err; @@ -4023,8 +4040,34 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } + +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), + xdp_prog); +} EXPORT_SYMBOL_GPL(xdp_do_redirect); +int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); + static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, From 70bc793382a0e37ba4e35e4d1a317b280b829a44 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Jan 2022 12:51:56 -0800 Subject: [PATCH 36/41] selftests/bpf: Don't rely on preserving volatile in PT_REGS macros in loop3 PT_REGS*() macro on some architectures force-cast struct pt_regs to other types (user_pt_regs, etc) and might drop volatile modifiers, if any. Volatile isn't really required as pt_regs value isn't supposed to change during the BPF program run, so this is correct behavior. But progs/loop3.c relies on that volatile modifier to ensure that loop is preserved. Fix loop3.c by declaring i and sum variables as volatile instead. It preserves the loop and makes the test pass on all architectures (including s390x which is currently broken). Fixes: 3cc31d794097 ("libbpf: Normalize PT_REGS_xxx() macro definitions") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220106205156.955373-1-andrii@kernel.org --- tools/testing/selftests/bpf/progs/loop3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c index 76e93b31c14b..717dab14322b 100644 --- a/tools/testing/selftests/bpf/progs/loop3.c +++ b/tools/testing/selftests/bpf/progs/loop3.c @@ -12,9 +12,9 @@ char _license[] SEC("license") = "GPL"; SEC("raw_tracepoint/consume_skb") -int while_true(volatile struct pt_regs* ctx) +int while_true(struct pt_regs *ctx) { - __u64 i = 0, sum = 0; + volatile __u64 i = 0, sum = 0; do { i++; sum += PT_REGS_RC(ctx); From e59618f0f46fa6cf86d5b82380e0f453756b282b Mon Sep 17 00:00:00 2001 From: Grant Seltzer Date: Thu, 6 Jan 2022 15:13:05 -0500 Subject: [PATCH 37/41] libbpf: Add documentation for bpf_map batch operations This adds documention for: - bpf_map_delete_batch() - bpf_map_lookup_batch() - bpf_map_lookup_and_delete_batch() - bpf_map_update_batch() This also updates the public API for the `keys` parameter of `bpf_map_delete_batch()`, and both the `keys` and `values` parameters of `bpf_map_update_batch()` to be constants. Signed-off-by: Grant Seltzer Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220106201304.112675-1-grantseltzer@gmail.com --- tools/lib/bpf/bpf.c | 8 +-- tools/lib/bpf/bpf.h | 115 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 117 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 9b64eed2b003..550b4cbb6c99 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -691,11 +691,11 @@ static int bpf_map_batch_common(int cmd, int fd, void *in_batch, return libbpf_err_errno(ret); } -int bpf_map_delete_batch(int fd, void *keys, __u32 *count, +int bpf_map_delete_batch(int fd, const void *keys, __u32 *count, const struct bpf_map_batch_opts *opts) { return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, NULL, - NULL, keys, NULL, count, opts); + NULL, (void *)keys, NULL, count, opts); } int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys, @@ -715,11 +715,11 @@ int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch, count, opts); } -int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count, +int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *count, const struct bpf_map_batch_opts *opts) { return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH, fd, NULL, NULL, - keys, values, count, opts); + (void *)keys, (void *)values, count, opts); } int bpf_obj_pin(int fd, const char *pathname) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 00619f64a040..14e0d97ad2cf 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -254,17 +254,128 @@ struct bpf_map_batch_opts { }; #define bpf_map_batch_opts__last_field flags -LIBBPF_API int bpf_map_delete_batch(int fd, void *keys, + +/** + * @brief **bpf_map_delete_batch()** allows for batch deletion of multiple + * elements in a BPF map. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param count input and output parameter; on input **count** represents the + * number of elements in the map to delete in batch; + * on output if a non-EFAULT error is returned, **count** represents the number of deleted + * elements if the output **count** value is not equal to the input **count** value + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch deletion works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys, __u32 *count, const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements. + * + * The parameter *in_batch* is the address of the first element in the batch to read. + * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent + * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate + * that the batched lookup starts from the beginning of the map. + * + * The *keys* and *values* are output parameters which must point to memory large enough to + * hold *count* items based on the key and value size of the map *map_fd*. The *keys* + * buffer must be of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * indicate that the batched lookup starts from the beginning of the map. + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array large enough for *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read in batch; on output it's the number of elements that were + * successfully read. + * If a non-EFAULT error is returned, count will be set as the number of elements + * that were read before the error occurred. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch lookup works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys, void *values, __u32 *count, const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_and_delete_batch()** allows for batch lookup and deletion + * of BPF map elements where each element is deleted after being retrieved. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * get address of the first element in *out_batch* + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array of *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read and delete in batch; on output it represents the number of + * elements that were successfully read and deleted + * If a non-**EFAULT** error code is returned and if the output **count** value + * is not equal to the input **count** value, up to **count** elements may + * have been deleted. + * if **EFAULT** is returned up to *count* elements may have been deleted without + * being returned via the *keys* and *values* output parameters. + * @param opts options for configuring the way the batch lookup and delete works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch, void *keys, void *values, __u32 *count, const struct bpf_map_batch_opts *opts); -LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values, + +/** + * @brief **bpf_map_update_batch()** updates multiple elements in a map + * by specifying keys and their corresponding values. + * + * The *keys* and *values* parameters must point to memory large enough + * to hold *count* items based on the key and value size of the map. + * + * The *opts* parameter can be used to control how *bpf_map_update_batch()* + * should handle keys that either do or do not already exist in the map. + * In particular the *flags* parameter of *bpf_map_batch_opts* can be + * one of the following: + * + * Note that *count* is an input and output parameter, where on output it + * represents how many elements were successfully updated. Also note that if + * **EFAULT** then *count* should not be trusted to be correct. + * + * **BPF_ANY** + * Create new elements or update existing. + * + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * + * **BPF_EXIST** + * Update existing elements. + * + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param values pointer to an array of *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to update in batch; on output if a non-EFAULT error is returned, + * **count** represents the number of updated elements if the output **count** + * value is not equal to the input **count** value. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch update works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *count, const struct bpf_map_batch_opts *opts); From 44bab87d8ca6f0544a9f8fc97bdf33aa5b3c899e Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 6 Jan 2022 12:55:25 -0800 Subject: [PATCH 38/41] bpf/selftests: Test bpf_d_path on rdonly_mem. The second parameter of bpf_d_path() can only accept writable memories. Rdonly_mem obtained from bpf_per_cpu_ptr() can not be passed into bpf_d_path for modification. This patch adds a selftest to verify this behavior. Signed-off-by: Hao Luo Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220106205525.2116218-1-haoluo@google.com --- .../testing/selftests/bpf/prog_tests/d_path.c | 22 ++++++++++++++- .../bpf/progs/test_d_path_check_rdonly_mem.c | 28 +++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index 0a577a248d34..32fc5b3b5cf6 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -9,6 +9,7 @@ #define MAX_FILES 7 #include "test_d_path.skel.h" +#include "test_d_path_check_rdonly_mem.skel.h" static int duration; @@ -99,7 +100,7 @@ static int trigger_fstat_events(pid_t pid) return ret; } -void test_d_path(void) +static void test_d_path_basic(void) { struct test_d_path__bss *bss; struct test_d_path *skel; @@ -155,3 +156,22 @@ void test_d_path(void) cleanup: test_d_path__destroy(skel); } + +static void test_d_path_check_rdonly_mem(void) +{ + struct test_d_path_check_rdonly_mem *skel; + + skel = test_d_path_check_rdonly_mem__open_and_load(); + ASSERT_ERR_PTR(skel, "unexpected_load_overwriting_rdonly_mem"); + + test_d_path_check_rdonly_mem__destroy(skel); +} + +void test_d_path(void) +{ + if (test__start_subtest("basic")) + test_d_path_basic(); + + if (test__start_subtest("check_rdonly_mem")) + test_d_path_check_rdonly_mem(); +} diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c new file mode 100644 index 000000000000..27c27cff6a3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Google */ + +#include "vmlinux.h" +#include +#include + +extern const int bpf_prog_active __ksym; + +SEC("fentry/security_inode_getattr") +int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat, + __u32 request_mask, unsigned int query_flags) +{ + void *active; + __u32 cpu; + + cpu = bpf_get_smp_processor_id(); + active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu); + if (active) { + /* FAIL here! 'active' points to readonly memory. bpf helpers + * that update its arguments can not write into it. + */ + bpf_d_path(path, active, sizeof(int)); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; From 91a760b26926265a60c77ddf016529bcf3e17a04 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 6 Jan 2022 21:20:20 +0800 Subject: [PATCH 39/41] net: bpf: Handle return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND() The return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND() in __inet_bind() is not handled properly. While the return value is non-zero, it will set inet_saddr and inet_rcv_saddr to 0 and exit: err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; goto out_release_sock; } Let's take UDP for example and see what will happen. For UDP socket, it will be added to 'udp_prot.h.udp_table->hash' and 'udp_prot.h.udp_table->hash2' after the sk->sk_prot->get_port() called success. If 'inet->inet_rcv_saddr' is specified here, then 'sk' will be in the 'hslot2' of 'hash2' that it don't belong to (because inet_saddr is changed to 0), and UDP packet received will not be passed to this sock. If 'inet->inet_rcv_saddr' is not specified here, the sock will work fine, as it can receive packet properly, which is wired, as the 'bind()' is already failed. To undo the get_port() operation, introduce the 'put_port' field for 'struct proto'. For TCP proto, it is inet_put_port(); For UDP proto, it is udp_lib_unhash(); For icmp proto, it is ping_unhash(). Therefore, after sys_bind() fail caused by BPF_CGROUP_RUN_PROG_INET4_POST_BIND(), it will be unbinded, which means that it can try to be binded to another port. Signed-off-by: Menglong Dong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220106132022.3470772-2-imagedong@tencent.com --- include/net/sock.h | 1 + net/ipv4/af_inet.c | 2 ++ net/ipv4/ping.c | 1 + net/ipv4/tcp_ipv4.c | 1 + net/ipv4/udp.c | 1 + net/ipv6/af_inet6.c | 2 ++ net/ipv6/ping.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/ipv6/udp.c | 1 + 9 files changed, 11 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 7b4b4237e6e0..ff9b508d9c5f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1209,6 +1209,7 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); + void (*put_port)(struct sock *sk); #ifdef CONFIG_BPF_SYSCALL int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f53184767ee7..9c465bac1eb0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -531,6 +531,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; + if (sk->sk_prot->put_port) + sk->sk_prot->put_port(sk); goto out_release_sock; } } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index e540b0dcf085..0e56df3a45e2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -994,6 +994,7 @@ struct proto ping_prot = { .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, + .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; EXPORT_SYMBOL(ping_prot); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac10e4cdd8d0..9861786b8336 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3076,6 +3076,7 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, + .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7b18a6f42f18..c2a4411d2b04 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2927,6 +2927,7 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, + .put_port = udp_lib_unhash, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d1636425654e..8fe7900f1949 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -413,6 +413,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, if (err) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); + if (sk->sk_prot->put_port) + sk->sk_prot->put_port(sk); goto out; } } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 6ac88fe24a8e..9256f6ba87ef 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -177,6 +177,7 @@ struct proto pingv6_prot = { .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, + .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), }; EXPORT_SYMBOL_GPL(pingv6_prot); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1ac243d18c2b..075ee8a2df3b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2181,6 +2181,7 @@ struct proto tcpv6_prot = { .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, + .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1accc06abc54..90718a924ca8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1732,6 +1732,7 @@ struct proto udpv6_prot = { .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, + .put_port = udp_lib_unhash, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif From 6fd92c7f0c3846340fee20f62dacb17d0a15c0d3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 6 Jan 2022 21:20:21 +0800 Subject: [PATCH 40/41] bpf: selftests: Use C99 initializers in test_sock.c Use C99 initializers for the initialization of 'tests' in test_sock.c. Signed-off-by: Menglong Dong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220106132022.3470772-3-imagedong@tencent.com --- tools/testing/selftests/bpf/test_sock.c | 220 ++++++++++-------------- 1 file changed, 92 insertions(+), 128 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index e8edd3dd3ec2..94f9b126f5ed 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -46,7 +46,7 @@ struct sock_test { static struct sock_test tests[] = { { - "bind4 load with invalid access: src_ip6", + .descr = "bind4 load with invalid access: src_ip6", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, @@ -54,16 +54,12 @@ static struct sock_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET4_POST_BIND, - 0, - 0, - NULL, - 0, - LOAD_REJECT, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = LOAD_REJECT, }, { - "bind4 load with invalid access: mark", + .descr = "bind4 load with invalid access: mark", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, @@ -71,16 +67,12 @@ static struct sock_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET4_POST_BIND, - 0, - 0, - NULL, - 0, - LOAD_REJECT, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = LOAD_REJECT, }, { - "bind6 load with invalid access: src_ip4", + .descr = "bind6 load with invalid access: src_ip4", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, @@ -88,16 +80,12 @@ static struct sock_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_INET6_POST_BIND, - 0, - 0, - NULL, - 0, - LOAD_REJECT, + .expected_attach_type = BPF_CGROUP_INET6_POST_BIND, + .attach_type = BPF_CGROUP_INET6_POST_BIND, + .result = LOAD_REJECT, }, { - "sock_create load with invalid access: src_port", + .descr = "sock_create load with invalid access: src_port", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, @@ -105,128 +93,106 @@ static struct sock_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET_SOCK_CREATE, - BPF_CGROUP_INET_SOCK_CREATE, - 0, - 0, - NULL, - 0, - LOAD_REJECT, + .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .result = LOAD_REJECT, }, { - "sock_create load w/o expected_attach_type (compat mode)", + .descr = "sock_create load w/o expected_attach_type (compat mode)", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - 0, - BPF_CGROUP_INET_SOCK_CREATE, - AF_INET, - SOCK_STREAM, - "127.0.0.1", - 8097, - SUCCESS, + .expected_attach_type = 0, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .domain = AF_INET, + .type = SOCK_STREAM, + .ip = "127.0.0.1", + .port = 8097, + .result = SUCCESS, }, { - "sock_create load w/ expected_attach_type", + .descr = "sock_create load w/ expected_attach_type", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET_SOCK_CREATE, - BPF_CGROUP_INET_SOCK_CREATE, - AF_INET, - SOCK_STREAM, - "127.0.0.1", - 8097, - SUCCESS, + .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .domain = AF_INET, + .type = SOCK_STREAM, + .ip = "127.0.0.1", + .port = 8097, + .result = SUCCESS, }, { - "attach type mismatch bind4 vs bind6", + .descr = "attach type mismatch bind4 vs bind6", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET6_POST_BIND, - 0, - 0, - NULL, - 0, - ATTACH_REJECT, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET6_POST_BIND, + .result = ATTACH_REJECT, }, { - "attach type mismatch bind6 vs bind4", + .descr = "attach type mismatch bind6 vs bind4", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_INET4_POST_BIND, - 0, - 0, - NULL, - 0, - ATTACH_REJECT, + .expected_attach_type = BPF_CGROUP_INET6_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = ATTACH_REJECT, }, { - "attach type mismatch default vs bind4", + .descr = "attach type mismatch default vs bind4", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - 0, - BPF_CGROUP_INET4_POST_BIND, - 0, - 0, - NULL, - 0, - ATTACH_REJECT, + .expected_attach_type = 0, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .result = ATTACH_REJECT, }, { - "attach type mismatch bind6 vs sock_create", + .descr = "attach type mismatch bind6 vs sock_create", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_INET_SOCK_CREATE, - 0, - 0, - NULL, - 0, - ATTACH_REJECT, + .expected_attach_type = BPF_CGROUP_INET6_POST_BIND, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .result = ATTACH_REJECT, }, { - "bind4 reject all", + .descr = "bind4 reject all", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET4_POST_BIND, - AF_INET, - SOCK_STREAM, - "0.0.0.0", - 0, - BIND_REJECT, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .domain = AF_INET, + .type = SOCK_STREAM, + .ip = "0.0.0.0", + .result = BIND_REJECT, }, { - "bind6 reject all", + .descr = "bind6 reject all", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_INET6_POST_BIND, - AF_INET6, - SOCK_STREAM, - "::", - 0, - BIND_REJECT, + .expected_attach_type = BPF_CGROUP_INET6_POST_BIND, + .attach_type = BPF_CGROUP_INET6_POST_BIND, + .domain = AF_INET6, + .type = SOCK_STREAM, + .ip = "::", + .result = BIND_REJECT, }, { - "bind6 deny specific IP & port", + .descr = "bind6 deny specific IP & port", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), @@ -247,16 +213,16 @@ static struct sock_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_INET6_POST_BIND, - AF_INET6, - SOCK_STREAM, - "::1", - 8193, - BIND_REJECT, + .expected_attach_type = BPF_CGROUP_INET6_POST_BIND, + .attach_type = BPF_CGROUP_INET6_POST_BIND, + .domain = AF_INET6, + .type = SOCK_STREAM, + .ip = "::1", + .port = 8193, + .result = BIND_REJECT, }, { - "bind4 allow specific IP & port", + .descr = "bind4 allow specific IP & port", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), @@ -277,41 +243,39 @@ static struct sock_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET4_POST_BIND, - AF_INET, - SOCK_STREAM, - "127.0.0.1", - 4098, - SUCCESS, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .domain = AF_INET, + .type = SOCK_STREAM, + .ip = "127.0.0.1", + .port = 4098, + .result = SUCCESS, }, { - "bind4 allow all", + .descr = "bind4 allow all", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET4_POST_BIND, - AF_INET, - SOCK_STREAM, - "0.0.0.0", - 0, - SUCCESS, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .domain = AF_INET, + .type = SOCK_STREAM, + .ip = "0.0.0.0", + .result = SUCCESS, }, { - "bind6 allow all", + .descr = "bind6 allow all", .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_INET6_POST_BIND, - AF_INET6, - SOCK_STREAM, - "::", - 0, - SUCCESS, + .expected_attach_type = BPF_CGROUP_INET6_POST_BIND, + .attach_type = BPF_CGROUP_INET6_POST_BIND, + .domain = AF_INET6, + .type = SOCK_STREAM, + .ip = "::", + .result = SUCCESS, }, }; From f7342481749365d9ac5f24fb971659a64e045bb5 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 6 Jan 2022 21:20:22 +0800 Subject: [PATCH 41/41] bpf: selftests: Add bind retry for post_bind{4, 6} With previous patch, kernel is able to 'put_port' after sys_bind() fails. Add the test for that case: rebind another port after sys_bind() fails. If the bind success, it means previous bind operation is already undoed. Signed-off-by: Menglong Dong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220106132022.3470772-4-imagedong@tencent.com --- tools/testing/selftests/bpf/test_sock.c | 150 ++++++++++++++++++++---- 1 file changed, 130 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index 94f9b126f5ed..fe10f8134278 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -35,12 +35,15 @@ struct sock_test { /* Endpoint to bind() to */ const char *ip; unsigned short port; + unsigned short port_retry; /* Expected test result */ enum { LOAD_REJECT, ATTACH_REJECT, BIND_REJECT, SUCCESS, + RETRY_SUCCESS, + RETRY_REJECT } result; }; @@ -251,6 +254,99 @@ static struct sock_test tests[] = { .port = 4098, .result = SUCCESS, }, + { + .descr = "bind4 deny specific IP & port of TCP, and retry", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* if (ip == expected && port == expected) */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_ip4)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, + __bpf_constant_ntohl(0x7F000001), 4), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2), + + /* return DENY; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + + /* else return ALLOW; */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .domain = AF_INET, + .type = SOCK_STREAM, + .ip = "127.0.0.1", + .port = 4098, + .port_retry = 5000, + .result = RETRY_SUCCESS, + }, + { + .descr = "bind4 deny specific IP & port of UDP, and retry", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* if (ip == expected && port == expected) */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_ip4)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, + __bpf_constant_ntohl(0x7F000001), 4), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2), + + /* return DENY; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + + /* else return ALLOW; */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_attach_type = BPF_CGROUP_INET4_POST_BIND, + .attach_type = BPF_CGROUP_INET4_POST_BIND, + .domain = AF_INET, + .type = SOCK_DGRAM, + .ip = "127.0.0.1", + .port = 4098, + .port_retry = 5000, + .result = RETRY_SUCCESS, + }, + { + .descr = "bind6 deny specific IP & port, and retry", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* if (ip == expected && port == expected) */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_ip6[3])), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, + __bpf_constant_ntohl(0x00000001), 4), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2), + + /* return DENY; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + + /* else return ALLOW; */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .expected_attach_type = BPF_CGROUP_INET6_POST_BIND, + .attach_type = BPF_CGROUP_INET6_POST_BIND, + .domain = AF_INET6, + .type = SOCK_STREAM, + .ip = "::1", + .port = 8193, + .port_retry = 9000, + .result = RETRY_SUCCESS, + }, { .descr = "bind4 allow all", .insns = { @@ -315,14 +411,15 @@ static int attach_sock_prog(int cgfd, int progfd, return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE); } -static int bind_sock(int domain, int type, const char *ip, unsigned short port) +static int bind_sock(int domain, int type, const char *ip, + unsigned short port, unsigned short port_retry) { struct sockaddr_storage addr; struct sockaddr_in6 *addr6; struct sockaddr_in *addr4; int sockfd = -1; socklen_t len; - int err = 0; + int res = SUCCESS; sockfd = socket(domain, type, 0); if (sockfd < 0) @@ -348,21 +445,44 @@ static int bind_sock(int domain, int type, const char *ip, unsigned short port) goto err; } - if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) - goto err; + if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) { + /* sys_bind() may fail for different reasons, errno has to be + * checked to confirm that BPF program rejected it. + */ + if (errno != EPERM) + goto err; + if (port_retry) + goto retry; + res = BIND_REJECT; + goto out; + } + goto out; +retry: + if (domain == AF_INET) + addr4->sin_port = htons(port_retry); + else + addr6->sin6_port = htons(port_retry); + if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) { + if (errno != EPERM) + goto err; + res = RETRY_REJECT; + } else { + res = RETRY_SUCCESS; + } goto out; err: - err = -1; + res = -1; out: close(sockfd); - return err; + return res; } static int run_test_case(int cgfd, const struct sock_test *test) { int progfd = -1; int err = 0; + int res; printf("Test case: %s .. ", test->descr); progfd = load_sock_prog(test->insns, test->expected_attach_type); @@ -380,21 +500,11 @@ static int run_test_case(int cgfd, const struct sock_test *test) goto err; } - if (bind_sock(test->domain, test->type, test->ip, test->port) == -1) { - /* sys_bind() may fail for different reasons, errno has to be - * checked to confirm that BPF program rejected it. - */ - if (test->result == BIND_REJECT && errno == EPERM) - goto out; - else - goto err; - } + res = bind_sock(test->domain, test->type, test->ip, test->port, + test->port_retry); + if (res > 0 && test->result == res) + goto out; - - if (test->result != SUCCESS) - goto err; - - goto out; err: err = -1; out: