From b22bf1b9979a608827dea98c61ed9ec297bcc513 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Tue, 4 Jan 2022 18:59:29 +0100
Subject: [PATCH 01/41] bpftool: Refactor misc. feature probe

There is currently a single miscellaneous feature probe,
HAVE_LARGE_INSN_LIMIT, to check for the 1M instructions limit in the
verifier. Subsequent patches will add additional miscellaneous probes,
which follow the same pattern at the existing probe. This patch
therefore refactors the probe to avoid code duplication in subsequent
patches.

The BPF program type and the checked error numbers in the
HAVE_LARGE_INSN_LIMIT probe are changed to better generalize to other
probes. The feature probe retains its current behavior despite those
changes.

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/956c9329a932c75941194f91790d01f31dfbe01b.1641314075.git.paul@isovalent.com
---
 tools/bpf/bpftool/feature.c | 45 ++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 6719b9282eca..3da97a02f455 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -642,6 +642,30 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
 		printf("\n");
 }
 
+static void
+probe_misc_feature(struct bpf_insn *insns, size_t len,
+		   const char *define_prefix, __u32 ifindex,
+		   const char *feat_name, const char *plain_name,
+		   const char *define_name)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, opts,
+		.prog_ifindex = ifindex,
+	);
+	bool res;
+	int fd;
+
+	errno = 0;
+	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL",
+			   insns, len, &opts);
+	res = fd >= 0 || !errno;
+
+	if (fd >= 0)
+		close(fd);
+
+	print_bool_feature(feat_name, plain_name, define_name, res,
+			   define_prefix);
+}
+
 /*
  * Probe for availability of kernel commit (5.3):
  *
@@ -649,29 +673,18 @@ probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type,
  */
 static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex)
 {
-	LIBBPF_OPTS(bpf_prog_load_opts, opts,
-		.prog_ifindex = ifindex,
-	);
 	struct bpf_insn insns[BPF_MAXINSNS + 1];
-	bool res;
-	int i, fd;
+	int i;
 
 	for (i = 0; i < BPF_MAXINSNS; i++)
 		insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1);
 	insns[BPF_MAXINSNS] = BPF_EXIT_INSN();
 
-	errno = 0;
-	fd = bpf_prog_load(BPF_PROG_TYPE_SCHED_CLS, NULL, "GPL",
-			   insns, ARRAY_SIZE(insns), &opts);
-	res = fd >= 0 || (errno != E2BIG && errno != EINVAL);
-
-	if (fd >= 0)
-		close(fd);
-
-	print_bool_feature("have_large_insn_limit",
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_large_insn_limit",
 			   "Large program size limit",
-			   "LARGE_INSN_LIMIT",
-			   res, define_prefix);
+			   "LARGE_INSN_LIMIT");
 }
 
 static void

From c04fb2b0bd9275969be3b0a95f9c3ef76b1bfb73 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Tue, 4 Jan 2022 18:59:57 +0100
Subject: [PATCH 02/41] bpftool: Probe for bounded loop support

This patch introduces a new probe to check whether the verifier supports
bounded loops as introduced in commit 2589726d12a1 ("bpf: introduce
bounded loops"). This patch will allow BPF users such as Cilium to probe
for loop support on startup and only unconditionally unroll loops on
older kernels.

The results are displayed as part of the miscellaneous section, as shown
below.

  $ bpftool feature probe | grep loops
  Bounded loop support is available
  $ bpftool feature probe macro | grep LOOPS
  #define HAVE_BOUNDED_LOOPS
  $ bpftool feature probe -j | jq .misc
  {
    "have_large_insn_limit": true,
    "have_bounded_loops": true
  }

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/f7807c0b27d79f48e71de7b5a99c680ca4bd0151.1641314075.git.paul@isovalent.com
---
 tools/bpf/bpftool/feature.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 3da97a02f455..03579d113042 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -687,6 +687,27 @@ static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex)
 			   "LARGE_INSN_LIMIT");
 }
 
+/*
+ * Probe for bounded loop support introduced in commit 2589726d12a1
+ * ("bpf: introduce bounded loops").
+ */
+static void
+probe_bounded_loops(const char *define_prefix, __u32 ifindex)
+{
+	struct bpf_insn insns[4] = {
+		BPF_MOV64_IMM(BPF_REG_0, 10),
+		BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, -2),
+		BPF_EXIT_INSN()
+	};
+
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_bounded_loops",
+			   "Bounded loop support",
+			   "BOUNDED_LOOPS");
+}
+
 static void
 section_system_config(enum probe_component target, const char *define_prefix)
 {
@@ -801,6 +822,7 @@ static void section_misc(const char *define_prefix, __u32 ifindex)
 			    "/*** eBPF misc features ***/",
 			    define_prefix);
 	probe_large_insn_limit(define_prefix, ifindex);
+	probe_bounded_loops(define_prefix, ifindex);
 	print_end_section();
 }
 

From 0fd800b2456cf90ed738a1260b53acaa8843b5ae Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@isovalent.com>
Date: Tue, 4 Jan 2022 19:00:13 +0100
Subject: [PATCH 03/41] bpftool: Probe for instruction set extensions

This patch introduces new probes to check whether the kernel supports
instruction set extensions v2 and v3. The first introduced eBPF
instructions BPF_J{LT,LE,SLT,SLE} in commit 92b31a9af73b ("bpf: add
BPF_J{LT,LE,SLT,SLE} instructions"). The second introduces 32-bit
variants of all jump instructions in commit 092ed0968bb6 ("bpf:
verifier support JMP32").

These probes are useful for userspace BPF projects that want to use newer
instruction set extensions on newer kernels, to reduce the programs'
sizes or their complexity. LLVM already provides an mcpu=probe option to
automatically probe the kernel and select the newest-supported
instruction set extension. That is however not flexible enough for all
use cases. For example, in Cilium, we only want to use the v3
instruction set extension on v5.10+, even though it is supported on all
kernels v5.1+.

Signed-off-by: Paul Chaignon <paul@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/3bfedcd9898c1f41ac67ca61f144fec84c6c3a92.1641314075.git.paul@isovalent.com
---
 tools/bpf/bpftool/feature.c | 44 +++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 03579d113042..e999159fa28d 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -708,6 +708,48 @@ probe_bounded_loops(const char *define_prefix, __u32 ifindex)
 			   "BOUNDED_LOOPS");
 }
 
+/*
+ * Probe for the v2 instruction set extension introduced in commit 92b31a9af73b
+ * ("bpf: add BPF_J{LT,LE,SLT,SLE} instructions").
+ */
+static void
+probe_v2_isa_extension(const char *define_prefix, __u32 ifindex)
+{
+	struct bpf_insn insns[4] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN()
+	};
+
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_v2_isa_extension",
+			   "ISA extension v2",
+			   "V2_ISA_EXTENSION");
+}
+
+/*
+ * Probe for the v3 instruction set extension introduced in commit 092ed0968bb6
+ * ("bpf: verifier support JMP32").
+ */
+static void
+probe_v3_isa_extension(const char *define_prefix, __u32 ifindex)
+{
+	struct bpf_insn insns[4] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_JMP32_IMM(BPF_JLT, BPF_REG_0, 0, 1),
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN()
+	};
+
+	probe_misc_feature(insns, ARRAY_SIZE(insns),
+			   define_prefix, ifindex,
+			   "have_v3_isa_extension",
+			   "ISA extension v3",
+			   "V3_ISA_EXTENSION");
+}
+
 static void
 section_system_config(enum probe_component target, const char *define_prefix)
 {
@@ -823,6 +865,8 @@ static void section_misc(const char *define_prefix, __u32 ifindex)
 			    define_prefix);
 	probe_large_insn_limit(define_prefix, ifindex);
 	probe_bounded_loops(define_prefix, ifindex);
+	probe_v2_isa_extension(define_prefix, ifindex);
+	probe_v3_isa_extension(define_prefix, ifindex);
 	print_end_section();
 }
 

From 5e22dd18626726028a93ff1350a8a71a00fd843d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 4 Jan 2022 13:10:30 +0100
Subject: [PATCH 04/41] bpf/selftests: Fix namespace mount setup in tc_redirect

The tc_redirect umounts /sys in the new namespace, which can be
mounted as shared and cause global umount. The lazy umount also
takes down mounted trees under /sys like debugfs, which won't be
available after sysfs mounts again and could cause fails in other
tests.

  # cat /proc/self/mountinfo | grep debugfs
  34 23 0:7 / /sys/kernel/debug rw,nosuid,nodev,noexec,relatime shared:14 - debugfs debugfs rw
  # cat /proc/self/mountinfo | grep sysfs
  23 86 0:22 / /sys rw,nosuid,nodev,noexec,relatime shared:2 - sysfs sysfs rw
  # mount | grep debugfs
  debugfs on /sys/kernel/debug type debugfs (rw,nosuid,nodev,noexec,relatime)

  # ./test_progs -t tc_redirect
  #164 tc_redirect:OK
  Summary: 1/4 PASSED, 0 SKIPPED, 0 FAILED

  # mount | grep debugfs
  # cat /proc/self/mountinfo | grep debugfs
  # cat /proc/self/mountinfo | grep sysfs
  25 86 0:22 / /sys rw,relatime shared:2 - sysfs sysfs rw

Making the sysfs private under the new namespace so the umount won't
trigger the global sysfs umount.

Reported-by: Hangbin Liu <haliu@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jussi Maki <joamaki@gmail.com>
Link: https://lore.kernel.org/bpf/20220104121030.138216-1-jolsa@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 4b18b73df10b..c2426df58e17 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -105,6 +105,13 @@ static int setns_by_fd(int nsfd)
 	if (!ASSERT_OK(err, "unshare"))
 		return err;
 
+	/* Make our /sys mount private, so the following umount won't
+	 * trigger the global umount in case it's shared.
+	 */
+	err = mount("none", "/sys", NULL, MS_PRIVATE, NULL);
+	if (!ASSERT_OK(err, "remount private /sys"))
+		return err;
+
 	err = umount2("/sys", MNT_DETACH);
 	if (!ASSERT_OK(err, "umount2 /sys"))
 		return err;

From e4a41c2c1fa916547e63440c73a51a5eb06247af Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 31 Dec 2021 23:10:18 +0800
Subject: [PATCH 05/41] bpf, arm64: Use emit_addr_mov_i64() for BPF_PSEUDO_FUNC

The following error is reported when running "./test_progs -t for_each"
under arm64:

  bpf_jit: multi-func JIT bug 58 != 56
  [...]
  JIT doesn't support bpf-to-bpf calls

The root cause is the size of BPF_PSEUDO_FUNC instruction increases
from 2 to 3 after the address of called bpf-function is settled and
there are two bpf-to-bpf calls in test_pkt_access. The generated
instructions are shown below:

  0x48:  21 00 C0 D2    movz x1, #0x1, lsl #32
  0x4c:  21 00 80 F2    movk x1, #0x1

  0x48:  E1 3F C0 92    movn x1, #0x1ff, lsl #32
  0x4c:  41 FE A2 F2    movk x1, #0x17f2, lsl #16
  0x50:  81 70 9F F2    movk x1, #0xfb84

Fixing it by using emit_addr_mov_i64() for BPF_PSEUDO_FUNC, so
the size of jited image will not change.

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20211231151018.3781550-1-houtao1@huawei.com
---
 arch/arm64/net/bpf_jit_comp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 07aad85848fa..e96d4d87291f 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -792,7 +792,10 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 		u64 imm64;
 
 		imm64 = (u64)insn1.imm << 32 | (u32)imm;
-		emit_a64_mov_i64(dst, imm64, ctx);
+		if (bpf_pseudo_func(insn))
+			emit_addr_mov_i64(dst, imm64, ctx);
+		else
+			emit_a64_mov_i64(dst, imm64, ctx);
 
 		return 1;
 	}

From 5b2c5540b8110eea0d67a78fb0ddb9654c58daeb Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 4 Jan 2022 12:59:18 -0800
Subject: [PATCH 06/41] bpf, sockmap: Fix return codes from
 tcp_bpf_recvmsg_parser()

Applications can be confused slightly because we do not always return the
same error code as expected, e.g. what the TCP stack normally returns. For
example on a sock err sk->sk_err instead of returning the sock_error we
return EAGAIN. This usually means the application will 'try again'
instead of aborting immediately. Another example, when a shutdown event
is received we should immediately abort instead of waiting for data when
the user provides a timeout.

These tend to not be fatal, applications usually recover, but introduces
bogus errors to the user or introduces unexpected latency. Before
'c5d2177a72a16' we fell back to the TCP stack when no data was available
so we managed to catch many of the cases here, although with the extra
latency cost of calling tcp_msg_wait_data() first.

To fix lets duplicate the error handling in TCP stack into tcp_bpf so
that we get the same error codes.

These were found in our CI tests that run applications against sockmap
and do longer lived testing, at least compared to test_sockmap that
does short-lived ping/pong tests, and in some of our test clusters
we deploy.

Its non-trivial to do these in a shorter form CI tests that would be
appropriate for BPF selftests, but we are looking into it so we can
ensure this keeps working going forward. As a preview one idea is to
pull in the packetdrill testing which catches some of this.

Fixes: c5d2177a72a16 ("bpf, sockmap: Fix race in ingress receive verdict with redirect to self")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220104205918.286416-1-john.fastabend@gmail.com
---
 net/ipv4/tcp_bpf.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index f70aa0932bd6..9b9b02052fd3 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -196,12 +196,39 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
 		long timeo;
 		int data;
 
+		if (sock_flag(sk, SOCK_DONE))
+			goto out;
+
+		if (sk->sk_err) {
+			copied = sock_error(sk);
+			goto out;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			goto out;
+
+		if (sk->sk_state == TCP_CLOSE) {
+			copied = -ENOTCONN;
+			goto out;
+		}
+
 		timeo = sock_rcvtimeo(sk, nonblock);
+		if (!timeo) {
+			copied = -EAGAIN;
+			goto out;
+		}
+
+		if (signal_pending(current)) {
+			copied = sock_intr_errno(timeo);
+			goto out;
+		}
+
 		data = tcp_msg_wait_data(sk, psock, timeo);
 		if (data && !sk_psock_queue_empty(psock))
 			goto msg_bytes_ready;
 		copied = -EAGAIN;
 	}
+out:
 	release_sock(sk);
 	sk_psock_put(sk, psock);
 	return copied;

From 218d747a4142f281a256687bb513a135c905867b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 4 Jan 2022 13:46:45 -0800
Subject: [PATCH 07/41] bpf, sockmap: Fix double bpf_prog_put on error case in
 map_link

sock_map_link() is called to update a sockmap entry with a sk. But, if the
sock_map_init_proto() call fails then we return an error to the map_update
op against the sockmap. In the error path though we need to cleanup psock
and dec the refcnt on any programs associated with the map, because we
refcnt them early in the update process to ensure they are pinned for the
psock. (This avoids a race where user deletes programs while also updating
the map with new socks.)

In current code we do the prog refcnt dec explicitely by calling
bpf_prog_put() when the program was found in the map. But, after commit
'38207a5e81230' in this error path we've already done the prog to psock
assignment so the programs have a reference from the psock as well. This
then causes the psock tear down logic, invoked by sk_psock_put() in the
error path, to similarly call bpf_prog_put on the programs there.

To be explicit this logic does the prog->psock assignment:

  if (msg_*)
    psock_set_prog(...)

Then the error path under the out_progs label does a similar check and
dec with:

  if (msg_*)
     bpf_prog_put(...)

And the teardown logic sk_psock_put() does ...

  psock_set_prog(msg_*, NULL)

... triggering another bpf_prog_put(...). Then KASAN gives us this splat,
found by syzbot because we've created an inbalance between bpf_prog_inc and
bpf_prog_put calling put twice on the program.

  BUG: KASAN: vmalloc-out-of-bounds in __bpf_prog_put kernel/bpf/syscall.c:1812 [inline]
  BUG: KASAN: vmalloc-out-of-bounds in __bpf_prog_put kernel/bpf/syscall.c:1812 [inline] kernel/bpf/syscall.c:1829
  BUG: KASAN: vmalloc-out-of-bounds in bpf_prog_put+0x8c/0x4f0 kernel/bpf/syscall.c:1829 kernel/bpf/syscall.c:1829
  Read of size 8 at addr ffffc90000e76038 by task syz-executor020/3641

To fix clean up error path so it doesn't try to do the bpf_prog_put in the
error path once progs are assigned then it relies on the normal psock
tear down logic to do complete cleanup.

For completness we also cover the case whereh sk_psock_init_strp() fails,
but this is not expected because it indicates an incorrect socket type
and should be caught earlier.

Fixes: 38207a5e8123 ("bpf, sockmap: Attach map progs to psock early for feature probes")
Reported-by: syzbot+bb73e71cf4b8fd376a4f@syzkaller.appspotmail.com
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220104214645.290900-1-john.fastabend@gmail.com
---
 net/core/sock_map.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 9618ab6d7cc9..1827669eedd6 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -292,15 +292,23 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	if (skb_verdict)
 		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
 
+	/* msg_* and stream_* programs references tracked in psock after this
+	 * point. Reference dec and cleanup will occur through psock destructor
+	 */
 	ret = sock_map_init_proto(sk, psock);
-	if (ret < 0)
-		goto out_drop;
+	if (ret < 0) {
+		sk_psock_put(sk, psock);
+		goto out;
+	}
 
 	write_lock_bh(&sk->sk_callback_lock);
 	if (stream_parser && stream_verdict && !psock->saved_data_ready) {
 		ret = sk_psock_init_strp(sk, psock);
-		if (ret)
-			goto out_unlock_drop;
+		if (ret) {
+			write_unlock_bh(&sk->sk_callback_lock);
+			sk_psock_put(sk, psock);
+			goto out;
+		}
 		sk_psock_start_strp(sk, psock);
 	} else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
 		sk_psock_start_verdict(sk,psock);
@@ -309,10 +317,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 	return 0;
-out_unlock_drop:
-	write_unlock_bh(&sk->sk_callback_lock);
-out_drop:
-	sk_psock_put(sk, psock);
 out_progs:
 	if (skb_verdict)
 		bpf_prog_put(skb_verdict);
@@ -325,6 +329,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 out_put_stream_verdict:
 	if (stream_verdict)
 		bpf_prog_put(stream_verdict);
+out:
 	return ret;
 }
 

From e60b0d12a95dcf16a63225cead4541567f5cb517 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 5 Jan 2022 11:35:13 -0800
Subject: [PATCH 08/41] bpf: Don't promote bogus looking registers after null
 check.

If we ever get to a point again where we convert a bogus looking <ptr>_or_null
typed register containing a non-zero fixed or variable offset, then lets not
reset these bounds to zero since they are not and also don't promote the register
to a <ptr> type, but instead leave it as <ptr>_or_null. Converting to a unknown
register could be an avenue as well, but then if we run into this case it would
allow to leak a kernel pointer this way.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b70c66c6db3b..c8d9e761173b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9079,15 +9079,15 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 {
 	if (type_may_be_null(reg->type) && reg->id == id &&
 	    !WARN_ON_ONCE(!reg->id)) {
-		/* Old offset (both fixed and variable parts) should
-		 * have been known-zero, because we don't allow pointer
-		 * arithmetic on pointers that might be NULL.
-		 */
 		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
 				 !tnum_equals_const(reg->var_off, 0) ||
 				 reg->off)) {
-			__mark_reg_known_zero(reg);
-			reg->off = 0;
+			/* Old offset (both fixed and variable parts) should
+			 * have been known-zero, because we don't allow pointer
+			 * arithmetic on pointers that might be NULL. If we
+			 * see this happening, don't convert the register.
+			 */
+			return;
 		}
 		if (is_null) {
 			reg->type = SCALAR_VALUE;

From ca796fe66f7fceff17679ee6cc5fe4b4023de44d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 5 Jan 2022 11:33:34 -0800
Subject: [PATCH 09/41] bpf, selftests: Add verifier test for mem_or_null
 register with offset.

Add a new test case with mem_or_null typed register with off > 0 to ensure
it gets rejected by the verifier:

  # ./test_verifier 1011
  #1009/u check with invalid reg offset 0 OK
  #1009/p check with invalid reg offset 0 OK
  Summary: 2 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/verifier/spill_fill.c       | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
index 6c907144311f..1a8eb9672bd1 100644
--- a/tools/testing/selftests/bpf/verifier/spill_fill.c
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -58,6 +58,34 @@
 	.result = ACCEPT,
 	.result_unpriv = ACCEPT,
 },
+{
+	"check with invalid reg offset 0",
+	.insns = {
+	/* reserve 8 byte ringbuf memory */
+	BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 8),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+	/* store a pointer to the reserved memory in R6 */
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	/* add invalid offset to memory or NULL */
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+	/* check whether the reservation was successful */
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+	/* should not be able to access *(R7) = 0 */
+	BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0),
+	/* submit the reserved ringbuf memory */
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_ringbuf = { 1 },
+	.result = REJECT,
+	.errstr = "R0 pointer arithmetic on mem_or_null prohibited",
+},
 {
 	"check corrupted spill/fill",
 	.insns = {

From 62e4683849b6516c71e91f36e4fc0393a5883cfb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Jan 2022 19:35:51 +0100
Subject: [PATCH 10/41] bpf, docs: Add a setion to explain the basic
 instruction encoding

The eBPF instruction set document does not currently document the basic
instruction encoding.  Add a section to do that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103183556.41040-2-hch@lst.de
---
 Documentation/bpf/instruction-set.rst | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 1af51143ff9f..80f42984b594 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -19,8 +19,22 @@ The eBPF calling convention is defined as:
 R0 - R5 are scratch registers and eBPF programs needs to spill/fill them if
 necessary across calls.
 
+Instruction encoding
+====================
+
+eBPF uses 64-bit instructions with the following encoding:
+
+ =============  =======  ===============  ====================  ============
+ 32 bits (MSB)  16 bits  4 bits           4 bits                8 bits (LSB)
+ =============  =======  ===============  ====================  ============
+ immediate      offset   source register  destination register  opcode
+ =============  =======  ===============  ====================  ============
+
+Note that most instructions do not use all of the fields.
+Unused fields shall be cleared to zero.
+
 Instruction classes
-===================
+-------------------
 
 The three LSB bits of the 'opcode' field store the instruction class:
 

From be3193cded9d5c030be1713bf52d307427e88d19 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Jan 2022 19:35:52 +0100
Subject: [PATCH 11/41] bpf, docs: Add subsections for ALU and JMP instructions

Add a little more stucture to the ALU/JMP documentation with sections and
improve the example text.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103183556.41040-3-hch@lst.de
---
 Documentation/bpf/instruction-set.rst | 52 ++++++++++++++++-----------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 80f42984b594..03bf3c6c5577 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -74,7 +74,13 @@ The 4th bit encodes the source operand:
 
 The four MSB bits store the operation code.
 
-For class BPF_ALU or BPF_ALU64:
+
+Arithmetic instructions
+-----------------------
+
+BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for
+otherwise identical operations.
+The code field encodes the operation as below:
 
   ========  =====  =========================
   code      value  description
@@ -95,7 +101,29 @@ For class BPF_ALU or BPF_ALU64:
   BPF_END   0xd0   endianness conversion
   ========  =====  =========================
 
-For class BPF_JMP or BPF_JMP32:
+BPF_ADD | BPF_X | BPF_ALU means::
+
+  dst_reg = (u32) dst_reg + (u32) src_reg;
+
+BPF_ADD | BPF_X | BPF_ALU64 means::
+
+  dst_reg = dst_reg + src_reg
+
+BPF_XOR | BPF_K | BPF_ALU means::
+
+  src_reg = (u32) src_reg ^ (u32) imm32
+
+BPF_XOR | BPF_K | BPF_ALU64 means::
+
+  src_reg = src_reg ^ imm32
+
+
+Jump instructions
+-----------------
+
+BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for
+otherwise identical operations.
+The code field encodes the operation as below:
 
   ========  =====  =========================
   code      value  description
@@ -116,24 +144,8 @@ For class BPF_JMP or BPF_JMP32:
   BPF_JSLE  0xd0   signed '<='
   ========  =====  =========================
 
-So BPF_ADD | BPF_X | BPF_ALU means::
-
-  dst_reg = (u32) dst_reg + (u32) src_reg;
-
-Similarly, BPF_XOR | BPF_K | BPF_ALU means::
-
-  src_reg = (u32) src_reg ^ (u32) imm32
-
-eBPF is using BPF_MOV | BPF_X | BPF_ALU to represent A = B moves.  BPF_ALU64
-is used to mean exactly the same operations as BPF_ALU, but with 64-bit wide
-operands instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.::
-
-  dst_reg = dst_reg + src_reg
-
-BPF_JMP | BPF_EXIT means function exit only. The eBPF program needs to store
-the return value into register R0 before doing a BPF_EXIT. Class 6 is used as
-BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide
-operands for the comparisons instead.
+The eBPF program needs to store the return value into register R0 before doing a
+BPF_EXIT.
 
 
 Load and store instructions

From 894cda554c3c3dc836f3cc873c47a465ba9433b4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Jan 2022 19:35:53 +0100
Subject: [PATCH 12/41] bpf, docs: Document the opcode classes

Add a description for each opcode class.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103183556.41040-4-hch@lst.de
---
 Documentation/bpf/instruction-set.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 03bf3c6c5577..2987cbb07f7f 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -38,18 +38,18 @@ Instruction classes
 
 The three LSB bits of the 'opcode' field store the instruction class:
 
-  ========= =====
-  class     value
-  ========= =====
-  BPF_LD    0x00
-  BPF_LDX   0x01
-  BPF_ST    0x02
-  BPF_STX   0x03
-  BPF_ALU   0x04
-  BPF_JMP   0x05
-  BPF_JMP32 0x06
-  BPF_ALU64 0x07
-  ========= =====
+  =========  =====  ===============================
+  class      value  description
+  =========  =====  ===============================
+  BPF_LD     0x00   non-standard load operations
+  BPF_LDX    0x01   load into register operations
+  BPF_ST     0x02   store from immediate operations
+  BPF_STX    0x03   store from register operations
+  BPF_ALU    0x04   32-bit arithmetic operations
+  BPF_JMP    0x05   64-bit jump operations
+  BPF_JMP32  0x06   32-bit jump operations
+  BPF_ALU64  0x07   64-bit arithmetic operations
+  =========  =====  ===============================
 
 Arithmetic and jump instructions
 ================================

From 03c517ee9eedd95472c36c6291fc97368b48c9e4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Jan 2022 19:35:54 +0100
Subject: [PATCH 13/41] bpf, docs: Fully document the ALU opcodes

Add pseudo-code to document all the different BPF_ALU / BPF_ALU64
opcodes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103183556.41040-5-hch@lst.de
---
 Documentation/bpf/instruction-set.rst | 30 +++++++++++++--------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 2987cbb07f7f..efba4d193185 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -82,24 +82,24 @@ BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for
 otherwise identical operations.
 The code field encodes the operation as below:
 
-  ========  =====  =========================
+  ========  =====  ==========================
   code      value  description
-  ========  =====  =========================
-  BPF_ADD   0x00
-  BPF_SUB   0x10
-  BPF_MUL   0x20
-  BPF_DIV   0x30
-  BPF_OR    0x40
-  BPF_AND   0x50
-  BPF_LSH   0x60
-  BPF_RSH   0x70
-  BPF_NEG   0x80
-  BPF_MOD   0x90
-  BPF_XOR   0xa0
-  BPF_MOV   0xb0   mov reg to reg
+  ========  =====  ==========================
+  BPF_ADD   0x00   dst += src
+  BPF_SUB   0x10   dst -= src
+  BPF_MUL   0x20   dst \*= src
+  BPF_DIV   0x30   dst /= src
+  BPF_OR    0x40   dst \|= src
+  BPF_AND   0x50   dst &= src
+  BPF_LSH   0x60   dst <<= src
+  BPF_RSH   0x70   dst >>= src
+  BPF_NEG   0x80   dst = ~src
+  BPF_MOD   0x90   dst %= src
+  BPF_XOR   0xa0   dst ^= src
+  BPF_MOV   0xb0   dst = src
   BPF_ARSH  0xc0   sign extending shift right
   BPF_END   0xd0   endianness conversion
-  ========  =====  =========================
+  ========  =====  ==========================
 
 BPF_ADD | BPF_X | BPF_ALU means::
 

From 9e533e22b5700097e84b8a841d9e1c251cc132c6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Jan 2022 19:35:55 +0100
Subject: [PATCH 14/41] bpf, docs: Fully document the JMP opcodes

Add pseudo-code to document all the different BPF_JMP / BPF_JMP64
opcodes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103183556.41040-6-hch@lst.de
---
 Documentation/bpf/instruction-set.rst | 34 +++++++++++++--------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index efba4d193185..88e8d6a9195c 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -125,24 +125,24 @@ BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for
 otherwise identical operations.
 The code field encodes the operation as below:
 
-  ========  =====  =========================
-  code      value  description
-  ========  =====  =========================
-  BPF_JA    0x00   BPF_JMP only
-  BPF_JEQ   0x10
-  BPF_JGT   0x20
-  BPF_JGE   0x30
-  BPF_JSET  0x40
-  BPF_JNE   0x50   jump '!='
-  BPF_JSGT  0x60   signed '>'
-  BPF_JSGE  0x70   signed '>='
+  ========  =====  =========================  ============
+  code      value  description                notes
+  ========  =====  =========================  ============
+  BPF_JA    0x00   PC += off                  BPF_JMP only
+  BPF_JEQ   0x10   PC += off if dst == src
+  BPF_JGT   0x20   PC += off if dst > src     unsigned
+  BPF_JGE   0x30   PC += off if dst >= src    unsigned
+  BPF_JSET  0x40   PC += off if dst & src
+  BPF_JNE   0x50   PC += off if dst != src
+  BPF_JSGT  0x60   PC += off if dst > src     signed
+  BPF_JSGE  0x70   PC += off if dst >= src    signed
   BPF_CALL  0x80   function call
-  BPF_EXIT  0x90   function return
-  BPF_JLT   0xa0   unsigned '<'
-  BPF_JLE   0xb0   unsigned '<='
-  BPF_JSLT  0xc0   signed '<'
-  BPF_JSLE  0xd0   signed '<='
-  ========  =====  =========================
+  BPF_EXIT  0x90   function / program return  BPF_JMP only
+  BPF_JLT   0xa0   PC += off if dst < src     unsigned
+  BPF_JLE   0xb0   PC += off if dst <= src    unsigned
+  BPF_JSLT  0xc0   PC += off if dst < src     signed
+  BPF_JSLE  0xd0   PC += off if dst <= src    signed
+  ========  =====  =========================  ============
 
 The eBPF program needs to store the return value into register R0 before doing a
 BPF_EXIT.

From 58d8a3fc4a40dcfebf333ab2dc2c7c338249be51 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Jan 2022 19:35:56 +0100
Subject: [PATCH 15/41] bpf, docs: Fully document the JMP mode modifiers

Add a description for all the modifiers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103183556.41040-7-hch@lst.de
---
 Documentation/bpf/instruction-set.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 88e8d6a9195c..3704836fe6df 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -173,15 +173,15 @@ The size modifier is one of:
 
 The mode modifier is one of:
 
-  =============  =====  =====================
+  =============  =====  ====================================
   mode modifier  value  description
-  =============  =====  =====================
+  =============  =====  ====================================
   BPF_IMM        0x00   used for 64-bit mov
-  BPF_ABS        0x20
-  BPF_IND        0x40
-  BPF_MEM        0x60
+  BPF_ABS        0x20   legacy BPF packet access
+  BPF_IND        0x40   legacy BPF packet access
+  BPF_MEM        0x60   all normal load and store operations
   BPF_ATOMIC     0xc0   atomic operations
-  =============  =====  =====================
+  =============  =====  ====================================
 
 BPF_MEM | <size> | BPF_STX means::
 

From a5bebc4f00dee47113eed48098c68e88b5ba70e8 Mon Sep 17 00:00:00 2001
From: Kris Van Hees <kris.van.hees@oracle.com>
Date: Wed, 5 Jan 2022 16:01:50 -0500
Subject: [PATCH 16/41] bpf: Fix verifier support for validation of async
 callbacks

Commit bfc6bb74e4f1 ("bpf: Implement verifier support for validation of async callbacks.")
added support for BPF_FUNC_timer_set_callback to
the __check_func_call() function.  The test in __check_func_call() is
flaweed because it can mis-interpret a regular BPF-to-BPF pseudo-call
as a BPF_FUNC_timer_set_callback callback call.

Consider the conditional in the code:

	if (insn->code == (BPF_JMP | BPF_CALL) &&
	    insn->imm == BPF_FUNC_timer_set_callback) {

The BPF_FUNC_timer_set_callback has value 170.  This means that if you
have a BPF program that contains a pseudo-call with an instruction delta
of 170, this conditional will be found to be true by the verifier, and
it will interpret the pseudo-call as a callback.  This leads to a mess
with the verification of the program because it makes the wrong
assumptions about the nature of this call.

Solution: include an explicit check to ensure that insn->src_reg == 0.
This ensures that calls cannot be mis-interpreted as an async callback
call.

Fixes: bfc6bb74e4f1 ("bpf: Implement verifier support for validation of async callbacks.")
Signed-off-by: Kris Van Hees <kris.van.hees@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220105210150.GH1559@oracle.com
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c8d9e761173b..bfb45381fb3f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6031,6 +6031,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	}
 
 	if (insn->code == (BPF_JMP | BPF_CALL) &&
+	    insn->src_reg == 0 &&
 	    insn->imm == BPF_FUNC_timer_set_callback) {
 		struct bpf_verifier_state *async_cb;
 

From 04c350b1ae6bdb12b84009a4d0bf5ab4e621c47b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 4 Jan 2022 10:31:48 +0900
Subject: [PATCH 17/41] bpf: Fix SO_RCVBUF/SO_SNDBUF handling in
 _bpf_setsockopt().

The commit 4057765f2dee ("sock: consistent handling of extreme
SO_SNDBUF/SO_RCVBUF values") added a change to prevent underflow
in setsockopt() around SO_SNDBUF/SO_RCVBUF.

This patch adds the same change to _bpf_setsockopt().

Fixes: 4057765f2dee ("sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220104013153.97906-2-kuniyu@amazon.co.jp
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 606ab5a98a1a..368fe28c8dc6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4741,12 +4741,14 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 		switch (optname) {
 		case SO_RCVBUF:
 			val = min_t(u32, val, sysctl_rmem_max);
+			val = min_t(int, val, INT_MAX / 2);
 			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 			WRITE_ONCE(sk->sk_rcvbuf,
 				   max_t(int, val * 2, SOCK_MIN_RCVBUF));
 			break;
 		case SO_SNDBUF:
 			val = min_t(u32, val, sysctl_wmem_max);
+			val = min_t(int, val, INT_MAX / 2);
 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 			WRITE_ONCE(sk->sk_sndbuf,
 				   max_t(int, val * 2, SOCK_MIN_SNDBUF));

From 28479934f26bcf9ddeb94125e05ddc5c4312b1f3 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Tue, 4 Jan 2022 10:31:49 +0900
Subject: [PATCH 18/41] bpf: Add SO_RCVBUF/SO_SNDBUF in _bpf_getsockopt().

This patch exposes SO_RCVBUF/SO_SNDBUF through bpf_getsockopt().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220104013153.97906-3-kuniyu@amazon.co.jp
---
 net/core/filter.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 368fe28c8dc6..cac2be559ab0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4969,6 +4969,12 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
 			goto err_clear;
 
 		switch (optname) {
+		case SO_RCVBUF:
+			*((int *)optval) = sk->sk_rcvbuf;
+			break;
+		case SO_SNDBUF:
+			*((int *)optval) = sk->sk_sndbuf;
+			break;
 		case SO_MARK:
 			*((int *)optval) = sk->sk_mark;
 			break;

From 7218c28c87f57c131879a75a226b9033ac90b266 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Wed, 29 Dec 2021 12:41:56 -0800
Subject: [PATCH 19/41] libbpf: Deprecate bpf_perf_event_read_simple() API

With perf_buffer__poll() and perf_buffer__consume() APIs available,
there is no reason to expose bpf_perf_event_read_simple() API to
users. If users need custom perf buffer, they could re-implement
the function.

Mark bpf_perf_event_read_simple() and move the logic to a new
static function so it can still be called by other functions in the
same file.

  [0] Closes: https://github.com/libbpf/libbpf/issues/310

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211229204156.13569-1-christylee@fb.com
---
 tools/lib/bpf/libbpf.c | 22 ++++++++++++++--------
 tools/lib/bpf/libbpf.h |  1 +
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9cb99d1e2385..1d02ba7f11b4 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10676,10 +10676,10 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
 	return link;
 }
 
-enum bpf_perf_event_ret
-bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
-			   void **copy_mem, size_t *copy_size,
-			   bpf_perf_event_print_t fn, void *private_data)
+static enum bpf_perf_event_ret
+perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+		       void **copy_mem, size_t *copy_size,
+		       bpf_perf_event_print_t fn, void *private_data)
 {
 	struct perf_event_mmap_page *header = mmap_mem;
 	__u64 data_head = ring_buffer_read_head(header);
@@ -10724,6 +10724,12 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 	return libbpf_err(ret);
 }
 
+__attribute__((alias("perf_event_read_simple")))
+enum bpf_perf_event_ret
+bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+			   void **copy_mem, size_t *copy_size,
+			   bpf_perf_event_print_t fn, void *private_data);
+
 struct perf_buffer;
 
 struct perf_buffer_params {
@@ -11132,10 +11138,10 @@ static int perf_buffer__process_records(struct perf_buffer *pb,
 {
 	enum bpf_perf_event_ret ret;
 
-	ret = bpf_perf_event_read_simple(cpu_buf->base, pb->mmap_size,
-					 pb->page_size, &cpu_buf->buf,
-					 &cpu_buf->buf_size,
-					 perf_buffer__process_record, cpu_buf);
+	ret = perf_event_read_simple(cpu_buf->base, pb->mmap_size,
+				     pb->page_size, &cpu_buf->buf,
+				     &cpu_buf->buf_size,
+				     perf_buffer__process_record, cpu_buf);
 	if (ret != LIBBPF_PERF_EVENT_CONT)
 		return ret;
 	return 0;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 85dfef88b3d2..ddf1cc9e7803 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1026,6 +1026,7 @@ LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_i
 typedef enum bpf_perf_event_ret
 	(*bpf_perf_event_print_t)(struct perf_event_header *hdr,
 				  void *private_data);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use perf_buffer__poll() or  perf_buffer__consume() instead")
 LIBBPF_API enum bpf_perf_event_ret
 bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
 			   void **copy_mem, size_t *copy_size,

From 71cff670baff5cc6a6eeb0181e2cc55579c5e1e0 Mon Sep 17 00:00:00 2001
From: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Date: Mon, 27 Dec 2021 21:07:12 +0800
Subject: [PATCH 20/41] libbpf: Use probe_name for legacy kprobe

Fix a bug in commit 46ed5fc33db9, which wrongly used the
func_name instead of probe_name to register legacy kprobe.

Fixes: 46ed5fc33db9 ("libbpf: Refactor and simplify legacy kprobe code")
Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Hengqi Chen <hengqi.chen@gmail.com>
Reviewed-by: Hengqi Chen <hengqi.chen@gmail.com>
Link: https://lore.kernel.org/bpf/20211227130713.66933-1-wangqiang.wq.frank@bytedance.com
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1d02ba7f11b4..26e49e6aa5b1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10017,7 +10017,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog,
 		gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name),
 					     func_name, offset);
 
-		legacy_probe = strdup(func_name);
+		legacy_probe = strdup(probe_name);
 		if (!legacy_probe)
 			return libbpf_err_ptr(-ENOMEM);
 

From 51a33c60f1c22c0d2dafad774315ba1537765442 Mon Sep 17 00:00:00 2001
From: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Date: Mon, 27 Dec 2021 21:07:13 +0800
Subject: [PATCH 21/41] libbpf: Support repeated legacy kprobes on same
 function

If repeated legacy kprobes on same function in one process,
libbpf will register using the same probe name and got -EBUSY
error. So append index to the probe name format to fix this
problem.

Co-developed-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Qiang Wang <wangqiang.wq.frank@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211227130713.66933-2-wangqiang.wq.frank@bytedance.com
---
 tools/lib/bpf/libbpf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 26e49e6aa5b1..7f10dd501a52 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9916,7 +9916,10 @@ static int append_to_file(const char *file, const char *fmt, ...)
 static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz,
 					 const char *kfunc_name, size_t offset)
 {
-	snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx", getpid(), kfunc_name, offset);
+	static int index = 0;
+
+	snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset,
+		 __sync_fetch_and_add(&index, 1));
 }
 
 static int add_kprobe_event_legacy(const char *probe_name, bool retprobe,

From 9855c131b9c8b0327ff5182f88bb1991f212415b Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Tue, 4 Jan 2022 16:06:01 -0800
Subject: [PATCH 22/41] libbpf 1.0: Deprecate bpf_map__is_offload_neutral()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deprecate bpf_map__is_offload_neutral(). It’s most probably broken
already. PERF_EVENT_ARRAY isn’t the only map that’s not suitable
for hardware offloading. Applications can directly check map type
instead.

  [0] Closes: https://github.com/libbpf/libbpf/issues/306

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220105000601.2090044-1-christylee@fb.com
---
 tools/bpf/bpftool/prog.c | 2 +-
 tools/lib/bpf/libbpf.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f874896c4154..2a21d50516bc 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1655,7 +1655,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 	j = 0;
 	idx = 0;
 	bpf_object__for_each_map(map, obj) {
-		if (!bpf_map__is_offload_neutral(map))
+		if (bpf_map__type(map) != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			bpf_map__set_ifindex(map, ifindex);
 
 		if (j < old_map_fds && idx == map_replace[j].idx) {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index ddf1cc9e7803..88dd943ba545 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -744,6 +744,7 @@ LIBBPF_API void *bpf_map__priv(const struct bpf_map *map);
 LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map,
 					  const void *data, size_t size);
 LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize);
+LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__type() instead")
 LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map);
 
 /**

From 5f6082642814050352a3e29f8713796b55ebf788 Mon Sep 17 00:00:00 2001
From: Christy Lee <christylee@fb.com>
Date: Tue, 4 Jan 2022 16:31:20 -0800
Subject: [PATCH 23/41] libbpf 1.0: Deprecate bpf_object__find_map_by_offset()
 API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

API created with simplistic assumptions about BPF map definitions.
It hasn’t worked for a while, deprecate it in preparation for
libbpf 1.0.

  [0] Closes: https://github.com/libbpf/libbpf/issues/302

Signed-off-by: Christy Lee <christylee@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220105003120.2222673-1-christylee@fb.com
---
 tools/lib/bpf/libbpf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 88dd943ba545..8b9bc5e90c2b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -677,7 +677,8 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name);
  * Get bpf_map through the offset of corresponding struct bpf_map_def
  * in the BPF object file.
  */
-LIBBPF_API struct bpf_map *
+LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__find_map_by_name() instead")
+struct bpf_map *
 bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset);
 
 LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_map() instead")

From 2741a0493c04067d7acb0e44035aa27618b7d204 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Thu, 30 Dec 2021 11:54:41 +0800
Subject: [PATCH 24/41] samples/bpf: xdpsock: Add VLAN support for Tx-only
 operation

In multi-queue environment testing, the support for VLAN-tag based
steering is useful. So, this patch adds the capability to add
VLAN tag (VLAN ID and Priority) to the generated Tx frame.

To set the VLAN ID=10 and Priority=2 for Tx only through TxQ=3:
 $ xdpsock -i eth0 -t -N -z -q 3 -V -J 10 -K 2

If VLAN ID (-J) and Priority (-K) is set, it default to
  VLAN ID = 1
  VLAN Priority = 0.

For example, VLAN-tagged Tx only, xdp copy mode through TxQ=1:
 $ xdpsock -i eth0 -t -N -c -q 1 -V

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20211230035447.523177-2-boon.leong.ong@intel.com
---
 samples/bpf/xdpsock_user.c | 88 ++++++++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 14 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 616d663d55aa..d5e298ccf491 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -56,6 +56,12 @@
 
 #define DEBUG_HEXDUMP 0
 
+#define VLAN_PRIO_MASK		0xe000 /* Priority Code Point */
+#define VLAN_PRIO_SHIFT		13
+#define VLAN_VID_MASK		0x0fff /* VLAN Identifier */
+#define VLAN_VID__DEFAULT	1
+#define VLAN_PRI__DEFAULT	0
+
 typedef __u64 u64;
 typedef __u32 u32;
 typedef __u16 u16;
@@ -81,6 +87,9 @@ static u32 opt_batch_size = 64;
 static int opt_pkt_count;
 static u16 opt_pkt_size = MIN_PKT_SIZE;
 static u32 opt_pkt_fill_pattern = 0x12345678;
+static bool opt_vlan_tag;
+static u16 opt_pkt_vlan_id = VLAN_VID__DEFAULT;
+static u16 opt_pkt_vlan_pri = VLAN_PRI__DEFAULT;
 static bool opt_extra_stats;
 static bool opt_quiet;
 static bool opt_app_stats;
@@ -101,6 +110,14 @@ static u32 prog_id;
 static bool opt_busy_poll;
 static bool opt_reduced_cap;
 
+struct vlan_ethhdr {
+	unsigned char h_dest[6];
+	unsigned char h_source[6];
+	__be16 h_vlan_proto;
+	__be16 h_vlan_TCI;
+	__be16 h_vlan_encapsulated_proto;
+};
+
 struct xsk_ring_stats {
 	unsigned long rx_npkts;
 	unsigned long tx_npkts;
@@ -740,11 +757,13 @@ static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len,
 
 #define ETH_FCS_SIZE 4
 
-#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
+#define ETH_HDR_SIZE (opt_vlan_tag ? sizeof(struct vlan_ethhdr) : \
+		      sizeof(struct ethhdr))
+#define PKT_HDR_SIZE (ETH_HDR_SIZE + sizeof(struct iphdr) + \
 		      sizeof(struct udphdr))
 
 #define PKT_SIZE		(opt_pkt_size - ETH_FCS_SIZE)
-#define IP_PKT_SIZE		(PKT_SIZE - sizeof(struct ethhdr))
+#define IP_PKT_SIZE		(PKT_SIZE - ETH_HDR_SIZE)
 #define UDP_PKT_SIZE		(IP_PKT_SIZE - sizeof(struct iphdr))
 #define UDP_PKT_DATA_SIZE	(UDP_PKT_SIZE - sizeof(struct udphdr))
 
@@ -752,17 +771,42 @@ static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE];
 
 static void gen_eth_hdr_data(void)
 {
-	struct udphdr *udp_hdr = (struct udphdr *)(pkt_data +
-						   sizeof(struct ethhdr) +
-						   sizeof(struct iphdr));
-	struct iphdr *ip_hdr = (struct iphdr *)(pkt_data +
-						sizeof(struct ethhdr));
-	struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
+	struct udphdr *udp_hdr;
+	struct iphdr *ip_hdr;
+
+	if (opt_vlan_tag) {
+		struct vlan_ethhdr *veth_hdr = (struct vlan_ethhdr *)pkt_data;
+		u16 vlan_tci = 0;
+
+		udp_hdr = (struct udphdr *)(pkt_data +
+					    sizeof(struct vlan_ethhdr) +
+					    sizeof(struct iphdr));
+		ip_hdr = (struct iphdr *)(pkt_data +
+					  sizeof(struct vlan_ethhdr));
+
+		/* ethernet & VLAN header */
+		memcpy(veth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
+		memcpy(veth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
+		veth_hdr->h_vlan_proto = htons(ETH_P_8021Q);
+		vlan_tci = opt_pkt_vlan_id & VLAN_VID_MASK;
+		vlan_tci |= (opt_pkt_vlan_pri << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
+		veth_hdr->h_vlan_TCI = htons(vlan_tci);
+		veth_hdr->h_vlan_encapsulated_proto = htons(ETH_P_IP);
+	} else {
+		struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
+
+		udp_hdr = (struct udphdr *)(pkt_data +
+					    sizeof(struct ethhdr) +
+					    sizeof(struct iphdr));
+		ip_hdr = (struct iphdr *)(pkt_data +
+					  sizeof(struct ethhdr));
+
+		/* ethernet header */
+		memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
+		memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
+		eth_hdr->h_proto = htons(ETH_P_IP);
+	}
 
-	/* ethernet header */
-	memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
-	memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
-	eth_hdr->h_proto = htons(ETH_P_IP);
 
 	/* IP header */
 	ip_hdr->version = IPVERSION;
@@ -920,6 +964,9 @@ static struct option long_options[] = {
 	{"tx-pkt-count", required_argument, 0, 'C'},
 	{"tx-pkt-size", required_argument, 0, 's'},
 	{"tx-pkt-pattern", required_argument, 0, 'P'},
+	{"tx-vlan", no_argument, 0, 'V'},
+	{"tx-vlan-id", required_argument, 0, 'J'},
+	{"tx-vlan-pri", required_argument, 0, 'K'},
 	{"extra-stats", no_argument, 0, 'x'},
 	{"quiet", no_argument, 0, 'Q'},
 	{"app-stats", no_argument, 0, 'a'},
@@ -960,6 +1007,9 @@ static void usage(const char *prog)
 		"			(Default: %d bytes)\n"
 		"			Min size: %d, Max size %d.\n"
 		"  -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
+		"  -V, --tx-vlan        Send VLAN tagged  packets (For -t|--txonly)\n"
+		"  -J, --tx-vlan-id=n   Tx VLAN ID [1-4095]. Default: %d (For -V|--tx-vlan)\n"
+		"  -K, --tx-vlan-pri=n  Tx VLAN Priority [0-7]. Default: %d (For -V|--tx-vlan)\n"
 		"  -x, --extra-stats	Display extra statistics.\n"
 		"  -Q, --quiet          Do not display any stats.\n"
 		"  -a, --app-stats	Display application (syscall) statistics.\n"
@@ -969,7 +1019,8 @@ static void usage(const char *prog)
 		"\n";
 	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
 		opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
-		XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern);
+		XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern,
+		VLAN_VID__DEFAULT, VLAN_PRI__DEFAULT);
 
 	exit(EXIT_FAILURE);
 }
@@ -981,7 +1032,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:BR",
+		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:xQaI:BR",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1062,6 +1113,15 @@ static void parse_command_line(int argc, char **argv)
 		case 'P':
 			opt_pkt_fill_pattern = strtol(optarg, NULL, 16);
 			break;
+		case 'V':
+			opt_vlan_tag = true;
+			break;
+		case 'J':
+			opt_pkt_vlan_id = atoi(optarg);
+			break;
+		case 'K':
+			opt_pkt_vlan_pri = atoi(optarg);
+			break;
 		case 'x':
 			opt_extra_stats = 1;
 			break;

From 6440a6c23f6c72c57dbdf7928d92d3fc1aef6edc Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Thu, 30 Dec 2021 11:54:42 +0800
Subject: [PATCH 25/41] samples/bpf: xdpsock: Add Dest and Src MAC setting for
 Tx-only operation

To set Dest MAC address (-G|--tx-dmac) only:
 $ xdpsock -i eth0 -t -N -z -G aa:bb:cc:dd:ee:ff

To set Source MAC address (-H|--tx-smac) only:
 $ xdpsock -i eth0 -t -N -z -H 11:22:33:44:55:66

To set both Dest and Source MAC address:
 $ xdpsock -i eth0 -t -N -z -G aa:bb:cc:dd:ee:ff \
   -H 11:22:33:44:55:66

The default Dest and Source MAC address remain the same as before.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20211230035447.523177-3-boon.leong.ong@intel.com
---
 samples/bpf/xdpsock_user.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index d5e298ccf491..c9a8748a460f 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -14,6 +14,7 @@
 #include <arpa/inet.h>
 #include <locale.h>
 #include <net/ethernet.h>
+#include <netinet/ether.h>
 #include <net/if.h>
 #include <poll.h>
 #include <pthread.h>
@@ -90,6 +91,10 @@ static u32 opt_pkt_fill_pattern = 0x12345678;
 static bool opt_vlan_tag;
 static u16 opt_pkt_vlan_id = VLAN_VID__DEFAULT;
 static u16 opt_pkt_vlan_pri = VLAN_PRI__DEFAULT;
+static struct ether_addr opt_txdmac = {{ 0x3c, 0xfd, 0xfe,
+					 0x9e, 0x7f, 0x71 }};
+static struct ether_addr opt_txsmac = {{ 0xec, 0xb1, 0xd7,
+					 0x98, 0x3a, 0xc0 }};
 static bool opt_extra_stats;
 static bool opt_quiet;
 static bool opt_app_stats;
@@ -785,8 +790,8 @@ static void gen_eth_hdr_data(void)
 					  sizeof(struct vlan_ethhdr));
 
 		/* ethernet & VLAN header */
-		memcpy(veth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
-		memcpy(veth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
+		memcpy(veth_hdr->h_dest, &opt_txdmac, ETH_ALEN);
+		memcpy(veth_hdr->h_source, &opt_txsmac, ETH_ALEN);
 		veth_hdr->h_vlan_proto = htons(ETH_P_8021Q);
 		vlan_tci = opt_pkt_vlan_id & VLAN_VID_MASK;
 		vlan_tci |= (opt_pkt_vlan_pri << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
@@ -802,8 +807,8 @@ static void gen_eth_hdr_data(void)
 					  sizeof(struct ethhdr));
 
 		/* ethernet header */
-		memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
-		memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
+		memcpy(eth_hdr->h_dest, &opt_txdmac, ETH_ALEN);
+		memcpy(eth_hdr->h_source, &opt_txsmac, ETH_ALEN);
 		eth_hdr->h_proto = htons(ETH_P_IP);
 	}
 
@@ -967,6 +972,8 @@ static struct option long_options[] = {
 	{"tx-vlan", no_argument, 0, 'V'},
 	{"tx-vlan-id", required_argument, 0, 'J'},
 	{"tx-vlan-pri", required_argument, 0, 'K'},
+	{"tx-dmac", required_argument, 0, 'G'},
+	{"tx-smac", required_argument, 0, 'H'},
 	{"extra-stats", no_argument, 0, 'x'},
 	{"quiet", no_argument, 0, 'Q'},
 	{"app-stats", no_argument, 0, 'a'},
@@ -1010,6 +1017,8 @@ static void usage(const char *prog)
 		"  -V, --tx-vlan        Send VLAN tagged  packets (For -t|--txonly)\n"
 		"  -J, --tx-vlan-id=n   Tx VLAN ID [1-4095]. Default: %d (For -V|--tx-vlan)\n"
 		"  -K, --tx-vlan-pri=n  Tx VLAN Priority [0-7]. Default: %d (For -V|--tx-vlan)\n"
+		"  -G, --tx-dmac=<MAC>  Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
+		"  -H, --tx-smac=<MAC>  Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
 		"  -x, --extra-stats	Display extra statistics.\n"
 		"  -Q, --quiet          Do not display any stats.\n"
 		"  -a, --app-stats	Display application (syscall) statistics.\n"
@@ -1032,7 +1041,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:xQaI:BR",
+		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1122,6 +1131,22 @@ static void parse_command_line(int argc, char **argv)
 		case 'K':
 			opt_pkt_vlan_pri = atoi(optarg);
 			break;
+		case 'G':
+			if (!ether_aton_r(optarg,
+					  (struct ether_addr *)&opt_txdmac)) {
+				fprintf(stderr, "Invalid dmac address:%s\n",
+					optarg);
+				usage(basename(argv[0]));
+			}
+			break;
+		case 'H':
+			if (!ether_aton_r(optarg,
+					  (struct ether_addr *)&opt_txsmac)) {
+				fprintf(stderr, "Invalid smac address:%s\n",
+					optarg);
+				usage(basename(argv[0]));
+			}
+			break;
 		case 'x':
 			opt_extra_stats = 1;
 			break;

From 5a3882542acda1ac5f0a22dddf7f7f8533d3a8cc Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Thu, 30 Dec 2021 11:54:43 +0800
Subject: [PATCH 26/41] samples/bpf: xdpsock: Add clockid selection support

User specifies the clock selection by using -w CLOCK or --clock=CLOCK
where CLOCK=[REALTIME, TAI, BOOTTIME, MONOTONIC].

The default CLOCK selection is MONOTONIC.

The implementation of clock selection parsing is borrowed from
iproute2/tc/q_taprio.c

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211230035447.523177-4-boon.leong.ong@intel.com
---
 samples/bpf/xdpsock_user.c | 40 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index c9a8748a460f..e6e9a20375cb 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -114,6 +114,7 @@ static u32 opt_num_xsks = 1;
 static u32 prog_id;
 static bool opt_busy_poll;
 static bool opt_reduced_cap;
+static clockid_t opt_clock = CLOCK_MONOTONIC;
 
 struct vlan_ethhdr {
 	unsigned char h_dest[6];
@@ -178,15 +179,40 @@ struct xsk_socket_info {
 	u32 outstanding_tx;
 };
 
+static const struct clockid_map {
+	const char *name;
+	clockid_t clockid;
+} clockids_map[] = {
+	{ "REALTIME", CLOCK_REALTIME },
+	{ "TAI", CLOCK_TAI },
+	{ "BOOTTIME", CLOCK_BOOTTIME },
+	{ "MONOTONIC", CLOCK_MONOTONIC },
+	{ NULL }
+};
+
 static int num_socks;
 struct xsk_socket_info *xsks[MAX_SOCKS];
 int sock;
 
+static int get_clockid(clockid_t *id, const char *name)
+{
+	const struct clockid_map *clk;
+
+	for (clk = clockids_map; clk->name; clk++) {
+		if (strcasecmp(clk->name, name) == 0) {
+			*id = clk->clockid;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
 static unsigned long get_nsecs(void)
 {
 	struct timespec ts;
 
-	clock_gettime(CLOCK_MONOTONIC, &ts);
+	clock_gettime(opt_clock, &ts);
 	return ts.tv_sec * 1000000000UL + ts.tv_nsec;
 }
 
@@ -965,6 +991,7 @@ static struct option long_options[] = {
 	{"shared-umem", no_argument, 0, 'M'},
 	{"force", no_argument, 0, 'F'},
 	{"duration", required_argument, 0, 'd'},
+	{"clock", required_argument, 0, 'w'},
 	{"batch-size", required_argument, 0, 'b'},
 	{"tx-pkt-count", required_argument, 0, 'C'},
 	{"tx-pkt-size", required_argument, 0, 's'},
@@ -1006,6 +1033,7 @@ static void usage(const char *prog)
 		"  -F, --force		Force loading the XDP prog\n"
 		"  -d, --duration=n	Duration in secs to run command.\n"
 		"			Default: forever.\n"
+		"  -w, --clock=CLOCK	Clock NAME (default MONOTONIC).\n"
 		"  -b, --batch-size=n	Batch size for sending or receiving\n"
 		"			packets. Default: %d\n"
 		"  -C, --tx-pkt-count=n	Number of packets to send.\n"
@@ -1041,7 +1069,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR",
+		c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1075,6 +1103,14 @@ static void parse_command_line(int argc, char **argv)
 		case 'n':
 			opt_interval = atoi(optarg);
 			break;
+		case 'w':
+			if (get_clockid(&opt_clock, optarg)) {
+				fprintf(stderr,
+					"ERROR: Invalid clock %s. Default to CLOCK_MONOTONIC.\n",
+					optarg);
+				opt_clock = CLOCK_MONOTONIC;
+			}
+			break;
 		case 'z':
 			opt_xdp_bind_flags |= XDP_ZEROCOPY;
 			break;

From fa0d27a1d5a8c1f07b0229348b0d178233694fbc Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Thu, 30 Dec 2021 11:54:44 +0800
Subject: [PATCH 27/41] samples/bpf: xdpsock: Add cyclic TX operation
 capability

Tx cycle time is in micro-seconds unit. By combining the batch size (-b M)
and Tx cycle time (-T|--tx-cycle N), xdpsock now can transmit batch-size of
packets every N-us periodically. Cyclic TX operation is not applicable if
--poll mode is used.

To transmit 16 packets every 1ms cycle time for total of 100000 packets
silently:
 $ xdpsock -i eth0 -T -N -z -T 1000 -b 16 -C 100000

To print cyclic TX schedule variance stats, use --app-stats|-a:
 $ xdpsock -i eth0 -T -N -z -T 1000 -b 16 -C 100000 -a

 sock0@eth0:0 txonly xdp-drv
                   pps            pkts           0.00
rx                 0              0
tx                 0              100000

                   calls/s        count
rx empty polls     0              0
fill fail polls    0              0
copy tx sendtos    0              0
tx wakeup sendtos  0              6254
opt polls          0              0

                   period     min        ave        max        cycle
Cyclic TX          1000000    53507      75334      712642     6250

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211230035447.523177-5-boon.leong.ong@intel.com
---
 samples/bpf/xdpsock_user.c | 87 +++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 6 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index e6e9a20375cb..a2a42ec4b0e9 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -63,12 +63,19 @@
 #define VLAN_VID__DEFAULT	1
 #define VLAN_PRI__DEFAULT	0
 
+#define NSEC_PER_SEC		1000000000UL
+#define NSEC_PER_USEC		1000
+
 typedef __u64 u64;
 typedef __u32 u32;
 typedef __u16 u16;
 typedef __u8  u8;
 
 static unsigned long prev_time;
+static long tx_cycle_diff_min;
+static long tx_cycle_diff_max;
+static double tx_cycle_diff_ave;
+static long tx_cycle_cnt;
 
 enum benchmark_type {
 	BENCH_RXDROP = 0,
@@ -115,6 +122,7 @@ static u32 prog_id;
 static bool opt_busy_poll;
 static bool opt_reduced_cap;
 static clockid_t opt_clock = CLOCK_MONOTONIC;
+static unsigned long opt_tx_cycle_ns;
 
 struct vlan_ethhdr {
 	unsigned char h_dest[6];
@@ -305,6 +313,15 @@ static void dump_app_stats(long dt)
 		xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos;
 		xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls;
 	}
+
+	if (opt_tx_cycle_ns) {
+		printf("\n%-18s %-10s %-10s %-10s %-10s %-10s\n",
+		       "", "period", "min", "ave", "max", "cycle");
+		printf("%-18s %-10lu %-10lu %-10lu %-10lu %-10lu\n",
+		       "Cyclic TX", opt_tx_cycle_ns, tx_cycle_diff_min,
+		       (long)(tx_cycle_diff_ave / tx_cycle_cnt),
+		       tx_cycle_diff_max, tx_cycle_cnt);
+	}
 }
 
 static bool get_interrupt_number(void)
@@ -1001,6 +1018,7 @@ static struct option long_options[] = {
 	{"tx-vlan-pri", required_argument, 0, 'K'},
 	{"tx-dmac", required_argument, 0, 'G'},
 	{"tx-smac", required_argument, 0, 'H'},
+	{"tx-cycle", required_argument, 0, 'T'},
 	{"extra-stats", no_argument, 0, 'x'},
 	{"quiet", no_argument, 0, 'Q'},
 	{"app-stats", no_argument, 0, 'a'},
@@ -1047,6 +1065,7 @@ static void usage(const char *prog)
 		"  -K, --tx-vlan-pri=n  Tx VLAN Priority [0-7]. Default: %d (For -V|--tx-vlan)\n"
 		"  -G, --tx-dmac=<MAC>  Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
 		"  -H, --tx-smac=<MAC>  Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
+		"  -T, --tx-cycle=n     Tx cycle time in micro-seconds (For -t|--txonly).\n"
 		"  -x, --extra-stats	Display extra statistics.\n"
 		"  -Q, --quiet          Do not display any stats.\n"
 		"  -a, --app-stats	Display application (syscall) statistics.\n"
@@ -1069,7 +1088,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:xQaI:BR",
+		c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:xQaI:BR",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1183,6 +1202,10 @@ static void parse_command_line(int argc, char **argv)
 				usage(basename(argv[0]));
 			}
 			break;
+		case 'T':
+			opt_tx_cycle_ns = atoi(optarg);
+			opt_tx_cycle_ns *= NSEC_PER_USEC;
+			break;
 		case 'x':
 			opt_extra_stats = 1;
 			break;
@@ -1388,7 +1411,7 @@ static void rx_drop_all(void)
 	}
 }
 
-static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
+static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
 {
 	u32 idx;
 	unsigned int i;
@@ -1397,7 +1420,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
 				      batch_size) {
 		complete_tx_only(xsk, batch_size);
 		if (benchmark_done)
-			return;
+			return 0;
 	}
 
 	for (i = 0; i < batch_size; i++) {
@@ -1413,6 +1436,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
 	*frame_nb += batch_size;
 	*frame_nb %= NUM_FRAMES;
 	complete_tx_only(xsk, batch_size);
+
+	return batch_size;
 }
 
 static inline int get_batch_size(int pkt_cnt)
@@ -1446,16 +1471,39 @@ static void tx_only_all(void)
 {
 	struct pollfd fds[MAX_SOCKS] = {};
 	u32 frame_nb[MAX_SOCKS] = {};
+	unsigned long next_tx_ns = 0;
 	int pkt_cnt = 0;
 	int i, ret;
 
+	if (opt_poll && opt_tx_cycle_ns) {
+		fprintf(stderr,
+			"Error: --poll and --tx-cycles are both set\n");
+		return;
+	}
+
 	for (i = 0; i < num_socks; i++) {
 		fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
 		fds[0].events = POLLOUT;
 	}
 
+	if (opt_tx_cycle_ns) {
+		/* Align Tx time to micro-second boundary */
+		next_tx_ns = (get_nsecs() / NSEC_PER_USEC + 1) *
+			     NSEC_PER_USEC;
+		next_tx_ns += opt_tx_cycle_ns;
+
+		/* Initialize periodic Tx scheduling variance */
+		tx_cycle_diff_min = 1000000000;
+		tx_cycle_diff_max = 0;
+		tx_cycle_diff_ave = 0.0;
+	}
+
 	while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
 		int batch_size = get_batch_size(pkt_cnt);
+		struct timespec next;
+		int tx_cnt = 0;
+		long diff;
+		int err;
 
 		if (opt_poll) {
 			for (i = 0; i < num_socks; i++)
@@ -1468,13 +1516,40 @@ static void tx_only_all(void)
 				continue;
 		}
 
-		for (i = 0; i < num_socks; i++)
-			tx_only(xsks[i], &frame_nb[i], batch_size);
+		if (opt_tx_cycle_ns) {
+			next.tv_sec = next_tx_ns / NSEC_PER_SEC;
+			next.tv_nsec = next_tx_ns % NSEC_PER_SEC;
+			err = clock_nanosleep(opt_clock, TIMER_ABSTIME, &next, NULL);
+			if (err) {
+				if (err != EINTR)
+					fprintf(stderr,
+						"clock_nanosleep failed. Err:%d errno:%d\n",
+						err, errno);
+				break;
+			}
 
-		pkt_cnt += batch_size;
+			/* Measure periodic Tx scheduling variance */
+			diff = get_nsecs() - next_tx_ns;
+			if (diff < tx_cycle_diff_min)
+				tx_cycle_diff_min = diff;
+
+			if (diff > tx_cycle_diff_max)
+				tx_cycle_diff_max = diff;
+
+			tx_cycle_diff_ave += (double)diff;
+			tx_cycle_cnt++;
+		}
+
+		for (i = 0; i < num_socks; i++)
+			tx_cnt += tx_only(xsks[i], &frame_nb[i], batch_size);
+
+		pkt_cnt += tx_cnt;
 
 		if (benchmark_done)
 			break;
+
+		if (opt_tx_cycle_ns)
+			next_tx_ns += opt_tx_cycle_ns;
 	}
 
 	if (opt_pkt_count)

From fa24d0b1d57825d1a5b802339728d4d8ac20b6d6 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Thu, 30 Dec 2021 11:54:45 +0800
Subject: [PATCH 28/41] samples/bpf: xdpsock: Add sched policy and priority
 support

By default, TX schedule policy is SCHED_OTHER (round-robin time-sharing).
To improve TX cyclic scheduling, we add SCHED_FIFO policy and its priority
by using -W FIFO or --policy=FIFO and -U <PRIO> or --schpri=<PRIO>.

A) From xdpsock --app-stats, for SCHED_OTHER policy:
   $ xdpsock -i eth0 -t -N -z -T 1000 -b 16 -C 100000 -a

                      period     min        ave        max        cycle
   Cyclic TX          1000000    53507      75334      712642     6250

B) For SCHED_FIFO policy and schpri=50:
   $ xdpsock -i eth0 -t -N -z -T 1000 -b 16 -C 100000 -a -W FIFO -U 50

                      period     min        ave        max        cycle
   Cyclic TX          1000000    3699       24859      54397      6250

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211230035447.523177-6-boon.leong.ong@intel.com
---
 samples/bpf/xdpsock_user.c | 61 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index a2a42ec4b0e9..b7d0f536f974 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -31,6 +31,7 @@
 #include <sys/un.h>
 #include <time.h>
 #include <unistd.h>
+#include <sched.h>
 
 #include <bpf/libbpf.h>
 #include <bpf/xsk.h>
@@ -66,6 +67,8 @@
 #define NSEC_PER_SEC		1000000000UL
 #define NSEC_PER_USEC		1000
 
+#define SCHED_PRI__DEFAULT	0
+
 typedef __u64 u64;
 typedef __u32 u32;
 typedef __u16 u16;
@@ -123,6 +126,8 @@ static bool opt_busy_poll;
 static bool opt_reduced_cap;
 static clockid_t opt_clock = CLOCK_MONOTONIC;
 static unsigned long opt_tx_cycle_ns;
+static int opt_schpolicy = SCHED_OTHER;
+static int opt_schprio = SCHED_PRI__DEFAULT;
 
 struct vlan_ethhdr {
 	unsigned char h_dest[6];
@@ -198,6 +203,15 @@ static const struct clockid_map {
 	{ NULL }
 };
 
+static const struct sched_map {
+	const char *name;
+	int policy;
+} schmap[] = {
+	{ "OTHER", SCHED_OTHER },
+	{ "FIFO", SCHED_FIFO },
+	{ NULL }
+};
+
 static int num_socks;
 struct xsk_socket_info *xsks[MAX_SOCKS];
 int sock;
@@ -216,6 +230,20 @@ static int get_clockid(clockid_t *id, const char *name)
 	return -1;
 }
 
+static int get_schpolicy(int *policy, const char *name)
+{
+	const struct sched_map *sch;
+
+	for (sch = schmap; sch->name; sch++) {
+		if (strcasecmp(sch->name, name) == 0) {
+			*policy = sch->policy;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
 static unsigned long get_nsecs(void)
 {
 	struct timespec ts;
@@ -1019,6 +1047,8 @@ static struct option long_options[] = {
 	{"tx-dmac", required_argument, 0, 'G'},
 	{"tx-smac", required_argument, 0, 'H'},
 	{"tx-cycle", required_argument, 0, 'T'},
+	{"policy", required_argument, 0, 'W'},
+	{"schpri", required_argument, 0, 'U'},
 	{"extra-stats", no_argument, 0, 'x'},
 	{"quiet", no_argument, 0, 'Q'},
 	{"app-stats", no_argument, 0, 'a'},
@@ -1066,6 +1096,8 @@ static void usage(const char *prog)
 		"  -G, --tx-dmac=<MAC>  Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
 		"  -H, --tx-smac=<MAC>  Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
 		"  -T, --tx-cycle=n     Tx cycle time in micro-seconds (For -t|--txonly).\n"
+		"  -W, --policy=POLICY  Schedule policy. Default: SCHED_OTHER\n"
+		"  -U, --schpri=n       Schedule priority. Default: %d\n"
 		"  -x, --extra-stats	Display extra statistics.\n"
 		"  -Q, --quiet          Do not display any stats.\n"
 		"  -a, --app-stats	Display application (syscall) statistics.\n"
@@ -1076,7 +1108,8 @@ static void usage(const char *prog)
 	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
 		opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
 		XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern,
-		VLAN_VID__DEFAULT, VLAN_PRI__DEFAULT);
+		VLAN_VID__DEFAULT, VLAN_PRI__DEFAULT,
+		SCHED_PRI__DEFAULT);
 
 	exit(EXIT_FAILURE);
 }
@@ -1088,7 +1121,8 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:xQaI:BR",
+		c = getopt_long(argc, argv,
+				"Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1206,6 +1240,17 @@ static void parse_command_line(int argc, char **argv)
 			opt_tx_cycle_ns = atoi(optarg);
 			opt_tx_cycle_ns *= NSEC_PER_USEC;
 			break;
+		case 'W':
+			if (get_schpolicy(&opt_schpolicy, optarg)) {
+				fprintf(stderr,
+					"ERROR: Invalid policy %s. Default to SCHED_OTHER.\n",
+					optarg);
+				opt_schpolicy = SCHED_OTHER;
+			}
+			break;
+		case 'U':
+			opt_schprio = atoi(optarg);
+			break;
 		case 'x':
 			opt_extra_stats = 1;
 			break;
@@ -1780,6 +1825,7 @@ int main(int argc, char **argv)
 	struct __user_cap_data_struct data[2] = { { 0 } };
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 	bool rx = false, tx = false;
+	struct sched_param schparam;
 	struct xsk_umem_info *umem;
 	struct bpf_object *obj;
 	int xsks_map_fd = 0;
@@ -1881,6 +1927,16 @@ int main(int argc, char **argv)
 	prev_time = get_nsecs();
 	start_time = prev_time;
 
+	/* Configure sched priority for better wake-up accuracy */
+	memset(&schparam, 0, sizeof(schparam));
+	schparam.sched_priority = opt_schprio;
+	ret = sched_setscheduler(0, opt_schpolicy, &schparam);
+	if (ret) {
+		fprintf(stderr, "Error(%d) in setting priority(%d): %s\n",
+			errno, opt_schprio, strerror(errno));
+		goto out;
+	}
+
 	if (opt_bench == BENCH_RXDROP)
 		rx_drop_all();
 	else if (opt_bench == BENCH_TXONLY)
@@ -1888,6 +1944,7 @@ int main(int argc, char **argv)
 	else
 		l2fwd_all();
 
+out:
 	benchmark_done = true;
 
 	if (!opt_quiet)

From 8121e78932018df48758985e00651e16ff34ae5f Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Thu, 30 Dec 2021 11:54:46 +0800
Subject: [PATCH 29/41] samples/bpf: xdpsock: Add time-out for cleaning Tx

When user sets tx-pkt-count and in case where there are invalid Tx frame,
the complete_tx_only_all() process polls indefinitely. So, this patch
adds a time-out mechanism into the process so that the application
can terminate automatically after it retries 3*polling interval duration.

v1->v2:
 Thanks to Jesper's and Song Liu's suggestion.
 - clean-up git message to remove polling log
 - make the Tx time-out retries configurable with 1s granularity

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211230035447.523177-7-boon.leong.ong@intel.com
---
 samples/bpf/xdpsock_user.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index b7d0f536f974..319cb3cdb226 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -113,6 +113,7 @@ static u32 irq_no;
 static int irqs_at_init = -1;
 static int opt_poll;
 static int opt_interval = 1;
+static int opt_retries = 3;
 static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
 static u32 opt_umem_flags;
 static int opt_unaligned_chunks;
@@ -1028,6 +1029,7 @@ static struct option long_options[] = {
 	{"xdp-skb", no_argument, 0, 'S'},
 	{"xdp-native", no_argument, 0, 'N'},
 	{"interval", required_argument, 0, 'n'},
+	{"retries", required_argument, 0, 'O'},
 	{"zero-copy", no_argument, 0, 'z'},
 	{"copy", no_argument, 0, 'c'},
 	{"frame-size", required_argument, 0, 'f'},
@@ -1072,6 +1074,7 @@ static void usage(const char *prog)
 		"  -S, --xdp-skb=n	Use XDP skb-mod\n"
 		"  -N, --xdp-native=n	Enforce XDP native mode\n"
 		"  -n, --interval=n	Specify statistics update interval (default 1 sec).\n"
+		"  -O, --retries=n	Specify time-out retries (1s interval) attempt (default 3).\n"
 		"  -z, --zero-copy      Force zero-copy mode.\n"
 		"  -c, --copy           Force copy mode.\n"
 		"  -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
@@ -1122,7 +1125,7 @@ static void parse_command_line(int argc, char **argv)
 
 	for (;;) {
 		c = getopt_long(argc, argv,
-				"Frtli:q:pSNn:w:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR",
+				"Frtli:q:pSNn:w:O:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1164,6 +1167,9 @@ static void parse_command_line(int argc, char **argv)
 				opt_clock = CLOCK_MONOTONIC;
 			}
 			break;
+		case 'O':
+			opt_retries = atoi(optarg);
+			break;
 		case 'z':
 			opt_xdp_bind_flags |= XDP_ZEROCOPY;
 			break;
@@ -1509,7 +1515,8 @@ static void complete_tx_only_all(void)
 				pending = !!xsks[i]->outstanding_tx;
 			}
 		}
-	} while (pending);
+		sleep(1);
+	} while (pending && opt_retries-- > 0);
 }
 
 static void tx_only_all(void)

From eb68db45b747756c351ea84e9af55a69468d0549 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Thu, 30 Dec 2021 11:54:47 +0800
Subject: [PATCH 30/41] samples/bpf: xdpsock: Add timestamp for Tx-only
 operation

It may be useful to add timestamp for Tx packets for continuous or cyclic
transmit operation. The timestamp and sequence ID of a Tx packet are
stored according to pktgen header format. To enable per-packet timestamp,
use -y|--tstamp option. If timestamp is off, pktgen header is not
included in the UDP payload. This means receiving side can use the magic
number for pktgen for differentiation.

The implementation supports both VLAN tagged and untagged option. By
default, the minimum packet size is set at 64B. However, if VLAN tagged
is on (-V), the minimum packet size is increased to 66B just so to fit
the pktgen_hdr size.

Added hex_dump() into the code path just for future cross-checking.
As before, simply change to "#define DEBUG_HEXDUMP 1" to inspect the
accuracy of TX packet.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20211230035447.523177-8-boon.leong.ong@intel.com
---
 samples/bpf/xdpsock_user.c | 77 +++++++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 9 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 319cb3cdb226..aa50864e4415 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -111,6 +111,7 @@ static bool opt_app_stats;
 static const char *opt_irq_str = "";
 static u32 irq_no;
 static int irqs_at_init = -1;
+static u32 sequence;
 static int opt_poll;
 static int opt_interval = 1;
 static int opt_retries = 3;
@@ -129,6 +130,7 @@ static clockid_t opt_clock = CLOCK_MONOTONIC;
 static unsigned long opt_tx_cycle_ns;
 static int opt_schpolicy = SCHED_OTHER;
 static int opt_schprio = SCHED_PRI__DEFAULT;
+static bool opt_tstamp;
 
 struct vlan_ethhdr {
 	unsigned char h_dest[6];
@@ -138,6 +140,14 @@ struct vlan_ethhdr {
 	__be16 h_vlan_encapsulated_proto;
 };
 
+#define PKTGEN_MAGIC 0xbe9be955
+struct pktgen_hdr {
+	__be32 pgh_magic;
+	__be32 seq_num;
+	__be32 tv_sec;
+	__be32 tv_usec;
+};
+
 struct xsk_ring_stats {
 	unsigned long rx_npkts;
 	unsigned long tx_npkts;
@@ -836,18 +846,25 @@ static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len,
 
 #define ETH_HDR_SIZE (opt_vlan_tag ? sizeof(struct vlan_ethhdr) : \
 		      sizeof(struct ethhdr))
+#define PKTGEN_HDR_SIZE (opt_tstamp ? sizeof(struct pktgen_hdr) : 0)
 #define PKT_HDR_SIZE (ETH_HDR_SIZE + sizeof(struct iphdr) + \
-		      sizeof(struct udphdr))
+		      sizeof(struct udphdr) + PKTGEN_HDR_SIZE)
+#define PKTGEN_HDR_OFFSET (ETH_HDR_SIZE + sizeof(struct iphdr) + \
+			   sizeof(struct udphdr))
+#define PKTGEN_SIZE_MIN (PKTGEN_HDR_OFFSET + sizeof(struct pktgen_hdr) + \
+			 ETH_FCS_SIZE)
 
 #define PKT_SIZE		(opt_pkt_size - ETH_FCS_SIZE)
 #define IP_PKT_SIZE		(PKT_SIZE - ETH_HDR_SIZE)
 #define UDP_PKT_SIZE		(IP_PKT_SIZE - sizeof(struct iphdr))
-#define UDP_PKT_DATA_SIZE	(UDP_PKT_SIZE - sizeof(struct udphdr))
+#define UDP_PKT_DATA_SIZE	(UDP_PKT_SIZE - \
+				 (sizeof(struct udphdr) + PKTGEN_HDR_SIZE))
 
 static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE];
 
 static void gen_eth_hdr_data(void)
 {
+	struct pktgen_hdr *pktgen_hdr;
 	struct udphdr *udp_hdr;
 	struct iphdr *ip_hdr;
 
@@ -860,7 +877,10 @@ static void gen_eth_hdr_data(void)
 					    sizeof(struct iphdr));
 		ip_hdr = (struct iphdr *)(pkt_data +
 					  sizeof(struct vlan_ethhdr));
-
+		pktgen_hdr = (struct pktgen_hdr *)(pkt_data +
+						   sizeof(struct vlan_ethhdr) +
+						   sizeof(struct iphdr) +
+						   sizeof(struct udphdr));
 		/* ethernet & VLAN header */
 		memcpy(veth_hdr->h_dest, &opt_txdmac, ETH_ALEN);
 		memcpy(veth_hdr->h_source, &opt_txsmac, ETH_ALEN);
@@ -877,7 +897,10 @@ static void gen_eth_hdr_data(void)
 					    sizeof(struct iphdr));
 		ip_hdr = (struct iphdr *)(pkt_data +
 					  sizeof(struct ethhdr));
-
+		pktgen_hdr = (struct pktgen_hdr *)(pkt_data +
+						   sizeof(struct ethhdr) +
+						   sizeof(struct iphdr) +
+						   sizeof(struct udphdr));
 		/* ethernet header */
 		memcpy(eth_hdr->h_dest, &opt_txdmac, ETH_ALEN);
 		memcpy(eth_hdr->h_source, &opt_txsmac, ETH_ALEN);
@@ -906,6 +929,9 @@ static void gen_eth_hdr_data(void)
 	udp_hdr->dest = htons(0x1000);
 	udp_hdr->len = htons(UDP_PKT_SIZE);
 
+	if (opt_tstamp)
+		pktgen_hdr->pgh_magic = htonl(PKTGEN_MAGIC);
+
 	/* UDP data */
 	memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern,
 		       UDP_PKT_DATA_SIZE);
@@ -1049,6 +1075,7 @@ static struct option long_options[] = {
 	{"tx-dmac", required_argument, 0, 'G'},
 	{"tx-smac", required_argument, 0, 'H'},
 	{"tx-cycle", required_argument, 0, 'T'},
+	{"tstamp", no_argument, 0, 'y'},
 	{"policy", required_argument, 0, 'W'},
 	{"schpri", required_argument, 0, 'U'},
 	{"extra-stats", no_argument, 0, 'x'},
@@ -1099,6 +1126,7 @@ static void usage(const char *prog)
 		"  -G, --tx-dmac=<MAC>  Dest MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
 		"  -H, --tx-smac=<MAC>  Src MAC addr of TX frame in aa:bb:cc:dd:ee:ff format (For -V|--tx-vlan)\n"
 		"  -T, --tx-cycle=n     Tx cycle time in micro-seconds (For -t|--txonly).\n"
+		"  -y, --tstamp         Add time-stamp to packet (For -t|--txonly).\n"
 		"  -W, --policy=POLICY  Schedule policy. Default: SCHED_OTHER\n"
 		"  -U, --schpri=n       Schedule priority. Default: %d\n"
 		"  -x, --extra-stats	Display extra statistics.\n"
@@ -1125,7 +1153,7 @@ static void parse_command_line(int argc, char **argv)
 
 	for (;;) {
 		c = getopt_long(argc, argv,
-				"Frtli:q:pSNn:w:O:czf:muMd:b:C:s:P:VJ:K:G:H:T:W:U:xQaI:BR",
+				"Frtli:q:pSNn:w:O:czf:muMd:b:C:s:P:VJ:K:G:H:T:yW:U:xQaI:BR",
 				long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1246,6 +1274,9 @@ static void parse_command_line(int argc, char **argv)
 			opt_tx_cycle_ns = atoi(optarg);
 			opt_tx_cycle_ns *= NSEC_PER_USEC;
 			break;
+		case 'y':
+			opt_tstamp = 1;
+			break;
 		case 'W':
 			if (get_schpolicy(&opt_schpolicy, optarg)) {
 				fprintf(stderr,
@@ -1462,9 +1493,10 @@ static void rx_drop_all(void)
 	}
 }
 
-static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
+static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb,
+		   int batch_size, unsigned long tx_ns)
 {
-	u32 idx;
+	u32 idx, tv_sec, tv_usec;
 	unsigned int i;
 
 	while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) <
@@ -1474,11 +1506,31 @@ static int tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
 			return 0;
 	}
 
+	if (opt_tstamp) {
+		tv_sec = (u32)(tx_ns / NSEC_PER_SEC);
+		tv_usec = (u32)((tx_ns % NSEC_PER_SEC) / 1000);
+	}
+
 	for (i = 0; i < batch_size; i++) {
 		struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
 								  idx + i);
 		tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size;
 		tx_desc->len = PKT_SIZE;
+
+		if (opt_tstamp) {
+			struct pktgen_hdr *pktgen_hdr;
+			u64 addr = tx_desc->addr;
+			char *pkt;
+
+			pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
+			pktgen_hdr = (struct pktgen_hdr *)(pkt + PKTGEN_HDR_OFFSET);
+
+			pktgen_hdr->seq_num = htonl(sequence++);
+			pktgen_hdr->tv_sec = htonl(tv_sec);
+			pktgen_hdr->tv_usec = htonl(tv_usec);
+
+			hex_dump(pkt, PKT_SIZE, addr);
+		}
 	}
 
 	xsk_ring_prod__submit(&xsk->tx, batch_size);
@@ -1552,6 +1604,7 @@ static void tx_only_all(void)
 
 	while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
 		int batch_size = get_batch_size(pkt_cnt);
+		unsigned long tx_ns = 0;
 		struct timespec next;
 		int tx_cnt = 0;
 		long diff;
@@ -1581,7 +1634,8 @@ static void tx_only_all(void)
 			}
 
 			/* Measure periodic Tx scheduling variance */
-			diff = get_nsecs() - next_tx_ns;
+			tx_ns = get_nsecs();
+			diff = tx_ns - next_tx_ns;
 			if (diff < tx_cycle_diff_min)
 				tx_cycle_diff_min = diff;
 
@@ -1590,10 +1644,12 @@ static void tx_only_all(void)
 
 			tx_cycle_diff_ave += (double)diff;
 			tx_cycle_cnt++;
+		} else if (opt_tstamp) {
+			tx_ns = get_nsecs();
 		}
 
 		for (i = 0; i < num_socks; i++)
-			tx_cnt += tx_only(xsks[i], &frame_nb[i], batch_size);
+			tx_cnt += tx_only(xsks[i], &frame_nb[i], batch_size, tx_ns);
 
 		pkt_cnt += tx_cnt;
 
@@ -1895,6 +1951,9 @@ int main(int argc, char **argv)
 		apply_setsockopt(xsks[i]);
 
 	if (opt_bench == BENCH_TXONLY) {
+		if (opt_tstamp && opt_pkt_size < PKTGEN_SIZE_MIN)
+			opt_pkt_size = PKTGEN_SIZE_MIN;
+
 		gen_eth_hdr_data();
 
 		for (i = 0; i < NUM_FRAMES; i++)

From 4a48ef70b93b8c7ed5190adfca18849e76387b80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 3 Jan 2022 16:08:06 +0100
Subject: [PATCH 31/41] xdp: Allow registering memory model without rxq
 reference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The functions that register an XDP memory model take a struct xdp_rxq as
parameter, but the RXQ is not actually used for anything other than pulling
out the struct xdp_mem_info that it embeds. So refactor the register
functions and export variants that just take a pointer to the xdp_mem_info.

This is in preparation for enabling XDP_REDIRECT in bpf_prog_run(), using a
page_pool instance that is not connected to any network device.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103150812.87914-2-toke@redhat.com
---
 include/net/xdp.h |  3 ++
 net/core/xdp.c    | 92 +++++++++++++++++++++++++++++++----------------
 2 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 447f9b1578f3..8f0812e4996d 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -260,6 +260,9 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator);
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq);
+int xdp_reg_mem_model(struct xdp_mem_info *mem,
+		      enum xdp_mem_type type, void *allocator);
+void xdp_unreg_mem_model(struct xdp_mem_info *mem);
 
 /* Drivers not supporting XDP metadata can use this helper, which
  * rejects any room expansion for metadata as a result.
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 7fe1df85f505..58089f6d2c7a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -110,20 +110,15 @@ static void mem_allocator_disconnect(void *allocator)
 	mutex_unlock(&mem_id_lock);
 }
 
-void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+void xdp_unreg_mem_model(struct xdp_mem_info *mem)
 {
 	struct xdp_mem_allocator *xa;
-	int type = xdp_rxq->mem.type;
-	int id = xdp_rxq->mem.id;
+	int type = mem->type;
+	int id = mem->id;
 
 	/* Reset mem info to defaults */
-	xdp_rxq->mem.id = 0;
-	xdp_rxq->mem.type = 0;
-
-	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
-		WARN(1, "Missing register, driver bug");
-		return;
-	}
+	mem->id = 0;
+	mem->type = 0;
 
 	if (id == 0)
 		return;
@@ -135,6 +130,17 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 		rcu_read_unlock();
 	}
 }
+EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);
+
+void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+		WARN(1, "Missing register, driver bug");
+		return;
+	}
+
+	xdp_unreg_mem_model(&xdp_rxq->mem);
+}
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
 
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
@@ -259,28 +265,24 @@ static bool __is_supported_mem_type(enum xdp_mem_type type)
 	return true;
 }
 
-int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
-			       enum xdp_mem_type type, void *allocator)
+static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
+						     enum xdp_mem_type type,
+						     void *allocator)
 {
 	struct xdp_mem_allocator *xdp_alloc;
 	gfp_t gfp = GFP_KERNEL;
 	int id, errno, ret;
 	void *ptr;
 
-	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
-		WARN(1, "Missing register, driver bug");
-		return -EFAULT;
-	}
-
 	if (!__is_supported_mem_type(type))
-		return -EOPNOTSUPP;
+		return ERR_PTR(-EOPNOTSUPP);
 
-	xdp_rxq->mem.type = type;
+	mem->type = type;
 
 	if (!allocator) {
 		if (type == MEM_TYPE_PAGE_POOL)
-			return -EINVAL; /* Setup time check page_pool req */
-		return 0;
+			return ERR_PTR(-EINVAL); /* Setup time check page_pool req */
+		return NULL;
 	}
 
 	/* Delay init of rhashtable to save memory if feature isn't used */
@@ -290,13 +292,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 		mutex_unlock(&mem_id_lock);
 		if (ret < 0) {
 			WARN_ON(1);
-			return ret;
+			return ERR_PTR(ret);
 		}
 	}
 
 	xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
 	if (!xdp_alloc)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	mutex_lock(&mem_id_lock);
 	id = __mem_id_cyclic_get(gfp);
@@ -304,15 +306,15 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 		errno = id;
 		goto err;
 	}
-	xdp_rxq->mem.id = id;
-	xdp_alloc->mem  = xdp_rxq->mem;
+	mem->id = id;
+	xdp_alloc->mem = *mem;
 	xdp_alloc->allocator = allocator;
 
 	/* Insert allocator into ID lookup table */
 	ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
 	if (IS_ERR(ptr)) {
-		ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
-		xdp_rxq->mem.id = 0;
+		ida_simple_remove(&mem_id_pool, mem->id);
+		mem->id = 0;
 		errno = PTR_ERR(ptr);
 		goto err;
 	}
@@ -322,13 +324,43 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 
 	mutex_unlock(&mem_id_lock);
 
-	trace_mem_connect(xdp_alloc, xdp_rxq);
-	return 0;
+	return xdp_alloc;
 err:
 	mutex_unlock(&mem_id_lock);
 	kfree(xdp_alloc);
-	return errno;
+	return ERR_PTR(errno);
 }
+
+int xdp_reg_mem_model(struct xdp_mem_info *mem,
+		      enum xdp_mem_type type, void *allocator)
+{
+	struct xdp_mem_allocator *xdp_alloc;
+
+	xdp_alloc = __xdp_reg_mem_model(mem, type, allocator);
+	if (IS_ERR(xdp_alloc))
+		return PTR_ERR(xdp_alloc);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_reg_mem_model);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator)
+{
+	struct xdp_mem_allocator *xdp_alloc;
+
+	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+		WARN(1, "Missing register, driver bug");
+		return -EFAULT;
+	}
+
+	xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator);
+	if (IS_ERR(xdp_alloc))
+		return PTR_ERR(xdp_alloc);
+
+	trace_mem_connect(xdp_alloc, xdp_rxq);
+	return 0;
+}
+
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
 /* XDP RX runs under NAPI protection, and in different delivery error

From 35b2e549894b7ef0b6e7f3a70c2ab75b767cfce9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 3 Jan 2022 16:08:07 +0100
Subject: [PATCH 32/41] page_pool: Add callback to init pages when they are
 allocated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new callback function to page_pool that, if set, will be called every
time a new page is allocated. This will be used from bpf_test_run() to
initialise the page data with the data provided by userspace when running
XDP programs with redirect turned on.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20220103150812.87914-3-toke@redhat.com
---
 include/net/page_pool.h | 2 ++
 net/core/page_pool.c    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index a4082406a003..d807b6800a4a 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -80,6 +80,8 @@ struct page_pool_params {
 	enum dma_data_direction dma_dir; /* DMA mapping direction */
 	unsigned int	max_len; /* max DMA sync memory size */
 	unsigned int	offset;  /* DMA addr offset */
+	void (*init_callback)(struct page *page, void *arg);
+	void *init_arg;
 };
 
 struct page_pool {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 1a6978427d6c..f53786f6666d 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -217,6 +217,8 @@ static void page_pool_set_pp_info(struct page_pool *pool,
 {
 	page->pp = pool;
 	page->pp_magic |= PP_SIGNATURE;
+	if (pool->p.init_callback)
+		pool->p.init_callback(page, pool->p.init_arg);
 }
 
 static void page_pool_clear_pp_info(struct page *page)

From 64693ec7774e471f817a725686d93903e919a2e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 3 Jan 2022 16:08:08 +0100
Subject: [PATCH 33/41] page_pool: Store the XDP mem id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Store the XDP mem ID inside the page_pool struct so it can be retrieved
later for use in bpf_prog_run().

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20220103150812.87914-4-toke@redhat.com
---
 include/net/page_pool.h | 9 +++++++--
 net/core/page_pool.c    | 4 +++-
 net/core/xdp.c          | 2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index d807b6800a4a..79a805542d0f 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -96,6 +96,7 @@ struct page_pool {
 	unsigned int frag_offset;
 	struct page *frag_page;
 	long frag_users;
+	u32 xdp_mem_id;
 
 	/*
 	 * Data structure for allocation side
@@ -170,9 +171,12 @@ bool page_pool_return_skb_page(struct page *page);
 
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
+struct xdp_mem_info;
+
 #ifdef CONFIG_PAGE_POOL
 void page_pool_destroy(struct page_pool *pool);
-void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *));
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+			   struct xdp_mem_info *mem);
 void page_pool_release_page(struct page_pool *pool, struct page *page);
 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 			     int count);
@@ -182,7 +186,8 @@ static inline void page_pool_destroy(struct page_pool *pool)
 }
 
 static inline void page_pool_use_xdp_mem(struct page_pool *pool,
-					 void (*disconnect)(void *))
+					 void (*disconnect)(void *),
+					 struct xdp_mem_info *mem)
 {
 }
 static inline void page_pool_release_page(struct page_pool *pool,
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f53786f6666d..7347d5c7dbe0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -693,10 +693,12 @@ static void page_pool_release_retry(struct work_struct *wq)
 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 }
 
-void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+			   struct xdp_mem_info *mem)
 {
 	refcount_inc(&pool->user_cnt);
 	pool->disconnect = disconnect;
+	pool->xdp_mem_id = mem->id;
 }
 
 void page_pool_destroy(struct page_pool *pool)
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 58089f6d2c7a..7aba35504986 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -320,7 +320,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
 	}
 
 	if (type == MEM_TYPE_PAGE_POOL)
-		page_pool_use_xdp_mem(allocator, mem_allocator_disconnect);
+		page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem);
 
 	mutex_unlock(&mem_id_lock);
 

From d53ad5d8b218a885e95080d4d3d556b16b91b1b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 3 Jan 2022 16:08:09 +0100
Subject: [PATCH 34/41] xdp: Move conversion to xdp_frame out of map functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All map redirect functions except XSK maps convert xdp_buff to xdp_frame
before enqueueing it. So move this conversion of out the map functions
and into xdp_do_redirect(). This removes a bit of duplicated code, but more
importantly it makes it possible to support caller-allocated xdp_frame
structures, which will be added in a subsequent commit.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103150812.87914-5-toke@redhat.com
---
 include/linux/bpf.h | 20 ++++++++++----------
 kernel/bpf/cpumap.c |  8 +-------
 kernel/bpf/devmap.c | 32 +++++++++++---------------------
 net/core/filter.c   | 24 +++++++++++++++++-------
 4 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 26753139d5b4..6e947cd91152 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1669,17 +1669,17 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 struct btf *bpf_get_btf_vmlinux(void);
 
 /* Map specifics */
-struct xdp_buff;
+struct xdp_frame;
 struct sk_buff;
 struct bpf_dtab_netdev;
 struct bpf_cpu_map_entry;
 
 void __dev_flush(void);
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx);
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
 			  struct bpf_map *map, bool exclude_ingress);
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
@@ -1688,7 +1688,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
 			   bool exclude_ingress);
 
 void __cpu_map_flush(void);
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx);
 int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
 			     struct sk_buff *skb);
@@ -1866,26 +1866,26 @@ static inline void __dev_flush(void)
 {
 }
 
-struct xdp_buff;
+struct xdp_frame;
 struct bpf_dtab_netdev;
 struct bpf_cpu_map_entry;
 
 static inline
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx)
 {
 	return 0;
 }
 
 static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx)
 {
 	return 0;
 }
 
 static inline
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
 			  struct bpf_map *map, bool exclude_ingress)
 {
 	return 0;
@@ -1913,7 +1913,7 @@ static inline void __cpu_map_flush(void)
 }
 
 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
-				  struct xdp_buff *xdp,
+				  struct xdp_frame *xdpf,
 				  struct net_device *dev_rx)
 {
 	return 0;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 0421061d95f1..b3e6b9422238 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 		list_add(&bq->flush_node, flush_list);
 }
 
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx)
 {
-	struct xdp_frame *xdpf;
-
-	xdpf = xdp_convert_buff_to_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
 	/* Info needed when constructing SKB on remote CPU */
 	xdpf->dev_rx = dev_rx;
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 6feea293ff10..fe019dbdb3f0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 	bq->q[bq->count++] = xdpf;
 }
 
-static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 				struct net_device *dev_rx,
 				struct bpf_prog *xdp_prog)
 {
-	struct xdp_frame *xdpf;
 	int err;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit)
 		return -EOPNOTSUPP;
 
-	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+	err = xdp_ok_fwd_dev(dev, xdpf->len);
 	if (unlikely(err))
 		return err;
 
-	xdpf = xdp_convert_buff_to_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
 	bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
 	return 0;
 }
@@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
 	return act;
 }
 
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx)
 {
-	return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+	return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
 }
 
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx)
 {
 	struct net_device *dev = dst->dev;
 
-	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+	return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
 }
 
-static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
 {
 	if (!obj ||
 	    !obj->dev->netdev_ops->ndo_xdp_xmit)
 		return false;
 
-	if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+	if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
 		return false;
 
 	return true;
@@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes)
 	return n;
 }
 
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
 			  struct bpf_map *map, bool exclude_ingress)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dst, *last_dst = NULL;
 	int excluded_devices[1+MAX_NEST_DEV];
 	struct hlist_head *head;
-	struct xdp_frame *xdpf;
 	int num_excluded = 0;
 	unsigned int i;
 	int err;
@@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
 		excluded_devices[num_excluded++] = dev_rx->ifindex;
 	}
 
-	xdpf = xdp_convert_buff_to_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		for (i = 0; i < map->max_entries; i++) {
 			dst = rcu_dereference_check(dtab->netdev_map[i],
 						    rcu_read_lock_bh_held());
-			if (!is_valid_dst(dst, xdp))
+			if (!is_valid_dst(dst, xdpf))
 				continue;
 
 			if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
@@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
 			head = dev_map_index_hash(dtab, i);
 			hlist_for_each_entry_rcu(dst, head, index_hlist,
 						 lockdep_is_held(&dtab->index_lock)) {
-				if (!is_valid_dst(dst, xdp))
+				if (!is_valid_dst(dst, xdpf))
 					continue;
 
 				if (is_ifindex_excluded(excluded_devices, num_excluded,
diff --git a/net/core/filter.c b/net/core/filter.c
index cac2be559ab0..e2b83056246c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3964,12 +3964,24 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	enum bpf_map_type map_type = ri->map_type;
 	void *fwd = ri->tgt_value;
 	u32 map_id = ri->map_id;
+	struct xdp_frame *xdpf;
 	struct bpf_map *map;
 	int err;
 
 	ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
 	ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
+	if (map_type == BPF_MAP_TYPE_XSKMAP) {
+		err = __xsk_map_redirect(fwd, xdp);
+		goto out;
+	}
+
+	xdpf = xdp_convert_buff_to_frame(xdp);
+	if (unlikely(!xdpf)) {
+		err = -EOVERFLOW;
+		goto err;
+	}
+
 	switch (map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 		fallthrough;
@@ -3977,17 +3989,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		map = READ_ONCE(ri->map);
 		if (unlikely(map)) {
 			WRITE_ONCE(ri->map, NULL);
-			err = dev_map_enqueue_multi(xdp, dev, map,
+			err = dev_map_enqueue_multi(xdpf, dev, map,
 						    ri->flags & BPF_F_EXCLUDE_INGRESS);
 		} else {
-			err = dev_map_enqueue(fwd, xdp, dev);
+			err = dev_map_enqueue(fwd, xdpf, dev);
 		}
 		break;
 	case BPF_MAP_TYPE_CPUMAP:
-		err = cpu_map_enqueue(fwd, xdp, dev);
-		break;
-	case BPF_MAP_TYPE_XSKMAP:
-		err = __xsk_map_redirect(fwd, xdp);
+		err = cpu_map_enqueue(fwd, xdpf, dev);
 		break;
 	case BPF_MAP_TYPE_UNSPEC:
 		if (map_id == INT_MAX) {
@@ -3996,7 +4005,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 				err = -EINVAL;
 				break;
 			}
-			err = dev_xdp_enqueue(fwd, xdp, dev);
+			err = dev_xdp_enqueue(fwd, xdpf, dev);
 			break;
 		}
 		fallthrough;
@@ -4004,6 +4013,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		err = -EBADRQC;
 	}
 
+out:
 	if (unlikely(err))
 		goto err;
 

From 1372d34ccf6dd480332b2bcb2fd59a2b9a0df415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 3 Jan 2022 16:08:10 +0100
Subject: [PATCH 35/41] xdp: Add xdp_do_redirect_frame() for pre-computed
 xdp_frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an xdp_do_redirect_frame() variant which supports pre-computed
xdp_frame structures. This will be used in bpf_prog_run() to avoid having
to write to the xdp_frame structure when the XDP program doesn't modify the
frame boundaries.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220103150812.87914-6-toke@redhat.com
---
 include/linux/filter.h |  4 +++
 net/core/filter.c      | 65 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 60eec80fa1d4..71fa57b88bfc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1019,6 +1019,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 int xdp_do_redirect(struct net_device *dev,
 		    struct xdp_buff *xdp,
 		    struct bpf_prog *prog);
+int xdp_do_redirect_frame(struct net_device *dev,
+			  struct xdp_buff *xdp,
+			  struct xdp_frame *xdpf,
+			  struct bpf_prog *prog);
 void xdp_do_flush(void);
 
 /* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as
diff --git a/net/core/filter.c b/net/core/filter.c
index e2b83056246c..4603b7cd3cd1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3957,26 +3957,44 @@ u32 xdp_master_redirect(struct xdp_buff *xdp)
 }
 EXPORT_SYMBOL_GPL(xdp_master_redirect);
 
-int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
-		    struct bpf_prog *xdp_prog)
+static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
+					struct net_device *dev,
+					struct xdp_buff *xdp,
+					struct bpf_prog *xdp_prog)
+{
+	enum bpf_map_type map_type = ri->map_type;
+	void *fwd = ri->tgt_value;
+	u32 map_id = ri->map_id;
+	int err;
+
+	ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+	ri->map_type = BPF_MAP_TYPE_UNSPEC;
+
+	err = __xsk_map_redirect(fwd, xdp);
+	if (unlikely(err))
+		goto err;
+
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
+	return 0;
+err:
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
+	return err;
+}
+
+static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
+						   struct net_device *dev,
+						   struct xdp_frame *xdpf,
+						   struct bpf_prog *xdp_prog)
 {
-	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	enum bpf_map_type map_type = ri->map_type;
 	void *fwd = ri->tgt_value;
 	u32 map_id = ri->map_id;
-	struct xdp_frame *xdpf;
 	struct bpf_map *map;
 	int err;
 
 	ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
 	ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
-	if (map_type == BPF_MAP_TYPE_XSKMAP) {
-		err = __xsk_map_redirect(fwd, xdp);
-		goto out;
-	}
-
-	xdpf = xdp_convert_buff_to_frame(xdp);
 	if (unlikely(!xdpf)) {
 		err = -EOVERFLOW;
 		goto err;
@@ -4013,7 +4031,6 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		err = -EBADRQC;
 	}
 
-out:
 	if (unlikely(err))
 		goto err;
 
@@ -4023,8 +4040,34 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
 	return err;
 }
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+		    struct bpf_prog *xdp_prog)
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	enum bpf_map_type map_type = ri->map_type;
+
+	if (map_type == BPF_MAP_TYPE_XSKMAP)
+		return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+	return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
+				       xdp_prog);
+}
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
+int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
+			  struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	enum bpf_map_type map_type = ri->map_type;
+
+	if (map_type == BPF_MAP_TYPE_XSKMAP)
+		return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+	return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
+
 static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct sk_buff *skb,
 				       struct xdp_buff *xdp,

From 70bc793382a0e37ba4e35e4d1a317b280b829a44 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Jan 2022 12:51:56 -0800
Subject: [PATCH 36/41] selftests/bpf: Don't rely on preserving volatile in
 PT_REGS macros in loop3

PT_REGS*() macro on some architectures force-cast struct pt_regs to
other types (user_pt_regs, etc) and might drop volatile modifiers, if any.
Volatile isn't really required as pt_regs value isn't supposed to change
during the BPF program run, so this is correct behavior.

But progs/loop3.c relies on that volatile modifier to ensure that loop
is preserved. Fix loop3.c by declaring i and sum variables as volatile
instead. It preserves the loop and makes the test pass on all
architectures (including s390x which is currently broken).

Fixes: 3cc31d794097 ("libbpf: Normalize PT_REGS_xxx() macro definitions")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220106205156.955373-1-andrii@kernel.org
---
 tools/testing/selftests/bpf/progs/loop3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c
index 76e93b31c14b..717dab14322b 100644
--- a/tools/testing/selftests/bpf/progs/loop3.c
+++ b/tools/testing/selftests/bpf/progs/loop3.c
@@ -12,9 +12,9 @@
 char _license[] SEC("license") = "GPL";
 
 SEC("raw_tracepoint/consume_skb")
-int while_true(volatile struct pt_regs* ctx)
+int while_true(struct pt_regs *ctx)
 {
-	__u64 i = 0, sum = 0;
+	volatile __u64 i = 0, sum = 0;
 	do {
 		i++;
 		sum += PT_REGS_RC(ctx);

From e59618f0f46fa6cf86d5b82380e0f453756b282b Mon Sep 17 00:00:00 2001
From: Grant Seltzer <grantseltzer@gmail.com>
Date: Thu, 6 Jan 2022 15:13:05 -0500
Subject: [PATCH 37/41] libbpf: Add documentation for bpf_map batch operations

This adds documention for:

- bpf_map_delete_batch()
- bpf_map_lookup_batch()
- bpf_map_lookup_and_delete_batch()
- bpf_map_update_batch()

This also updates the public API for the `keys` parameter
of `bpf_map_delete_batch()`, and both the
`keys` and `values` parameters of `bpf_map_update_batch()`
to be constants.

Signed-off-by: Grant Seltzer <grantseltzer@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220106201304.112675-1-grantseltzer@gmail.com
---
 tools/lib/bpf/bpf.c |   8 +--
 tools/lib/bpf/bpf.h | 115 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 9b64eed2b003..550b4cbb6c99 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -691,11 +691,11 @@ static int bpf_map_batch_common(int cmd, int fd, void  *in_batch,
 	return libbpf_err_errno(ret);
 }
 
-int bpf_map_delete_batch(int fd, void *keys, __u32 *count,
+int bpf_map_delete_batch(int fd, const void *keys, __u32 *count,
 			 const struct bpf_map_batch_opts *opts)
 {
 	return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, NULL,
-				    NULL, keys, NULL, count, opts);
+				    NULL, (void *)keys, NULL, count, opts);
 }
 
 int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys,
@@ -715,11 +715,11 @@ int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch,
 				    count, opts);
 }
 
-int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count,
+int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *count,
 			 const struct bpf_map_batch_opts *opts)
 {
 	return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH, fd, NULL, NULL,
-				    keys, values, count, opts);
+				    (void *)keys, (void *)values, count, opts);
 }
 
 int bpf_obj_pin(int fd, const char *pathname)
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 00619f64a040..14e0d97ad2cf 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -254,17 +254,128 @@ struct bpf_map_batch_opts {
 };
 #define bpf_map_batch_opts__last_field flags
 
-LIBBPF_API int bpf_map_delete_batch(int fd, void *keys,
+
+/**
+ * @brief **bpf_map_delete_batch()** allows for batch deletion of multiple
+ * elements in a BPF map.
+ *
+ * @param fd BPF map file descriptor
+ * @param keys pointer to an array of *count* keys
+ * @param count input and output parameter; on input **count** represents the
+ * number of  elements in the map to delete in batch;
+ * on output if a non-EFAULT error is returned, **count** represents the number of deleted
+ * elements if the output **count** value is not equal to the input **count** value
+ * If EFAULT is returned, **count** should not be trusted to be correct.
+ * @param opts options for configuring the way the batch deletion works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
+LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys,
 				    __u32 *count,
 				    const struct bpf_map_batch_opts *opts);
+
+/**
+ * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements.
+ *
+ * The parameter *in_batch* is the address of the first element in the batch to read.
+ * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent
+ * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate
+ * that the batched lookup starts from the beginning of the map.
+ *
+ * The *keys* and *values* are output parameters which must point to memory large enough to
+ * hold *count* items based on the key and value size of the map *map_fd*. The *keys*
+ * buffer must be of *key_size* * *count*. The *values* buffer must be of
+ * *value_size* * *count*.
+ *
+ * @param fd BPF map file descriptor
+ * @param in_batch address of the first element in batch to read, can pass NULL to
+ * indicate that the batched lookup starts from the beginning of the map.
+ * @param out_batch output parameter that should be passed to next call as *in_batch*
+ * @param keys pointer to an array large enough for *count* keys
+ * @param values pointer to an array large enough for *count* values
+ * @param count input and output parameter; on input it's the number of elements
+ * in the map to read in batch; on output it's the number of elements that were
+ * successfully read.
+ * If a non-EFAULT error is returned, count will be set as the number of elements
+ * that were read before the error occurred.
+ * If EFAULT is returned, **count** should not be trusted to be correct.
+ * @param opts options for configuring the way the batch lookup works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
 LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch,
 				    void *keys, void *values, __u32 *count,
 				    const struct bpf_map_batch_opts *opts);
+
+/**
+ * @brief **bpf_map_lookup_and_delete_batch()** allows for batch lookup and deletion
+ * of BPF map elements where each element is deleted after being retrieved.
+ *
+ * @param fd BPF map file descriptor
+ * @param in_batch address of the first element in batch to read, can pass NULL to
+ * get address of the first element in *out_batch*
+ * @param out_batch output parameter that should be passed to next call as *in_batch*
+ * @param keys pointer to an array of *count* keys
+ * @param values pointer to an array large enough for *count* values
+ * @param count input and output parameter; on input it's the number of elements
+ * in the map to read and delete in batch; on output it represents the number of
+ * elements that were successfully read and deleted
+ * If a non-**EFAULT** error code is returned and if the output **count** value
+ * is not equal to the input **count** value, up to **count** elements may
+ * have been deleted.
+ * if **EFAULT** is returned up to *count* elements may have been deleted without
+ * being returned via the *keys* and *values* output parameters.
+ * @param opts options for configuring the way the batch lookup and delete works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
 LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch,
 					void *out_batch, void *keys,
 					void *values, __u32 *count,
 					const struct bpf_map_batch_opts *opts);
-LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values,
+
+/**
+ * @brief **bpf_map_update_batch()** updates multiple elements in a map
+ * by specifying keys and their corresponding values.
+ *
+ * The *keys* and *values* parameters must point to memory large enough
+ * to hold *count* items based on the key and value size of the map.
+ *
+ * The *opts* parameter can be used to control how *bpf_map_update_batch()*
+ * should handle keys that either do or do not already exist in the map.
+ * In particular the *flags* parameter of *bpf_map_batch_opts* can be
+ * one of the following:
+ *
+ * Note that *count* is an input and output parameter, where on output it
+ * represents how many elements were successfully updated. Also note that if
+ * **EFAULT** then *count* should not be trusted to be correct.
+ *
+ * **BPF_ANY**
+ *    Create new elements or update existing.
+ *
+ * **BPF_NOEXIST**
+ *    Create new elements only if they do not exist.
+ *
+ * **BPF_EXIST**
+ *    Update existing elements.
+ *
+ * **BPF_F_LOCK**
+ *    Update spin_lock-ed map elements. This must be
+ *    specified if the map value contains a spinlock.
+ *
+ * @param fd BPF map file descriptor
+ * @param keys pointer to an array of *count* keys
+ * @param values pointer to an array of *count* values
+ * @param count input and output parameter; on input it's the number of elements
+ * in the map to update in batch; on output if a non-EFAULT error is returned,
+ * **count** represents the number of updated elements if the output **count**
+ * value is not equal to the input **count** value.
+ * If EFAULT is returned, **count** should not be trusted to be correct.
+ * @param opts options for configuring the way the batch update works
+ * @return 0, on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
+LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values,
 				    __u32 *count,
 				    const struct bpf_map_batch_opts *opts);
 

From 44bab87d8ca6f0544a9f8fc97bdf33aa5b3c899e Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Thu, 6 Jan 2022 12:55:25 -0800
Subject: [PATCH 38/41] bpf/selftests: Test bpf_d_path on rdonly_mem.

The second parameter of bpf_d_path() can only accept writable
memories. Rdonly_mem obtained from bpf_per_cpu_ptr() can not
be passed into bpf_d_path for modification. This patch adds
a selftest to verify this behavior.

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220106205525.2116218-1-haoluo@google.com
---
 .../testing/selftests/bpf/prog_tests/d_path.c | 22 ++++++++++++++-
 .../bpf/progs/test_d_path_check_rdonly_mem.c  | 28 +++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c

diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c
index 0a577a248d34..32fc5b3b5cf6 100644
--- a/tools/testing/selftests/bpf/prog_tests/d_path.c
+++ b/tools/testing/selftests/bpf/prog_tests/d_path.c
@@ -9,6 +9,7 @@
 #define MAX_FILES		7
 
 #include "test_d_path.skel.h"
+#include "test_d_path_check_rdonly_mem.skel.h"
 
 static int duration;
 
@@ -99,7 +100,7 @@ static int trigger_fstat_events(pid_t pid)
 	return ret;
 }
 
-void test_d_path(void)
+static void test_d_path_basic(void)
 {
 	struct test_d_path__bss *bss;
 	struct test_d_path *skel;
@@ -155,3 +156,22 @@ void test_d_path(void)
 cleanup:
 	test_d_path__destroy(skel);
 }
+
+static void test_d_path_check_rdonly_mem(void)
+{
+	struct test_d_path_check_rdonly_mem *skel;
+
+	skel = test_d_path_check_rdonly_mem__open_and_load();
+	ASSERT_ERR_PTR(skel, "unexpected_load_overwriting_rdonly_mem");
+
+	test_d_path_check_rdonly_mem__destroy(skel);
+}
+
+void test_d_path(void)
+{
+	if (test__start_subtest("basic"))
+		test_d_path_basic();
+
+	if (test__start_subtest("check_rdonly_mem"))
+		test_d_path_check_rdonly_mem();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c
new file mode 100644
index 000000000000..27c27cff6a3a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_d_path_check_rdonly_mem.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Google */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+extern const int bpf_prog_active __ksym;
+
+SEC("fentry/security_inode_getattr")
+int BPF_PROG(d_path_check_rdonly_mem, struct path *path, struct kstat *stat,
+	     __u32 request_mask, unsigned int query_flags)
+{
+	void *active;
+	__u32 cpu;
+
+	cpu = bpf_get_smp_processor_id();
+	active = (void *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+	if (active) {
+		/* FAIL here! 'active' points to readonly memory. bpf helpers
+		 * that update its arguments can not write into it.
+		 */
+		bpf_d_path(path, active, sizeof(int));
+	}
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";

From 91a760b26926265a60c77ddf016529bcf3e17a04 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 6 Jan 2022 21:20:20 +0800
Subject: [PATCH 39/41] net: bpf: Handle return value of
 BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND()

The return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND() in
__inet_bind() is not handled properly. While the return value
is non-zero, it will set inet_saddr and inet_rcv_saddr to 0 and
exit:

	err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
	if (err) {
		inet->inet_saddr = inet->inet_rcv_saddr = 0;
		goto out_release_sock;
	}

Let's take UDP for example and see what will happen. For UDP
socket, it will be added to 'udp_prot.h.udp_table->hash' and
'udp_prot.h.udp_table->hash2' after the sk->sk_prot->get_port()
called success. If 'inet->inet_rcv_saddr' is specified here,
then 'sk' will be in the 'hslot2' of 'hash2' that it don't belong
to (because inet_saddr is changed to 0), and UDP packet received
will not be passed to this sock. If 'inet->inet_rcv_saddr' is not
specified here, the sock will work fine, as it can receive packet
properly, which is wired, as the 'bind()' is already failed.

To undo the get_port() operation, introduce the 'put_port' field
for 'struct proto'. For TCP proto, it is inet_put_port(); For UDP
proto, it is udp_lib_unhash(); For icmp proto, it is
ping_unhash().

Therefore, after sys_bind() fail caused by
BPF_CGROUP_RUN_PROG_INET4_POST_BIND(), it will be unbinded, which
means that it can try to be binded to another port.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220106132022.3470772-2-imagedong@tencent.com
---
 include/net/sock.h  | 1 +
 net/ipv4/af_inet.c  | 2 ++
 net/ipv4/ping.c     | 1 +
 net/ipv4/tcp_ipv4.c | 1 +
 net/ipv4/udp.c      | 1 +
 net/ipv6/af_inet6.c | 2 ++
 net/ipv6/ping.c     | 1 +
 net/ipv6/tcp_ipv6.c | 1 +
 net/ipv6/udp.c      | 1 +
 9 files changed, 11 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7b4b4237e6e0..ff9b508d9c5f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1209,6 +1209,7 @@ struct proto {
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
+	void			(*put_port)(struct sock *sk);
 #ifdef CONFIG_BPF_SYSCALL
 	int			(*psock_update_sk_prot)(struct sock *sk,
 							struct sk_psock *psock,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f53184767ee7..9c465bac1eb0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -531,6 +531,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 			err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
 			if (err) {
 				inet->inet_saddr = inet->inet_rcv_saddr = 0;
+				if (sk->sk_prot->put_port)
+					sk->sk_prot->put_port(sk);
 				goto out_release_sock;
 			}
 		}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index e540b0dcf085..0e56df3a45e2 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -994,6 +994,7 @@ struct proto ping_prot = {
 	.hash =		ping_hash,
 	.unhash =	ping_unhash,
 	.get_port =	ping_get_port,
+	.put_port =	ping_unhash,
 	.obj_size =	sizeof(struct inet_sock),
 };
 EXPORT_SYMBOL(ping_prot);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ac10e4cdd8d0..9861786b8336 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3076,6 +3076,7 @@ struct proto tcp_prot = {
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+	.put_port		= inet_put_port,
 #ifdef CONFIG_BPF_SYSCALL
 	.psock_update_sk_prot	= tcp_bpf_update_proto,
 #endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7b18a6f42f18..c2a4411d2b04 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2927,6 +2927,7 @@ struct proto udp_prot = {
 	.unhash			= udp_lib_unhash,
 	.rehash			= udp_v4_rehash,
 	.get_port		= udp_v4_get_port,
+	.put_port		= udp_lib_unhash,
 #ifdef CONFIG_BPF_SYSCALL
 	.psock_update_sk_prot	= udp_bpf_update_proto,
 #endif
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d1636425654e..8fe7900f1949 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -413,6 +413,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 			if (err) {
 				sk->sk_ipv6only = saved_ipv6only;
 				inet_reset_saddr(sk);
+				if (sk->sk_prot->put_port)
+					sk->sk_prot->put_port(sk);
 				goto out;
 			}
 		}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 6ac88fe24a8e..9256f6ba87ef 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -177,6 +177,7 @@ struct proto pingv6_prot = {
 	.hash =		ping_hash,
 	.unhash =	ping_unhash,
 	.get_port =	ping_get_port,
+	.put_port =	ping_unhash,
 	.obj_size =	sizeof(struct raw6_sock),
 };
 EXPORT_SYMBOL_GPL(pingv6_prot);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1ac243d18c2b..075ee8a2df3b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2181,6 +2181,7 @@ struct proto tcpv6_prot = {
 	.hash			= inet6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+	.put_port		= inet_put_port,
 #ifdef CONFIG_BPF_SYSCALL
 	.psock_update_sk_prot	= tcp_bpf_update_proto,
 #endif
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1accc06abc54..90718a924ca8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1732,6 +1732,7 @@ struct proto udpv6_prot = {
 	.unhash			= udp_lib_unhash,
 	.rehash			= udp_v6_rehash,
 	.get_port		= udp_v6_get_port,
+	.put_port		= udp_lib_unhash,
 #ifdef CONFIG_BPF_SYSCALL
 	.psock_update_sk_prot	= udp_bpf_update_proto,
 #endif

From 6fd92c7f0c3846340fee20f62dacb17d0a15c0d3 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 6 Jan 2022 21:20:21 +0800
Subject: [PATCH 40/41] bpf: selftests: Use C99 initializers in test_sock.c

Use C99 initializers for the initialization of 'tests' in test_sock.c.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220106132022.3470772-3-imagedong@tencent.com
---
 tools/testing/selftests/bpf/test_sock.c | 220 ++++++++++--------------
 1 file changed, 92 insertions(+), 128 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index e8edd3dd3ec2..94f9b126f5ed 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -46,7 +46,7 @@ struct sock_test {
 
 static struct sock_test tests[] = {
 	{
-		"bind4 load with invalid access: src_ip6",
+		.descr = "bind4 load with invalid access: src_ip6",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -54,16 +54,12 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = LOAD_REJECT,
 	},
 	{
-		"bind4 load with invalid access: mark",
+		.descr = "bind4 load with invalid access: mark",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -71,16 +67,12 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = LOAD_REJECT,
 	},
 	{
-		"bind6 load with invalid access: src_ip4",
+		.descr = "bind6 load with invalid access: src_ip4",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -88,16 +80,12 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.result = LOAD_REJECT,
 	},
 	{
-		"sock_create load with invalid access: src_port",
+		.descr = "sock_create load with invalid access: src_port",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
@@ -105,128 +93,106 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET_SOCK_CREATE,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		0,
-		0,
-		NULL,
-		0,
-		LOAD_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.result = LOAD_REJECT,
 	},
 	{
-		"sock_create load w/o expected_attach_type (compat mode)",
+		.descr = "sock_create load w/o expected_attach_type (compat mode)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		0,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		AF_INET,
-		SOCK_STREAM,
-		"127.0.0.1",
-		8097,
-		SUCCESS,
+		.expected_attach_type = 0,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 8097,
+		.result = SUCCESS,
 	},
 	{
-		"sock_create load w/ expected_attach_type",
+		.descr = "sock_create load w/ expected_attach_type",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET_SOCK_CREATE,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		AF_INET,
-		SOCK_STREAM,
-		"127.0.0.1",
-		8097,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 8097,
+		.result = SUCCESS,
 	},
 	{
-		"attach type mismatch bind4 vs bind6",
+		.descr = "attach type mismatch bind4 vs bind6",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"attach type mismatch bind6 vs bind4",
+		.descr = "attach type mismatch bind6 vs bind4",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"attach type mismatch default vs bind4",
+		.descr = "attach type mismatch default vs bind4",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		0,
-		BPF_CGROUP_INET4_POST_BIND,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = 0,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"attach type mismatch bind6 vs sock_create",
+		.descr = "attach type mismatch bind6 vs sock_create",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET_SOCK_CREATE,
-		0,
-		0,
-		NULL,
-		0,
-		ATTACH_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET_SOCK_CREATE,
+		.result = ATTACH_REJECT,
 	},
 	{
-		"bind4 reject all",
+		.descr = "bind4 reject all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		"0.0.0.0",
-		0,
-		BIND_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "0.0.0.0",
+		.result = BIND_REJECT,
 	},
 	{
-		"bind6 reject all",
+		.descr = "bind6 reject all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		"::",
-		0,
-		BIND_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::",
+		.result = BIND_REJECT,
 	},
 	{
-		"bind6 deny specific IP & port",
+		.descr = "bind6 deny specific IP & port",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 
@@ -247,16 +213,16 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		"::1",
-		8193,
-		BIND_REJECT,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::1",
+		.port = 8193,
+		.result = BIND_REJECT,
 	},
 	{
-		"bind4 allow specific IP & port",
+		.descr = "bind4 allow specific IP & port",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 
@@ -277,41 +243,39 @@ static struct sock_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		"127.0.0.1",
-		4098,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.result = SUCCESS,
 	},
 	{
-		"bind4 allow all",
+		.descr = "bind4 allow all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET4_POST_BIND,
-		BPF_CGROUP_INET4_POST_BIND,
-		AF_INET,
-		SOCK_STREAM,
-		"0.0.0.0",
-		0,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "0.0.0.0",
+		.result = SUCCESS,
 	},
 	{
-		"bind6 allow all",
+		.descr = "bind6 allow all",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		BPF_CGROUP_INET6_POST_BIND,
-		BPF_CGROUP_INET6_POST_BIND,
-		AF_INET6,
-		SOCK_STREAM,
-		"::",
-		0,
-		SUCCESS,
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::",
+		.result = SUCCESS,
 	},
 };
 

From f7342481749365d9ac5f24fb971659a64e045bb5 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 6 Jan 2022 21:20:22 +0800
Subject: [PATCH 41/41] bpf: selftests: Add bind retry for post_bind{4, 6}

With previous patch, kernel is able to 'put_port' after sys_bind()
fails. Add the test for that case: rebind another port after
sys_bind() fails. If the bind success, it means previous bind
operation is already undoed.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220106132022.3470772-4-imagedong@tencent.com
---
 tools/testing/selftests/bpf/test_sock.c | 150 ++++++++++++++++++++----
 1 file changed, 130 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 94f9b126f5ed..fe10f8134278 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -35,12 +35,15 @@ struct sock_test {
 	/* Endpoint to bind() to */
 	const char *ip;
 	unsigned short port;
+	unsigned short port_retry;
 	/* Expected test result */
 	enum {
 		LOAD_REJECT,
 		ATTACH_REJECT,
 		BIND_REJECT,
 		SUCCESS,
+		RETRY_SUCCESS,
+		RETRY_REJECT
 	} result;
 };
 
@@ -251,6 +254,99 @@ static struct sock_test tests[] = {
 		.port = 4098,
 		.result = SUCCESS,
 	},
+	{
+		.descr = "bind4 deny specific IP & port of TCP, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x7F000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_STREAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.port_retry = 5000,
+		.result = RETRY_SUCCESS,
+	},
+	{
+		.descr = "bind4 deny specific IP & port of UDP, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip4)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x7F000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.attach_type = BPF_CGROUP_INET4_POST_BIND,
+		.domain = AF_INET,
+		.type = SOCK_DGRAM,
+		.ip = "127.0.0.1",
+		.port = 4098,
+		.port_retry = 5000,
+		.result = RETRY_SUCCESS,
+	},
+	{
+		.descr = "bind6 deny specific IP & port, and retry",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+			/* if (ip == expected && port == expected) */
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_ip6[3])),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+				    __bpf_constant_ntohl(0x00000001), 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+				    offsetof(struct bpf_sock, src_port)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
+
+			/* return DENY; */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_JMP_A(1),
+
+			/* else return ALLOW; */
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.expected_attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.attach_type = BPF_CGROUP_INET6_POST_BIND,
+		.domain = AF_INET6,
+		.type = SOCK_STREAM,
+		.ip = "::1",
+		.port = 8193,
+		.port_retry = 9000,
+		.result = RETRY_SUCCESS,
+	},
 	{
 		.descr = "bind4 allow all",
 		.insns = {
@@ -315,14 +411,15 @@ static int attach_sock_prog(int cgfd, int progfd,
 	return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE);
 }
 
-static int bind_sock(int domain, int type, const char *ip, unsigned short port)
+static int bind_sock(int domain, int type, const char *ip,
+		     unsigned short port, unsigned short port_retry)
 {
 	struct sockaddr_storage addr;
 	struct sockaddr_in6 *addr6;
 	struct sockaddr_in *addr4;
 	int sockfd = -1;
 	socklen_t len;
-	int err = 0;
+	int res = SUCCESS;
 
 	sockfd = socket(domain, type, 0);
 	if (sockfd < 0)
@@ -348,21 +445,44 @@ static int bind_sock(int domain, int type, const char *ip, unsigned short port)
 		goto err;
 	}
 
-	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1)
-		goto err;
+	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+		/* sys_bind() may fail for different reasons, errno has to be
+		 * checked to confirm that BPF program rejected it.
+		 */
+		if (errno != EPERM)
+			goto err;
+		if (port_retry)
+			goto retry;
+		res = BIND_REJECT;
+		goto out;
+	}
 
+	goto out;
+retry:
+	if (domain == AF_INET)
+		addr4->sin_port = htons(port_retry);
+	else
+		addr6->sin6_port = htons(port_retry);
+	if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+		if (errno != EPERM)
+			goto err;
+		res = RETRY_REJECT;
+	} else {
+		res = RETRY_SUCCESS;
+	}
 	goto out;
 err:
-	err = -1;
+	res = -1;
 out:
 	close(sockfd);
-	return err;
+	return res;
 }
 
 static int run_test_case(int cgfd, const struct sock_test *test)
 {
 	int progfd = -1;
 	int err = 0;
+	int res;
 
 	printf("Test case: %s .. ", test->descr);
 	progfd = load_sock_prog(test->insns, test->expected_attach_type);
@@ -380,21 +500,11 @@ static int run_test_case(int cgfd, const struct sock_test *test)
 			goto err;
 	}
 
-	if (bind_sock(test->domain, test->type, test->ip, test->port) == -1) {
-		/* sys_bind() may fail for different reasons, errno has to be
-		 * checked to confirm that BPF program rejected it.
-		 */
-		if (test->result == BIND_REJECT && errno == EPERM)
-			goto out;
-		else
-			goto err;
-	}
+	res = bind_sock(test->domain, test->type, test->ip, test->port,
+			test->port_retry);
+	if (res > 0 && test->result == res)
+		goto out;
 
-
-	if (test->result != SUCCESS)
-		goto err;
-
-	goto out;
 err:
 	err = -1;
 out: