From 80847a71b270b172814dc7562f3eb59507fca61e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 26 Mar 2021 00:41:34 +0100
Subject: [PATCH 01/68] bpf: Undo ptr_to_map_key alu sanitation for now

Remove PTR_TO_MAP_KEY for the time being from being sanitized on pointer ALU
through sanitize_ptr_alu() mainly for 3 reasons:

  1) It's currently unused and not available from unprivileged. However that by
     itself is not yet a strong reason to drop the code.

  2) Commit 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") implemented
     the sanitation not fully correct in that unlike stack or map_value pointer
     it doesn't probe whether the access to the map key /after/ the simulated ALU
     operation is still in bounds. This means that the generated mask can truncate
     the offset in the non-speculative domain whereas it should only truncate in
     the speculative domain. The verifier should instead reject such program as
     we do for other types.

  3) Given the recent fixes from f232326f6966 ("bpf: Prohibit alu ops for pointer
     types not defining ptr_limit"), 10d2bb2e6b1d ("bpf: Fix off-by-one for area
     size in creating mask to left"), b5871dca250c ("bpf: Simplify alu_limit masking
     for pointer arithmetic") as well as 1b1597e64e1a ("bpf: Add sanity check for
     upper ptr_limit") the code changed quite a bit and the merge in efd13b71a3fa
     broke the PTR_TO_MAP_KEY case due to an incorrect merge conflict.

Remove the relevant pieces for the time being and we can rework the PTR_TO_MAP_KEY
case once everything settles.

Fixes: efd13b71a3fa ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net")
Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 210169c25ead..85f9f842d15c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6062,19 +6062,6 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 		else
 			*ptr_limit = -off - 1;
 		return *ptr_limit >= max ? -ERANGE : 0;
-	case PTR_TO_MAP_KEY:
-		/* Currently, this code is not exercised as the only use
-		 * is bpf_for_each_map_elem() helper which requires
-		 * bpf_capble. The code has been tested manually for
-		 * future use.
-		 */
-		if (mask_to_left) {
-			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
-		} else {
-			off = ptr_reg->smin_value + ptr_reg->off;
-			*ptr_limit = ptr_reg->map_ptr->key_size - off;
-		}
-		return 0;
 	case PTR_TO_MAP_VALUE:
 		max = ptr_reg->map_ptr->value_size;
 		if (mask_to_left) {
@@ -6281,7 +6268,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
-	case PTR_TO_MAP_KEY:
 	case PTR_TO_MAP_VALUE:
 		if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
 			verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",

From 5d9034938720a15fa0f62db3e195c0c473c72c1b Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Thu, 18 Mar 2021 21:22:22 +0100
Subject: [PATCH 02/68] bpf: Fix typo 'accesible' into 'accessible'

Trivial fix.

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210318202223.164873-8-ribalda@chromium.org
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 39dce9d3c3a5..24678d6ecbcf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_iter_seq_info {
 	u32 seq_priv_size;
 };
 
-/* map is generic key/value storage optionally accesible by eBPF programs */
+/* map is generic key/value storage optionally accessible by eBPF programs */
 struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
 	int (*map_alloc_check)(union bpf_attr *attr);

From a46410d5e4975d701d526397156fa0815747dc2f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 24 Mar 2021 10:29:41 -0700
Subject: [PATCH 03/68] libbpf: Constify few bpf_program getters

bpf_program__get_type() and bpf_program__get_expected_attach_type() shouldn't
modify given bpf_program, so mark input parameter as const struct bpf_program.
This eliminates unnecessary compilation warnings or explicit casts in user
programs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210324172941.2609884-1-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 4 ++--
 tools/lib/bpf/libbpf.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index c0fd2c718a7d..10a0a67699f1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8458,7 +8458,7 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n)
 	return fd;
 }
 
-enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog)
+enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog)
 {
 	return prog->type;
 }
@@ -8503,7 +8503,7 @@ BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT);
 BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP);
 
 enum bpf_attach_type
-bpf_program__get_expected_attach_type(struct bpf_program *prog)
+bpf_program__get_expected_attach_type(const struct bpf_program *prog)
 {
 	return prog->expected_attach_type;
 }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a1a424b9b8ff..89ade7d7b31c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -361,12 +361,12 @@ LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog);
 
-LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog);
+LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog);
 LIBBPF_API void bpf_program__set_type(struct bpf_program *prog,
 				      enum bpf_prog_type type);
 
 LIBBPF_API enum bpf_attach_type
-bpf_program__get_expected_attach_type(struct bpf_program *prog);
+bpf_program__get_expected_attach_type(const struct bpf_program *prog);
 LIBBPF_API void
 bpf_program__set_expected_attach_type(struct bpf_program *prog,
 				      enum bpf_attach_type type);

From b910eaaaa4b89976ef02e5d6448f3f73dc671d91 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 22 Mar 2021 22:51:46 -0700
Subject: [PATCH 04/68] bpf: Fix NULL pointer dereference in
 bpf_get_local_storage() helper

Jiri Olsa reported a bug ([1]) in kernel where cgroup local
storage pointer may be NULL in bpf_get_local_storage() helper.
There are two issues uncovered by this bug:
  (1). kprobe or tracepoint prog incorrectly sets cgroup local storage
       before prog run,
  (2). due to change from preempt_disable to migrate_disable,
       preemption is possible and percpu storage might be overwritten
       by other tasks.

This issue (1) is fixed in [2]. This patch tried to address issue (2).
The following shows how things can go wrong:
  task 1:   bpf_cgroup_storage_set() for percpu local storage
         preemption happens
  task 2:   bpf_cgroup_storage_set() for percpu local storage
         preemption happens
  task 1:   run bpf program

task 1 will effectively use the percpu local storage setting by task 2
which will be either NULL or incorrect ones.

Instead of just one common local storage per cpu, this patch fixed
the issue by permitting 8 local storages per cpu and each local
storage is identified by a task_struct pointer. This way, we
allow at most 8 nested preemption between bpf_cgroup_storage_set()
and bpf_cgroup_storage_unset(). The percpu local storage slot
is released (calling bpf_cgroup_storage_unset()) by the same task
after bpf program finished running.
bpf_test_run() is also fixed to use the new bpf_cgroup_storage_set()
interface.

The patch is tested on top of [2] with reproducer in [1].
Without this patch, kernel will emit error in 2-3 minutes.
With this patch, after one hour, still no error.

 [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T
 [2] https://lore.kernel.org/bpf/20210309185028.3763817-1-yhs@fb.com

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Link: https://lore.kernel.org/bpf/20210323055146.3334476-1-yhs@fb.com
---
 include/linux/bpf-cgroup.h | 57 ++++++++++++++++++++++++++++++++------
 include/linux/bpf.h        | 22 ++++++++++++---
 kernel/bpf/helpers.c       | 15 +++++++---
 kernel/bpf/local_storage.c |  5 ++--
 net/bpf/test_run.c         |  6 +++-
 5 files changed, 86 insertions(+), 19 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c42e02b4d84b..6a29fe11485d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -20,14 +20,25 @@ struct bpf_sock_ops_kern;
 struct bpf_cgroup_storage;
 struct ctl_table;
 struct ctl_table_header;
+struct task_struct;
 
 #ifdef CONFIG_CGROUP_BPF
 
 extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
 #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
 
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
-		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+#define BPF_CGROUP_STORAGE_NEST_MAX	8
+
+struct bpf_cgroup_storage_info {
+	struct task_struct *task;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+};
+
+/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks
+ * to use bpf cgroup storage simultaneously.
+ */
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -161,13 +172,42 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
-					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
+static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
+					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
 {
 	enum bpf_cgroup_storage_type stype;
+	int i, err = 0;
 
-	for_each_cgroup_storage_type(stype)
-		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
+	preempt_disable();
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
+			continue;
+
+		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
+		for_each_cgroup_storage_type(stype)
+			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
+				       storage[stype]);
+		goto out;
+	}
+	err = -EBUSY;
+	WARN_ON_ONCE(1);
+
+out:
+	preempt_enable();
+	return err;
+}
+
+static inline void bpf_cgroup_storage_unset(void)
+{
+	int i;
+
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+			continue;
+
+		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
+		return;
+	}
 }
 
 struct bpf_cgroup_storage *
@@ -448,8 +488,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline void bpf_cgroup_storage_set(
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
+static inline int bpf_cgroup_storage_set(
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; }
+static inline void bpf_cgroup_storage_unset(void) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 24678d6ecbcf..5a0801b420ca 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1106,6 +1106,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 /* BPF program asks to set CN on the packet. */
 #define BPF_RET_SET_CN						(1 << 0)
 
+/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY,
+ * if bpf_cgroup_storage_set() failed, the rest of programs
+ * will not execute. This should be a really rare scenario
+ * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of
+ * preemptions all between bpf_cgroup_storage_set() and
+ * bpf_cgroup_storage_unset() on the same cpu.
+ */
 #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)		\
 	({								\
 		struct bpf_prog_array_item *_item;			\
@@ -1118,10 +1125,12 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		_array = rcu_dereference(array);			\
 		_item = &_array->items[0];				\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
+				break;					\
 			func_ret = func(_prog, ctx);			\
 			_ret &= (func_ret & 1);				\
 			*(ret_flags) |= (func_ret >> 1);			\
+			bpf_cgroup_storage_unset();			\
 			_item++;					\
 		}							\
 		rcu_read_unlock();					\
@@ -1142,9 +1151,14 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			goto _out;			\
 		_item = &_array->items[0];		\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (set_cg_storage)		\
-				bpf_cgroup_storage_set(_item->cgroup_storage);	\
-			_ret &= func(_prog, ctx);	\
+			if (!set_cg_storage) {			\
+				_ret &= func(_prog, ctx);	\
+			} else {				\
+				if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
+					break;			\
+				_ret &= func(_prog, ctx);	\
+				bpf_cgroup_storage_unset();	\
+			}				\
 			_item++;			\
 		}					\
 _out:							\
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 074800226327..f306611c4ddf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
-		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage *storage;
+	struct bpf_cgroup_storage *storage = NULL;
 	void *ptr;
+	int i;
 
-	storage = this_cpu_read(bpf_cgroup_storage[stype]);
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+			continue;
+
+		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
+		break;
+	}
 
 	if (stype == BPF_CGROUP_STORAGE_SHARED)
 		ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2d4f9ac12377..bd11db9774c3 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,10 +9,11 @@
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
 #ifdef CONFIG_CGROUP_BPF
 
+DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
+	       bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
+
 #include "../cgroup/cgroup-internal.h"
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0abdd67f44b1..4aabf71cd95d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -106,12 +106,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 
 	bpf_test_timer_enter(&t);
 	do {
-		bpf_cgroup_storage_set(storage);
+		ret = bpf_cgroup_storage_set(storage);
+		if (ret)
+			break;
 
 		if (xdp)
 			*retval = bpf_prog_run_xdp(prog, ctx);
 		else
 			*retval = BPF_PROG_RUN(prog, ctx);
+
+		bpf_cgroup_storage_unset();
 	} while (bpf_test_timer_continue(&t, repeat, &ret, time));
 	bpf_test_timer_leave(&t);
 

From cff908463d91a6b2fb8c8ab6c41d9c308c29fd42 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Mon, 22 Mar 2021 17:07:20 +0000
Subject: [PATCH 05/68] selftests/bpf: Better error messages for ima_setup.sh
 failures

The current implementation uses the CHECK_FAIL macro which does not
provide useful error messages when the script fails. Use the CHECK macro
instead and provide more descriptive messages to aid debugging.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210322170720.2926715-1-kpsingh@kernel.org
---
 tools/testing/selftests/bpf/prog_tests/test_ima.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c
index b54bc0c351b7..0252f61d611a 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_ima.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c
@@ -68,7 +68,8 @@ void test_test_ima(void)
 		goto close_prog;
 
 	snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir);
-	if (CHECK_FAIL(system(cmd)))
+	err = system(cmd);
+	if (CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno))
 		goto close_clean;
 
 	err = run_measured_process(measured_dir, &skel->bss->monitored_pid);
@@ -81,7 +82,8 @@ void test_test_ima(void)
 
 close_clean:
 	snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir);
-	CHECK_FAIL(system(cmd));
+	err = system(cmd);
+	CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno);
 close_prog:
 	ima__destroy(skel);
 }

From f56387c534cc54d2578b962692e574e3edd8c3f6 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Mon, 22 Mar 2021 23:50:53 -0300
Subject: [PATCH 06/68] bpf: Add support for batched ops in LPM trie maps

Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210323025058.315763-2-pctammela@gmail.com
---
 kernel/bpf/lpm_trie.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index cec792a17e5f..1b7b8a6f34ee 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -726,6 +726,9 @@ const struct bpf_map_ops trie_map_ops = {
 	.map_lookup_elem = trie_lookup_elem,
 	.map_update_elem = trie_update_elem,
 	.map_delete_elem = trie_delete_elem,
+	.map_lookup_batch = generic_map_lookup_batch,
+	.map_update_batch = generic_map_update_batch,
+	.map_delete_batch = generic_map_delete_batch,
 	.map_check_btf = trie_check_btf,
 	.map_btf_name = "lpm_trie",
 	.map_btf_id = &trie_map_btf_id,

From e9bd8cbd970bedd3cca8ee334c76ab90feb78760 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Mon, 22 Mar 2021 23:50:54 -0300
Subject: [PATCH 07/68] bpf: selftests: Add tests for batched ops in LPM trie
 maps

Uses the already existing infrastructure for testing batched ops.
The testing code is essentially the same, with minor tweaks for this use
case.

Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210323025058.315763-3-pctammela@gmail.com
---
 .../bpf/map_tests/lpm_trie_map_batch_ops.c    | 158 ++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c

diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
new file mode 100644
index 000000000000..2e986e5e4cac
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <arpa/inet.h>
+#include <linux/bpf.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+struct test_lpm_key {
+	__u32 prefix;
+	struct in_addr ipv4;
+};
+
+static void map_batch_update(int map_fd, __u32 max_entries,
+			     struct test_lpm_key *keys, int *values)
+{
+	__u32 i;
+	int err;
+	char buff[16] = { 0 };
+	DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+		.elem_flags = 0,
+		.flags = 0,
+	);
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i].prefix = 32;
+		snprintf(buff, 16, "192.168.1.%d", i + 1);
+		inet_pton(AF_INET, buff, &keys[i].ipv4);
+		values[i] = i + 1;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+			     struct test_lpm_key *keys, int *values)
+{
+	char buff[16] = { 0 };
+	int lower_byte = 0;
+	__u32 i;
+
+	memset(visited, 0, max_entries * sizeof(*visited));
+	for (i = 0; i < max_entries; i++) {
+		inet_ntop(AF_INET, &keys[i].ipv4, buff, 32);
+		CHECK(sscanf(buff, "192.168.1.%d", &lower_byte) == EOF,
+		      "sscanf()", "error: i %d\n", i);
+		CHECK(lower_byte != values[i], "key/value checking",
+		      "error: i %d key %s value %d\n", i, buff, values[i]);
+		visited[i] = 1;
+	}
+	for (i = 0; i < max_entries; i++) {
+		CHECK(visited[i] != 1, "visited checking",
+		      "error: keys array at index %d missing\n", i);
+	}
+}
+
+void test_lpm_trie_map_batch_ops(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "lpm_trie_map",
+		.map_type = BPF_MAP_TYPE_LPM_TRIE,
+		.key_size = sizeof(struct test_lpm_key),
+		.value_size = sizeof(int),
+		.map_flags = BPF_F_NO_PREALLOC,
+	};
+	struct test_lpm_key *keys, key;
+	int map_fd, *values, *visited;
+	__u32 step, count, total, total_success;
+	const __u32 max_entries = 10;
+	__u64 batch = 0;
+	int err;
+	DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+		.elem_flags = 0,
+		.flags = 0,
+	);
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n",
+	      strerror(errno));
+
+	keys = malloc(max_entries * sizeof(struct test_lpm_key));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n",
+	      strerror(errno));
+
+	total_success = 0;
+	for (step = 1; step < max_entries; step++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		map_batch_verify(visited, max_entries, keys, values);
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		batch = 0;
+		total = 0;
+		/* iteratively lookup/delete elements with 'step'
+		 * elements each.
+		 */
+		count = step;
+		while (true) {
+			err = bpf_map_lookup_batch(map_fd,
+				total ? &batch : NULL, &batch,
+				keys + total, values + total, &count, &opts);
+
+			CHECK((err && errno != ENOENT), "lookup with steps",
+			      "error: %s\n", strerror(errno));
+
+			total += count;
+			if (err)
+				break;
+		}
+
+		CHECK(total != max_entries, "lookup with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		map_batch_verify(visited, max_entries, keys, values);
+
+		total = 0;
+		count = step;
+		while (total < max_entries) {
+			if (max_entries - total < step)
+				count = max_entries - total;
+			err = bpf_map_delete_batch(map_fd, keys + total, &count,
+						   &opts);
+			CHECK((err && errno != ENOENT), "delete batch",
+			      "error: %s\n", strerror(errno));
+			total += count;
+			if (err)
+				break;
+		}
+		CHECK(total != max_entries, "delete with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		/* check map is empty, errono == ENOENT */
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()",
+		      "error: %s\n", strerror(errno));
+
+		total_success++;
+	}
+
+	CHECK(total_success == 0, "check total_success",
+	      "unexpected failure\n");
+
+	printf("%s:PASS\n", __func__);
+
+	free(keys);
+	free(values);
+	free(visited);
+}

From 155f556d64b1a48710f01305e14bb860734ed1e3 Mon Sep 17 00:00:00 2001
From: Rafael David Tinoco <rafaeldtinoco@ubuntu.com>
Date: Tue, 23 Mar 2021 01:09:52 -0300
Subject: [PATCH 08/68] libbpf: Add bpf object kern_version attribute setter

Unfortunately some distros don't have their kernel version defined
accurately in <linux/version.h> due to different long term support
reasons.

It is important to have a way to override the bpf kern_version
attribute during runtime: some old kernels might still check for
kern_version attribute during bpf_prog_load().

Signed-off-by: Rafael David Tinoco <rafaeldtinoco@ubuntu.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210323040952.2118241-1-rafaeldtinoco@ubuntu.com
---
 tools/lib/bpf/libbpf.c   | 10 ++++++++++
 tools/lib/bpf/libbpf.h   |  1 +
 tools/lib/bpf/libbpf.map |  1 +
 3 files changed, 12 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 10a0a67699f1..cebb0e852cf8 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8270,6 +8270,16 @@ int bpf_object__btf_fd(const struct bpf_object *obj)
 	return obj->btf ? btf__fd(obj->btf) : -1;
 }
 
+int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version)
+{
+	if (obj->loaded)
+		return -EINVAL;
+
+	obj->kern_version = kern_version;
+
+	return 0;
+}
+
 int bpf_object__set_priv(struct bpf_object *obj, void *priv,
 			 bpf_object_clear_priv_t clear_priv)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 89ade7d7b31c..f500621d28e5 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -143,6 +143,7 @@ LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
 
 LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj);
 LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj);
+LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version);
 
 struct btf;
 LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 279ae861f568..f5990f7208ce 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -359,4 +359,5 @@ LIBBPF_0.4.0 {
 		bpf_linker__finalize;
 		bpf_linker__free;
 		bpf_linker__new;
+		bpf_object__set_kversion;
 } LIBBPF_0.3.0;

From b0c407ec503b8e97bbffd65cc4bed39e008c192f Mon Sep 17 00:00:00 2001
From: Lu Wei <luwei32@huawei.com>
Date: Wed, 24 Mar 2021 16:31:47 +0800
Subject: [PATCH 09/68] bpf: Remove unused headers

The header <linux/version.h> is useless in sampleip_kern.c
and trace_event_kern.c, remove it.

Signed-off-by: Lu Wei <luwei32@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210324083147.149278-1-luwei32@huawei.com
---
 samples/bpf/sampleip_kern.c    | 1 -
 samples/bpf/trace_event_kern.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c
index f24806ac24e7..a3f8a3998e0a 100644
--- a/samples/bpf/sampleip_kern.c
+++ b/samples/bpf/sampleip_kern.c
@@ -4,7 +4,6 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
-#include <linux/version.h>
 #include <linux/ptrace.h>
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/bpf_perf_event.h>
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
index 7d3c66fb3f88..0bba5fcd7d24 100644
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -5,7 +5,6 @@
  * License as published by the Free Software Foundation.
  */
 #include <linux/ptrace.h>
-#include <linux/version.h>
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/bpf_perf_event.h>
 #include <uapi/linux/perf_event.h>

From fcb8d0d7587e0f2b7439d6c14a380fd17a450f96 Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Thu, 25 Mar 2021 15:06:02 +0800
Subject: [PATCH 10/68] bpf: struct sock is declared twice in bpf_sk_storage
 header

struct sock has been declared twice, therefore remove the duplicate.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210325070602.858024-1-wanjiabing@vivo.com
---
 include/net/bpf_sk_storage.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 0e85713f56df..2926f1f00d65 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -27,7 +27,6 @@ struct bpf_local_storage_elem;
 struct bpf_sk_storage_diag;
 struct sk_buff;
 struct nlattr;
-struct sock;
 
 #ifdef CONFIG_BPF_SYSCALL
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);

From 36e7985160782bc683001afe09e33a288435def0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 25 Mar 2021 21:30:36 -0700
Subject: [PATCH 11/68] libbpf: Preserve empty DATASEC BTFs during static
 linking

Ensure that BPF static linker preserves all DATASEC BTF types, even if some of
them might not have any variable information at all. This may happen if the
compiler promotes local initialized variable contents into .rodata section and
there are no global or static functions in the program.

For example,

  $ cat t.c
  struct t { char a; char b; char c; };
  void bar(struct t*);
  void find() {
     struct t tmp = {1, 2, 3};
     bar(&tmp);
  }

  $ clang -target bpf -O2 -g -S t.c
         .long   104                             # BTF_KIND_DATASEC(id = 8)
         .long   251658240                       # 0xf000000
         .long   0

         .ascii  ".rodata"                       # string offset=104

  $ clang -target bpf -O2 -g -c t.c
  $ readelf -S t.o | grep data
     [ 4] .rodata           PROGBITS         0000000000000000  00000090

Fixes: 8fd27bf69b86 ("libbpf: Add BPF static linker BTF and BTF.ext support")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210326043036.3081011-1-andrii@kernel.org
---
 tools/lib/bpf/linker.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index 5e0aa2f2c0ca..a29d62ff8041 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -94,6 +94,7 @@ struct dst_sec {
 	int sec_sym_idx;
 
 	/* section's DATASEC variable info, emitted on BTF finalization */
+	bool has_btf;
 	int sec_var_cnt;
 	struct btf_var_secinfo *sec_vars;
 
@@ -1436,6 +1437,16 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj)
 			continue;
 		dst_sec = &linker->secs[src_sec->dst_id];
 
+		/* Mark section as having BTF regardless of the presence of
+		 * variables. In some cases compiler might generate empty BTF
+		 * with no variables information. E.g., when promoting local
+		 * array/structure variable initial values and BPF object
+		 * file otherwise has no read-only static variables in
+		 * .rodata. We need to preserve such empty BTF and just set
+		 * correct section size.
+		 */
+		dst_sec->has_btf = true;
+
 		t = btf__type_by_id(obj->btf, src_sec->sec_type_id);
 		src_var = btf_var_secinfos(t);
 		n = btf_vlen(t);
@@ -1717,7 +1728,7 @@ static int finalize_btf(struct bpf_linker *linker)
 	for (i = 1; i < linker->sec_cnt; i++) {
 		struct dst_sec *sec = &linker->secs[i];
 
-		if (!sec->sec_var_cnt)
+		if (!sec->has_btf)
 			continue;
 
 		id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz);

From e16301fbe1837c9594f9c1957c28fd1bb18fbd15 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:30 -0700
Subject: [PATCH 12/68] bpf: Simplify freeing logic in linfo and jited_linfo

This patch simplifies the linfo freeing logic by combining
"bpf_prog_free_jited_linfo()" and "bpf_prog_free_unused_jited_linfo()"
into the new "bpf_prog_jit_attempt_done()".
It is a prep work for the kernel function call support.  In a later
patch, freeing the kernel function call descriptors will also
be done in the "bpf_prog_jit_attempt_done()".

"bpf_prog_free_linfo()" is removed since it is only called by
"__bpf_prog_put_noref()".  The kvfree() are directly called
instead.

It also takes this chance to s/kcalloc/kvcalloc/ for the jited_linfo
allocation.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015130.1544323-1-kafai@fb.com
---
 include/linux/filter.h |  3 +--
 kernel/bpf/core.c      | 35 ++++++++++++-----------------------
 kernel/bpf/syscall.c   |  3 ++-
 kernel/bpf/verifier.c  |  4 ++--
 4 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index b2b85b2cad8e..0d9c710eb050 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -877,8 +877,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog);
 void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 			       const u32 *insn_to_jit_off);
 int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 75244ecb2389..a35eb3d7b126 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -143,25 +143,22 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
 	if (!prog->aux->nr_linfo || !prog->jit_requested)
 		return 0;
 
-	prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
-					 sizeof(*prog->aux->jited_linfo),
-					 GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+	prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+					  sizeof(*prog->aux->jited_linfo),
+					  GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!prog->aux->jited_linfo)
 		return -ENOMEM;
 
 	return 0;
 }
 
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
 {
-	kfree(prog->aux->jited_linfo);
-	prog->aux->jited_linfo = NULL;
-}
-
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
-{
-	if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
-		bpf_prog_free_jited_linfo(prog);
+	if (prog->aux->jited_linfo &&
+	    (!prog->jited || !prog->aux->jited_linfo[0])) {
+		kvfree(prog->aux->jited_linfo);
+		prog->aux->jited_linfo = NULL;
+	}
 }
 
 /* The jit engine is responsible to provide an array
@@ -217,12 +214,6 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 			insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
 }
 
-void bpf_prog_free_linfo(struct bpf_prog *prog)
-{
-	bpf_prog_free_jited_linfo(prog);
-	kvfree(prog->aux->linfo);
-}
-
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags)
 {
@@ -1866,15 +1857,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 			return fp;
 
 		fp = bpf_int_jit_compile(fp);
-		if (!fp->jited) {
-			bpf_prog_free_jited_linfo(fp);
+		bpf_prog_jit_attempt_done(fp);
 #ifdef CONFIG_BPF_JIT_ALWAYS_ON
+		if (!fp->jited) {
 			*err = -ENOTSUPP;
 			return fp;
-#endif
-		} else {
-			bpf_prog_free_unused_jited_linfo(fp);
 		}
+#endif
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 250503482cda..eaf85bf51c5a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1694,7 +1694,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 {
 	bpf_prog_kallsyms_del_all(prog);
 	btf_put(prog->aux->btf);
-	bpf_prog_free_linfo(prog);
+	kvfree(prog->aux->jited_linfo);
+	kvfree(prog->aux->linfo);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 85f9f842d15c..b7df3f06a279 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11741,7 +11741,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt;
-	bpf_prog_free_unused_jited_linfo(prog);
+	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
 	for (i = 0; i < env->subprog_cnt; i++) {
@@ -11764,7 +11764,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->off = 0;
 		insn->imm = env->insn_aux_data[i].call_imm;
 	}
-	bpf_prog_free_jited_linfo(prog);
+	bpf_prog_jit_attempt_done(prog);
 	return err;
 }
 

From 34747c4120418143097d4343312a0ca96c986d86 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:36 -0700
Subject: [PATCH 13/68] bpf: Refactor btf_check_func_arg_match

This patch moved the subprog specific logic from
btf_check_func_arg_match() to the new btf_check_subprog_arg_match().
The core logic is left in btf_check_func_arg_match() which
will be reused later to check the kernel function call.

The "if (!btf_type_is_ptr(t))" is checked first to improve the
indentation which will be useful for a later patch.

Some of the "btf_kind_str[]" usages is replaced with the shortcut
"btf_type_str(t)".

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015136.1544504-1-kafai@fb.com
---
 include/linux/bpf.h   |   4 +-
 include/linux/btf.h   |   5 ++
 kernel/bpf/btf.c      | 191 ++++++++++++++++++++++--------------------
 kernel/bpf/verifier.c |   4 +-
 4 files changed, 111 insertions(+), 93 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5a0801b420ca..eaae618a90b5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1545,8 +1545,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
-			     struct bpf_reg_state *regs);
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+				struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 9c1b52738bbe..8a05687a4ee2 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -141,6 +141,11 @@ static inline bool btf_type_is_enum(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
 }
 
+static inline bool btf_type_is_scalar(const struct btf_type *t)
+{
+	return btf_type_is_int(t) || btf_type_is_enum(t);
+}
+
 static inline bool btf_type_is_typedef(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 369faeddf1df..3c489adacf3b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4377,7 +4377,7 @@ static u8 bpf_ctx_convert_map[] = {
 #undef BPF_LINK_TYPE
 
 static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, enum bpf_prog_type prog_type,
 		      int arg)
 {
@@ -5362,6 +5362,99 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+				    const struct btf *btf, u32 func_id,
+				    struct bpf_reg_state *regs,
+				    bool ptr_to_mem_ok)
+{
+	struct bpf_verifier_log *log = &env->log;
+	const char *func_name, *ref_tname;
+	const struct btf_type *t, *ref_t;
+	const struct btf_param *args;
+	u32 i, nargs;
+
+	t = btf_type_by_id(btf, func_id);
+	if (!t || !btf_type_is_func(t)) {
+		/* These checks were already done by the verifier while loading
+		 * struct bpf_func_info
+		 */
+		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+			func_id);
+		return -EFAULT;
+	}
+	func_name = btf_name_by_offset(btf, t->name_off);
+
+	t = btf_type_by_id(btf, t->type);
+	if (!t || !btf_type_is_func_proto(t)) {
+		bpf_log(log, "Invalid BTF of func %s\n", func_name);
+		return -EFAULT;
+	}
+	args = (const struct btf_param *)(t + 1);
+	nargs = btf_type_vlen(t);
+	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+		bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
+			MAX_BPF_FUNC_REG_ARGS);
+		return -EINVAL;
+	}
+
+	/* check that BTF function arguments match actual types that the
+	 * verifier sees.
+	 */
+	for (i = 0; i < nargs; i++) {
+		u32 regno = i + 1;
+		struct bpf_reg_state *reg = &regs[regno];
+
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		if (btf_type_is_scalar(t)) {
+			if (reg->type == SCALAR_VALUE)
+				continue;
+			bpf_log(log, "R%d is not a scalar\n", regno);
+			return -EINVAL;
+		}
+
+		if (!btf_type_is_ptr(t)) {
+			bpf_log(log, "Unrecognized arg#%d type %s\n",
+				i, btf_type_str(t));
+			return -EINVAL;
+		}
+
+		ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+		if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
+			/* If function expects ctx type in BTF check that caller
+			 * is passing PTR_TO_CTX.
+			 */
+			if (reg->type != PTR_TO_CTX) {
+				bpf_log(log,
+					"arg#%d expected pointer to ctx, but got %s\n",
+					i, btf_type_str(t));
+				return -EINVAL;
+			}
+			if (check_ctx_reg(env, reg, regno))
+				return -EINVAL;
+		} else if (ptr_to_mem_ok) {
+			const struct btf_type *resolve_ret;
+			u32 type_size;
+
+			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+			if (IS_ERR(resolve_ret)) {
+				bpf_log(log,
+					"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+					i, btf_type_str(ref_t), ref_tname,
+					PTR_ERR(resolve_ret));
+				return -EINVAL;
+			}
+
+			if (check_mem_reg(env, reg, regno, type_size))
+				return -EINVAL;
+		} else {
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 /* Compare BTF of a function with given bpf_reg_state.
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
@@ -5369,17 +5462,14 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
  * 0 - BTF matches with what bpf_reg_state expects.
  * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
  */
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
-			     struct bpf_reg_state *regs)
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+				struct bpf_reg_state *regs)
 {
-	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
 	struct btf *btf = prog->aux->btf;
-	const struct btf_param *args;
-	const struct btf_type *t, *ref_t;
-	u32 i, nargs, btf_id, type_size;
-	const char *tname;
 	bool is_global;
+	u32 btf_id;
+	int err;
 
 	if (!prog->aux->func_info)
 		return -EINVAL;
@@ -5391,93 +5481,16 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 	if (prog->aux->func_info_aux[subprog].unreliable)
 		return -EINVAL;
 
-	t = btf_type_by_id(btf, btf_id);
-	if (!t || !btf_type_is_func(t)) {
-		/* These checks were already done by the verifier while loading
-		 * struct bpf_func_info
-		 */
-		bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
-			subprog);
-		return -EFAULT;
-	}
-	tname = btf_name_by_offset(btf, t->name_off);
-
-	t = btf_type_by_id(btf, t->type);
-	if (!t || !btf_type_is_func_proto(t)) {
-		bpf_log(log, "Invalid BTF of func %s\n", tname);
-		return -EFAULT;
-	}
-	args = (const struct btf_param *)(t + 1);
-	nargs = btf_type_vlen(t);
-	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
-		bpf_log(log, "Function %s has %d > %d args\n", tname, nargs,
-			MAX_BPF_FUNC_REG_ARGS);
-		goto out;
-	}
-
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	/* check that BTF function arguments match actual types that the
-	 * verifier sees.
-	 */
-	for (i = 0; i < nargs; i++) {
-		struct bpf_reg_state *reg = &regs[i + 1];
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
 
-		t = btf_type_by_id(btf, args[i].type);
-		while (btf_type_is_modifier(t))
-			t = btf_type_by_id(btf, t->type);
-		if (btf_type_is_int(t) || btf_type_is_enum(t)) {
-			if (reg->type == SCALAR_VALUE)
-				continue;
-			bpf_log(log, "R%d is not a scalar\n", i + 1);
-			goto out;
-		}
-		if (btf_type_is_ptr(t)) {
-			/* If function expects ctx type in BTF check that caller
-			 * is passing PTR_TO_CTX.
-			 */
-			if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
-				if (reg->type != PTR_TO_CTX) {
-					bpf_log(log,
-						"arg#%d expected pointer to ctx, but got %s\n",
-						i, btf_kind_str[BTF_INFO_KIND(t->info)]);
-					goto out;
-				}
-				if (check_ctx_reg(env, reg, i + 1))
-					goto out;
-				continue;
-			}
-
-			if (!is_global)
-				goto out;
-
-			t = btf_type_skip_modifiers(btf, t->type, NULL);
-
-			ref_t = btf_resolve_size(btf, t, &type_size);
-			if (IS_ERR(ref_t)) {
-				bpf_log(log,
-				    "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
-				    i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
-					PTR_ERR(ref_t));
-				goto out;
-			}
-
-			if (check_mem_reg(env, reg, i + 1, type_size))
-				goto out;
-
-			continue;
-		}
-		bpf_log(log, "Unrecognized arg#%d type %s\n",
-			i, btf_kind_str[BTF_INFO_KIND(t->info)]);
-		goto out;
-	}
-	return 0;
-out:
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
 	 * In such cases mark the function as unreliable from BTF point of view.
 	 */
-	prog->aux->func_info_aux[subprog].unreliable = true;
-	return -EINVAL;
+	if (err)
+		prog->aux->func_info_aux[subprog].unreliable = true;
+	return err;
 }
 
 /* Convert BTF of a function into bpf_reg_state if possible
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b7df3f06a279..b31e62daafbd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5365,7 +5365,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	func_info_aux = env->prog->aux->func_info_aux;
 	if (func_info_aux)
 		is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, subprog, caller->regs);
+	err = btf_check_subprog_arg_match(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
 	if (is_global) {
@@ -12288,7 +12288,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
 		mark_reg_known_zero(env, regs, BPF_REG_1);
-		ret = btf_check_func_arg_match(env, subprog, regs);
+		ret = btf_check_subprog_arg_match(env, subprog, regs);
 		if (ret == -EFAULT)
 			/* unlikely verifier bug. abort.
 			 * ret == 0 and ret < 0 are sadly acceptable for

From e6ac2450d6dee3121cd8bbf2907b78a68a8a353d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:42 -0700
Subject: [PATCH 14/68] bpf: Support bpf program calling kernel function

This patch adds support to BPF verifier to allow bpf program calling
kernel function directly.

The use case included in this set is to allow bpf-tcp-cc to directly
call some tcp-cc helper functions (e.g. "tcp_cong_avoid_ai()").  Those
functions have already been used by some kernel tcp-cc implementations.

This set will also allow the bpf-tcp-cc program to directly call the
kernel tcp-cc implementation,  For example, a bpf_dctcp may only want to
implement its own dctcp_cwnd_event() and reuse other dctcp_*() directly
from the kernel tcp_dctcp.c instead of reimplementing (or
copy-and-pasting) them.

The tcp-cc kernel functions mentioned above will be white listed
for the struct_ops bpf-tcp-cc programs to use in a later patch.
The white listed functions are not bounded to a fixed ABI contract.
Those functions have already been used by the existing kernel tcp-cc.
If any of them has changed, both in-tree and out-of-tree kernel tcp-cc
implementations have to be changed.  The same goes for the struct_ops
bpf-tcp-cc programs which have to be adjusted accordingly.

This patch is to make the required changes in the bpf verifier.

First change is in btf.c, it adds a case in "btf_check_func_arg_match()".
When the passed in "btf->kernel_btf == true", it means matching the
verifier regs' states with a kernel function.  This will handle the
PTR_TO_BTF_ID reg.  It also maps PTR_TO_SOCK_COMMON, PTR_TO_SOCKET,
and PTR_TO_TCP_SOCK to its kernel's btf_id.

In the later libbpf patch, the insn calling a kernel function will
look like:

insn->code == (BPF_JMP | BPF_CALL)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL /* <- new in this patch */
insn->imm == func_btf_id /* btf_id of the running kernel */

[ For the future calling function-in-kernel-module support, an array
  of module btf_fds can be passed at the load time and insn->off
  can be used to index into this array. ]

At the early stage of verifier, the verifier will collect all kernel
function calls into "struct bpf_kfunc_desc".  Those
descriptors are stored in "prog->aux->kfunc_tab" and will
be available to the JIT.  Since this "add" operation is similar
to the current "add_subprog()" and looking for the same insn->code,
they are done together in the new "add_subprog_and_kfunc()".

In the "do_check()" stage, the new "check_kfunc_call()" is added
to verify the kernel function call instruction:
1. Ensure the kernel function can be used by a particular BPF_PROG_TYPE.
   A new bpf_verifier_ops "check_kfunc_call" is added to do that.
   The bpf-tcp-cc struct_ops program will implement this function in
   a later patch.
2. Call "btf_check_kfunc_args_match()" to ensure the regs can be
   used as the args of a kernel function.
3. Mark the regs' type, subreg_def, and zext_dst.

At the later do_misc_fixups() stage, the new fixup_kfunc_call()
will replace the insn->imm with the function address (relative
to __bpf_call_base).  If needed, the jit can find the btf_func_model
by calling the new bpf_jit_find_kfunc_model(prog, insn).
With the imm set to the function address, "bpftool prog dump xlated"
will be able to display the kernel function calls the same way as
it displays other bpf helper calls.

gpl_compatible program is required to call kernel function.

This feature currently requires JIT.

The verifier selftests are adjusted because of the changes in
the verbose log in add_subprog_and_kfunc().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015142.1544736-1-kafai@fb.com
---
 arch/x86/net/bpf_jit_comp.c                   |   5 +
 include/linux/bpf.h                           |  24 ++
 include/linux/btf.h                           |   1 +
 include/linux/filter.h                        |   1 +
 include/uapi/linux/bpf.h                      |   4 +
 kernel/bpf/btf.c                              |  65 +++-
 kernel/bpf/core.c                             |  18 +-
 kernel/bpf/disasm.c                           |  13 +-
 kernel/bpf/syscall.c                          |   1 +
 kernel/bpf/verifier.c                         | 368 ++++++++++++++++--
 tools/include/uapi/linux/bpf.h                |   4 +
 tools/testing/selftests/bpf/verifier/calls.c  |  12 +-
 .../selftests/bpf/verifier/dead_code.c        |  10 +-
 13 files changed, 480 insertions(+), 46 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b35fc8023884..9eead60f0301 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2346,3 +2346,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 					   tmp : orig_prog);
 	return prog;
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+	return true;
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eaae618a90b5..b5b7967e3ff3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -427,6 +427,7 @@ enum bpf_reg_type {
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
 	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
+	__BPF_REG_TYPE_MAX,
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -480,6 +481,7 @@ struct bpf_verifier_ops {
 				 const struct btf_type *t, int off, int size,
 				 enum bpf_access_type atype,
 				 u32 *next_btf_id);
+	bool (*check_kfunc_call)(u32 kfunc_btf_id);
 };
 
 struct bpf_prog_offload_ops {
@@ -796,6 +798,8 @@ struct btf_mod_pair {
 	struct module *module;
 };
 
+struct bpf_kfunc_desc_tab;
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -832,6 +836,7 @@ struct bpf_prog_aux {
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
+	struct bpf_kfunc_desc_tab *kfunc_tab;
 	u32 size_poke_tab;
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
@@ -1547,6 +1552,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+			      const struct btf *btf, u32 func_id,
+			      struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
@@ -1557,6 +1565,10 @@ struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 void bpf_task_storage_free(struct task_struct *task);
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1737,6 +1749,18 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 static inline void bpf_task_storage_free(struct task_struct *task)
 {
 }
+
+static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+	return false;
+}
+
+static inline const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn)
+{
+	return NULL;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 8a05687a4ee2..3bac66e0183a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -110,6 +110,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
 const struct btf_type *
 btf_resolve_size(const struct btf *btf, const struct btf_type *type,
 		 u32 *type_size);
+const char *btf_type_str(const struct btf_type *t);
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0d9c710eb050..eecfd82db648 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -918,6 +918,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
+bool bpf_jit_supports_kfunc_call(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 008edc1dc8c1..598716742593 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1117,6 +1117,10 @@ enum bpf_link_type {
  * offset to another bpf function
  */
 #define BPF_PSEUDO_CALL		1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL	2
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3c489adacf3b..ec8afc4bc560 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -283,7 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_FLOAT]	= "FLOAT",
 };
 
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
 {
 	return btf_kind_str[BTF_INFO_KIND(t->info)];
 }
@@ -5362,6 +5362,14 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+	[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+	[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+	[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
@@ -5371,12 +5379,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
-	u32 i, nargs;
+	u32 i, nargs, ref_id;
 
 	t = btf_type_by_id(btf, func_id);
 	if (!t || !btf_type_is_func(t)) {
 		/* These checks were already done by the verifier while loading
-		 * struct bpf_func_info
+		 * struct bpf_func_info or in add_kfunc_call().
 		 */
 		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
 			func_id);
@@ -5418,9 +5426,49 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
-		ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-		if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
+		if (btf_is_kernel(btf)) {
+			const struct btf_type *reg_ref_t;
+			const struct btf *reg_btf;
+			const char *reg_ref_tname;
+			u32 reg_ref_id;
+
+			if (!btf_type_is_struct(ref_t)) {
+				bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+					func_name, i, btf_type_str(ref_t),
+					ref_tname);
+				return -EINVAL;
+			}
+
+			if (reg->type == PTR_TO_BTF_ID) {
+				reg_btf = reg->btf;
+				reg_ref_id = reg->btf_id;
+			} else if (reg2btf_ids[reg->type]) {
+				reg_btf = btf_vmlinux;
+				reg_ref_id = *reg2btf_ids[reg->type];
+			} else {
+				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
+					func_name, i,
+					btf_type_str(ref_t), ref_tname, regno);
+				return -EINVAL;
+			}
+
+			reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
+							    &reg_ref_id);
+			reg_ref_tname = btf_name_by_offset(reg_btf,
+							   reg_ref_t->name_off);
+			if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
+						  reg->off, btf, ref_id)) {
+				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+					func_name, i,
+					btf_type_str(ref_t), ref_tname,
+					regno, btf_type_str(reg_ref_t),
+					reg_ref_tname);
+				return -EINVAL;
+			}
+		} else if (btf_get_prog_ctx_type(log, btf, t,
+						 env->prog->type, i)) {
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
@@ -5493,6 +5541,13 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 	return err;
 }
 
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+			      const struct btf *btf, u32 func_id,
+			      struct bpf_reg_state *regs)
+{
+	return btf_check_func_arg_match(env, btf, func_id, regs, false);
+}
+
 /* Convert BTF of a function into bpf_reg_state if possible
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a35eb3d7b126..f5423251c118 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -159,6 +159,9 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
 		kvfree(prog->aux->jited_linfo);
 		prog->aux->jited_linfo = NULL;
 	}
+
+	kfree(prog->aux->kfunc_tab);
+	prog->aux->kfunc_tab = NULL;
 }
 
 /* The jit engine is responsible to provide an array
@@ -1840,9 +1843,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	/* In case of BPF to BPF calls, verifier did all the prep
 	 * work with regards to JITing, etc.
 	 */
+	bool jit_needed = false;
+
 	if (fp->bpf_func)
 		goto finalize;
 
+	if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+	    bpf_prog_has_kfunc_call(fp))
+		jit_needed = true;
+
 	bpf_prog_select_func(fp);
 
 	/* eBPF JITs can rewrite the program in case constant
@@ -1858,12 +1867,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 
 		fp = bpf_int_jit_compile(fp);
 		bpf_prog_jit_attempt_done(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
-		if (!fp->jited) {
+		if (!fp->jited && jit_needed) {
 			*err = -ENOTSUPP;
 			return fp;
 		}
-#endif
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
@@ -2343,6 +2350,11 @@ bool __weak bpf_jit_needs_zext(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 3acc7e0b6916..dad821c8ecd0 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
 {
 	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
 
-	if (insn->src_reg != BPF_PSEUDO_CALL &&
+	if (!insn->src_reg &&
 	    insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
 	    func_id_str[insn->imm])
 		return func_id_str[insn->imm];
 
-	if (cbs && cbs->cb_call)
-		return cbs->cb_call(cbs->private_data, insn);
+	if (cbs && cbs->cb_call) {
+		const char *res;
+
+		res = cbs->cb_call(cbs->private_data, insn);
+		if (res)
+			return res;
+	}
 
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		snprintf(buff, len, "%+d", insn->imm);
+	else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+		snprintf(buff, len, "kernel-function");
 
 	return buff;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eaf85bf51c5a..9603de81811a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1696,6 +1696,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 	btf_put(prog->aux->btf);
 	kvfree(prog->aux->jited_linfo);
 	kvfree(prog->aux->linfo);
+	kfree(prog->aux->kfunc_tab);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b31e62daafbd..852541a435ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
 	       insn->src_reg == BPF_PSEUDO_CALL;
 }
 
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+	       insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
 static bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
 	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
@@ -1554,47 +1560,205 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
+	/* determine subprog starts. The end is one before the next starts */
 	env->subprog_info[env->subprog_cnt++].start = off;
 	sort(env->subprog_info, env->subprog_cnt,
 	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 	return env->subprog_cnt - 1;
 }
 
-static int check_subprogs(struct bpf_verifier_env *env)
+struct bpf_kfunc_desc {
+	struct btf_func_model func_model;
+	u32 func_id;
+	s32 imm;
+};
+
+#define MAX_KFUNC_DESCS 256
+struct bpf_kfunc_desc_tab {
+	struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+	u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+{
+	const struct bpf_kfunc_desc *d0 = a;
+	const struct bpf_kfunc_desc *d1 = b;
+
+	/* func_id is not greater than BTF_MAX_TYPE */
+	return d0->func_id - d1->func_id;
+}
+
+static const struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+{
+	struct bpf_kfunc_desc desc = {
+		.func_id = func_id,
+	};
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	return bsearch(&desc, tab->descs, tab->nr_descs,
+		       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+{
+	const struct btf_type *func, *func_proto;
+	struct bpf_kfunc_desc_tab *tab;
+	struct bpf_prog_aux *prog_aux;
+	struct bpf_kfunc_desc *desc;
+	const char *func_name;
+	unsigned long addr;
+	int err;
+
+	prog_aux = env->prog->aux;
+	tab = prog_aux->kfunc_tab;
+	if (!tab) {
+		if (!btf_vmlinux) {
+			verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+			return -ENOTSUPP;
+		}
+
+		if (!env->prog->jit_requested) {
+			verbose(env, "JIT is required for calling kernel function\n");
+			return -ENOTSUPP;
+		}
+
+		if (!bpf_jit_supports_kfunc_call()) {
+			verbose(env, "JIT does not support calling kernel function\n");
+			return -ENOTSUPP;
+		}
+
+		if (!env->prog->gpl_compatible) {
+			verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+			return -EINVAL;
+		}
+
+		tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+		if (!tab)
+			return -ENOMEM;
+		prog_aux->kfunc_tab = tab;
+	}
+
+	if (find_kfunc_desc(env->prog, func_id))
+		return 0;
+
+	if (tab->nr_descs == MAX_KFUNC_DESCS) {
+		verbose(env, "too many different kernel function calls\n");
+		return -E2BIG;
+	}
+
+	func = btf_type_by_id(btf_vmlinux, func_id);
+	if (!func || !btf_type_is_func(func)) {
+		verbose(env, "kernel btf_id %u is not a function\n",
+			func_id);
+		return -EINVAL;
+	}
+	func_proto = btf_type_by_id(btf_vmlinux, func->type);
+	if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+		verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+			func_id);
+		return -EINVAL;
+	}
+
+	func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+	addr = kallsyms_lookup_name(func_name);
+	if (!addr) {
+		verbose(env, "cannot find address for kernel function %s\n",
+			func_name);
+		return -EINVAL;
+	}
+
+	desc = &tab->descs[tab->nr_descs++];
+	desc->func_id = func_id;
+	desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
+	err = btf_distill_func_proto(&env->log, btf_vmlinux,
+				     func_proto, func_name,
+				     &desc->func_model);
+	if (!err)
+		sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+		     kfunc_desc_cmp_by_id, NULL);
+	return err;
+}
+
+static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+{
+	const struct bpf_kfunc_desc *d0 = a;
+	const struct bpf_kfunc_desc *d1 = b;
+
+	if (d0->imm > d1->imm)
+		return 1;
+	else if (d0->imm < d1->imm)
+		return -1;
+	return 0;
+}
+
+static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+{
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	if (!tab)
+		return;
+
+	sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+	     kfunc_desc_cmp_by_imm, NULL);
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+	return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn)
+{
+	const struct bpf_kfunc_desc desc = {
+		.imm = insn->imm,
+	};
+	const struct bpf_kfunc_desc *res;
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	res = bsearch(&desc, tab->descs, tab->nr_descs,
+		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+
+	return res ? &res->func_model : NULL;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 {
-	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
+	int i, ret, insn_cnt = env->prog->len;
 
 	/* Add entry function. */
 	ret = add_subprog(env, 0);
-	if (ret < 0)
+	if (ret)
 		return ret;
 
-	/* determine subprog starts. The end is one before the next starts */
-	for (i = 0; i < insn_cnt; i++) {
-		if (bpf_pseudo_func(insn + i)) {
-			if (!env->bpf_capable) {
-				verbose(env,
-					"function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
-				return -EPERM;
-			}
-			ret = add_subprog(env, i + insn[i].imm + 1);
-			if (ret < 0)
-				return ret;
-			/* remember subprog */
-			insn[i + 1].imm = ret;
-			continue;
-		}
-		if (!bpf_pseudo_call(insn + i))
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+		    !bpf_pseudo_kfunc_call(insn))
 			continue;
+
 		if (!env->bpf_capable) {
-			verbose(env,
-				"function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+			verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
 			return -EPERM;
 		}
-		ret = add_subprog(env, i + insn[i].imm + 1);
+
+		if (bpf_pseudo_func(insn)) {
+			ret = add_subprog(env, i + insn->imm + 1);
+			if (ret >= 0)
+				/* remember subprog */
+				insn[1].imm = ret;
+		} else if (bpf_pseudo_call(insn)) {
+			ret = add_subprog(env, i + insn->imm + 1);
+		} else {
+			ret = add_kfunc_call(env, insn->imm);
+		}
+
 		if (ret < 0)
 			return ret;
 	}
@@ -1608,6 +1772,16 @@ static int check_subprogs(struct bpf_verifier_env *env)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
+	return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+	int i, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_subprog_info *subprog = env->subprog_info;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+
 	/* now check that all jumps are within the same subprog */
 	subprog_start = subprog[cur_subprog].start;
 	subprog_end = subprog[cur_subprog + 1].start;
@@ -1916,6 +2090,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
 	return i;
 }
 
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+	const struct btf_type *func;
+
+	if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+		return NULL;
+
+	func = btf_type_by_id(btf_vmlinux, insn->imm);
+	return btf_name_by_offset(btf_vmlinux, func->name_off);
+}
+
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
@@ -1924,6 +2109,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			  u32 *reg_mask, u64 *stack_mask)
 {
 	const struct bpf_insn_cbs cbs = {
+		.cb_call	= disasm_kfunc_name,
 		.cb_print	= verbose,
 		.private_data	= env,
 	};
@@ -5960,6 +6146,98 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	return 0;
 }
 
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+				   size_t reg_size)
+{
+	struct bpf_reg_state *reg = &cur_regs(env)[regno];
+
+	if (regno == BPF_REG_0) {
+		/* Function return value */
+		reg->live |= REG_LIVE_WRITTEN;
+		reg->subreg_def = reg_size == sizeof(u64) ?
+			DEF_NOT_SUBREG : env->insn_idx + 1;
+	} else {
+		/* Function argument */
+		if (reg_size == sizeof(u64)) {
+			mark_insn_zext(env, reg);
+			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+		} else {
+			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+		}
+	}
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+	const struct btf_type *t, *func, *func_proto, *ptr_type;
+	struct bpf_reg_state *regs = cur_regs(env);
+	const char *func_name, *ptr_type_name;
+	u32 i, nargs, func_id, ptr_type_id;
+	const struct btf_param *args;
+	int err;
+
+	func_id = insn->imm;
+	func = btf_type_by_id(btf_vmlinux, func_id);
+	func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+	func_proto = btf_type_by_id(btf_vmlinux, func->type);
+
+	if (!env->ops->check_kfunc_call ||
+	    !env->ops->check_kfunc_call(func_id)) {
+		verbose(env, "calling kernel function %s is not allowed\n",
+			func_name);
+		return -EACCES;
+	}
+
+	/* Check the arguments */
+	err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+	if (err)
+		return err;
+
+	for (i = 0; i < CALLER_SAVED_REGS; i++)
+		mark_reg_not_init(env, regs, caller_saved[i]);
+
+	/* Check return type */
+	t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+	if (btf_type_is_scalar(t)) {
+		mark_reg_unknown(env, regs, BPF_REG_0);
+		mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+	} else if (btf_type_is_ptr(t)) {
+		ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
+						   &ptr_type_id);
+		if (!btf_type_is_struct(ptr_type)) {
+			ptr_type_name = btf_name_by_offset(btf_vmlinux,
+							   ptr_type->name_off);
+			verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
+				func_name, btf_type_str(ptr_type),
+				ptr_type_name);
+			return -EINVAL;
+		}
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].btf = btf_vmlinux;
+		regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+		regs[BPF_REG_0].btf_id = ptr_type_id;
+		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+
+	nargs = btf_type_vlen(func_proto);
+	args = (const struct btf_param *)(func_proto + 1);
+	for (i = 0; i < nargs; i++) {
+		u32 regno = i + 1;
+
+		t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+		if (btf_type_is_ptr(t))
+			mark_btf_func_reg_size(env, regno, sizeof(void *));
+		else
+			/* scalar. ensured by btf_check_kfunc_arg_match() */
+			mark_btf_func_reg_size(env, regno, t->size);
+	}
+
+	return 0;
+}
+
 static bool signed_add_overflows(s64 a, s64 b)
 {
 	/* Do the add in u64, where overflow is well-defined */
@@ -10162,6 +10440,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level & BPF_LOG_LEVEL) {
 			const struct bpf_insn_cbs cbs = {
+				.cb_call	= disasm_kfunc_name,
 				.cb_print	= verbose,
 				.private_data	= env,
 			};
@@ -10309,7 +10588,8 @@ static int do_check(struct bpf_verifier_env *env)
 				if (BPF_SRC(insn->code) != BPF_K ||
 				    insn->off != 0 ||
 				    (insn->src_reg != BPF_REG_0 &&
-				     insn->src_reg != BPF_PSEUDO_CALL) ||
+				     insn->src_reg != BPF_PSEUDO_CALL &&
+				     insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
 				    insn->dst_reg != BPF_REG_0 ||
 				    class == BPF_JMP32) {
 					verbose(env, "BPF_CALL uses reserved fields\n");
@@ -10324,6 +10604,8 @@ static int do_check(struct bpf_verifier_env *env)
 				}
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
+				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+					err = check_kfunc_call(env, insn);
 				else
 					err = check_helper_call(env, insn, &env->insn_idx);
 				if (err)
@@ -11634,6 +11916,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
 		func[i]->aux->linfo = prog->aux->linfo;
 		func[i]->aux->nr_linfo = prog->aux->nr_linfo;
 		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -11773,6 +12056,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	struct bpf_prog *prog = env->prog;
 	struct bpf_insn *insn = prog->insnsi;
+	bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
 	int i, depth;
 #endif
 	int err = 0;
@@ -11786,6 +12070,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 			return err;
 	}
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
+	if (has_kfunc_call) {
+		verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+		return -EINVAL;
+	}
 	if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
 		/* When JIT fails the progs with bpf2bpf calls and tail_calls
 		 * have to be rejected, since interpreter doesn't support them yet.
@@ -11814,6 +12102,26 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	return err;
 }
 
+static int fixup_kfunc_call(struct bpf_verifier_env *env,
+			    struct bpf_insn *insn)
+{
+	const struct bpf_kfunc_desc *desc;
+
+	/* insn->imm has the btf func_id. Replace it with
+	 * an address (relative to __bpf_base_call).
+	 */
+	desc = find_kfunc_desc(env->prog, insn->imm);
+	if (!desc) {
+		verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
+			insn->imm);
+		return -EFAULT;
+	}
+
+	insn->imm = desc->imm;
+
+	return 0;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
@@ -11949,6 +12257,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			continue;
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			ret = fixup_kfunc_call(env, insn);
+			if (ret)
+				return ret;
+			continue;
+		}
 
 		if (insn->imm == BPF_FUNC_get_route_realm)
 			prog->dst_needed = 1;
@@ -12178,6 +12492,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		}
 	}
 
+	sort_kfunc_descs_by_imm(env->prog);
+
 	return 0;
 }
 
@@ -12883,6 +13199,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (!env->explored_states)
 		goto skip_full_check;
 
+	ret = add_subprog_and_kfunc(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = check_subprogs(env);
 	if (ret < 0)
 		goto skip_full_check;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2d3036e292a9..ab9f2233607c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1117,6 +1117,10 @@ enum bpf_link_type {
  * offset to another bpf function
  */
 #define BPF_PSEUDO_CALL		1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL	2
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index eb888c8479c3..336a749673d1 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -19,7 +19,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 2),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 1,
@@ -136,7 +136,7 @@
 {
 	"calls: wrong src reg",
 	.insns = {
-	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
@@ -397,7 +397,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.fixup_map_hash_48b = { 3 },
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
@@ -1977,7 +1977,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
@@ -2003,7 +2003,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.errstr = "!read_ok",
 	.result = REJECT,
 },
@@ -2028,7 +2028,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.errstr = "!read_ok",
 	.result = REJECT,
 },
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
index 5cf361d8eb1c..17fe33a75034 100644
--- a/tools/testing/selftests/bpf/verifier/dead_code.c
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -85,7 +85,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 12),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -103,7 +103,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 12),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -121,7 +121,7 @@
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -137,7 +137,7 @@
 	BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
@@ -152,7 +152,7 @@
 	BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,

From 797b84f727bce9c64ea2c85899c1ba283df54c16 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:49 -0700
Subject: [PATCH 15/68] bpf: Support kernel function call in x86-32

This patch adds kernel function call support to the x86-32 bpf jit.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015149.1545267-1-kafai@fb.com
---
 arch/x86/net/bpf_jit_comp32.c | 198 ++++++++++++++++++++++++++++++++++
 1 file changed, 198 insertions(+)

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index d17b67c69f89..0a7a2870f111 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1390,6 +1390,19 @@ static inline void emit_push_r64(const u8 src[], u8 **pprog)
 	*pprog = prog;
 }
 
+static void emit_push_r32(const u8 src[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+	/* push ecx */
+	EMIT1(0x51);
+
+	*pprog = prog;
+}
+
 static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
 {
 	u8 jmp_cond;
@@ -1459,6 +1472,174 @@ static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
 	return jmp_cond;
 }
 
+/* i386 kernel compiles with "-mregparm=3".  From gcc document:
+ *
+ * ==== snippet ====
+ * regparm (number)
+ *	On x86-32 targets, the regparm attribute causes the compiler
+ *	to pass arguments number one to (number) if they are of integral
+ *	type in registers EAX, EDX, and ECX instead of on the stack.
+ *	Functions that take a variable number of arguments continue
+ *	to be passed all of their arguments on the stack.
+ * ==== snippet ====
+ *
+ * The first three args of a function will be considered for
+ * putting into the 32bit register EAX, EDX, and ECX.
+ *
+ * Two 32bit registers are used to pass a 64bit arg.
+ *
+ * For example,
+ * void foo(u32 a, u32 b, u32 c, u32 d):
+ *	u32 a: EAX
+ *	u32 b: EDX
+ *	u32 c: ECX
+ *	u32 d: stack
+ *
+ * void foo(u64 a, u32 b, u32 c):
+ *	u64 a: EAX (lo32) EDX (hi32)
+ *	u32 b: ECX
+ *	u32 c: stack
+ *
+ * void foo(u32 a, u64 b, u32 c):
+ *	u32 a: EAX
+ *	u64 b: EDX (lo32) ECX (hi32)
+ *	u32 c: stack
+ *
+ * void foo(u32 a, u32 b, u64 c):
+ *	u32 a: EAX
+ *	u32 b: EDX
+ *	u64 c: stack
+ *
+ * The return value will be stored in the EAX (and EDX for 64bit value).
+ *
+ * For example,
+ * u32 foo(u32 a, u32 b, u32 c):
+ *	return value: EAX
+ *
+ * u64 foo(u32 a, u32 b, u32 c):
+ *	return value: EAX (lo32) EDX (hi32)
+ *
+ * Notes:
+ *	The verifier only accepts function having integer and pointers
+ *	as its args and return value, so it does not have
+ *	struct-by-value.
+ *
+ * emit_kfunc_call() finds out the btf_func_model by calling
+ * bpf_jit_find_kfunc_model().  A btf_func_model
+ * has the details about the number of args, size of each arg,
+ * and the size of the return value.
+ *
+ * It first decides how many args can be passed by EAX, EDX, and ECX.
+ * That will decide what args should be pushed to the stack:
+ * [first_stack_regno, last_stack_regno] are the bpf regnos
+ * that should be pushed to the stack.
+ *
+ * It will first push all args to the stack because the push
+ * will need to use ECX.  Then, it moves
+ * [BPF_REG_1, first_stack_regno) to EAX, EDX, and ECX.
+ *
+ * When emitting a call (0xE8), it needs to figure out
+ * the jmp_offset relative to the jit-insn address immediately
+ * following the call (0xE8) instruction.  At this point, it knows
+ * the end of the jit-insn address after completely translated the
+ * current (BPF_JMP | BPF_CALL) bpf-insn.  It is passed as "end_addr"
+ * to the emit_kfunc_call().  Thus, it can learn the "immediate-follow-call"
+ * address by figuring out how many jit-insn is generated between
+ * the call (0xE8) and the end_addr:
+ *	- 0-1 jit-insn (3 bytes each) to restore the esp pointer if there
+ *	  is arg pushed to the stack.
+ *	- 0-2 jit-insns (3 bytes each) to handle the return value.
+ */
+static int emit_kfunc_call(const struct bpf_prog *bpf_prog, u8 *end_addr,
+			   const struct bpf_insn *insn, u8 **pprog)
+{
+	const u8 arg_regs[] = { IA32_EAX, IA32_EDX, IA32_ECX };
+	int i, cnt = 0, first_stack_regno, last_stack_regno;
+	int free_arg_regs = ARRAY_SIZE(arg_regs);
+	const struct btf_func_model *fm;
+	int bytes_in_stack = 0;
+	const u8 *cur_arg_reg;
+	u8 *prog = *pprog;
+	s64 jmp_offset;
+
+	fm = bpf_jit_find_kfunc_model(bpf_prog, insn);
+	if (!fm)
+		return -EINVAL;
+
+	first_stack_regno = BPF_REG_1;
+	for (i = 0; i < fm->nr_args; i++) {
+		int regs_needed = fm->arg_size[i] > sizeof(u32) ? 2 : 1;
+
+		if (regs_needed > free_arg_regs)
+			break;
+
+		free_arg_regs -= regs_needed;
+		first_stack_regno++;
+	}
+
+	/* Push the args to the stack */
+	last_stack_regno = BPF_REG_0 + fm->nr_args;
+	for (i = last_stack_regno; i >= first_stack_regno; i--) {
+		if (fm->arg_size[i - 1] > sizeof(u32)) {
+			emit_push_r64(bpf2ia32[i], &prog);
+			bytes_in_stack += 8;
+		} else {
+			emit_push_r32(bpf2ia32[i], &prog);
+			bytes_in_stack += 4;
+		}
+	}
+
+	cur_arg_reg = &arg_regs[0];
+	for (i = BPF_REG_1; i < first_stack_regno; i++) {
+		/* mov e[adc]x,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+		      STACK_VAR(bpf2ia32[i][0]));
+		if (fm->arg_size[i - 1] > sizeof(u32))
+			/* mov e[adc]x,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+			      STACK_VAR(bpf2ia32[i][1]));
+	}
+
+	if (bytes_in_stack)
+		/* add esp,"bytes_in_stack" */
+		end_addr -= 3;
+
+	/* mov dword ptr [ebp+off],edx */
+	if (fm->ret_size > sizeof(u32))
+		end_addr -= 3;
+
+	/* mov dword ptr [ebp+off],eax */
+	if (fm->ret_size)
+		end_addr -= 3;
+
+	jmp_offset = (u8 *)__bpf_call_base + insn->imm - end_addr;
+	if (!is_simm32(jmp_offset)) {
+		pr_err("unsupported BPF kernel function jmp_offset:%lld\n",
+		       jmp_offset);
+		return -EINVAL;
+	}
+
+	EMIT1_off32(0xE8, jmp_offset);
+
+	if (fm->ret_size)
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(bpf2ia32[BPF_REG_0][0]));
+
+	if (fm->ret_size > sizeof(u32))
+		/* mov dword ptr [ebp+off],edx */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(bpf2ia32[BPF_REG_0][1]));
+
+	if (bytes_in_stack)
+		/* add esp,"bytes_in_stack" */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ESP), bytes_in_stack);
+
+	*pprog = prog;
+
+	return 0;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -1888,6 +2069,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			if (insn->src_reg == BPF_PSEUDO_CALL)
 				goto notyet;
 
+			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+				int err;
+
+				err = emit_kfunc_call(bpf_prog,
+						      image + addrs[i],
+						      insn, &prog);
+
+				if (err)
+					return err;
+				break;
+			}
+
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 
@@ -2393,3 +2586,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 					   tmp : orig_prog);
 	return prog;
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+	return true;
+}

From d22f6ad18709e93622b6115ec9a5e42ed96b5d82 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:55 -0700
Subject: [PATCH 16/68] tcp: Rename bictcp function prefix to cubictcp

The cubic functions in tcp_cubic.c are using the bictcp prefix as
in tcp_bic.c.  This patch gives it the proper name cubictcp
because the later patch will allow the bpf prog to directly
call the cubictcp implementation.  Renaming them will avoid
the name collision when trying to find the intended
one to call during bpf prog load time.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015155.1545532-1-kafai@fb.com
---
 net/ipv4/tcp_cubic.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ffcbe46dacdb..4a30deaa9a37 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -124,7 +124,7 @@ static inline void bictcp_hystart_reset(struct sock *sk)
 	ca->sample_cnt = 0;
 }
 
-static void bictcp_init(struct sock *sk)
+static void cubictcp_init(struct sock *sk)
 {
 	struct bictcp *ca = inet_csk_ca(sk);
 
@@ -137,7 +137,7 @@ static void bictcp_init(struct sock *sk)
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 {
 	if (event == CA_EVENT_TX_START) {
 		struct bictcp *ca = inet_csk_ca(sk);
@@ -319,7 +319,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
 	ca->cnt = max(ca->cnt, 2U);
 }
 
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -338,7 +338,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	tcp_cong_avoid_ai(tp, ca->cnt, acked);
 }
 
-static u32 bictcp_recalc_ssthresh(struct sock *sk)
+static u32 cubictcp_recalc_ssthresh(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -355,7 +355,7 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
 	return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
 }
 
-static void bictcp_state(struct sock *sk, u8 new_state)
+static void cubictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss) {
 		bictcp_reset(inet_csk_ca(sk));
@@ -442,7 +442,7 @@ static void hystart_update(struct sock *sk, u32 delay)
 	}
 }
 
-static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
+static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -471,13 +471,13 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
 }
 
 static struct tcp_congestion_ops cubictcp __read_mostly = {
-	.init		= bictcp_init,
-	.ssthresh	= bictcp_recalc_ssthresh,
-	.cong_avoid	= bictcp_cong_avoid,
-	.set_state	= bictcp_state,
+	.init		= cubictcp_init,
+	.ssthresh	= cubictcp_recalc_ssthresh,
+	.cong_avoid	= cubictcp_cong_avoid,
+	.set_state	= cubictcp_state,
 	.undo_cwnd	= tcp_reno_undo_cwnd,
-	.cwnd_event	= bictcp_cwnd_event,
-	.pkts_acked     = bictcp_acked,
+	.cwnd_event	= cubictcp_cwnd_event,
+	.pkts_acked     = cubictcp_acked,
 	.owner		= THIS_MODULE,
 	.name		= "cubic",
 };

From e78aea8b2170be1b88c96a4d138422986a737336 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:01 -0700
Subject: [PATCH 17/68] bpf: tcp: Put some tcp cong functions in allowlist for
 bpf-tcp-cc

This patch puts some tcp cong helper functions, tcp_slow_start()
and tcp_cong_avoid_ai(), into the allowlist for the bpf-tcp-cc
program.

A few tcp cc implementation functions are also put into the
allowlist.  A potential use case is the bpf-tcp-cc implementation
may only want to override a subset of a tcp_congestion_ops.  For others,
the bpf-tcp-cc can directly call the kernel counter parts instead of
re-implementing (or copy-and-pasting) them to the bpf program.

They will only be available to the bpf-tcp-cc typed program.
The allowlist functions are not bounded to a fixed ABI contract.
When any of them has changed, the bpf-tcp-cc program has to be changed
like any in-tree/out-of-tree kernel tcp-cc implementations do also.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015201.1546345-1-kafai@fb.com
---
 net/ipv4/bpf_tcp_ca.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index d520e61649c8..40520b77a307 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -5,6 +5,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
+#include <linux/btf_ids.h>
 #include <linux/filter.h>
 #include <net/tcp.h>
 #include <net/bpf_sk_storage.h>
@@ -178,10 +179,50 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	}
 }
 
+BTF_SET_START(bpf_tcp_ca_kfunc_ids)
+BTF_ID(func, tcp_reno_ssthresh)
+BTF_ID(func, tcp_reno_cong_avoid)
+BTF_ID(func, tcp_reno_undo_cwnd)
+BTF_ID(func, tcp_slow_start)
+BTF_ID(func, tcp_cong_avoid_ai)
+#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC)
+BTF_ID(func, cubictcp_init)
+BTF_ID(func, cubictcp_recalc_ssthresh)
+BTF_ID(func, cubictcp_cong_avoid)
+BTF_ID(func, cubictcp_state)
+BTF_ID(func, cubictcp_cwnd_event)
+BTF_ID(func, cubictcp_acked)
+#endif
+#if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP)
+BTF_ID(func, dctcp_init)
+BTF_ID(func, dctcp_update_alpha)
+BTF_ID(func, dctcp_cwnd_event)
+BTF_ID(func, dctcp_ssthresh)
+BTF_ID(func, dctcp_cwnd_undo)
+BTF_ID(func, dctcp_state)
+#endif
+#if IS_BUILTIN(CONFIG_TCP_CONG_BBR)
+BTF_ID(func, bbr_init)
+BTF_ID(func, bbr_main)
+BTF_ID(func, bbr_sndbuf_expand)
+BTF_ID(func, bbr_undo_cwnd)
+BTF_ID(func, bbr_cwnd_even),
+BTF_ID(func, bbr_ssthresh)
+BTF_ID(func, bbr_min_tso_segs)
+BTF_ID(func, bbr_set_state)
+#endif
+BTF_SET_END(bpf_tcp_ca_kfunc_ids)
+
+static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id)
+{
+	return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id);
+}
+
 static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
 	.get_func_proto		= bpf_tcp_ca_get_func_proto,
 	.is_valid_access	= bpf_tcp_ca_is_valid_access,
 	.btf_struct_access	= bpf_tcp_ca_btf_struct_access,
+	.check_kfunc_call	= bpf_tcp_ca_check_kfunc_call,
 };
 
 static int bpf_tcp_ca_init_member(const struct btf_type *t,

From 933d1aa32409ef4209c8065ee4ede68236659cd2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:07 -0700
Subject: [PATCH 18/68] libbpf: Refactor bpf_object__resolve_ksyms_btf_id

This patch refactors most of the logic from
bpf_object__resolve_ksyms_btf_id() into a new function
bpf_object__resolve_ksym_var_btf_id().
It is to get ready for a later patch adding
bpf_object__resolve_ksym_func_btf_id() which resolves
a kernel function to the running kernel btf_id.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015207.1546749-1-kafai@fb.com
---
 tools/lib/bpf/libbpf.c | 130 ++++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 60 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index cebb0e852cf8..e289155510ca 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7395,75 +7395,85 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj)
 	return err;
 }
 
+static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj,
+					       struct extern_desc *ext)
+{
+	const struct btf_type *targ_var, *targ_type;
+	__u32 targ_type_id, local_type_id;
+	const char *targ_var_name;
+	int i, id, btf_fd, err;
+	struct btf *btf;
+
+	btf = obj->btf_vmlinux;
+	btf_fd = 0;
+	id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
+	if (id == -ENOENT) {
+		err = load_module_btfs(obj);
+		if (err)
+			return err;
+
+		for (i = 0; i < obj->btf_module_cnt; i++) {
+			btf = obj->btf_modules[i].btf;
+			/* we assume module BTF FD is always >0 */
+			btf_fd = obj->btf_modules[i].fd;
+			id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
+			if (id != -ENOENT)
+				break;
+		}
+	}
+	if (id <= 0) {
+		pr_warn("extern (var ksym) '%s': failed to find BTF ID in kernel BTF(s).\n",
+			ext->name);
+		return -ESRCH;
+	}
+
+	/* find local type_id */
+	local_type_id = ext->ksym.type_id;
+
+	/* find target type_id */
+	targ_var = btf__type_by_id(btf, id);
+	targ_var_name = btf__name_by_offset(btf, targ_var->name_off);
+	targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id);
+
+	err = bpf_core_types_are_compat(obj->btf, local_type_id,
+					btf, targ_type_id);
+	if (err <= 0) {
+		const struct btf_type *local_type;
+		const char *targ_name, *local_name;
+
+		local_type = btf__type_by_id(obj->btf, local_type_id);
+		local_name = btf__name_by_offset(obj->btf, local_type->name_off);
+		targ_name = btf__name_by_offset(btf, targ_type->name_off);
+
+		pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n",
+			ext->name, local_type_id,
+			btf_kind_str(local_type), local_name, targ_type_id,
+			btf_kind_str(targ_type), targ_name);
+		return -EINVAL;
+	}
+
+	ext->is_set = true;
+	ext->ksym.kernel_btf_obj_fd = btf_fd;
+	ext->ksym.kernel_btf_id = id;
+	pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n",
+		 ext->name, id, btf_kind_str(targ_var), targ_var_name);
+
+	return 0;
+}
+
 static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
 {
 	struct extern_desc *ext;
-	struct btf *btf;
-	int i, j, id, btf_fd, err;
+	int i, err;
 
 	for (i = 0; i < obj->nr_extern; i++) {
-		const struct btf_type *targ_var, *targ_type;
-		__u32 targ_type_id, local_type_id;
-		const char *targ_var_name;
-		int ret;
-
 		ext = &obj->externs[i];
 		if (ext->type != EXT_KSYM || !ext->ksym.type_id)
 			continue;
 
-		btf = obj->btf_vmlinux;
-		btf_fd = 0;
-		id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
-		if (id == -ENOENT) {
-			err = load_module_btfs(obj);
-			if (err)
-				return err;
-
-			for (j = 0; j < obj->btf_module_cnt; j++) {
-				btf = obj->btf_modules[j].btf;
-				/* we assume module BTF FD is always >0 */
-				btf_fd = obj->btf_modules[j].fd;
-				id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
-				if (id != -ENOENT)
-					break;
-			}
-		}
-		if (id <= 0) {
-			pr_warn("extern (ksym) '%s': failed to find BTF ID in kernel BTF(s).\n",
-				ext->name);
-			return -ESRCH;
-		}
-
-		/* find local type_id */
-		local_type_id = ext->ksym.type_id;
-
-		/* find target type_id */
-		targ_var = btf__type_by_id(btf, id);
-		targ_var_name = btf__name_by_offset(btf, targ_var->name_off);
-		targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id);
-
-		ret = bpf_core_types_are_compat(obj->btf, local_type_id,
-						btf, targ_type_id);
-		if (ret <= 0) {
-			const struct btf_type *local_type;
-			const char *targ_name, *local_name;
-
-			local_type = btf__type_by_id(obj->btf, local_type_id);
-			local_name = btf__name_by_offset(obj->btf, local_type->name_off);
-			targ_name = btf__name_by_offset(btf, targ_type->name_off);
-
-			pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n",
-				ext->name, local_type_id,
-				btf_kind_str(local_type), local_name, targ_type_id,
-				btf_kind_str(targ_type), targ_name);
-			return -EINVAL;
-		}
-
-		ext->is_set = true;
-		ext->ksym.kernel_btf_obj_fd = btf_fd;
-		ext->ksym.kernel_btf_id = id;
-		pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n",
-			 ext->name, id, btf_kind_str(targ_var), targ_var_name);
+		err = bpf_object__resolve_ksym_var_btf_id(obj, ext);
+		if (err)
+			return err;
 	}
 	return 0;
 }

From 774e132e83d0f10a7ebbfe7db1debdaed6013f83 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:14 -0700
Subject: [PATCH 19/68] libbpf: Refactor codes for finding btf id of a kernel
 symbol

This patch refactors code, that finds kernel btf_id by kind
and symbol name, to a new function find_ksym_btf_id().

It also adds a new helper __btf_kind_str() to return
a string by the numeric kind value.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015214.1547069-1-kafai@fb.com
---
 tools/lib/bpf/libbpf.c | 44 +++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e289155510ca..2f5595ae0b84 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1921,9 +1921,9 @@ resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id)
 	return btf_is_func_proto(t) ? t : NULL;
 }
 
-static const char *btf_kind_str(const struct btf_type *t)
+static const char *__btf_kind_str(__u16 kind)
 {
-	switch (btf_kind(t)) {
+	switch (kind) {
 	case BTF_KIND_UNKN: return "void";
 	case BTF_KIND_INT: return "int";
 	case BTF_KIND_PTR: return "ptr";
@@ -1945,6 +1945,11 @@ static const char *btf_kind_str(const struct btf_type *t)
 	}
 }
 
+static const char *btf_kind_str(const struct btf_type *t)
+{
+	return __btf_kind_str(btf_kind(t));
+}
+
 /*
  * Fetch integer attribute of BTF map definition. Such attributes are
  * represented using a pointer to an array, in which dimensionality of array
@@ -7395,18 +7400,17 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj)
 	return err;
 }
 
-static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj,
-					       struct extern_desc *ext)
+static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name,
+			    __u16 kind, struct btf **res_btf,
+			    int *res_btf_fd)
 {
-	const struct btf_type *targ_var, *targ_type;
-	__u32 targ_type_id, local_type_id;
-	const char *targ_var_name;
 	int i, id, btf_fd, err;
 	struct btf *btf;
 
 	btf = obj->btf_vmlinux;
 	btf_fd = 0;
-	id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
+	id = btf__find_by_name_kind(btf, ksym_name, kind);
+
 	if (id == -ENOENT) {
 		err = load_module_btfs(obj);
 		if (err)
@@ -7416,17 +7420,35 @@ static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj,
 			btf = obj->btf_modules[i].btf;
 			/* we assume module BTF FD is always >0 */
 			btf_fd = obj->btf_modules[i].fd;
-			id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR);
+			id = btf__find_by_name_kind(btf, ksym_name, kind);
 			if (id != -ENOENT)
 				break;
 		}
 	}
 	if (id <= 0) {
-		pr_warn("extern (var ksym) '%s': failed to find BTF ID in kernel BTF(s).\n",
-			ext->name);
+		pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n",
+			__btf_kind_str(kind), ksym_name);
 		return -ESRCH;
 	}
 
+	*res_btf = btf;
+	*res_btf_fd = btf_fd;
+	return id;
+}
+
+static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj,
+					       struct extern_desc *ext)
+{
+	const struct btf_type *targ_var, *targ_type;
+	__u32 targ_type_id, local_type_id;
+	const char *targ_var_name;
+	int id, btf_fd = 0, err;
+	struct btf *btf = NULL;
+
+	id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd);
+	if (id < 0)
+		return id;
+
 	/* find local type_id */
 	local_type_id = ext->ksym.type_id;
 

From 0c091e5c2d37696589a3e0131a809b5499899995 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:21 -0700
Subject: [PATCH 20/68] libbpf: Rename RELO_EXTERN to RELO_EXTERN_VAR

This patch renames RELO_EXTERN to RELO_EXTERN_VAR.
It is to avoid the confusion with a later patch adding
RELO_EXTERN_FUNC.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015221.1547722-1-kafai@fb.com
---
 tools/lib/bpf/libbpf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2f5595ae0b84..0ac53dce37b7 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -185,7 +185,7 @@ enum reloc_type {
 	RELO_LD64,
 	RELO_CALL,
 	RELO_DATA,
-	RELO_EXTERN,
+	RELO_EXTERN_VAR,
 	RELO_SUBPROG_ADDR,
 };
 
@@ -3455,7 +3455,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 		}
 		pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n",
 			 prog->name, i, ext->name, ext->sym_idx, insn_idx);
-		reloc_desc->type = RELO_EXTERN;
+		reloc_desc->type = RELO_EXTERN_VAR;
 		reloc_desc->insn_idx = insn_idx;
 		reloc_desc->sym_off = i; /* sym_off stores extern index */
 		return 0;
@@ -6218,7 +6218,7 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
 			insn[0].imm = obj->maps[relo->map_idx].fd;
 			relo->processed = true;
 			break;
-		case RELO_EXTERN:
+		case RELO_EXTERN_VAR:
 			ext = &obj->externs[relo->sym_off];
 			if (ext->type == EXT_KCFG) {
 				insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;

From aa0b8d43e9537d371cbd3f272d3403f2b15201af Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:27 -0700
Subject: [PATCH 21/68] libbpf: Record extern sym relocation first

This patch records the extern sym relocs first before recording
subprog relocs.  The later patch will have relocs for extern
kernel function call which is also using BPF_JMP | BPF_CALL.
It will be easier to handle the extern symbols first in
the later patch.

is_call_insn() helper is added.  The existing is_ldimm64() helper
is renamed to is_ldimm64_insn() for consistency.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015227.1548623-1-kafai@fb.com
---
 tools/lib/bpf/libbpf.c | 63 +++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0ac53dce37b7..e1615c274250 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -573,14 +573,19 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn)
 	       insn->off == 0;
 }
 
-static bool is_ldimm64(struct bpf_insn *insn)
+static bool is_ldimm64_insn(struct bpf_insn *insn)
 {
 	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
 }
 
+static bool is_call_insn(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL);
+}
+
 static bool insn_is_pseudo_func(struct bpf_insn *insn)
 {
-	return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
+	return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
 }
 
 static int
@@ -3408,31 +3413,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 
 	reloc_desc->processed = false;
 
-	/* sub-program call relocation */
-	if (insn->code == (BPF_JMP | BPF_CALL)) {
-		if (insn->src_reg != BPF_PSEUDO_CALL) {
-			pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name);
-			return -LIBBPF_ERRNO__RELOC;
-		}
-		/* text_shndx can be 0, if no default "main" program exists */
-		if (!shdr_idx || shdr_idx != obj->efile.text_shndx) {
-			sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
-			pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n",
-				prog->name, sym_name, sym_sec_name);
-			return -LIBBPF_ERRNO__RELOC;
-		}
-		if (sym->st_value % BPF_INSN_SZ) {
-			pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n",
-				prog->name, sym_name, (size_t)sym->st_value);
-			return -LIBBPF_ERRNO__RELOC;
-		}
-		reloc_desc->type = RELO_CALL;
-		reloc_desc->insn_idx = insn_idx;
-		reloc_desc->sym_off = sym->st_value;
-		return 0;
-	}
-
-	if (!is_ldimm64(insn)) {
+	if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) {
 		pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n",
 			prog->name, sym_name, insn_idx, insn->code);
 		return -LIBBPF_ERRNO__RELOC;
@@ -3461,6 +3442,30 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 		return 0;
 	}
 
+	/* sub-program call relocation */
+	if (is_call_insn(insn)) {
+		if (insn->src_reg != BPF_PSEUDO_CALL) {
+			pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name);
+			return -LIBBPF_ERRNO__RELOC;
+		}
+		/* text_shndx can be 0, if no default "main" program exists */
+		if (!shdr_idx || shdr_idx != obj->efile.text_shndx) {
+			sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
+			pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n",
+				prog->name, sym_name, sym_sec_name);
+			return -LIBBPF_ERRNO__RELOC;
+		}
+		if (sym->st_value % BPF_INSN_SZ) {
+			pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n",
+				prog->name, sym_name, (size_t)sym->st_value);
+			return -LIBBPF_ERRNO__RELOC;
+		}
+		reloc_desc->type = RELO_CALL;
+		reloc_desc->insn_idx = insn_idx;
+		reloc_desc->sym_off = sym->st_value;
+		return 0;
+	}
+
 	if (!shdr_idx || shdr_idx >= SHN_LORESERVE) {
 		pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n",
 			prog->name, sym_name, shdr_idx);
@@ -5700,7 +5705,7 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 		/* poison second part of ldimm64 to avoid confusing error from
 		 * verifier about "unknown opcode 00"
 		 */
-		if (is_ldimm64(insn))
+		if (is_ldimm64_insn(insn))
 			bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1);
 		bpf_core_poison_insn(prog, relo_idx, insn_idx, insn);
 		return 0;
@@ -5776,7 +5781,7 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 	case BPF_LD: {
 		__u64 imm;
 
-		if (!is_ldimm64(insn) ||
+		if (!is_ldimm64_insn(insn) ||
 		    insn[0].src_reg != 0 || insn[0].off != 0 ||
 		    insn_idx + 1 >= prog->insns_cnt ||
 		    insn[1].code != 0 || insn[1].dst_reg != 0 ||

From 5bd022ec01f060f30672cc6383b8b04e75a4310d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:34 -0700
Subject: [PATCH 22/68] libbpf: Support extern kernel function

This patch is to make libbpf able to handle the following extern
kernel function declaration and do the needed relocations before
loading the bpf program to the kernel.

extern int foo(struct sock *) __attribute__((section(".ksyms")))

In the collect extern phase, needed changes is made to
bpf_object__collect_externs() and find_extern_btf_id() to collect
extern function in ".ksyms" section.  The func in the BTF datasec also
needs to be replaced by an int var.  The idea is similar to the existing
handling in extern var.  In case the BTF may not have a var, a dummy ksym
var is added at the beginning of bpf_object__collect_externs()
if there is func under ksyms datasec.  It will also change the
func linkage from extern to global which the kernel can support.
It also assigns a param name if it does not have one.

In the collect relo phase, it will record the kernel function
call as RELO_EXTERN_FUNC.

bpf_object__resolve_ksym_func_btf_id() is added to find the func
btf_id of the running kernel.

During actual relocation, it will patch the BPF_CALL instruction with
src_reg = BPF_PSEUDO_FUNC_CALL and insn->imm set to the running
kernel func's btf_id.

The required LLVM patch: https://reviews.llvm.org/D93563

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015234.1548923-1-kafai@fb.com
---
 tools/lib/bpf/libbpf.c | 174 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 162 insertions(+), 12 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e1615c274250..7aad78dbb4b4 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -186,6 +186,7 @@ enum reloc_type {
 	RELO_CALL,
 	RELO_DATA,
 	RELO_EXTERN_VAR,
+	RELO_EXTERN_FUNC,
 	RELO_SUBPROG_ADDR,
 };
 
@@ -1955,6 +1956,11 @@ static const char *btf_kind_str(const struct btf_type *t)
 	return __btf_kind_str(btf_kind(t));
 }
 
+static enum btf_func_linkage btf_func_linkage(const struct btf_type *t)
+{
+	return (enum btf_func_linkage)BTF_INFO_VLEN(t->info);
+}
+
 /*
  * Fetch integer attribute of BTF map definition. Such attributes are
  * represented using a pointer to an array, in which dimensionality of array
@@ -3019,7 +3025,7 @@ static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx)
 static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
 {
 	const struct btf_type *t;
-	const char *var_name;
+	const char *tname;
 	int i, n;
 
 	if (!btf)
@@ -3029,14 +3035,18 @@ static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
 	for (i = 1; i <= n; i++) {
 		t = btf__type_by_id(btf, i);
 
-		if (!btf_is_var(t))
+		if (!btf_is_var(t) && !btf_is_func(t))
 			continue;
 
-		var_name = btf__name_by_offset(btf, t->name_off);
-		if (strcmp(var_name, ext_name))
+		tname = btf__name_by_offset(btf, t->name_off);
+		if (strcmp(tname, ext_name))
 			continue;
 
-		if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN)
+		if (btf_is_var(t) &&
+		    btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN)
+			return -EINVAL;
+
+		if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN)
 			return -EINVAL;
 
 		return i;
@@ -3149,12 +3159,48 @@ static int find_int_btf_id(const struct btf *btf)
 	return 0;
 }
 
+static int add_dummy_ksym_var(struct btf *btf)
+{
+	int i, int_btf_id, sec_btf_id, dummy_var_btf_id;
+	const struct btf_var_secinfo *vs;
+	const struct btf_type *sec;
+
+	sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC,
+					    BTF_KIND_DATASEC);
+	if (sec_btf_id < 0)
+		return 0;
+
+	sec = btf__type_by_id(btf, sec_btf_id);
+	vs = btf_var_secinfos(sec);
+	for (i = 0; i < btf_vlen(sec); i++, vs++) {
+		const struct btf_type *vt;
+
+		vt = btf__type_by_id(btf, vs->type);
+		if (btf_is_func(vt))
+			break;
+	}
+
+	/* No func in ksyms sec.  No need to add dummy var. */
+	if (i == btf_vlen(sec))
+		return 0;
+
+	int_btf_id = find_int_btf_id(btf);
+	dummy_var_btf_id = btf__add_var(btf,
+					"dummy_ksym",
+					BTF_VAR_GLOBAL_ALLOCATED,
+					int_btf_id);
+	if (dummy_var_btf_id < 0)
+		pr_warn("cannot create a dummy_ksym var\n");
+
+	return dummy_var_btf_id;
+}
+
 static int bpf_object__collect_externs(struct bpf_object *obj)
 {
 	struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL;
 	const struct btf_type *t;
 	struct extern_desc *ext;
-	int i, n, off;
+	int i, n, off, dummy_var_btf_id;
 	const char *ext_name, *sec_name;
 	Elf_Scn *scn;
 	GElf_Shdr sh;
@@ -3166,6 +3212,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 	if (elf_sec_hdr(obj, scn, &sh))
 		return -LIBBPF_ERRNO__FORMAT;
 
+	dummy_var_btf_id = add_dummy_ksym_var(obj->btf);
+	if (dummy_var_btf_id < 0)
+		return dummy_var_btf_id;
+
 	n = sh.sh_size / sh.sh_entsize;
 	pr_debug("looking for externs among %d symbols...\n", n);
 
@@ -3210,6 +3260,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 		sec_name = btf__name_by_offset(obj->btf, sec->name_off);
 
 		if (strcmp(sec_name, KCONFIG_SEC) == 0) {
+			if (btf_is_func(t)) {
+				pr_warn("extern function %s is unsupported under %s section\n",
+					ext->name, KCONFIG_SEC);
+				return -ENOTSUP;
+			}
 			kcfg_sec = sec;
 			ext->type = EXT_KCFG;
 			ext->kcfg.sz = btf__resolve_size(obj->btf, t->type);
@@ -3231,6 +3286,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 				return -ENOTSUP;
 			}
 		} else if (strcmp(sec_name, KSYMS_SEC) == 0) {
+			if (btf_is_func(t) && ext->is_weak) {
+				pr_warn("extern weak function %s is unsupported\n",
+					ext->name);
+				return -ENOTSUP;
+			}
 			ksym_sec = sec;
 			ext->type = EXT_KSYM;
 			skip_mods_and_typedefs(obj->btf, t->type,
@@ -3257,7 +3317,14 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 		 * extern variables in DATASEC
 		 */
 		int int_btf_id = find_int_btf_id(obj->btf);
+		/* For extern function, a dummy_var added earlier
+		 * will be used to replace the vs->type and
+		 * its name string will be used to refill
+		 * the missing param's name.
+		 */
+		const struct btf_type *dummy_var;
 
+		dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id);
 		for (i = 0; i < obj->nr_extern; i++) {
 			ext = &obj->externs[i];
 			if (ext->type != EXT_KSYM)
@@ -3276,12 +3343,32 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 			ext_name = btf__name_by_offset(obj->btf, vt->name_off);
 			ext = find_extern_by_name(obj, ext_name);
 			if (!ext) {
-				pr_warn("failed to find extern definition for BTF var '%s'\n",
-					ext_name);
+				pr_warn("failed to find extern definition for BTF %s '%s'\n",
+					btf_kind_str(vt), ext_name);
 				return -ESRCH;
 			}
-			btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED;
-			vt->type = int_btf_id;
+			if (btf_is_func(vt)) {
+				const struct btf_type *func_proto;
+				struct btf_param *param;
+				int j;
+
+				func_proto = btf__type_by_id(obj->btf,
+							     vt->type);
+				param = btf_params(func_proto);
+				/* Reuse the dummy_var string if the
+				 * func proto does not have param name.
+				 */
+				for (j = 0; j < btf_vlen(func_proto); j++)
+					if (param[j].type && !param[j].name_off)
+						param[j].name_off =
+							dummy_var->name_off;
+				vs->type = dummy_var_btf_id;
+				vt->info &= ~0xffff;
+				vt->info |= BTF_FUNC_GLOBAL;
+			} else {
+				btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED;
+				vt->type = int_btf_id;
+			}
 			vs->offset = off;
 			vs->size = sizeof(int);
 		}
@@ -3436,7 +3523,10 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 		}
 		pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n",
 			 prog->name, i, ext->name, ext->sym_idx, insn_idx);
-		reloc_desc->type = RELO_EXTERN_VAR;
+		if (insn->code == (BPF_JMP | BPF_CALL))
+			reloc_desc->type = RELO_EXTERN_FUNC;
+		else
+			reloc_desc->type = RELO_EXTERN_VAR;
 		reloc_desc->insn_idx = insn_idx;
 		reloc_desc->sym_off = i; /* sym_off stores extern index */
 		return 0;
@@ -6241,6 +6331,12 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
 			}
 			relo->processed = true;
 			break;
+		case RELO_EXTERN_FUNC:
+			ext = &obj->externs[relo->sym_off];
+			insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL;
+			insn[0].imm = ext->ksym.kernel_btf_id;
+			relo->processed = true;
+			break;
 		case RELO_SUBPROG_ADDR:
 			insn[0].src_reg = BPF_PSEUDO_FUNC;
 			/* will be handled as a follow up pass */
@@ -7361,6 +7457,7 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj)
 {
 	char sym_type, sym_name[500];
 	unsigned long long sym_addr;
+	const struct btf_type *t;
 	struct extern_desc *ext;
 	int ret, err = 0;
 	FILE *f;
@@ -7387,6 +7484,10 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj)
 		if (!ext || ext->type != EXT_KSYM)
 			continue;
 
+		t = btf__type_by_id(obj->btf, ext->btf_id);
+		if (!btf_is_var(t))
+			continue;
+
 		if (ext->is_set && ext->ksym.addr != sym_addr) {
 			pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n",
 				sym_name, ext->ksym.addr, sym_addr);
@@ -7488,8 +7589,53 @@ static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj,
 	return 0;
 }
 
+static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj,
+						struct extern_desc *ext)
+{
+	int local_func_proto_id, kfunc_proto_id, kfunc_id;
+	const struct btf_type *kern_func;
+	struct btf *kern_btf = NULL;
+	int ret, kern_btf_fd = 0;
+
+	local_func_proto_id = ext->ksym.type_id;
+
+	kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC,
+				    &kern_btf, &kern_btf_fd);
+	if (kfunc_id < 0) {
+		pr_warn("extern (func ksym) '%s': not found in kernel BTF\n",
+			ext->name);
+		return kfunc_id;
+	}
+
+	if (kern_btf != obj->btf_vmlinux) {
+		pr_warn("extern (func ksym) '%s': function in kernel module is not supported\n",
+			ext->name);
+		return -ENOTSUP;
+	}
+
+	kern_func = btf__type_by_id(kern_btf, kfunc_id);
+	kfunc_proto_id = kern_func->type;
+
+	ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id,
+					kern_btf, kfunc_proto_id);
+	if (ret <= 0) {
+		pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n",
+			ext->name, local_func_proto_id, kfunc_proto_id);
+		return -EINVAL;
+	}
+
+	ext->is_set = true;
+	ext->ksym.kernel_btf_obj_fd = kern_btf_fd;
+	ext->ksym.kernel_btf_id = kfunc_id;
+	pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n",
+		 ext->name, kfunc_id);
+
+	return 0;
+}
+
 static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
 {
+	const struct btf_type *t;
 	struct extern_desc *ext;
 	int i, err;
 
@@ -7498,7 +7644,11 @@ static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
 		if (ext->type != EXT_KSYM || !ext->ksym.type_id)
 			continue;
 
-		err = bpf_object__resolve_ksym_var_btf_id(obj, ext);
+		t = btf__type_by_id(obj->btf, ext->btf_id);
+		if (btf_is_var(t))
+			err = bpf_object__resolve_ksym_var_btf_id(obj, ext);
+		else
+			err = bpf_object__resolve_ksym_func_btf_id(obj, ext);
 		if (err)
 			return err;
 	}

From 39cd9e0f6783fd2dd2b0e95500e34575b3707ed8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:40 -0700
Subject: [PATCH 23/68] bpf: selftests: Rename bictcp to bpf_cubic

As a similar chanage in the kernel, this patch gives the proper
name to the bpf cubic.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015240.1550074-1-kafai@fb.com
---
 tools/testing/selftests/bpf/progs/bpf_cubic.c | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index 6939bfd8690f..33c4d2bded64 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -174,8 +174,8 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk)
  * as long as it is used in one of the func ptr
  * under SEC(".struct_ops").
  */
-SEC("struct_ops/bictcp_init")
-void BPF_PROG(bictcp_init, struct sock *sk)
+SEC("struct_ops/bpf_cubic_init")
+void BPF_PROG(bpf_cubic_init, struct sock *sk)
 {
 	struct bictcp *ca = inet_csk_ca(sk);
 
@@ -192,7 +192,7 @@ void BPF_PROG(bictcp_init, struct sock *sk)
  * The remaining tcp-cubic functions have an easier way.
  */
 SEC("no-sec-prefix-bictcp_cwnd_event")
-void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event)
+void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
 {
 	if (event == CA_EVENT_TX_START) {
 		struct bictcp *ca = inet_csk_ca(sk);
@@ -384,7 +384,7 @@ static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
 }
 
 /* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */
-void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
+void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -403,7 +403,7 @@ void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
 	tcp_cong_avoid_ai(tp, ca->cnt, acked);
 }
 
-__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk)
+__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
@@ -420,7 +420,7 @@ __u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk)
 	return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
 }
 
-void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state)
+void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
 {
 	if (new_state == TCP_CA_Loss) {
 		bictcp_reset(inet_csk_ca(sk));
@@ -496,7 +496,7 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay)
 	}
 }
 
-void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk,
+void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
 		    const struct ack_sample *sample)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
@@ -525,7 +525,7 @@ void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk,
 		hystart_update(sk, delay);
 }
 
-__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk)
+__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
@@ -534,12 +534,12 @@ __u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk)
 
 SEC(".struct_ops")
 struct tcp_congestion_ops cubic = {
-	.init		= (void *)bictcp_init,
-	.ssthresh	= (void *)bictcp_recalc_ssthresh,
-	.cong_avoid	= (void *)bictcp_cong_avoid,
-	.set_state	= (void *)bictcp_state,
-	.undo_cwnd	= (void *)tcp_reno_undo_cwnd,
-	.cwnd_event	= (void *)bictcp_cwnd_event,
-	.pkts_acked     = (void *)bictcp_acked,
+	.init		= (void *)bpf_cubic_init,
+	.ssthresh	= (void *)bpf_cubic_recalc_ssthresh,
+	.cong_avoid	= (void *)bpf_cubic_cong_avoid,
+	.set_state	= (void *)bpf_cubic_state,
+	.undo_cwnd	= (void *)bpf_cubic_undo_cwnd,
+	.cwnd_event	= (void *)bpf_cubic_cwnd_event,
+	.pkts_acked     = (void *)bpf_cubic_acked,
 	.name		= "bpf_cubic",
 };

From 78e60bbbe8e8f614b6a453d8a780d9e6f77749a8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:46 -0700
Subject: [PATCH 24/68] bpf: selftests: Bpf_cubic and bpf_dctcp calling kernel
 functions

This patch removes the bpf implementation of tcp_slow_start()
and tcp_cong_avoid_ai().  Instead, it directly uses the kernel
implementation.

It also replaces the bpf_cubic_undo_cwnd implementation by directly
calling tcp_reno_undo_cwnd().  bpf_dctcp also directly calls
tcp_reno_cong_avoid() instead.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015246.1551062-1-kafai@fb.com
---
 tools/testing/selftests/bpf/bpf_tcp_helpers.h | 29 ++-----------------
 tools/testing/selftests/bpf/progs/bpf_cubic.c |  6 ++--
 tools/testing/selftests/bpf/progs/bpf_dctcp.c | 22 ++++----------
 3 files changed, 11 insertions(+), 46 deletions(-)

diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
index 91f0fac632f4..029589c008c9 100644
--- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
@@ -187,16 +187,6 @@ struct tcp_congestion_ops {
 	typeof(y) __y = (y);			\
 	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
 
-static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked)
-{
-	__u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
-
-	acked -= cwnd - tp->snd_cwnd;
-	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
-
-	return acked;
-}
-
 static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
 {
 	return tp->snd_cwnd < tp->snd_ssthresh;
@@ -213,22 +203,7 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
 	return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
 }
 
-static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked)
-{
-	/* If credits accumulated at a higher w, apply them gently now. */
-	if (tp->snd_cwnd_cnt >= w) {
-		tp->snd_cwnd_cnt = 0;
-		tp->snd_cwnd++;
-	}
-
-	tp->snd_cwnd_cnt += acked;
-	if (tp->snd_cwnd_cnt >= w) {
-		__u32 delta = tp->snd_cwnd_cnt / w;
-
-		tp->snd_cwnd_cnt -= delta * w;
-		tp->snd_cwnd += delta;
-	}
-	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
-}
+extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym;
+extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym;
 
 #endif
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index 33c4d2bded64..f62df4d023f9 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -525,11 +525,11 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
 		hystart_update(sk, delay);
 }
 
+extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
+
 __u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	return max(tp->snd_cwnd, tp->prior_cwnd);
+	return tcp_reno_undo_cwnd(sk);
 }
 
 SEC(".struct_ops")
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
index 4dc1a967776a..fd42247da8b4 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -194,22 +194,12 @@ __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk)
 	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
 }
 
-SEC("struct_ops/tcp_reno_cong_avoid")
-void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
+extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
+
+SEC("struct_ops/dctcp_reno_cong_avoid")
+void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (!tcp_is_cwnd_limited(sk))
-		return;
-
-	/* In "safe" area, increase. */
-	if (tcp_in_slow_start(tp)) {
-		acked = tcp_slow_start(tp, acked);
-		if (!acked)
-			return;
-	}
-	/* In dangerous area, increase slowly. */
-	tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+	tcp_reno_cong_avoid(sk, ack, acked);
 }
 
 SEC(".struct_ops")
@@ -226,7 +216,7 @@ struct tcp_congestion_ops dctcp = {
 	.in_ack_event   = (void *)dctcp_update_alpha,
 	.cwnd_event	= (void *)dctcp_cwnd_event,
 	.ssthresh	= (void *)dctcp_ssthresh,
-	.cong_avoid	= (void *)tcp_reno_cong_avoid,
+	.cong_avoid	= (void *)dctcp_cong_avoid,
 	.undo_cwnd	= (void *)dctcp_cwnd_undo,
 	.set_state	= (void *)dctcp_state,
 	.flags		= TCP_CONG_NEEDS_ECN,

From 7bd1590d4eba1583f6ee85e8cfe556505f761e19 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:52 -0700
Subject: [PATCH 25/68] bpf: selftests: Add kfunc_call test

This patch adds a few kernel function bpf_kfunc_call_test*() for the
selftest's test_run purpose.  They will be allowed for tc_cls prog.

The selftest calling the kernel function bpf_kfunc_call_test*()
is also added in this patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015252.1551395-1-kafai@fb.com
---
 include/linux/bpf.h                           |  6 ++
 net/bpf/test_run.c                            | 28 +++++++++
 net/core/filter.c                             |  1 +
 .../selftests/bpf/prog_tests/kfunc_call.c     | 59 +++++++++++++++++++
 .../selftests/bpf/progs/kfunc_call_test.c     | 47 +++++++++++++++
 .../bpf/progs/kfunc_call_test_subprog.c       | 42 +++++++++++++
 6 files changed, 183 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_call.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b5b7967e3ff3..9fdd839b418c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1532,6 +1532,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 				const union bpf_attr *kattr,
 				union bpf_attr __user *uattr);
+bool bpf_prog_test_check_kfunc_call(u32 kfunc_id);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
@@ -1731,6 +1732,11 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 	return -ENOTSUPP;
 }
 
+static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id)
+{
+	return false;
+}
+
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 4aabf71cd95d..a5d72c48fb66 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2017 Facebook
  */
 #include <linux/bpf.h>
+#include <linux/btf_ids.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/etherdevice.h>
@@ -213,10 +214,37 @@ int noinline bpf_modify_return_test(int a, int *b)
 	*b += 1;
 	return a + *b;
 }
+
+u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
+{
+	return a + b + c + d;
+}
+
+int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
+{
+	return a + b;
+}
+
+struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
+{
+	return sk;
+}
+
 __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
 
+BTF_SET_START(test_sk_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test1)
+BTF_ID(func, bpf_kfunc_call_test2)
+BTF_ID(func, bpf_kfunc_call_test3)
+BTF_SET_END(test_sk_kfunc_ids)
+
+bool bpf_prog_test_check_kfunc_call(u32 kfunc_id)
+{
+	return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id);
+}
+
 static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
 			   u32 headroom, u32 tailroom)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 17dc159ec40c..cae56d08a670 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9813,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
 	.gen_ld_abs		= bpf_gen_ld_abs,
+	.check_kfunc_call	= bpf_prog_test_check_kfunc_call,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
new file mode 100644
index 000000000000..7fc0951ee75f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "kfunc_call_test.skel.h"
+#include "kfunc_call_test_subprog.skel.h"
+
+static void test_main(void)
+{
+	struct kfunc_call_test *skel;
+	int prog_fd, retval, err;
+
+	skel = kfunc_call_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel"))
+		return;
+
+	prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1);
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, (__u32 *)&retval, NULL);
+	ASSERT_OK(err, "bpf_prog_test_run(test1)");
+	ASSERT_EQ(retval, 12, "test1-retval");
+
+	prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2);
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, (__u32 *)&retval, NULL);
+	ASSERT_OK(err, "bpf_prog_test_run(test2)");
+	ASSERT_EQ(retval, 3, "test2-retval");
+
+	kfunc_call_test__destroy(skel);
+}
+
+static void test_subprog(void)
+{
+	struct kfunc_call_test_subprog *skel;
+	int prog_fd, retval, err;
+
+	skel = kfunc_call_test_subprog__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel"))
+		return;
+
+	prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1);
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, (__u32 *)&retval, NULL);
+	ASSERT_OK(err, "bpf_prog_test_run(test1)");
+	ASSERT_EQ(retval, 10, "test1-retval");
+	ASSERT_NEQ(skel->data->active_res, -1, "active_res");
+	ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state");
+
+	kfunc_call_test_subprog__destroy(skel);
+}
+
+void test_kfunc_call(void)
+{
+	if (test__start_subtest("main"))
+		test_main();
+
+	if (test__start_subtest("subprog"))
+		test_subprog();
+}
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
new file mode 100644
index 000000000000..470f8723e463
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
+extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
+				  __u32 c, __u64 d) __ksym;
+
+SEC("classifier")
+int kfunc_call_test2(struct __sk_buff *skb)
+{
+	struct bpf_sock *sk = skb->sk;
+
+	if (!sk)
+		return -1;
+
+	sk = bpf_sk_fullsock(sk);
+	if (!sk)
+		return -1;
+
+	return bpf_kfunc_call_test2((struct sock *)sk, 1, 2);
+}
+
+SEC("classifier")
+int kfunc_call_test1(struct __sk_buff *skb)
+{
+	struct bpf_sock *sk = skb->sk;
+	__u64 a = 1ULL << 32;
+	__u32 ret;
+
+	if (!sk)
+		return -1;
+
+	sk = bpf_sk_fullsock(sk);
+	if (!sk)
+		return -1;
+
+	a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4);
+	ret = a >> 32;   /* ret should be 2 */
+	ret += (__u32)a; /* ret should be 12 */
+
+	return ret;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
new file mode 100644
index 000000000000..b2dcb7d9cb03
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+extern const int bpf_prog_active __ksym;
+extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
+				  __u32 c, __u64 d) __ksym;
+extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
+int active_res = -1;
+int sk_state = -1;
+
+int __noinline f1(struct __sk_buff *skb)
+{
+	struct bpf_sock *sk = skb->sk;
+	int *active;
+
+	if (!sk)
+		return -1;
+
+	sk = bpf_sk_fullsock(sk);
+	if (!sk)
+		return -1;
+
+	active = (int *)bpf_per_cpu_ptr(&bpf_prog_active,
+					bpf_get_smp_processor_id());
+	if (active)
+		active_res = *active;
+
+	sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state;
+
+	return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4);
+}
+
+SEC("classifier")
+int kfunc_call_test1(struct __sk_buff *skb)
+{
+	return f1(skb);
+}
+
+char _license[] SEC("license") = "GPL";

From 7e32a09fdcb38d97f148f72bf78a3b49e8d1612d Mon Sep 17 00:00:00 2001
From: Atul Gopinathan <atulgopinathan@gmail.com>
Date: Sun, 28 Mar 2021 17:35:15 +0530
Subject: [PATCH 26/68] bpf: tcp: Remove comma which is causing build error

Currently, building the bpf-next source with the CONFIG_BPF_SYSCALL
enabled is causing a compilation error:

"net/ipv4/bpf_tcp_ca.c:209:28: error: expected identifier or '(' before
',' token"

Fix this by removing an unnecessary comma.

Fixes: e78aea8b2170 ("bpf: tcp: Put some tcp cong functions in allowlist for bpf-tcp-cc")
Reported-by: syzbot+0b74d8ec3bf0cc4e4209@syzkaller.appspotmail.com
Signed-off-by: Atul Gopinathan <atulgopinathan@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210328120515.113895-1-atulgopinathan@gmail.com
---
 net/ipv4/bpf_tcp_ca.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 40520b77a307..12777d444d0f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -206,7 +206,7 @@ BTF_ID(func, bbr_init)
 BTF_ID(func, bbr_main)
 BTF_ID(func, bbr_sndbuf_expand)
 BTF_ID(func, bbr_undo_cwnd)
-BTF_ID(func, bbr_cwnd_even),
+BTF_ID(func, bbr_cwnd_even)
 BTF_ID(func, bbr_ssthresh)
 BTF_ID(func, bbr_min_tso_segs)
 BTF_ID(func, bbr_set_state)

From 21cfd2db9f51c0454d44a103ff12398c2236d3a8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sun, 28 Mar 2021 17:32:13 -0700
Subject: [PATCH 27/68] bpf: tcp: Fix an error in the bpf_tcp_ca_kfunc_ids list

There is a typo in the bbr function, s/even/event/.
This patch fixes it.

Fixes: e78aea8b2170 ("bpf: tcp: Put some tcp cong functions in allowlist for bpf-tcp-cc")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329003213.2274210-1-kafai@fb.com
---
 net/ipv4/bpf_tcp_ca.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 12777d444d0f..6bb7b335ff9f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -206,7 +206,7 @@ BTF_ID(func, bbr_init)
 BTF_ID(func, bbr_main)
 BTF_ID(func, bbr_sndbuf_expand)
 BTF_ID(func, bbr_undo_cwnd)
-BTF_ID(func, bbr_cwnd_even)
+BTF_ID(func, bbr_cwnd_event)
 BTF_ID(func, bbr_ssthresh)
 BTF_ID(func, bbr_min_tso_segs)
 BTF_ID(func, bbr_set_state)

From 7aae231ac93b9d9c45487dcafd844fa756069f3b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 29 Mar 2021 15:13:57 -0700
Subject: [PATCH 28/68] bpf: tcp: Limit calling some tcp cc functions to
 CONFIG_DYNAMIC_FTRACE

pahole currently only generates the btf_id for external function and
ftrace-able function.  Some functions in the bpf_tcp_ca_kfunc_ids
are static (e.g. cubictcp_init).  Thus, unless CONFIG_DYNAMIC_FTRACE
is set, btf_ids for those functions will not be generated and the
compilation fails during resolve_btfids.

This patch limits those functions to CONFIG_DYNAMIC_FTRACE.  I will
address the pahole generation in a followup and then remove the
CONFIG_DYNAMIC_FTRACE limitation.

Fixes: e78aea8b2170 ("bpf: tcp: Put some tcp cong functions in allowlist for bpf-tcp-cc")
Reported-by: Cong Wang <xiyou.wangcong@gmail.com>
Reported-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329221357.834438-1-kafai@fb.com
---
 net/ipv4/bpf_tcp_ca.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 6bb7b335ff9f..dff4f0eb96b0 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -185,6 +185,7 @@ BTF_ID(func, tcp_reno_cong_avoid)
 BTF_ID(func, tcp_reno_undo_cwnd)
 BTF_ID(func, tcp_slow_start)
 BTF_ID(func, tcp_cong_avoid_ai)
+#ifdef CONFIG_DYNAMIC_FTRACE
 #if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC)
 BTF_ID(func, cubictcp_init)
 BTF_ID(func, cubictcp_recalc_ssthresh)
@@ -211,6 +212,7 @@ BTF_ID(func, bbr_ssthresh)
 BTF_ID(func, bbr_min_tso_segs)
 BTF_ID(func, bbr_set_state)
 #endif
+#endif  /* CONFIG_DYNAMIC_FTRACE */
 BTF_SET_END(bpf_tcp_ca_kfunc_ids)
 
 static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id)

From 5bdca94ff30d99168c3c09394da664c3c37c6834 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 29 Mar 2021 22:41:50 -0700
Subject: [PATCH 29/68] bpf: Update bpf_design_QA.rst to clarify the kfunc call
 is not ABI

This patch updates bpf_design_QA.rst to clarify that the kernel
function callable by bpf program is not an ABI.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210330054150.2933542-1-kafai@fb.com
---
 Documentation/bpf/bpf_design_QA.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index 0e15f9b05c9d..437de2a7a5de 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -258,3 +258,18 @@ Q: Can BPF functionality such as new program or map types, new
 helpers, etc be added out of kernel module code?
 
 A: NO.
+
+Q: Directly calling kernel function is an ABI?
+----------------------------------------------
+Q: Some kernel functions (e.g. tcp_slow_start) can be called
+by BPF programs.  Do these kernel functions become an ABI?
+
+A: NO.
+
+The kernel function protos will change and the bpf programs will be
+rejected by the verifier.  Also, for example, some of the bpf-callable
+kernel functions have already been used by other kernel tcp
+cc (congestion-control) implementations.  If any of these kernel
+functions has changed, both the in-tree and out-of-tree kernel tcp cc
+implementations have to be changed.  The same goes for the bpf
+programs and they have to be adjusted accordingly.

From 2ba4badca9977b64c966b0177920daadbd5501fe Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 29 Mar 2021 22:41:56 -0700
Subject: [PATCH 30/68] bpf: selftests: Update clang requirement in README.rst
 for testing kfunc call

This patch updates the README.rst to specify the clang requirement
to compile the bpf selftests that call kernel function.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210330054156.2933804-1-kafai@fb.com
---
 tools/testing/selftests/bpf/README.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 3464161c8eea..65fe318d1e71 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -179,3 +179,17 @@ types, which was introduced in `Clang 13`__. The older Clang versions will
 either crash when compiling these tests, or generate an incorrect BTF.
 
 __  https://reviews.llvm.org/D83289
+
+Kernel function call test and Clang version
+===========================================
+
+Some selftests (e.g. kfunc_call and bpf_tcp_ca) require a LLVM support
+to generate extern function in BTF.  It was introduced in `Clang 13`__.
+
+Without it, the error from compiling bpf selftests looks like:
+
+.. code-block:: console
+
+  libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2
+
+__ https://reviews.llvm.org/D93563

From 05d817031ff9686a8206039b19e37616cf9e1d44 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 26 Mar 2021 21:25:02 -0700
Subject: [PATCH 31/68] libbpf: Fix memory leak when emitting final btf_ext

Free temporary allocated memory used to construct finalized .BTF.ext data.
Found by Coverity static analysis on libbpf's Github repo.

Fixes: 8fd27bf69b86 ("libbpf: Add BPF static linker BTF and BTF.ext support")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210327042502.969745-1-andrii@kernel.org
---
 tools/lib/bpf/linker.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index a29d62ff8041..46b16cbdcda3 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -1906,8 +1906,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
 			struct dst_sec *sec = &linker->secs[i];
 
 			sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info);
-			if (sz < 0)
-				return sz;
+			if (sz < 0) {
+				err = sz;
+				goto out;
+			}
 
 			cur += sz;
 		}
@@ -1921,8 +1923,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
 			struct dst_sec *sec = &linker->secs[i];
 
 			sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info);
-			if (sz < 0)
-				return sz;
+			if (sz < 0) {
+				err = sz;
+				goto out;
+			}
 
 			cur += sz;
 		}
@@ -1936,8 +1940,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
 			struct dst_sec *sec = &linker->secs[i];
 
 			sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info);
-			if (sz < 0)
-				return sz;
+			if (sz < 0) {
+				err = sz;
+				goto out;
+			}
 
 			cur += sz;
 		}
@@ -1948,8 +1954,10 @@ static int finalize_btf_ext(struct bpf_linker *linker)
 	if (err) {
 		linker->btf_ext = NULL;
 		pr_warn("failed to parse final .BTF.ext data: %d\n", err);
-		return err;
+		goto out;
 	}
 
-	return 0;
+out:
+	free(data);
+	return err;
 }

From 9f33df73a929ed91dddad036f518d690b3094eda Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:00 +0200
Subject: [PATCH 32/68] selftests: xsk: Don't call worker_pkt_dump() for stats
 test

For TEST_TYPE_STATS, worker_pkt_validate() that places frames onto
pkt_buf is not called. Therefore, when dump mode is set, don't call
worker_pkt_dump() for mentioned test type, so that it won't crash on
pkt_buf() access.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-2-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 1e21a3172687..09429ed2ddf6 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -999,7 +999,7 @@ static void testapp_validate(void)
 	pthread_join(t1, NULL);
 	pthread_join(t0, NULL);
 
-	if (debug_pkt_dump) {
+	if (debug_pkt_dump && test_type != TEST_TYPE_STATS) {
 		worker_pkt_dump();
 		for (int iter = 0; iter < num_frames - 1; iter++) {
 			free(pkt_buf[iter]->payload);

From e623bfdef713ddda1b9f57d1759df3a1cd97255a Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:01 +0200
Subject: [PATCH 33/68] selftests: xsk: Remove struct ifaceconfigobj

ifaceconfigobj is not really useful, it is possible to keep the
functionality and simplify the code.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-3-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 63 +++++++++++-------------
 tools/testing/selftests/bpf/xdpxceiver.h |  9 ----
 2 files changed, 29 insertions(+), 43 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 09429ed2ddf6..e2ff3cd48ccf 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -93,6 +93,13 @@ typedef __u16 __sum16;
 #include "xdpxceiver.h"
 #include "../kselftest.h"
 
+static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62";
+static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61";
+static const char *IP1 = "192.168.100.162";
+static const char *IP2 = "192.168.100.161";
+static const u16 UDP_PORT1 = 2020;
+static const u16 UDP_PORT2 = 2121;
+
 static void __exit_with_error(int error, const char *file, const char *func, int line)
 {
 	if (configured_mode == TEST_MODE_UNCONFIGURED) {
@@ -1053,25 +1060,24 @@ static void testapp_stats(void)
 	print_ksft_result();
 }
 
-static void init_iface_config(struct ifaceconfigobj *ifaceconfig)
+static void init_iface(struct ifobject *ifobj, const char *dst_mac,
+		       const char *src_mac, const char *dst_ip,
+		       const char *src_ip, const u16 dst_port,
+		       const u16 src_port)
 {
-	/*Init interface0 */
-	ifdict[0]->fv.vector = tx;
-	memcpy(ifdict[0]->dst_mac, ifaceconfig->dst_mac, ETH_ALEN);
-	memcpy(ifdict[0]->src_mac, ifaceconfig->src_mac, ETH_ALEN);
-	ifdict[0]->dst_ip = ifaceconfig->dst_ip.s_addr;
-	ifdict[0]->src_ip = ifaceconfig->src_ip.s_addr;
-	ifdict[0]->dst_port = ifaceconfig->dst_port;
-	ifdict[0]->src_port = ifaceconfig->src_port;
+	struct in_addr ip;
 
-	/*Init interface1 */
-	ifdict[1]->fv.vector = rx;
-	memcpy(ifdict[1]->dst_mac, ifaceconfig->src_mac, ETH_ALEN);
-	memcpy(ifdict[1]->src_mac, ifaceconfig->dst_mac, ETH_ALEN);
-	ifdict[1]->dst_ip = ifaceconfig->src_ip.s_addr;
-	ifdict[1]->src_ip = ifaceconfig->dst_ip.s_addr;
-	ifdict[1]->dst_port = ifaceconfig->src_port;
-	ifdict[1]->src_port = ifaceconfig->dst_port;
+	memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN);
+	memcpy(ifobj->src_mac, src_mac, ETH_ALEN);
+
+	inet_aton(dst_ip, &ip);
+	ifobj->dst_ip = ip.s_addr;
+
+	inet_aton(src_ip, &ip);
+	ifobj->src_ip = ip.s_addr;
+
+	ifobj->dst_port = dst_port;
+	ifobj->src_port = src_port;
 }
 
 static void *nsdisablemodethread(void *args)
@@ -1175,26 +1181,11 @@ static void run_pkt_test(int mode, int type)
 int main(int argc, char **argv)
 {
 	struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY };
+	int i, j;
 
 	if (setrlimit(RLIMIT_MEMLOCK, &_rlim))
 		exit_with_error(errno);
 
-	const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62";
-	const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61";
-	const char *IP1 = "192.168.100.162";
-	const char *IP2 = "192.168.100.161";
-	u16 UDP_DST_PORT = 2020;
-	u16 UDP_SRC_PORT = 2121;
-	int i, j;
-
-	ifaceconfig = malloc(sizeof(struct ifaceconfigobj));
-	memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN);
-	memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN);
-	inet_aton(IP1, &ifaceconfig->dst_ip);
-	inet_aton(IP2, &ifaceconfig->src_ip);
-	ifaceconfig->dst_port = UDP_DST_PORT;
-	ifaceconfig->src_port = UDP_SRC_PORT;
-
 	for (int i = 0; i < MAX_INTERFACES; i++) {
 		ifdict[i] = malloc(sizeof(struct ifobject));
 		if (!ifdict[i])
@@ -1209,7 +1200,11 @@ int main(int argc, char **argv)
 
 	num_frames = ++opt_pkt_count;
 
-	init_iface_config(ifaceconfig);
+	ifdict[0]->fv.vector = tx;
+	init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2);
+
+	ifdict[1]->fv.vector = rx;
+	init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1);
 
 	disable_xdp_mode(XDP_FLAGS_DRV_MODE);
 
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 30314ef305c2..8f9308099318 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -125,15 +125,6 @@ struct generic_data {
 	u32 seqnum;
 };
 
-struct ifaceconfigobj {
-	u8 dst_mac[ETH_ALEN];
-	u8 src_mac[ETH_ALEN];
-	struct in_addr dst_ip;
-	struct in_addr src_ip;
-	u16 src_port;
-	u16 dst_port;
-} *ifaceconfig;
-
 struct ifobject {
 	int ifindex;
 	int ifdict_index;

From 7519c387e69d367075bf493de8a9ea427c9d2a1b Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:02 +0200
Subject: [PATCH 34/68] selftests: xsk: Remove unused function

Probably it was ported from xdpsock but is not used anywhere.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-4-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index e2ff3cd48ccf..593e8be41fce 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -153,19 +153,6 @@ static void *memset32_htonl(void *dest, u32 val, u32 size)
 	return dest;
 }
 
-/*
- * This function code has been taken from
- * Linux kernel lib/checksum.c
- */
-static inline unsigned short from32to16(unsigned int x)
-{
-	/* add up 16-bit and 16-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
 /*
  * Fold a partial checksum
  * This function code has been taken from

From 965d2cb0f675e5f60af441c9bd430c284d66179d Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:03 +0200
Subject: [PATCH 35/68] selftests: xsk: Remove inline keyword from source file

Follow the kernel coding style guidelines and let compiler do the
decision about inlining.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-5-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 593e8be41fce..77880abd818c 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -158,7 +158,7 @@ static void *memset32_htonl(void *dest, u32 val, u32 size)
  * This function code has been taken from
  * Linux kernel include/asm-generic/checksum.h
  */
-static inline __u16 csum_fold(__u32 csum)
+static __u16 csum_fold(__u32 csum)
 {
 	u32 sum = (__force u32)csum;
 
@@ -171,7 +171,7 @@ static inline __u16 csum_fold(__u32 csum)
  * This function code has been taken from
  * Linux kernel lib/checksum.c
  */
-static inline u32 from64to32(u64 x)
+static u32 from64to32(u64 x)
 {
 	/* add up 32-bit and 32-bit for 32+c bit */
 	x = (x & 0xffffffff) + (x >> 32);
@@ -180,13 +180,11 @@ static inline u32 from64to32(u64 x)
 	return (u32)x;
 }
 
-__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum);
-
 /*
  * This function code has been taken from
  * Linux kernel lib/checksum.c
  */
-__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
+static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
 {
 	unsigned long long s = (__force u32)sum;
 
@@ -204,13 +202,12 @@ __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u3
  * This function has been taken from
  * Linux kernel include/asm-generic/checksum.h
  */
-static inline __u16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
+static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
 
-static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt)
+static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt)
 {
 	u32 csum = 0;
 	u32 cnt = 0;
@@ -500,7 +497,7 @@ static void kick_tx(struct xsk_socket_info *xsk)
 	exit_with_error(errno);
 }
 
-static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
+static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
 {
 	unsigned int rcvd;
 	u32 idx;
@@ -605,7 +602,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size)
 	complete_tx_only(xsk, batch_size);
 }
 
-static inline int get_batch_size(int pkt_cnt)
+static int get_batch_size(int pkt_cnt)
 {
 	if (!opt_pkt_count)
 		return BATCH_SIZE;

From aa2d61c154f9387883664e3873bc0700419640a3 Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:04 +0200
Subject: [PATCH 36/68] selftests: xsk: Simplify frame traversal in dumping
 thread
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Store offsets to each layer in a separate variables rather than compute
them every single time.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-6-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 47 +++++++++++-------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 77880abd818c..ae2f7d71f041 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -658,45 +658,40 @@ static void tx_only_all(struct ifobject *ifobject)
 
 static void worker_pkt_dump(void)
 {
-	struct in_addr ipaddr;
+	struct ethhdr *ethhdr;
+	struct iphdr *iphdr;
+	struct udphdr *udphdr;
+	char s[128];
+	int payload;
+	void *ptr;
 
 	fprintf(stdout, "---------------------------------------\n");
 	for (int iter = 0; iter < num_frames - 1; iter++) {
+		ptr = pkt_buf[iter]->payload;
+		ethhdr = ptr;
+		iphdr = ptr + sizeof(*ethhdr);
+		udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr);
+
 		/*extract L2 frame */
 		fprintf(stdout, "DEBUG>> L2: dst mac: ");
 		for (int i = 0; i < ETH_ALEN; i++)
-			fprintf(stdout, "%02X", ((struct ethhdr *)
-						 pkt_buf[iter]->payload)->h_dest[i]);
+			fprintf(stdout, "%02X", ethhdr->h_dest[i]);
 
 		fprintf(stdout, "\nDEBUG>> L2: src mac: ");
 		for (int i = 0; i < ETH_ALEN; i++)
-			fprintf(stdout, "%02X", ((struct ethhdr *)
-						 pkt_buf[iter]->payload)->h_source[i]);
+			fprintf(stdout, "%02X", ethhdr->h_source[i]);
 
 		/*extract L3 frame */
-		fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n",
-			((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl);
-
-		ipaddr.s_addr =
-		    ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr;
-		fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr));
-
-		ipaddr.s_addr =
-		    ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr;
-		fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr));
-
+		fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl);
+		fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n",
+			inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s)));
+		fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n",
+			inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s)));
 		/*extract L4 frame */
-		fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n",
-			ntohs(((struct udphdr *)(pkt_buf[iter]->payload +
-						 sizeof(struct ethhdr) +
-						 sizeof(struct iphdr)))->source));
-
-		fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n",
-			ntohs(((struct udphdr *)(pkt_buf[iter]->payload +
-						 sizeof(struct ethhdr) +
-						 sizeof(struct iphdr)))->dest));
+		fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source));
+		fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest));
 		/*extract L5 frame */
-		int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE));
+		payload = *((uint32_t *)(ptr + PKT_HDR_SIZE));
 
 		if (payload == EOT) {
 			print_verbose("End-of-transmission frame received\n");

From 10397994d30f2de51bfd9321ed9ddb789464f572 Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:05 +0200
Subject: [PATCH 37/68] libbpf: xsk: Use bpf_link

Currently, if there are multiple xdpsock instances running on a single
interface and in case one of the instances is terminated, the rest of
them are left in an inoperable state due to the fact of unloaded XDP
prog from interface.

Consider the scenario below:

// load xdp prog and xskmap and add entry to xskmap at idx 10
$ sudo ./xdpsock -i ens801f0 -t -q 10

// add entry to xskmap at idx 11
$ sudo ./xdpsock -i ens801f0 -t -q 11

terminate one of the processes and another one is unable to work due to
the fact that the XDP prog was unloaded from interface.

To address that, step away from setting bpf prog in favour of bpf_link.
This means that refcounting of BPF resources will be done automatically
by bpf_link itself.

Provide backward compatibility by checking if underlying system is
bpf_link capable. Do this by looking up/creating bpf_link on loopback
device. If it failed in any way, stick with netlink-based XDP prog.
therwise, use bpf_link-based logic.

When setting up BPF resources during xsk socket creation, check whether
bpf_link for a given ifindex already exists via set of calls to
bpf_link_get_next_id -> bpf_link_get_fd_by_id -> bpf_obj_get_info_by_fd
and comparing the ifindexes from bpf_link and xsk socket.

For case where resources exist but they are not AF_XDP related, bail out
and ask user to remove existing prog and then retry.

Lastly, do a bit of refactoring within __xsk_setup_xdp_prog and pull out
existing code branches based on prog_id value onto separate functions
that are responsible for resource initialization if prog_id was 0 and
for lookup existing resources for non-zero prog_id as that implies that
XDP program is present on the underlying net device. This in turn makes
it easier to follow, especially the teardown part of both branches.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-7-maciej.fijalkowski@intel.com
---
 tools/lib/bpf/xsk.c | 266 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 217 insertions(+), 49 deletions(-)

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 526fc35c0b23..95da0e19f4a5 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -28,6 +28,7 @@
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/types.h>
+#include <linux/if_link.h>
 
 #include "bpf.h"
 #include "libbpf.h"
@@ -70,8 +71,10 @@ struct xsk_ctx {
 	int ifindex;
 	struct list_head list;
 	int prog_fd;
+	int link_fd;
 	int xsks_map_fd;
 	char ifname[IFNAMSIZ];
+	bool has_bpf_link;
 };
 
 struct xsk_socket {
@@ -409,7 +412,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 	static const int log_buf_size = 16 * 1024;
 	struct xsk_ctx *ctx = xsk->ctx;
 	char log_buf[log_buf_size];
-	int err, prog_fd;
+	int prog_fd;
 
 	/* This is the fallback C-program:
 	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
@@ -499,14 +502,41 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 		return prog_fd;
 	}
 
-	err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd,
-				  xsk->config.xdp_flags);
+	ctx->prog_fd = prog_fd;
+	return 0;
+}
+
+static int xsk_create_bpf_link(struct xsk_socket *xsk)
+{
+	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+	struct xsk_ctx *ctx = xsk->ctx;
+	__u32 prog_id = 0;
+	int link_fd;
+	int err;
+
+	err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags);
 	if (err) {
-		close(prog_fd);
+		pr_warn("getting XDP prog id failed\n");
 		return err;
 	}
 
-	ctx->prog_fd = prog_fd;
+	/* if there's a netlink-based XDP prog loaded on interface, bail out
+	 * and ask user to do the removal by himself
+	 */
+	if (prog_id) {
+		pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
+		return -EINVAL;
+	}
+
+	opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
+
+	link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
+	if (link_fd < 0) {
+		pr_warn("bpf_link_create failed: %s\n", strerror(errno));
+		return link_fd;
+	}
+
+	ctx->link_fd = link_fd;
 	return 0;
 }
 
@@ -625,7 +655,6 @@ static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
 		close(fd);
 	}
 
-	err = 0;
 	if (ctx->xsks_map_fd == -1)
 		err = -ENOENT;
 
@@ -642,6 +671,98 @@ static int xsk_set_bpf_maps(struct xsk_socket *xsk)
 				   &xsk->fd, 0);
 }
 
+static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
+{
+	struct bpf_link_info link_info;
+	__u32 link_len;
+	__u32 id = 0;
+	int err;
+	int fd;
+
+	while (true) {
+		err = bpf_link_get_next_id(id, &id);
+		if (err) {
+			if (errno == ENOENT) {
+				err = 0;
+				break;
+			}
+			pr_warn("can't get next link: %s\n", strerror(errno));
+			break;
+		}
+
+		fd = bpf_link_get_fd_by_id(id);
+		if (fd < 0) {
+			if (errno == ENOENT)
+				continue;
+			pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
+			err = -errno;
+			break;
+		}
+
+		link_len = sizeof(struct bpf_link_info);
+		memset(&link_info, 0, link_len);
+		err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
+		if (err) {
+			pr_warn("can't get link info: %s\n", strerror(errno));
+			close(fd);
+			break;
+		}
+		if (link_info.type == BPF_LINK_TYPE_XDP) {
+			if (link_info.xdp.ifindex == ifindex) {
+				*link_fd = fd;
+				if (prog_id)
+					*prog_id = link_info.prog_id;
+				break;
+			}
+		}
+		close(fd);
+	}
+
+	return err;
+}
+
+static bool xsk_probe_bpf_link(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts,
+			    .flags = XDP_FLAGS_SKB_MODE);
+	struct bpf_load_program_attr prog_attr;
+	struct bpf_insn insns[2] = {
+		BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
+		BPF_EXIT_INSN()
+	};
+	int prog_fd, link_fd = -1;
+	int ifindex_lo = 1;
+	bool ret = false;
+	int err;
+
+	err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
+	if (err)
+		return ret;
+
+	if (link_fd >= 0)
+		return true;
+
+	memset(&prog_attr, 0, sizeof(prog_attr));
+	prog_attr.prog_type = BPF_PROG_TYPE_XDP;
+	prog_attr.insns = insns;
+	prog_attr.insns_cnt = ARRAY_SIZE(insns);
+	prog_attr.license = "GPL";
+
+	prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0);
+	if (prog_fd < 0)
+		return ret;
+
+	link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
+	close(prog_fd);
+
+	if (link_fd >= 0) {
+		ret = true;
+		close(link_fd);
+	}
+
+	return ret;
+}
+
 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
 {
 	char ifname[IFNAMSIZ];
@@ -663,65 +784,109 @@ static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
 	ctx->ifname[IFNAMSIZ - 1] = 0;
 
 	xsk->ctx = ctx;
+	xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
 
 	return 0;
 }
 
-static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp,
-				int *xsks_map_fd)
+static int xsk_init_xdp_res(struct xsk_socket *xsk,
+			    int *xsks_map_fd)
+{
+	struct xsk_ctx *ctx = xsk->ctx;
+	int err;
+
+	err = xsk_create_bpf_maps(xsk);
+	if (err)
+		return err;
+
+	err = xsk_load_xdp_prog(xsk);
+	if (err)
+		goto err_load_xdp_prog;
+
+	if (ctx->has_bpf_link)
+		err = xsk_create_bpf_link(xsk);
+	else
+		err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd,
+					  xsk->config.xdp_flags);
+
+	if (err)
+		goto err_attach_xdp_prog;
+
+	if (!xsk->rx)
+		return err;
+
+	err = xsk_set_bpf_maps(xsk);
+	if (err)
+		goto err_set_bpf_maps;
+
+	return err;
+
+err_set_bpf_maps:
+	if (ctx->has_bpf_link)
+		close(ctx->link_fd);
+	else
+		bpf_set_link_xdp_fd(ctx->ifindex, -1, 0);
+err_attach_xdp_prog:
+	close(ctx->prog_fd);
+err_load_xdp_prog:
+	xsk_delete_bpf_maps(xsk);
+	return err;
+}
+
+static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
+{
+	struct xsk_ctx *ctx = xsk->ctx;
+	int err;
+
+	ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
+	if (ctx->prog_fd < 0) {
+		err = -errno;
+		goto err_prog_fd;
+	}
+	err = xsk_lookup_bpf_maps(xsk);
+	if (err)
+		goto err_lookup_maps;
+
+	if (!xsk->rx)
+		return err;
+
+	err = xsk_set_bpf_maps(xsk);
+	if (err)
+		goto err_set_maps;
+
+	return err;
+
+err_set_maps:
+	close(ctx->xsks_map_fd);
+err_lookup_maps:
+	close(ctx->prog_fd);
+err_prog_fd:
+	if (ctx->has_bpf_link)
+		close(ctx->link_fd);
+	return err;
+}
+
+static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
 {
 	struct xsk_socket *xsk = _xdp;
 	struct xsk_ctx *ctx = xsk->ctx;
 	__u32 prog_id = 0;
 	int err;
 
-	err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id,
-				  xsk->config.xdp_flags);
+	if (ctx->has_bpf_link)
+		err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
+	else
+		err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags);
+
 	if (err)
 		return err;
 
-	if (!prog_id) {
-		err = xsk_create_bpf_maps(xsk);
-		if (err)
-			return err;
+	err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
+			 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
 
-		err = xsk_load_xdp_prog(xsk);
-		if (err) {
-			goto err_load_xdp_prog;
-		}
-	} else {
-		ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
-		if (ctx->prog_fd < 0)
-			return -errno;
-		err = xsk_lookup_bpf_maps(xsk);
-		if (err) {
-			close(ctx->prog_fd);
-			return err;
-		}
-	}
-
-	if (xsk->rx) {
-		err = xsk_set_bpf_maps(xsk);
-		if (err) {
-			if (!prog_id) {
-				goto err_set_bpf_maps;
-			} else {
-				close(ctx->prog_fd);
-				return err;
-			}
-		}
-	}
-	if (xsks_map_fd)
+	if (!err && xsks_map_fd)
 		*xsks_map_fd = ctx->xsks_map_fd;
 
-	return 0;
-
-err_set_bpf_maps:
-	close(ctx->prog_fd);
-	bpf_set_link_xdp_fd(ctx->ifindex, -1, 0);
-err_load_xdp_prog:
-	xsk_delete_bpf_maps(xsk);
-
 	return err;
 }
 
@@ -898,6 +1063,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 		}
 	}
 	xsk->ctx = ctx;
+	xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
 
 	if (rx) {
 		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
@@ -1054,6 +1220,8 @@ void xsk_socket__delete(struct xsk_socket *xsk)
 	if (ctx->prog_fd != -1) {
 		xsk_delete_bpf_maps(xsk);
 		close(ctx->prog_fd);
+		if (ctx->has_bpf_link)
+			close(ctx->link_fd);
 	}
 
 	err = xsk_get_mmap_offsets(xsk->fd, &off);

From c9d27c9e8dc7ccced65007a4d5d3640cad42adfc Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:06 +0200
Subject: [PATCH 38/68] samples: bpf: Do not unload prog within xdpsock

With the introduction of bpf_link in xsk's libbpf part, there's no
further need for explicit unload of prog on xdpsock's termination. When
process dies, the bpf_link's refcount will be decremented and resources
will be unloaded/freed under the hood in case when there are no more
active users.

While at it, don't dump stats on error path.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-8-maciej.fijalkowski@intel.com
---
 samples/bpf/xdpsock_user.c | 63 +++++++++++---------------------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 1e2a1105d0e6..aa696854be78 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -96,7 +96,6 @@ static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
 static int opt_timeout = 1000;
 static bool opt_need_wakeup = true;
 static u32 opt_num_xsks = 1;
-static u32 prog_id;
 static bool opt_busy_poll;
 static bool opt_reduced_cap;
 
@@ -462,59 +461,37 @@ static void *poller(void *arg)
 	return NULL;
 }
 
-static void remove_xdp_program(void)
-{
-	u32 curr_prog_id = 0;
-	int cmd = CLOSE_CONN;
-
-	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
-		printf("bpf_get_link_xdp_id failed\n");
-		exit(EXIT_FAILURE);
-	}
-	if (prog_id == curr_prog_id)
-		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
-	else if (!curr_prog_id)
-		printf("couldn't find a prog id on a given interface\n");
-	else
-		printf("program on interface changed, not removing\n");
-
-	if (opt_reduced_cap) {
-		if (write(sock, &cmd, sizeof(int)) < 0) {
-			fprintf(stderr, "Error writing into stream socket: %s", strerror(errno));
-			exit(EXIT_FAILURE);
-		}
-	}
-}
-
 static void int_exit(int sig)
 {
 	benchmark_done = true;
 }
 
-static void xdpsock_cleanup(void)
-{
-	struct xsk_umem *umem = xsks[0]->umem->umem;
-	int i;
-
-	dump_stats();
-	for (i = 0; i < num_socks; i++)
-		xsk_socket__delete(xsks[i]->xsk);
-	(void)xsk_umem__delete(umem);
-	remove_xdp_program();
-}
-
 static void __exit_with_error(int error, const char *file, const char *func,
 			      int line)
 {
 	fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
 		line, error, strerror(error));
-	dump_stats();
-	remove_xdp_program();
 	exit(EXIT_FAILURE);
 }
 
-#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
-						 __LINE__)
+#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
+
+static void xdpsock_cleanup(void)
+{
+	struct xsk_umem *umem = xsks[0]->umem->umem;
+	int i, cmd = CLOSE_CONN;
+
+	dump_stats();
+	for (i = 0; i < num_socks; i++)
+		xsk_socket__delete(xsks[i]->xsk);
+	(void)xsk_umem__delete(umem);
+
+	if (opt_reduced_cap) {
+		if (write(sock, &cmd, sizeof(int)) < 0)
+			exit_with_error(errno);
+	}
+}
+
 static void swap_mac_addresses(void *data)
 {
 	struct ether_header *eth = (struct ether_header *)data;
@@ -880,10 +857,6 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
 	if (ret)
 		exit_with_error(-ret);
 
-	ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
-	if (ret)
-		exit_with_error(-ret);
-
 	xsk->app_stats.rx_empty_polls = 0;
 	xsk->app_stats.fill_fail_polls = 0;
 	xsk->app_stats.copy_tx_sendtos = 0;

From ef9280789773c974b45f809d58b47b481f2cf9f5 Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:07 +0200
Subject: [PATCH 39/68] selftests: xsk: Remove thread for netns switch

Currently, there is a dedicated thread for following remote ns operations:
- grabbing the ifindex of the interface moved to remote netns
- removing xdp prog from that interface

With bpf_link usage in place, this can be simply omitted, so remove
mentioned thread, as BPF resources will be managed by bpf_link itself,
so there's no further need for creating the thread that will switch to
remote netns and do the cleanup.

Keep most of the logic for switching the ns, though, but make
switch_namespace() return the fd so that it will be possible to close it
at the process termination time. Get rid of logic around making sure
that it's possible to switch ns in validate_interfaces().

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-9-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 123 +++--------------------
 tools/testing/selftests/bpf/xdpxceiver.h |  10 +-
 2 files changed, 14 insertions(+), 119 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index ae2f7d71f041..a856b2935d66 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -355,12 +355,15 @@ static void usage(const char *prog)
 	ksft_print_msg(str, prog);
 }
 
-static bool switch_namespace(int idx)
+static int switch_namespace(const char *nsname)
 {
 	char fqns[26] = "/var/run/netns/";
 	int nsfd;
 
-	strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1);
+	if (!nsname || strlen(nsname) == 0)
+		return -1;
+
+	strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1);
 	nsfd = open(fqns, O_RDONLY);
 
 	if (nsfd == -1)
@@ -369,26 +372,9 @@ static bool switch_namespace(int idx)
 	if (setns(nsfd, 0) == -1)
 		exit_with_error(errno);
 
-	return true;
-}
+	print_verbose("NS switched: %s\n", nsname);
 
-static void *nsswitchthread(void *args)
-{
-	struct targs *targs = args;
-
-	targs->retptr = false;
-
-	if (switch_namespace(targs->idx)) {
-		ifdict[targs->idx]->ifindex = if_nametoindex(ifdict[targs->idx]->ifname);
-		if (!ifdict[targs->idx]->ifindex) {
-			ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n",
-					      __func__, ifdict[targs->idx]->ifname);
-		} else {
-			print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname);
-			targs->retptr = true;
-		}
-	}
-	pthread_exit(NULL);
+	return nsfd;
 }
 
 static int validate_interfaces(void)
@@ -400,33 +386,6 @@ static int validate_interfaces(void)
 			ret = false;
 			ksft_test_result_fail("ERROR: interfaces: -i <int>,<ns> -i <int>,<ns>.");
 		}
-		if (strcmp(ifdict[i]->nsname, "")) {
-			struct targs *targs;
-
-			targs = malloc(sizeof(*targs));
-			if (!targs)
-				exit_with_error(errno);
-
-			targs->idx = i;
-			if (pthread_create(&ns_thread, NULL, nsswitchthread, targs))
-				exit_with_error(errno);
-
-			pthread_join(ns_thread, NULL);
-
-			if (targs->retptr)
-				print_verbose("NS switched: %s\n", ifdict[i]->nsname);
-
-			free(targs);
-		} else {
-			ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname);
-			if (!ifdict[i]->ifindex) {
-				ksft_test_result_fail
-				    ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname);
-				ret = false;
-			} else {
-				print_verbose("Interface found: %s\n", ifdict[i]->ifname);
-			}
-		}
 	}
 	return ret;
 }
@@ -843,8 +802,7 @@ static void *worker_testapp_validate(void *arg)
 		if (bufs == MAP_FAILED)
 			exit_with_error(errno);
 
-		if (strcmp(ifobject->nsname, ""))
-			switch_namespace(ifobject->ifdict_index);
+		ifobject->ns_fd = switch_namespace(ifobject->nsname);
 	}
 
 	if (ifobject->fv.vector == tx) {
@@ -1059,60 +1017,6 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac,
 	ifobj->src_port = src_port;
 }
 
-static void *nsdisablemodethread(void *args)
-{
-	struct targs *targs = args;
-
-	targs->retptr = false;
-
-	if (switch_namespace(targs->idx)) {
-		targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags);
-	} else {
-		targs->retptr = errno;
-		print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname);
-	}
-
-	pthread_exit(NULL);
-}
-
-static void disable_xdp_mode(int mode)
-{
-	int err = 0;
-	__u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode;
-	char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv";
-
-	for (int i = 0; i < MAX_INTERFACES; i++) {
-		if (strcmp(ifdict[i]->nsname, "")) {
-			struct targs *targs;
-
-			targs = malloc(sizeof(*targs));
-			memset(targs, 0, sizeof(*targs));
-			if (!targs)
-				exit_with_error(errno);
-
-			targs->idx = i;
-			targs->flags = flags;
-			if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs))
-				exit_with_error(errno);
-
-			pthread_join(ns_thread, NULL);
-			err = targs->retptr;
-			free(targs);
-		} else {
-			err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags);
-		}
-
-		if (err) {
-			print_verbose("Failed to disable %s mode on interface %s\n",
-						mode_str, ifdict[i]->ifname);
-			exit_with_error(err);
-		}
-
-		print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname);
-		configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB;
-	}
-}
-
 static void run_pkt_test(int mode, int type)
 {
 	test_type = type;
@@ -1132,13 +1036,9 @@ static void run_pkt_test(int mode, int type)
 
 	switch (mode) {
 	case (TEST_MODE_SKB):
-		if (configured_mode == TEST_MODE_DRV)
-			disable_xdp_mode(XDP_FLAGS_DRV_MODE);
 		xdp_flags |= XDP_FLAGS_SKB_MODE;
 		break;
 	case (TEST_MODE_DRV):
-		if (configured_mode == TEST_MODE_SKB)
-			disable_xdp_mode(XDP_FLAGS_SKB_MODE);
 		xdp_flags |= XDP_FLAGS_DRV_MODE;
 		break;
 	default:
@@ -1185,8 +1085,6 @@ int main(int argc, char **argv)
 	ifdict[1]->fv.vector = rx;
 	init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1);
 
-	disable_xdp_mode(XDP_FLAGS_DRV_MODE);
-
 	ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
 
 	for (i = 0; i < TEST_MODE_MAX; i++) {
@@ -1194,8 +1092,11 @@ int main(int argc, char **argv)
 			run_pkt_test(i, j);
 	}
 
-	for (int i = 0; i < MAX_INTERFACES; i++)
+	for (int i = 0; i < MAX_INTERFACES; i++) {
+		if (ifdict[i]->ns_fd != -1)
+			close(ifdict[i]->ns_fd);
 		free(ifdict[i]);
+	}
 
 	ksft_exit_pass();
 
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 8f9308099318..493f7498d40e 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -126,7 +126,7 @@ struct generic_data {
 };
 
 struct ifobject {
-	int ifindex;
+	int ns_fd;
 	int ifdict_index;
 	char ifname[MAX_INTERFACE_NAME_CHARS];
 	char nsname[MAX_INTERFACES_NAMESPACE_CHARS];
@@ -150,15 +150,9 @@ pthread_mutex_t sync_mutex;
 pthread_mutex_t sync_mutex_tx;
 pthread_cond_t signal_rx_condition;
 pthread_cond_t signal_tx_condition;
-pthread_t t0, t1, ns_thread;
+pthread_t t0, t1;
 pthread_attr_t attr;
 
-struct targs {
-	u8 retptr;
-	int idx;
-	u32 flags;
-};
-
 TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head);
 struct head_s *head_p;
 struct pkt {

From 9866bcd6635c264aaf3de1d89e44773a269048eb Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:08 +0200
Subject: [PATCH 40/68] selftests: xsk: Split worker thread

Let's a have a separate Tx/Rx worker threads instead of a one common
thread packed with Tx/Rx specific checks.

Move mmap for umem buffer space and a switch_namespace() call to
thread_common_ops.

This also allows for a bunch of simplifactions that are the subject of
the next commits. The final result will be a code base that is much
easier to follow.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-10-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 170 +++++++++++------------
 1 file changed, 84 insertions(+), 86 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index a856b2935d66..cb6583b8ad67 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -760,6 +760,15 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut
 	int ctr = 0;
 	int ret;
 
+	pthread_attr_setstacksize(&attr, THREAD_STACK);
+
+	bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE,
+		    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (bufs == MAP_FAILED)
+		exit_with_error(errno);
+
+	ifobject->ns_fd = switch_namespace(ifobject->nsname);
+
 	xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE);
 	ret = xsk_configure_socket(ifobject);
 
@@ -782,9 +791,12 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut
 
 	if (ctr >= SOCK_RECONF_CTR)
 		exit_with_error(ret);
+
+	print_verbose("Interface [%s] vector [%s]\n",
+		      ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx");
 }
 
-static void *worker_testapp_validate(void *arg)
+static void *worker_testapp_validate_tx(void *arg)
 {
 	struct udphdr *udp_hdr =
 	    (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr));
@@ -792,98 +804,84 @@ static void *worker_testapp_validate(void *arg)
 	struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
 	struct ifobject *ifobject = (struct ifobject *)arg;
 	struct generic_data data;
+	int spinningrxctr = 0;
 	void *bufs = NULL;
 
-	pthread_attr_setstacksize(&attr, THREAD_STACK);
+	if (!bidi_pass)
+		thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx);
 
-	if (!bidi_pass) {
-		bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE,
-			    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-		if (bufs == MAP_FAILED)
+	while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) {
+		spinningrxctr++;
+		usleep(USLEEP_MAX);
+	}
+
+	for (int i = 0; i < num_frames; i++) {
+		/*send EOT frame */
+		if (i == (num_frames - 1))
+			data.seqnum = -1;
+		else
+			data.seqnum = i;
+		gen_udp_hdr(&data, ifobject, udp_hdr);
+		gen_ip_hdr(ifobject, ip_hdr);
+		gen_udp_csum(udp_hdr, ip_hdr);
+		gen_eth_hdr(ifobject, eth_hdr);
+		gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE);
+	}
+
+	print_verbose("Sending %d packets on interface %s\n",
+		      (opt_pkt_count - 1), ifobject->ifname);
+	tx_only_all(ifobject);
+
+	if (test_type != TEST_TYPE_BIDI || bidi_pass) {
+		xsk_socket__delete(ifobject->xsk->xsk);
+		(void)xsk_umem__delete(ifobject->umem->umem);
+	}
+	pthread_exit(NULL);
+}
+
+static void *worker_testapp_validate_rx(void *arg)
+{
+	struct ifobject *ifobject = (struct ifobject *)arg;
+	struct pollfd fds[MAX_SOCKS] = { };
+	void *bufs = NULL;
+
+	if (!bidi_pass)
+		thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx);
+
+	if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
+		xsk_populate_fill_ring(ifobject->umem);
+
+	TAILQ_INIT(&head);
+	if (debug_pkt_dump) {
+		pkt_buf = calloc(num_frames, sizeof(*pkt_buf));
+		if (!pkt_buf)
 			exit_with_error(errno);
-
-		ifobject->ns_fd = switch_namespace(ifobject->nsname);
 	}
 
-	if (ifobject->fv.vector == tx) {
-		int spinningrxctr = 0;
+	fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk);
+	fds[0].events = POLLIN;
 
-		if (!bidi_pass)
-			thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx);
+	pthread_mutex_lock(&sync_mutex);
+	pthread_cond_signal(&signal_rx_condition);
+	pthread_mutex_unlock(&sync_mutex);
 
-		while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) {
-			spinningrxctr++;
-			usleep(USLEEP_MAX);
+	while (1) {
+		if (test_type != TEST_TYPE_STATS) {
+			rx_pkt(ifobject->xsk, fds);
+			worker_pkt_validate();
+		} else {
+			worker_stats_validate(ifobject);
 		}
-
-		print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname);
-		for (int i = 0; i < num_frames; i++) {
-			/*send EOT frame */
-			if (i == (num_frames - 1))
-				data.seqnum = -1;
-			else
-				data.seqnum = i;
-			gen_udp_hdr(&data, ifobject, udp_hdr);
-			gen_ip_hdr(ifobject, ip_hdr);
-			gen_udp_csum(udp_hdr, ip_hdr);
-			gen_eth_hdr(ifobject, eth_hdr);
-			gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE);
-		}
-
-		print_verbose("Sending %d packets on interface %s\n",
-			       (opt_pkt_count - 1), ifobject->ifname);
-		tx_only_all(ifobject);
-	} else if (ifobject->fv.vector == rx) {
-		struct pollfd fds[MAX_SOCKS] = { };
-		int ret;
-
-		if (!bidi_pass)
-			thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx);
-
-		print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname);
-		if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
-			xsk_populate_fill_ring(ifobject->umem);
-
-		TAILQ_INIT(&head);
-		if (debug_pkt_dump) {
-			pkt_buf = calloc(num_frames, sizeof(*pkt_buf));
-			if (!pkt_buf)
-				exit_with_error(errno);
-		}
-
-		fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk);
-		fds[0].events = POLLIN;
-
-		pthread_mutex_lock(&sync_mutex);
-		pthread_cond_signal(&signal_rx_condition);
-		pthread_mutex_unlock(&sync_mutex);
-
-		while (1) {
-			if (test_type == TEST_TYPE_POLL) {
-				ret = poll(fds, 1, POLL_TMOUT);
-				if (ret <= 0)
-					continue;
-			}
-
-			if (test_type != TEST_TYPE_STATS) {
-				rx_pkt(ifobject->xsk, fds);
-				worker_pkt_validate();
-			} else {
-				worker_stats_validate(ifobject);
-			}
-
-			if (sigvar)
-				break;
-		}
-
-		if (test_type != TEST_TYPE_STATS)
-			print_verbose("Received %d packets on interface %s\n",
-				pkt_counter, ifobject->ifname);
-
-		if (test_type == TEST_TYPE_TEARDOWN)
-			print_verbose("Destroying socket\n");
+		if (sigvar)
+			break;
 	}
 
+	print_verbose("Received %d packets on interface %s\n",
+		      pkt_counter, ifobject->ifname);
+
+	if (test_type == TEST_TYPE_TEARDOWN)
+		print_verbose("Destroying socket\n");
+
 	if ((test_type != TEST_TYPE_BIDI) || bidi_pass) {
 		xsk_socket__delete(ifobject->xsk->xsk);
 		(void)xsk_umem__delete(ifobject->umem->umem);
@@ -911,12 +909,12 @@ static void testapp_validate(void)
 
 	/*Spawn RX thread */
 	if (!bidi || !bidi_pass) {
-		if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1]))
+		if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[1]))
 			exit_with_error(errno);
 	} else if (bidi && bidi_pass) {
 		/*switch Tx/Rx vectors */
 		ifdict[0]->fv.vector = rx;
-		if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0]))
+		if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[0]))
 			exit_with_error(errno);
 	}
 
@@ -931,12 +929,12 @@ static void testapp_validate(void)
 
 	/*Spawn TX thread */
 	if (!bidi || !bidi_pass) {
-		if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0]))
+		if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[0]))
 			exit_with_error(errno);
 	} else if (bidi && bidi_pass) {
 		/*switch Tx/Rx vectors */
 		ifdict[1]->fv.vector = tx;
-		if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1]))
+		if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[1]))
 			exit_with_error(errno);
 	}
 

From 99f9bcb65705dda07288b759c569d30d8a4f297c Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:09 +0200
Subject: [PATCH 41/68] selftests: xsk: Remove Tx synchronization resources

Tx thread needs to be started after the Rx side is fully initialized so
that packets are not xmitted until xsk Rx socket is ready to be used.

It can be observed that atomic variable spinning_tx is not checked from
Rx side in any way, so thread_common_ops can be modified to only address
the spinning_rx. This means that spinning_tx can be removed altogheter.

signal_tx_condition is never utilized, so simply remove it.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-11-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 15 +++++++--------
 tools/testing/selftests/bpf/xdpxceiver.h |  2 --
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index cb6583b8ad67..9f7ee9a38752 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -126,7 +126,6 @@ static void pthread_init_mutex(void)
 	pthread_mutex_init(&sync_mutex, NULL);
 	pthread_mutex_init(&sync_mutex_tx, NULL);
 	pthread_cond_init(&signal_rx_condition, NULL);
-	pthread_cond_init(&signal_tx_condition, NULL);
 }
 
 static void pthread_destroy_mutex(void)
@@ -134,7 +133,6 @@ static void pthread_destroy_mutex(void)
 	pthread_mutex_destroy(&sync_mutex);
 	pthread_mutex_destroy(&sync_mutex_tx);
 	pthread_cond_destroy(&signal_rx_condition);
-	pthread_cond_destroy(&signal_tx_condition);
 }
 
 static void *memset32_htonl(void *dest, u32 val, u32 size)
@@ -754,8 +752,7 @@ static void worker_pkt_validate(void)
 	}
 }
 
-static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr,
-			      atomic_int *spinningptr)
+static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr)
 {
 	int ctr = 0;
 	int ret;
@@ -780,13 +777,15 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut
 	 */
 	pthread_mutex_lock(mutexptr);
 	while (ret && ctr < SOCK_RECONF_CTR) {
-		atomic_store(spinningptr, 1);
+		if (ifobject->fv.vector == rx)
+			atomic_store(&spinning_rx, 1);
 		xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE);
 		ret = xsk_configure_socket(ifobject);
 		usleep(USLEEP_MAX);
 		ctr++;
 	}
-	atomic_store(spinningptr, 0);
+	if (ifobject->fv.vector == rx)
+		atomic_store(&spinning_rx, 0);
 	pthread_mutex_unlock(mutexptr);
 
 	if (ctr >= SOCK_RECONF_CTR)
@@ -808,7 +807,7 @@ static void *worker_testapp_validate_tx(void *arg)
 	void *bufs = NULL;
 
 	if (!bidi_pass)
-		thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx);
+		thread_common_ops(ifobject, bufs, &sync_mutex_tx);
 
 	while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) {
 		spinningrxctr++;
@@ -846,7 +845,7 @@ static void *worker_testapp_validate_rx(void *arg)
 	void *bufs = NULL;
 
 	if (!bidi_pass)
-		thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx);
+		thread_common_ops(ifobject, bufs, &sync_mutex_tx);
 
 	if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
 		xsk_populate_fill_ring(ifobject->umem);
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 493f7498d40e..483be41229c6 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -144,12 +144,10 @@ struct ifobject {
 static struct ifobject *ifdict[MAX_INTERFACES];
 
 /*threads*/
-atomic_int spinning_tx;
 atomic_int spinning_rx;
 pthread_mutex_t sync_mutex;
 pthread_mutex_t sync_mutex_tx;
 pthread_cond_t signal_rx_condition;
-pthread_cond_t signal_tx_condition;
 pthread_t t0, t1;
 pthread_attr_t attr;
 

From 9445f8c765838edf84dd0d3910ff309bdab8f95f Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:10 +0200
Subject: [PATCH 42/68] selftests: xsk: Refactor teardown/bidi test cases and
 testapp_validate

Currently, there is a testapp_sockets() that acts like a wrapper around
testapp_validate() and it is called for bidi and teardown test types.
Other test types call testapp_validate() directly.

Split testapp_sockets() onto two separate functions so a bunch of bidi
specific logic can be moved there and out of testapp_validate() itself.

Introduce function pointer to ifobject struct which will be used for
assigning the Rx/Tx function that is assigned to worker thread. Let's
also have a global ifobject Rx/Tx pointers so it's easier to swap the
vectors on a second run of a bi-directional test. Thread creation now is
easey to follow.

switching_notify variable is useless, info about vector switch can be
printed based on bidi_pass state.

Last but not least, init/destroy synchronization variables only once,
not per each test.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-12-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 117 ++++++++++++++---------
 tools/testing/selftests/bpf/xdpxceiver.h |  14 +--
 2 files changed, 78 insertions(+), 53 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 9f7ee9a38752..c0ebca8f55f8 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -896,26 +896,10 @@ static void testapp_validate(void)
 	pthread_attr_init(&attr);
 	pthread_attr_setstacksize(&attr, THREAD_STACK);
 
-	if ((test_type == TEST_TYPE_BIDI) && bidi_pass) {
-		pthread_init_mutex();
-		if (!switching_notify) {
-			print_verbose("Switching Tx/Rx vectors\n");
-			switching_notify++;
-		}
-	}
-
 	pthread_mutex_lock(&sync_mutex);
 
 	/*Spawn RX thread */
-	if (!bidi || !bidi_pass) {
-		if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[1]))
-			exit_with_error(errno);
-	} else if (bidi && bidi_pass) {
-		/*switch Tx/Rx vectors */
-		ifdict[0]->fv.vector = rx;
-		if (pthread_create(&t0, &attr, worker_testapp_validate_rx, ifdict[0]))
-			exit_with_error(errno);
-	}
+	pthread_create(&t0, &attr, ifdict_rx->func_ptr, ifdict_rx);
 
 	if (clock_gettime(CLOCK_REALTIME, &max_wait))
 		exit_with_error(errno);
@@ -927,15 +911,7 @@ static void testapp_validate(void)
 	pthread_mutex_unlock(&sync_mutex);
 
 	/*Spawn TX thread */
-	if (!bidi || !bidi_pass) {
-		if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[0]))
-			exit_with_error(errno);
-	} else if (bidi && bidi_pass) {
-		/*switch Tx/Rx vectors */
-		ifdict[1]->fv.vector = tx;
-		if (pthread_create(&t1, &attr, worker_testapp_validate_tx, ifdict[1]))
-			exit_with_error(errno);
-	}
+	pthread_create(&t1, &attr, ifdict_tx->func_ptr, ifdict_tx);
 
 	pthread_join(t1, NULL);
 	pthread_join(t0, NULL);
@@ -953,21 +929,56 @@ static void testapp_validate(void)
 		print_ksft_result();
 }
 
-static void testapp_sockets(void)
+static void testapp_teardown(void)
 {
-	for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER);
-	     i++) {
+	int i;
+
+	for (i = 0; i < MAX_TEARDOWN_ITER; i++) {
 		pkt_counter = 0;
 		prev_pkt = -1;
 		sigvar = 0;
 		print_verbose("Creating socket\n");
 		testapp_validate();
-		test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass;
 	}
 
 	print_ksft_result();
 }
 
+static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2)
+{
+	void *(*tmp_func_ptr)(void *) = ifobj1->func_ptr;
+	enum fvector tmp_vector = ifobj1->fv.vector;
+
+	ifobj1->func_ptr = ifobj2->func_ptr;
+	ifobj1->fv.vector = ifobj2->fv.vector;
+
+	ifobj2->func_ptr = tmp_func_ptr;
+	ifobj2->fv.vector = tmp_vector;
+
+	ifdict_tx = ifobj1;
+	ifdict_rx = ifobj2;
+}
+
+static void testapp_bidi(void)
+{
+	for (int i = 0; i < MAX_BIDI_ITER; i++) {
+		pkt_counter = 0;
+		prev_pkt = -1;
+		sigvar = 0;
+		print_verbose("Creating socket\n");
+		testapp_validate();
+		if (!bidi_pass) {
+			print_verbose("Switching Tx/Rx vectors\n");
+			swap_vectors(ifdict[1], ifdict[0]);
+		}
+		bidi_pass++;
+	}
+
+	swap_vectors(ifdict[0], ifdict[1]);
+
+	print_ksft_result();
+}
+
 static void testapp_stats(void)
 {
 	for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) {
@@ -997,7 +1008,7 @@ static void testapp_stats(void)
 static void init_iface(struct ifobject *ifobj, const char *dst_mac,
 		       const char *src_mac, const char *dst_ip,
 		       const char *src_ip, const u16 dst_port,
-		       const u16 src_port)
+		       const u16 src_port, enum fvector vector)
 {
 	struct in_addr ip;
 
@@ -1012,6 +1023,16 @@ static void init_iface(struct ifobject *ifobj, const char *dst_mac,
 
 	ifobj->dst_port = dst_port;
 	ifobj->src_port = src_port;
+
+	if (vector == tx) {
+		ifobj->fv.vector = tx;
+		ifobj->func_ptr = worker_testapp_validate_tx;
+		ifdict_tx = ifobj;
+	} else {
+		ifobj->fv.vector = rx;
+		ifobj->func_ptr = worker_testapp_validate_rx;
+		ifdict_rx = ifobj;
+	}
 }
 
 static void run_pkt_test(int mode, int type)
@@ -1021,11 +1042,8 @@ static void run_pkt_test(int mode, int type)
 	/* reset defaults after potential previous test */
 	xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 	pkt_counter = 0;
-	switching_notify = 0;
 	bidi_pass = 0;
 	prev_pkt = -1;
-	ifdict[0]->fv.vector = tx;
-	ifdict[1]->fv.vector = rx;
 	sigvar = 0;
 	stat_test_type = -1;
 	rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
@@ -1042,16 +1060,20 @@ static void run_pkt_test(int mode, int type)
 		break;
 	}
 
-	pthread_init_mutex();
-
-	if (test_type == TEST_TYPE_STATS)
+	switch (test_type) {
+	case TEST_TYPE_STATS:
 		testapp_stats();
-	else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI))
+		break;
+	case TEST_TYPE_TEARDOWN:
+		testapp_teardown();
+		break;
+	case TEST_TYPE_BIDI:
+		testapp_bidi();
+		break;
+	default:
 		testapp_validate();
-	else
-		testapp_sockets();
-
-	pthread_destroy_mutex();
+		break;
+	}
 }
 
 int main(int argc, char **argv)
@@ -1076,14 +1098,13 @@ int main(int argc, char **argv)
 
 	num_frames = ++opt_pkt_count;
 
-	ifdict[0]->fv.vector = tx;
-	init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2);
-
-	ifdict[1]->fv.vector = rx;
-	init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1);
+	init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx);
+	init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx);
 
 	ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
 
+	pthread_init_mutex();
+
 	for (i = 0; i < TEST_MODE_MAX; i++) {
 		for (j = 0; j < TEST_TYPE_MAX; j++)
 			run_pkt_test(i, j);
@@ -1095,6 +1116,8 @@ int main(int argc, char **argv)
 		free(ifdict[i]);
 	}
 
+	pthread_destroy_mutex();
+
 	ksft_exit_pass();
 
 	return 0;
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 483be41229c6..3945746900af 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -77,7 +77,6 @@ enum STAT_TEST_TYPES {
 static int configured_mode = TEST_MODE_UNCONFIGURED;
 static u8 debug_pkt_dump;
 static u32 num_frames;
-static u8 switching_notify;
 static u8 bidi_pass;
 static int test_type;
 
@@ -126,22 +125,25 @@ struct generic_data {
 };
 
 struct ifobject {
-	int ns_fd;
-	int ifdict_index;
 	char ifname[MAX_INTERFACE_NAME_CHARS];
 	char nsname[MAX_INTERFACES_NAMESPACE_CHARS];
-	struct flow_vector fv;
 	struct xsk_socket_info *xsk;
 	struct xsk_umem_info *umem;
-	u8 dst_mac[ETH_ALEN];
-	u8 src_mac[ETH_ALEN];
+	void *(*func_ptr)(void *arg);
+	struct flow_vector fv;
+	int ns_fd;
+	int ifdict_index;
 	u32 dst_ip;
 	u32 src_ip;
 	u16 src_port;
 	u16 dst_port;
+	u8 dst_mac[ETH_ALEN];
+	u8 src_mac[ETH_ALEN];
 };
 
 static struct ifobject *ifdict[MAX_INTERFACES];
+static struct ifobject *ifdict_rx;
+static struct ifobject *ifdict_tx;
 
 /*threads*/
 atomic_int spinning_rx;

From 0464b1ed07677f869005bde3fade1580b48de67e Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:11 +0200
Subject: [PATCH 43/68] selftests: xsk: Remove sync_mutex_tx and atomic var

Although thread_common_ops() are called in both Tx and Rx threads,
testapp_validate() will not spawn Tx thread until Rx thread signals that
it has finished its initialization via condition variable.

Therefore, locking in thread_common_ops is not needed and furthermore Tx
thread does not have to spin on atomic variable.

Note that this simplification wouldn't be possible if there would still
be a common worker thread.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-13-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 31 ++++++------------------
 tools/testing/selftests/bpf/xdpxceiver.h |  2 --
 2 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index c0ebca8f55f8..35e14ad8097e 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -121,17 +121,15 @@ static void __exit_with_error(int error, const char *file, const char *func, int
 			       test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\
 			       test_type == TEST_TYPE_STATS ? "Stats" : ""))
 
-static void pthread_init_mutex(void)
+static void init_sync_resources(void)
 {
 	pthread_mutex_init(&sync_mutex, NULL);
-	pthread_mutex_init(&sync_mutex_tx, NULL);
 	pthread_cond_init(&signal_rx_condition, NULL);
 }
 
-static void pthread_destroy_mutex(void)
+static void destroy_sync_resources(void)
 {
 	pthread_mutex_destroy(&sync_mutex);
-	pthread_mutex_destroy(&sync_mutex_tx);
 	pthread_cond_destroy(&signal_rx_condition);
 }
 
@@ -752,7 +750,7 @@ static void worker_pkt_validate(void)
 	}
 }
 
-static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr)
+static void thread_common_ops(struct ifobject *ifobject, void *bufs)
 {
 	int ctr = 0;
 	int ret;
@@ -771,22 +769,13 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mut
 
 	/* Retry Create Socket if it fails as xsk_socket__create()
 	 * is asynchronous
-	 *
-	 * Essential to lock Mutex here to prevent Tx thread from
-	 * entering before Rx and causing a deadlock
 	 */
-	pthread_mutex_lock(mutexptr);
 	while (ret && ctr < SOCK_RECONF_CTR) {
-		if (ifobject->fv.vector == rx)
-			atomic_store(&spinning_rx, 1);
 		xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE);
 		ret = xsk_configure_socket(ifobject);
 		usleep(USLEEP_MAX);
 		ctr++;
 	}
-	if (ifobject->fv.vector == rx)
-		atomic_store(&spinning_rx, 0);
-	pthread_mutex_unlock(mutexptr);
 
 	if (ctr >= SOCK_RECONF_CTR)
 		exit_with_error(ret);
@@ -803,16 +792,10 @@ static void *worker_testapp_validate_tx(void *arg)
 	struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
 	struct ifobject *ifobject = (struct ifobject *)arg;
 	struct generic_data data;
-	int spinningrxctr = 0;
 	void *bufs = NULL;
 
 	if (!bidi_pass)
-		thread_common_ops(ifobject, bufs, &sync_mutex_tx);
-
-	while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) {
-		spinningrxctr++;
-		usleep(USLEEP_MAX);
-	}
+		thread_common_ops(ifobject, bufs);
 
 	for (int i = 0; i < num_frames; i++) {
 		/*send EOT frame */
@@ -845,7 +828,7 @@ static void *worker_testapp_validate_rx(void *arg)
 	void *bufs = NULL;
 
 	if (!bidi_pass)
-		thread_common_ops(ifobject, bufs, &sync_mutex_tx);
+		thread_common_ops(ifobject, bufs);
 
 	if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
 		xsk_populate_fill_ring(ifobject->umem);
@@ -1103,7 +1086,7 @@ int main(int argc, char **argv)
 
 	ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
 
-	pthread_init_mutex();
+	init_sync_resources();
 
 	for (i = 0; i < TEST_MODE_MAX; i++) {
 		for (j = 0; j < TEST_TYPE_MAX; j++)
@@ -1116,7 +1099,7 @@ int main(int argc, char **argv)
 		free(ifdict[i]);
 	}
 
-	pthread_destroy_mutex();
+	destroy_sync_resources();
 
 	ksft_exit_pass();
 
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 3945746900af..e304229d8a4c 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -146,9 +146,7 @@ static struct ifobject *ifdict_rx;
 static struct ifobject *ifdict_tx;
 
 /*threads*/
-atomic_int spinning_rx;
 pthread_mutex_t sync_mutex;
-pthread_mutex_t sync_mutex_tx;
 pthread_cond_t signal_rx_condition;
 pthread_t t0, t1;
 pthread_attr_t attr;

From 34829eec3b698219a5cbc09a174b2a7407b3b4c1 Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:12 +0200
Subject: [PATCH 44/68] veth: Implement ethtool's get_channels() callback

Libbpf's xsk part calls get_channels() API to retrieve the queue count
of the underlying driver so that XSKMAP is sized accordingly.

Implement that in veth so multi queue scenarios can work properly.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-14-maciej.fijalkowski@intel.com
---
 drivers/net/veth.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 91b73db37555..9e525646df1d 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -218,6 +218,17 @@ static void veth_get_ethtool_stats(struct net_device *dev,
 	}
 }
 
+static void veth_get_channels(struct net_device *dev,
+			      struct ethtool_channels *channels)
+{
+	channels->tx_count = dev->real_num_tx_queues;
+	channels->rx_count = dev->real_num_rx_queues;
+	channels->max_tx = dev->real_num_tx_queues;
+	channels->max_rx = dev->real_num_rx_queues;
+	channels->combined_count = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
+	channels->max_combined = min(dev->real_num_rx_queues, dev->real_num_tx_queues);
+}
+
 static const struct ethtool_ops veth_ethtool_ops = {
 	.get_drvinfo		= veth_get_drvinfo,
 	.get_link		= ethtool_op_get_link,
@@ -226,6 +237,7 @@ static const struct ethtool_ops veth_ethtool_ops = {
 	.get_ethtool_stats	= veth_get_ethtool_stats,
 	.get_link_ksettings	= veth_get_link_ksettings,
 	.get_ts_info		= ethtool_op_get_ts_info,
+	.get_channels		= veth_get_channels,
 };
 
 /* general routines */

From 27e1ca2525de264901b5c2d9d0c4403c3fe8608c Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Mar 2021 00:43:13 +0200
Subject: [PATCH 45/68] selftests: xsk: Implement bpf_link test

Introduce a test that is supposed to verify the persistence of BPF
resources based on underlying bpf_link usage.

Test will:
1) create and bind two sockets on queue ids 0 and 1
2) run a traffic on queue ids 0
3) remove xsk sockets from queue 0 on both veth interfaces
4) run a traffic on queues ids 1

Running traffic successfully on qids 1 means that BPF resources were
not removed on step 3).

In order to make it work, change the command that creates veth pair to
have the 4 queue pairs by default.

Introduce the arrays of xsks and umems to ifobject struct but keep a
pointers to single entities, so rest of the logic around Rx/Tx can be
kept as-is.

For umem handling, double the size of mmapped space and split that
between the two sockets.

Rename also bidi_pass to a variable 'second_step' of a boolean type as
it's now used also for the test that is introduced here and it doesn't
have anything in common with bi-directional testing.

Drop opt_queue command line argument as it wasn't working before anyway.

Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-15-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/test_xsk.sh  |   3 +-
 tools/testing/selftests/bpf/xdpxceiver.c | 179 +++++++++++++++++------
 tools/testing/selftests/bpf/xdpxceiver.h |   7 +-
 3 files changed, 139 insertions(+), 50 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh
index 56d4474e2c83..46633a3bfb0b 100755
--- a/tools/testing/selftests/bpf/test_xsk.sh
+++ b/tools/testing/selftests/bpf/test_xsk.sh
@@ -107,7 +107,7 @@ setup_vethPairs() {
 	        echo "setting up ${VETH0}: namespace: ${NS0}"
 	fi
 	ip netns add ${NS1}
-	ip link add ${VETH0} type veth peer name ${VETH1}
+	ip link add ${VETH0} numtxqueues 4 numrxqueues 4 type veth peer name ${VETH1} numtxqueues 4 numrxqueues 4
 	if [ -f /proc/net/if_inet6 ]; then
 		echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6
 	fi
@@ -118,6 +118,7 @@ setup_vethPairs() {
 	ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU}
 	ip link set ${VETH0} mtu ${MTU}
 	ip netns exec ${NS1} ip link set ${VETH1} up
+	ip netns exec ${NS1} ip link set dev lo up
 	ip link set ${VETH0} up
 }
 
diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 35e14ad8097e..5c0d6dbd076f 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -41,8 +41,12 @@
  *            Reduce the size of the RX ring to a fraction of the fill ring size.
  *       iv.  fill queue empty
  *            Do not populate the fill queue and then try to receive pkts.
+ *    f. bpf_link resource persistence
+ *       Configure sockets at indexes 0 and 1, run a traffic on queue ids 0,
+ *       then remove xsk sockets from queue 0 on both veth interfaces and
+ *       finally run a traffic on queues ids 1
  *
- * Total tests: 10
+ * Total tests: 12
  *
  * Flow:
  * -----
@@ -115,11 +119,12 @@ static void __exit_with_error(int error, const char *file, const char *func, int
 #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__)
 
 #define print_ksft_result(void)\
-	(ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\
+	(ksft_test_result_pass("PASS: %s %s %s%s%s%s\n", configured_mode ? "DRV" : "SKB",\
 			       test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\
 			       test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\
 			       test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\
-			       test_type == TEST_TYPE_STATS ? "Stats" : ""))
+			       test_type == TEST_TYPE_STATS ? "Stats" : "",\
+			       test_type == TEST_TYPE_BPF_RES ? "BPF RES" : ""))
 
 static void init_sync_resources(void)
 {
@@ -258,9 +263,8 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
 	memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE);
 }
 
-static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size)
+static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx)
 {
-	int ret;
 	struct xsk_umem_config cfg = {
 		.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
 		.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
@@ -268,17 +272,22 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size)
 		.frame_headroom = frame_headroom,
 		.flags = XSK_UMEM__DEFAULT_FLAGS
 	};
+	int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE;
+	struct xsk_umem_info *umem;
+	int ret;
 
-	data->umem = calloc(1, sizeof(struct xsk_umem_info));
-	if (!data->umem)
+	umem = calloc(1, sizeof(struct xsk_umem_info));
+	if (!umem)
 		exit_with_error(errno);
 
-	ret = xsk_umem__create(&data->umem->umem, buffer, size,
-			       &data->umem->fq, &data->umem->cq, &cfg);
+	ret = xsk_umem__create(&umem->umem, buffer, size,
+			       &umem->fq, &umem->cq, &cfg);
 	if (ret)
 		exit_with_error(ret);
 
-	data->umem->buffer = buffer;
+	umem->buffer = buffer;
+
+	data->umem_arr[idx] = umem;
 }
 
 static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
@@ -294,18 +303,19 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
 	xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS);
 }
 
-static int xsk_configure_socket(struct ifobject *ifobject)
+static int xsk_configure_socket(struct ifobject *ifobject, int idx)
 {
 	struct xsk_socket_config cfg;
+	struct xsk_socket_info *xsk;
 	struct xsk_ring_cons *rxr;
 	struct xsk_ring_prod *txr;
 	int ret;
 
-	ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info));
-	if (!ifobject->xsk)
+	xsk = calloc(1, sizeof(struct xsk_socket_info));
+	if (!xsk)
 		exit_with_error(errno);
 
-	ifobject->xsk->umem = ifobject->umem;
+	xsk->umem = ifobject->umem;
 	cfg.rx_size = rxqsize;
 	cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
 	cfg.libbpf_flags = 0;
@@ -313,19 +323,20 @@ static int xsk_configure_socket(struct ifobject *ifobject)
 	cfg.bind_flags = xdp_bind_flags;
 
 	if (test_type != TEST_TYPE_BIDI) {
-		rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL;
-		txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL;
+		rxr = (ifobject->fv.vector == rx) ? &xsk->rx : NULL;
+		txr = (ifobject->fv.vector == tx) ? &xsk->tx : NULL;
 	} else {
-		rxr = &ifobject->xsk->rx;
-		txr = &ifobject->xsk->tx;
+		rxr = &xsk->rx;
+		txr = &xsk->tx;
 	}
 
-	ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname,
-				 opt_queue, ifobject->umem->umem, rxr, txr, &cfg);
-
+	ret = xsk_socket__create(&xsk->xsk, ifobject->ifname, idx,
+				 ifobject->umem->umem, rxr, txr, &cfg);
 	if (ret)
 		return 1;
 
+	ifobject->xsk_arr[idx] = xsk;
+
 	return 0;
 }
 
@@ -393,7 +404,7 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index);
+		c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index);
 
 		if (c == -1)
 			break;
@@ -413,9 +424,6 @@ static void parse_command_line(int argc, char **argv)
 				       MAX_INTERFACES_NAMESPACE_CHARS);
 			interface_index++;
 			break;
-		case 'q':
-			opt_queue = atoi(optarg);
-			break;
 		case 'D':
 			debug_pkt_dump = 1;
 			break;
@@ -752,27 +760,33 @@ static void worker_pkt_validate(void)
 
 static void thread_common_ops(struct ifobject *ifobject, void *bufs)
 {
+	int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE;
 	int ctr = 0;
 	int ret;
 
 	pthread_attr_setstacksize(&attr, THREAD_STACK);
 
-	bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE,
+	ifobject->ns_fd = switch_namespace(ifobject->nsname);
+
+	if (test_type == TEST_TYPE_BPF_RES)
+		umem_sz *= 2;
+
+	bufs = mmap(NULL, umem_sz,
 		    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 	if (bufs == MAP_FAILED)
 		exit_with_error(errno);
 
-	ifobject->ns_fd = switch_namespace(ifobject->nsname);
-
-	xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE);
-	ret = xsk_configure_socket(ifobject);
+	xsk_configure_umem(ifobject, bufs, 0);
+	ifobject->umem = ifobject->umem_arr[0];
+	ret = xsk_configure_socket(ifobject, 0);
 
 	/* Retry Create Socket if it fails as xsk_socket__create()
 	 * is asynchronous
 	 */
 	while (ret && ctr < SOCK_RECONF_CTR) {
-		xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE);
-		ret = xsk_configure_socket(ifobject);
+		xsk_configure_umem(ifobject, bufs, 0);
+		ifobject->umem = ifobject->umem_arr[0];
+		ret = xsk_configure_socket(ifobject, 0);
 		usleep(USLEEP_MAX);
 		ctr++;
 	}
@@ -780,10 +794,34 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs)
 	if (ctr >= SOCK_RECONF_CTR)
 		exit_with_error(ret);
 
+	ifobject->umem = ifobject->umem_arr[0];
+	ifobject->xsk = ifobject->xsk_arr[0];
+
+	if (test_type == TEST_TYPE_BPF_RES) {
+		xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1);
+		ifobject->umem = ifobject->umem_arr[1];
+		ret = xsk_configure_socket(ifobject, 1);
+	}
+
+	ifobject->umem = ifobject->umem_arr[0];
+	ifobject->xsk = ifobject->xsk_arr[0];
 	print_verbose("Interface [%s] vector [%s]\n",
 		      ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx");
 }
 
+static bool testapp_is_test_two_stepped(void)
+{
+	return (test_type != TEST_TYPE_BIDI && test_type != TEST_TYPE_BPF_RES) || second_step;
+}
+
+static void testapp_cleanup_xsk_res(struct ifobject *ifobj)
+{
+	if (testapp_is_test_two_stepped()) {
+		xsk_socket__delete(ifobj->xsk->xsk);
+		(void)xsk_umem__delete(ifobj->umem->umem);
+	}
+}
+
 static void *worker_testapp_validate_tx(void *arg)
 {
 	struct udphdr *udp_hdr =
@@ -794,7 +832,7 @@ static void *worker_testapp_validate_tx(void *arg)
 	struct generic_data data;
 	void *bufs = NULL;
 
-	if (!bidi_pass)
+	if (!second_step)
 		thread_common_ops(ifobject, bufs);
 
 	for (int i = 0; i < num_frames; i++) {
@@ -814,10 +852,7 @@ static void *worker_testapp_validate_tx(void *arg)
 		      (opt_pkt_count - 1), ifobject->ifname);
 	tx_only_all(ifobject);
 
-	if (test_type != TEST_TYPE_BIDI || bidi_pass) {
-		xsk_socket__delete(ifobject->xsk->xsk);
-		(void)xsk_umem__delete(ifobject->umem->umem);
-	}
+	testapp_cleanup_xsk_res(ifobject);
 	pthread_exit(NULL);
 }
 
@@ -827,7 +862,7 @@ static void *worker_testapp_validate_rx(void *arg)
 	struct pollfd fds[MAX_SOCKS] = { };
 	void *bufs = NULL;
 
-	if (!bidi_pass)
+	if (!second_step)
 		thread_common_ops(ifobject, bufs);
 
 	if (stat_test_type != STAT_TEST_RX_FILL_EMPTY)
@@ -864,10 +899,7 @@ static void *worker_testapp_validate_rx(void *arg)
 	if (test_type == TEST_TYPE_TEARDOWN)
 		print_verbose("Destroying socket\n");
 
-	if ((test_type != TEST_TYPE_BIDI) || bidi_pass) {
-		xsk_socket__delete(ifobject->xsk->xsk);
-		(void)xsk_umem__delete(ifobject->umem->umem);
-	}
+	testapp_cleanup_xsk_res(ifobject);
 	pthread_exit(NULL);
 }
 
@@ -875,6 +907,7 @@ static void testapp_validate(void)
 {
 	struct timespec max_wait = { 0, 0 };
 	bool bidi = test_type == TEST_TYPE_BIDI;
+	bool bpf = test_type == TEST_TYPE_BPF_RES;
 
 	pthread_attr_init(&attr);
 	pthread_attr_setstacksize(&attr, THREAD_STACK);
@@ -908,7 +941,7 @@ static void testapp_validate(void)
 		free(pkt_buf);
 	}
 
-	if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS))
+	if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS))
 		print_ksft_result();
 }
 
@@ -950,11 +983,11 @@ static void testapp_bidi(void)
 		sigvar = 0;
 		print_verbose("Creating socket\n");
 		testapp_validate();
-		if (!bidi_pass) {
+		if (!second_step) {
 			print_verbose("Switching Tx/Rx vectors\n");
 			swap_vectors(ifdict[1], ifdict[0]);
 		}
-		bidi_pass++;
+		second_step = true;
 	}
 
 	swap_vectors(ifdict[0], ifdict[1]);
@@ -962,6 +995,36 @@ static void testapp_bidi(void)
 	print_ksft_result();
 }
 
+static void swap_xsk_res(void)
+{
+	xsk_socket__delete(ifdict_tx->xsk->xsk);
+	xsk_umem__delete(ifdict_tx->umem->umem);
+	xsk_socket__delete(ifdict_rx->xsk->xsk);
+	xsk_umem__delete(ifdict_rx->umem->umem);
+	ifdict_tx->umem = ifdict_tx->umem_arr[1];
+	ifdict_tx->xsk = ifdict_tx->xsk_arr[1];
+	ifdict_rx->umem = ifdict_rx->umem_arr[1];
+	ifdict_rx->xsk = ifdict_rx->xsk_arr[1];
+}
+
+static void testapp_bpf_res(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_BPF_ITER; i++) {
+		pkt_counter = 0;
+		prev_pkt = -1;
+		sigvar = 0;
+		print_verbose("Creating socket\n");
+		testapp_validate();
+		if (!second_step)
+			swap_xsk_res();
+		second_step = true;
+	}
+
+	print_ksft_result();
+}
+
 static void testapp_stats(void)
 {
 	for (int i = 0; i < STAT_TEST_TYPE_MAX; i++) {
@@ -1025,13 +1088,15 @@ static void run_pkt_test(int mode, int type)
 	/* reset defaults after potential previous test */
 	xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 	pkt_counter = 0;
-	bidi_pass = 0;
+	second_step = 0;
 	prev_pkt = -1;
 	sigvar = 0;
 	stat_test_type = -1;
 	rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS;
 	frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
 
+	configured_mode = mode;
+
 	switch (mode) {
 	case (TEST_MODE_SKB):
 		xdp_flags |= XDP_FLAGS_SKB_MODE;
@@ -1053,6 +1118,9 @@ static void run_pkt_test(int mode, int type)
 	case TEST_TYPE_BIDI:
 		testapp_bidi();
 		break;
+	case TEST_TYPE_BPF_RES:
+		testapp_bpf_res();
+		break;
 	default:
 		testapp_validate();
 		break;
@@ -1062,6 +1130,7 @@ static void run_pkt_test(int mode, int type)
 int main(int argc, char **argv)
 {
 	struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY };
+	bool failure = false;
 	int i, j;
 
 	if (setrlimit(RLIMIT_MEMLOCK, &_rlim))
@@ -1073,6 +1142,16 @@ int main(int argc, char **argv)
 			exit_with_error(errno);
 
 		ifdict[i]->ifdict_index = i;
+		ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *));
+		if (!ifdict[i]->xsk_arr) {
+			failure = true;
+			goto cleanup;
+		}
+		ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *));
+		if (!ifdict[i]->umem_arr) {
+			failure = true;
+			goto cleanup;
+		}
 	}
 
 	setlocale(LC_ALL, "");
@@ -1093,13 +1172,19 @@ int main(int argc, char **argv)
 			run_pkt_test(i, j);
 	}
 
+	destroy_sync_resources();
+
+cleanup:
 	for (int i = 0; i < MAX_INTERFACES; i++) {
 		if (ifdict[i]->ns_fd != -1)
 			close(ifdict[i]->ns_fd);
+		free(ifdict[i]->xsk_arr);
+		free(ifdict[i]->umem_arr);
 		free(ifdict[i]);
 	}
 
-	destroy_sync_resources();
+	if (failure)
+		exit_with_error(errno);
 
 	ksft_exit_pass();
 
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index e304229d8a4c..e431ecb9bb95 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -23,6 +23,7 @@
 #define MAX_SOCKS 1
 #define MAX_TEARDOWN_ITER 10
 #define MAX_BIDI_ITER 2
+#define MAX_BPF_ITER 2
 #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
 			sizeof(struct udphdr))
 #define MIN_PKT_SIZE 64
@@ -63,6 +64,7 @@ enum TEST_TYPES {
 	TEST_TYPE_TEARDOWN,
 	TEST_TYPE_BIDI,
 	TEST_TYPE_STATS,
+	TEST_TYPE_BPF_RES,
 	TEST_TYPE_MAX
 };
 
@@ -77,10 +79,9 @@ enum STAT_TEST_TYPES {
 static int configured_mode = TEST_MODE_UNCONFIGURED;
 static u8 debug_pkt_dump;
 static u32 num_frames;
-static u8 bidi_pass;
+static bool second_step;
 static int test_type;
 
-static int opt_queue;
 static int opt_pkt_count;
 static u8 opt_verbose;
 
@@ -128,6 +129,8 @@ struct ifobject {
 	char ifname[MAX_INTERFACE_NAME_CHARS];
 	char nsname[MAX_INTERFACES_NAMESPACE_CHARS];
 	struct xsk_socket_info *xsk;
+	struct xsk_socket_info **xsk_arr;
+	struct xsk_umem_info **umem_arr;
 	struct xsk_umem_info *umem;
 	void *(*func_ptr)(void *arg);
 	struct flow_vector fv;

From 7651910257c8fb1ec76b50bef0330fcf739105c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Tue, 30 Mar 2021 00:43:14 +0200
Subject: [PATCH 46/68] selftests: xsk: Remove thread attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is really no reason to have a non-default thread stack
size. Remove that.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-16-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 9 ++-------
 tools/testing/selftests/bpf/xdpxceiver.h | 2 --
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 5c0d6dbd076f..ec09b8fe1178 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -764,8 +764,6 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs)
 	int ctr = 0;
 	int ret;
 
-	pthread_attr_setstacksize(&attr, THREAD_STACK);
-
 	ifobject->ns_fd = switch_namespace(ifobject->nsname);
 
 	if (test_type == TEST_TYPE_BPF_RES)
@@ -909,13 +907,10 @@ static void testapp_validate(void)
 	bool bidi = test_type == TEST_TYPE_BIDI;
 	bool bpf = test_type == TEST_TYPE_BPF_RES;
 
-	pthread_attr_init(&attr);
-	pthread_attr_setstacksize(&attr, THREAD_STACK);
-
 	pthread_mutex_lock(&sync_mutex);
 
 	/*Spawn RX thread */
-	pthread_create(&t0, &attr, ifdict_rx->func_ptr, ifdict_rx);
+	pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx);
 
 	if (clock_gettime(CLOCK_REALTIME, &max_wait))
 		exit_with_error(errno);
@@ -927,7 +922,7 @@ static void testapp_validate(void)
 	pthread_mutex_unlock(&sync_mutex);
 
 	/*Spawn TX thread */
-	pthread_create(&t1, &attr, ifdict_tx->func_ptr, ifdict_tx);
+	pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx);
 
 	pthread_join(t1, NULL);
 	pthread_join(t0, NULL);
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index e431ecb9bb95..78863820fb81 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -37,7 +37,6 @@
 #define TMOUT_SEC (3)
 #define EOT (-1)
 #define USLEEP_MAX 200000
-#define THREAD_STACK 60000000
 #define SOCK_RECONF_CTR 10
 #define BATCH_SIZE 64
 #define POLL_TMOUT 1000
@@ -152,7 +151,6 @@ static struct ifobject *ifdict_tx;
 pthread_mutex_t sync_mutex;
 pthread_cond_t signal_rx_condition;
 pthread_t t0, t1;
-pthread_attr_t attr;
 
 TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head);
 struct head_s *head_p;

From 96539f1c5efb0022b94412e8623722aad23dee6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Tue, 30 Mar 2021 00:43:15 +0200
Subject: [PATCH 47/68] selftests: xsk: Remove mutex and condition variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The usage of the condition variable is broken, and overkill. Replace it
with a pthread barrier.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-17-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 33 ++++--------------------
 tools/testing/selftests/bpf/xdpxceiver.h |  3 +--
 2 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index ec09b8fe1178..6d87bdf3574a 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -126,18 +126,6 @@ static void __exit_with_error(int error, const char *file, const char *func, int
 			       test_type == TEST_TYPE_STATS ? "Stats" : "",\
 			       test_type == TEST_TYPE_BPF_RES ? "BPF RES" : ""))
 
-static void init_sync_resources(void)
-{
-	pthread_mutex_init(&sync_mutex, NULL);
-	pthread_cond_init(&signal_rx_condition, NULL);
-}
-
-static void destroy_sync_resources(void)
-{
-	pthread_mutex_destroy(&sync_mutex);
-	pthread_cond_destroy(&signal_rx_condition);
-}
-
 static void *memset32_htonl(void *dest, u32 val, u32 size)
 {
 	u32 *ptr = (u32 *)dest;
@@ -876,9 +864,7 @@ static void *worker_testapp_validate_rx(void *arg)
 	fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk);
 	fds[0].events = POLLIN;
 
-	pthread_mutex_lock(&sync_mutex);
-	pthread_cond_signal(&signal_rx_condition);
-	pthread_mutex_unlock(&sync_mutex);
+	pthread_barrier_wait(&barr);
 
 	while (1) {
 		if (test_type != TEST_TYPE_STATS) {
@@ -903,23 +889,18 @@ static void *worker_testapp_validate_rx(void *arg)
 
 static void testapp_validate(void)
 {
-	struct timespec max_wait = { 0, 0 };
 	bool bidi = test_type == TEST_TYPE_BIDI;
 	bool bpf = test_type == TEST_TYPE_BPF_RES;
 
-	pthread_mutex_lock(&sync_mutex);
+	if (pthread_barrier_init(&barr, NULL, 2))
+		exit_with_error(errno);
 
 	/*Spawn RX thread */
 	pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx);
 
-	if (clock_gettime(CLOCK_REALTIME, &max_wait))
+	pthread_barrier_wait(&barr);
+	if (pthread_barrier_destroy(&barr))
 		exit_with_error(errno);
-	max_wait.tv_sec += TMOUT_SEC;
-
-	if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT)
-		exit_with_error(errno);
-
-	pthread_mutex_unlock(&sync_mutex);
 
 	/*Spawn TX thread */
 	pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx);
@@ -1160,15 +1141,11 @@ int main(int argc, char **argv)
 
 	ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX);
 
-	init_sync_resources();
-
 	for (i = 0; i < TEST_MODE_MAX; i++) {
 		for (j = 0; j < TEST_TYPE_MAX; j++)
 			run_pkt_test(i, j);
 	}
 
-	destroy_sync_resources();
-
 cleanup:
 	for (int i = 0; i < MAX_INTERFACES; i++) {
 		if (ifdict[i]->ns_fd != -1)
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index 78863820fb81..ef219c0785eb 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -148,8 +148,7 @@ static struct ifobject *ifdict_rx;
 static struct ifobject *ifdict_tx;
 
 /*threads*/
-pthread_mutex_t sync_mutex;
-pthread_cond_t signal_rx_condition;
+pthread_barrier_t barr;
 pthread_t t0, t1;
 
 TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head);

From ae6b6a17800f34dd5215286b44a4e99a0a1cf862 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Tue, 30 Mar 2021 00:43:16 +0200
Subject: [PATCH 48/68] selftests: xsk: Remove unused defines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove two unused defines.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210329224316.17793-18-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xdpxceiver.c | 7 +++----
 tools/testing/selftests/bpf/xdpxceiver.h | 2 --
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c
index 6d87bdf3574a..1135fb980814 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.c
+++ b/tools/testing/selftests/bpf/xdpxceiver.c
@@ -456,7 +456,7 @@ static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size)
 	if (!xsk->outstanding_tx)
 		return;
 
-	if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx))
+	if (xsk_ring_prod__needs_wakeup(&xsk->tx))
 		kick_tx(xsk);
 
 	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
@@ -544,9 +544,8 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size)
 	xsk_ring_prod__submit(&xsk->tx, batch_size);
 	if (!tx_invalid_test) {
 		xsk->outstanding_tx += batch_size;
-	} else {
-		if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx))
-			kick_tx(xsk);
+	} else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+		kick_tx(xsk);
 	}
 	*frameptr += batch_size;
 	*frameptr %= num_frames;
diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h
index ef219c0785eb..6c428b276ab6 100644
--- a/tools/testing/selftests/bpf/xdpxceiver.h
+++ b/tools/testing/selftests/bpf/xdpxceiver.h
@@ -34,13 +34,11 @@
 #define IP_PKT_TOS 0x9
 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr))
 #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr))
-#define TMOUT_SEC (3)
 #define EOT (-1)
 #define USLEEP_MAX 200000
 #define SOCK_RECONF_CTR 10
 #define BATCH_SIZE 64
 #define POLL_TMOUT 1000
-#define NEED_WAKEUP true
 #define DEFAULT_PKT_CNT 10000
 #define RX_FULL_RXQSIZE 32
 

From 913d55037616659c04763e756f948fcbaef0bbee Mon Sep 17 00:00:00 2001
From: He Fengqing <hefengqing@huawei.com>
Date: Tue, 30 Mar 2021 02:48:43 +0000
Subject: [PATCH 49/68] bpf: Remove unused bpf_load_pointer

Remove unused bpf_load_pointer function in filter.h. The last user of it has
been removed with 24dea04767e6 ("bpf, x32: remove ld_abs/ld_ind").

Signed-off-by: He Fengqing <hefengqing@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210330024843.3479844-1-hefengqing@huawei.com
---
 include/linux/filter.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index eecfd82db648..9a09547bc7ba 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1246,15 +1246,6 @@ static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
 void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
 					   int k, unsigned int size);
 
-static inline void *bpf_load_pointer(const struct sk_buff *skb, int k,
-				     unsigned int size, void *buffer)
-{
-	if (k >= 0)
-		return skb_header_pointer(skb, k, size, buffer);
-
-	return bpf_internal_load_pointer_neg_helper(skb, k, size);
-}
-
 static inline int bpf_tell_extensions(void)
 {
 	return SKF_AD_MAX;

From 235fc0e36d35270e1ff426f26bbea6154b8863a5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 26 Mar 2021 19:43:48 +0000
Subject: [PATCH 50/68] bpf: Remove redundant assignment of variable id

The variable id is being assigned a value that is never read, the
assignment is redundant and can be removed.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210326194348.623782-1-colin.king@canonical.com
---
 kernel/bpf/btf.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ec8afc4bc560..0600ed325fa0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -789,7 +789,6 @@ static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
 
 	while (btf_type_is_modifier(t) &&
 	       BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
-		id = t->type;
 		t = btf_type_by_id(btf, t->type);
 	}
 

From 63f8af0fc34197a276674fa0d4d865aeff1f0172 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Tue, 23 Mar 2021 01:47:52 +0000
Subject: [PATCH 51/68] selftests/bpf: Add an option for a debug shell in
 vmtest.sh

The newly introduced -s command line option starts an interactive shell.
If a command is specified, the shell is started after the command
finishes executing. It's useful to have a shell especially when
debugging failing tests or developing new tests.

Since the user may terminate the VM forcefully, an extra "sync" is added
after the execution of the command to persist any logs from the command
into the log file.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210323014752.3198283-1-kpsingh@kernel.org
---
 tools/testing/selftests/bpf/vmtest.sh | 39 +++++++++++++++++++--------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh
index 22554894db99..8889b3f55236 100755
--- a/tools/testing/selftests/bpf/vmtest.sh
+++ b/tools/testing/selftests/bpf/vmtest.sh
@@ -24,15 +24,15 @@ EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status"
 usage()
 {
 	cat <<EOF
-Usage: $0 [-i] [-d <output_dir>] -- [<command>]
+Usage: $0 [-i] [-s] [-d <output_dir>] -- [<command>]
 
 <command> is the command you would normally run when you are in
 tools/testing/selftests/bpf. e.g:
 
 	$0 -- ./test_progs -t test_lsm
 
-If no command is specified, "${DEFAULT_COMMAND}" will be run by
-default.
+If no command is specified and a debug shell (-s) is not requested,
+"${DEFAULT_COMMAND}" will be run by default.
 
 If you build your kernel using KBUILD_OUTPUT= or O= options, these
 can be passed as environment variables to the script:
@@ -49,6 +49,9 @@ Options:
 	-d)		Update the output directory (default: ${OUTPUT_DIR})
 	-j)		Number of jobs for compilation, similar to -j in make
 			(default: ${NUM_COMPILE_JOBS})
+	-s)		Instead of powering off the VM, start an interactive
+			shell. If <command> is specified, the shell runs after
+			the command finishes executing
 EOF
 }
 
@@ -149,6 +152,7 @@ update_init_script()
 	local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d"
 	local init_script="${init_script_dir}/S50-startup"
 	local command="$1"
+	local exit_command="$2"
 
 	mount_image
 
@@ -162,9 +166,10 @@ EOF
 
 	fi
 
-	sudo bash -c "cat >${init_script}" <<EOF
-#!/bin/bash
+	sudo bash -c "echo '#!/bin/bash' > ${init_script}"
 
+	if [[ "${command}" != "" ]]; then
+		sudo bash -c "cat >>${init_script}" <<EOF
 # Have a default value in the exit status file
 # incase the VM is forcefully stopped.
 echo "130" > "/root/${EXIT_STATUS_FILE}"
@@ -175,9 +180,12 @@ echo "130" > "/root/${EXIT_STATUS_FILE}"
 	stdbuf -oL -eL ${command}
 	echo "\$?" > "/root/${EXIT_STATUS_FILE}"
 } 2>&1 | tee "/root/${LOG_FILE}"
-poweroff -f
+# Ensure that the logs are written to disk
+sync
 EOF
+	fi
 
+	sudo bash -c "echo ${exit_command} >> ${init_script}"
 	sudo chmod a+x "${init_script}"
 	unmount_image
 }
@@ -277,8 +285,10 @@ main()
 	local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}"
 	local command="${DEFAULT_COMMAND}"
 	local update_image="no"
+	local exit_command="poweroff -f"
+	local debug_shell="no"
 
-	while getopts 'hkid:j:' opt; do
+	while getopts 'hskid:j:' opt; do
 		case ${opt} in
 		i)
 			update_image="yes"
@@ -289,6 +299,11 @@ main()
 		j)
 			NUM_COMPILE_JOBS="$OPTARG"
 			;;
+		s)
+			command=""
+			debug_shell="yes"
+			exit_command="bash"
+			;;
 		h)
 			usage
 			exit 0
@@ -307,7 +322,7 @@ main()
 	done
 	shift $((OPTIND -1))
 
-	if [[ $# -eq 0 ]]; then
+	if [[ $# -eq 0  && "${debug_shell}" == "no" ]]; then
 		echo "No command specified, will run ${DEFAULT_COMMAND} in the vm"
 	else
 		command="$@"
@@ -355,10 +370,12 @@ main()
 	fi
 
 	update_selftests "${kernel_checkout}" "${make_command}"
-	update_init_script "${command}"
+	update_init_script "${command}" "${exit_command}"
 	run_vm "${kernel_bzimage}"
-	copy_logs
-	echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}"
+	if [[ "${command}" != "" ]]; then
+		copy_logs
+		echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}"
+	fi
 }
 
 catch()

From e27bfefb21f28d5295432f042b5d9d7871100c35 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 29 Mar 2021 15:31:43 -0700
Subject: [PATCH 52/68] tools/resolve_btfids: Fix warnings

* make eprintf static, used only in main.c
* initialize ret in eprintf
* remove unused *tmp

v3:
* remove another err (Song Liu)

v2:
* remove unused 'int err = -1'

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210329223143.3659983-1-sdf@google.com
---
 tools/bpf/resolve_btfids/main.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 80d966cfcaa1..7550fd9c3188 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -115,10 +115,10 @@ struct object {
 
 static int verbose;
 
-int eprintf(int level, int var, const char *fmt, ...)
+static int eprintf(int level, int var, const char *fmt, ...)
 {
 	va_list args;
-	int ret;
+	int ret = 0;
 
 	if (var >= level) {
 		va_start(args, fmt);
@@ -385,7 +385,7 @@ static int elf_collect(struct object *obj)
 static int symbols_collect(struct object *obj)
 {
 	Elf_Scn *scn = NULL;
-	int n, i, err = 0;
+	int n, i;
 	GElf_Shdr sh;
 	char *name;
 
@@ -402,11 +402,10 @@ static int symbols_collect(struct object *obj)
 	 * Scan symbols and look for the ones starting with
 	 * __BTF_ID__* over .BTF_ids section.
 	 */
-	for (i = 0; !err && i < n; i++) {
-		char *tmp, *prefix;
+	for (i = 0; i < n; i++) {
+		char *prefix;
 		struct btf_id *id;
 		GElf_Sym sym;
-		int err = -1;
 
 		if (!gelf_getsym(obj->efile.symbols, i, &sym))
 			return -1;

From 37f0e514db660f03f8982b8f4fbbd4b2740abe7d Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:22 -0700
Subject: [PATCH 53/68] skmsg: Lock ingress_skb when purging

Currently we purge the ingress_skb queue only when psock
refcnt goes down to 0, so locking the queue is not necessary,
but in order to be called during ->close, we have to lock it
here.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-2-xiyou.wangcong@gmail.com
---
 net/core/skmsg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 07f54015238a..bebf84ed4e30 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -634,7 +634,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock)
 {
 	struct sk_buff *skb;
 
-	while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
+	while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
 		skb_bpf_redirect_clear(skb);
 		kfree_skb(skb);
 	}

From b01fd6e802b6d0a635176f943315670b679d8d7b Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:23 -0700
Subject: [PATCH 54/68] skmsg: Introduce a spinlock to protect ingress_msg

Currently we rely on lock_sock to protect ingress_msg,
it is too big for this, we can actually just use a spinlock
to protect this list like protecting other skb queues.

__tcp_bpf_recvmsg() is still special because of peeking,
it still has to use lock_sock.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 46 +++++++++++++++++++++++++++++++++++++++++++
 net/core/skmsg.c      |  3 +++
 net/ipv4/tcp_bpf.c    | 18 ++++++-----------
 3 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 6c09d94be2e9..f2d45a73b2b2 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -89,6 +89,7 @@ struct sk_psock {
 #endif
 	struct sk_buff_head		ingress_skb;
 	struct list_head		ingress_msg;
+	spinlock_t			ingress_lock;
 	unsigned long			state;
 	struct list_head		link;
 	spinlock_t			link_lock;
@@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
 static inline void sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
+	spin_lock_bh(&psock->ingress_lock);
 	list_add_tail(&msg->list, &psock->ingress_msg);
+	spin_unlock_bh(&psock->ingress_lock);
+}
+
+static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
+{
+	struct sk_msg *msg;
+
+	spin_lock_bh(&psock->ingress_lock);
+	msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+	if (msg)
+		list_del(&msg->list);
+	spin_unlock_bh(&psock->ingress_lock);
+	return msg;
+}
+
+static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
+{
+	struct sk_msg *msg;
+
+	spin_lock_bh(&psock->ingress_lock);
+	msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+	spin_unlock_bh(&psock->ingress_lock);
+	return msg;
+}
+
+static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock,
+					       struct sk_msg *msg)
+{
+	struct sk_msg *ret;
+
+	spin_lock_bh(&psock->ingress_lock);
+	if (list_is_last(&msg->list, &psock->ingress_msg))
+		ret = NULL;
+	else
+		ret = list_next_entry(msg, list);
+	spin_unlock_bh(&psock->ingress_lock);
+	return ret;
 }
 
 static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
@@ -292,6 +331,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
 	return psock ? list_empty(&psock->ingress_msg) : true;
 }
 
+static inline void kfree_sk_msg(struct sk_msg *msg)
+{
+	if (msg->skb)
+		consume_skb(msg->skb);
+	kfree(msg);
+}
+
 static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 {
 	struct sock *sk = psock->sk;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index bebf84ed4e30..305dddc51857 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -592,6 +592,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 
 	INIT_WORK(&psock->work, sk_psock_backlog);
 	INIT_LIST_HEAD(&psock->ingress_msg);
+	spin_lock_init(&psock->ingress_lock);
 	skb_queue_head_init(&psock->ingress_skb);
 
 	sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
@@ -638,7 +639,9 @@ static void sk_psock_zap_ingress(struct sk_psock *psock)
 		skb_bpf_redirect_clear(skb);
 		kfree_skb(skb);
 	}
+	spin_lock_bh(&psock->ingress_lock);
 	__sk_psock_purge_ingress_msg(psock);
+	spin_unlock_bh(&psock->ingress_lock);
 }
 
 static void sk_psock_link_destroy(struct sk_psock *psock)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 17c322b875fd..ae980716d896 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -18,9 +18,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 	struct sk_msg *msg_rx;
 	int i, copied = 0;
 
-	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
-					  struct sk_msg, list);
-
+	msg_rx = sk_psock_peek_msg(psock);
 	while (copied != len) {
 		struct scatterlist *sge;
 
@@ -68,22 +66,18 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 		} while (i != msg_rx->sg.end);
 
 		if (unlikely(peek)) {
-			if (msg_rx == list_last_entry(&psock->ingress_msg,
-						      struct sk_msg, list))
+			msg_rx = sk_psock_next_msg(psock, msg_rx);
+			if (!msg_rx)
 				break;
-			msg_rx = list_next_entry(msg_rx, list);
 			continue;
 		}
 
 		msg_rx->sg.start = i;
 		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
-			list_del(&msg_rx->list);
-			if (msg_rx->skb)
-				consume_skb(msg_rx->skb);
-			kfree(msg_rx);
+			msg_rx = sk_psock_dequeue_msg(psock);
+			kfree_sk_msg(msg_rx);
 		}
-		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
-						  struct sk_msg, list);
+		msg_rx = sk_psock_peek_msg(psock);
 	}
 
 	return copied;

From 0739cd28f2645e814586c7536ba5da9825cb8029 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:24 -0700
Subject: [PATCH 55/68] net: Introduce skb_send_sock() for sock_map

We only have skb_send_sock_locked() which requires callers
to use lock_sock(). Introduce a variant skb_send_sock()
which locks on its own, callers do not need to lock it
any more. This will save us from adding a ->sendmsg_locked
for each protocol.

To reuse the code, pass function pointers to __skb_send_sock()
and build skb_send_sock() and skb_send_sock_locked() on top.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-4-xiyou.wangcong@gmail.com
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c      | 55 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8def85fcc22..dbf820a50a39 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3626,6 +3626,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    unsigned int flags);
 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 			 int len);
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
 int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e8320b5d651a..3ad9e8425ab2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2500,9 +2500,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 }
 EXPORT_SYMBOL_GPL(skb_splice_bits);
 
-/* Send skb data on a socket. Socket must be locked. */
-int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
-			 int len)
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
+			    struct kvec *vec, size_t num, size_t size)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (!sock)
+		return -EINVAL;
+	return kernel_sendmsg(sock, msg, vec, num, size);
+}
+
+static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
+			     size_t size, int flags)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (!sock)
+		return -EINVAL;
+	return kernel_sendpage(sock, page, offset, size, flags);
+}
+
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
+			    struct kvec *vec, size_t num, size_t size);
+typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
+			     size_t size, int flags);
+static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
+			   int len, sendmsg_func sendmsg, sendpage_func sendpage)
 {
 	unsigned int orig_len = len;
 	struct sk_buff *head = skb;
@@ -2522,7 +2545,8 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 		memset(&msg, 0, sizeof(msg));
 		msg.msg_flags = MSG_DONTWAIT;
 
-		ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+		ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
+				      sendmsg_unlocked, sk, &msg, &kv, 1, slen);
 		if (ret <= 0)
 			goto error;
 
@@ -2553,9 +2577,11 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 		slen = min_t(size_t, len, skb_frag_size(frag) - offset);
 
 		while (slen) {
-			ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
-						     skb_frag_off(frag) + offset,
-						     slen, MSG_DONTWAIT);
+			ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
+					      sendpage_unlocked, sk,
+					      skb_frag_page(frag),
+					      skb_frag_off(frag) + offset,
+					      slen, MSG_DONTWAIT);
 			if (ret <= 0)
 				goto error;
 
@@ -2587,8 +2613,23 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 error:
 	return orig_len == len ? ret : orig_len - len;
 }
+
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+			 int len)
+{
+	return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
+			       kernel_sendpage_locked);
+}
 EXPORT_SYMBOL_GPL(skb_send_sock_locked);
 
+/* Send skb data on a socket. Socket must be unlocked. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
+			       sendpage_unlocked);
+}
+
 /**
  *	skb_store_bits - store bits from kernel buffer to skb
  *	@skb: destination buffer

From 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:25 -0700
Subject: [PATCH 56/68] skmsg: Avoid lock_sock() in sk_psock_backlog()

We do not have to lock the sock to avoid losing sk_socket,
instead we can purge all the ingress queues when we close
the socket. Sending or receiving packets after orphaning
socket makes no sense.

We do purge these queues when psock refcnt reaches zero but
here we want to purge them explicitly in sock_map_close().
There are also some nasty race conditions on testing bit
SK_PSOCK_TX_ENABLED and queuing/canceling the psock work,
we can expand psock->ingress_lock a bit to protect them too.

As noticed by John, we still have to lock the psock->work,
because the same work item could be running concurrently on
different CPU's.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h |  2 ++
 net/core/skmsg.c      | 50 +++++++++++++++++++++++++++++--------------
 net/core/sock_map.c   |  1 +
 3 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index f2d45a73b2b2..7382c4b518d7 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -99,6 +99,7 @@ struct sk_psock {
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
 	struct proto			*sk_proto;
+	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
 	struct work_struct		work;
 	union {
@@ -347,6 +348,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node);
+void sk_psock_stop(struct sk_psock *psock, bool wait);
 
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 305dddc51857..9c25020086a9 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 	if (!ingress) {
 		if (!sock_writeable(psock->sk))
 			return -EAGAIN;
-		return skb_send_sock_locked(psock->sk, skb, off, len);
+		return skb_send_sock(psock->sk, skb, off, len);
 	}
 	return sk_psock_skb_ingress(psock, skb);
 }
@@ -511,8 +511,7 @@ static void sk_psock_backlog(struct work_struct *work)
 	u32 len, off;
 	int ret;
 
-	/* Lock sock to avoid losing sk_socket during loop. */
-	lock_sock(psock->sk);
+	mutex_lock(&psock->work_mutex);
 	if (state->skb) {
 		skb = state->skb;
 		len = state->len;
@@ -529,7 +528,7 @@ static void sk_psock_backlog(struct work_struct *work)
 		skb_bpf_redirect_clear(skb);
 		do {
 			ret = -EIO;
-			if (likely(psock->sk->sk_socket))
+			if (!sock_flag(psock->sk, SOCK_DEAD))
 				ret = sk_psock_handle_skb(psock, skb, off,
 							  len, ingress);
 			if (ret <= 0) {
@@ -553,7 +552,7 @@ static void sk_psock_backlog(struct work_struct *work)
 			kfree_skb(skb);
 	}
 end:
-	release_sock(psock->sk);
+	mutex_unlock(&psock->work_mutex);
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -591,6 +590,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 	spin_lock_init(&psock->link_lock);
 
 	INIT_WORK(&psock->work, sk_psock_backlog);
+	mutex_init(&psock->work_mutex);
 	INIT_LIST_HEAD(&psock->ingress_msg);
 	spin_lock_init(&psock->ingress_lock);
 	skb_queue_head_init(&psock->ingress_skb);
@@ -631,7 +631,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 	}
 }
 
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
 {
 	struct sk_buff *skb;
 
@@ -639,9 +639,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock)
 		skb_bpf_redirect_clear(skb);
 		kfree_skb(skb);
 	}
-	spin_lock_bh(&psock->ingress_lock);
 	__sk_psock_purge_ingress_msg(psock);
-	spin_unlock_bh(&psock->ingress_lock);
 }
 
 static void sk_psock_link_destroy(struct sk_psock *psock)
@@ -654,6 +652,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
 	}
 }
 
+void sk_psock_stop(struct sk_psock *psock, bool wait)
+{
+	spin_lock_bh(&psock->ingress_lock);
+	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+	sk_psock_cork_free(psock);
+	__sk_psock_zap_ingress(psock);
+	spin_unlock_bh(&psock->ingress_lock);
+
+	if (wait)
+		cancel_work_sync(&psock->work);
+}
+
 static void sk_psock_done_strp(struct sk_psock *psock);
 
 static void sk_psock_destroy_deferred(struct work_struct *gc)
@@ -665,12 +675,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	sk_psock_done_strp(psock);
 
 	cancel_work_sync(&psock->work);
+	mutex_destroy(&psock->work_mutex);
 
 	psock_progs_drop(&psock->progs);
 
 	sk_psock_link_destroy(psock);
 	sk_psock_cork_free(psock);
-	sk_psock_zap_ingress(psock);
 
 	if (psock->sk_redir)
 		sock_put(psock->sk_redir);
@@ -688,8 +698,7 @@ static void sk_psock_destroy(struct rcu_head *rcu)
 
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
-	sk_psock_cork_free(psock);
-	sk_psock_zap_ingress(psock);
+	sk_psock_stop(psock, false);
 
 	write_lock_bh(&sk->sk_callback_lock);
 	sk_psock_restore_proto(sk, psock);
@@ -699,7 +708,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 	else if (psock->progs.stream_verdict)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
-	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 
 	call_rcu(&psock->rcu, sk_psock_destroy);
 }
@@ -770,14 +778,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
 	 * error that caused the pipe to break. We can't send a packet on
 	 * a socket that is in this state so we drop the skb.
 	 */
-	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
-	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+	if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+		kfree_skb(skb);
+		return;
+	}
+	spin_lock_bh(&psock_other->ingress_lock);
+	if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+		spin_unlock_bh(&psock_other->ingress_lock);
 		kfree_skb(skb);
 		return;
 	}
 
 	skb_queue_tail(&psock_other->ingress_skb, skb);
 	schedule_work(&psock_other->work);
+	spin_unlock_bh(&psock_other->ingress_lock);
 }
 
 static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
@@ -845,8 +859,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 			err = sk_psock_skb_ingress_self(psock, skb);
 		}
 		if (err < 0) {
-			skb_queue_tail(&psock->ingress_skb, skb);
-			schedule_work(&psock->work);
+			spin_lock_bh(&psock->ingress_lock);
+			if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+				skb_queue_tail(&psock->ingress_skb, skb);
+				schedule_work(&psock->work);
+			}
+			spin_unlock_bh(&psock->ingress_lock);
 		}
 		break;
 	case __SK_REDIRECT:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index dd53a7771d7e..e564fdeaada1 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout)
 	saved_close = psock->saved_close;
 	sock_map_remove_links(sk, psock);
 	rcu_read_unlock();
+	sk_psock_stop(psock, true);
 	release_sock(sk);
 	saved_close(sk, timeout);
 }

From 7786dfc41a74e0567557b5c4a28fc8482f5f5691 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:26 -0700
Subject: [PATCH 57/68] skmsg: Use rcu work for destroying psock

The RCU callback sk_psock_destroy() only queues work psock->gc,
so we can just switch to rcu work to simplify the code.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-6-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h |  5 +----
 net/core/skmsg.c      | 17 +++++------------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 7382c4b518d7..e7aba150539d 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -102,10 +102,7 @@ struct sk_psock {
 	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
 	struct work_struct		work;
-	union {
-		struct rcu_head		rcu;
-		struct work_struct	gc;
-	};
+	struct rcu_work			rwork;
 };
 
 int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 9c25020086a9..d43d43905d2c 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -666,10 +666,10 @@ void sk_psock_stop(struct sk_psock *psock, bool wait)
 
 static void sk_psock_done_strp(struct sk_psock *psock);
 
-static void sk_psock_destroy_deferred(struct work_struct *gc)
+static void sk_psock_destroy(struct work_struct *work)
 {
-	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
-
+	struct sk_psock *psock = container_of(to_rcu_work(work),
+					      struct sk_psock, rwork);
 	/* No sk_callback_lock since already detached. */
 
 	sk_psock_done_strp(psock);
@@ -688,14 +688,6 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	kfree(psock);
 }
 
-static void sk_psock_destroy(struct rcu_head *rcu)
-{
-	struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
-
-	INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
-	schedule_work(&psock->gc);
-}
-
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
 	sk_psock_stop(psock, false);
@@ -709,7 +701,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
 
-	call_rcu(&psock->rcu, sk_psock_destroy);
+	INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+	queue_rcu_work(system_wq, &psock->rwork);
 }
 EXPORT_SYMBOL_GPL(sk_psock_drop);
 

From 190179f65ba8bc18dc1d38435b7932505ca5544f Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:27 -0700
Subject: [PATCH 58/68] skmsg: Use GFP_KERNEL in sk_psock_create_ingress_msg()

This function is only called in process context.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-7-xiyou.wangcong@gmail.com
---
 net/core/skmsg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index d43d43905d2c..656eceab73bc 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -410,7 +410,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
 	if (!sk_rmem_schedule(sk, skb, skb->truesize))
 		return NULL;
 
-	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
 	if (unlikely(!msg))
 		return NULL;
 

From 2004fdbd8a2b56757691717639f86d0eea3ab5b4 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:28 -0700
Subject: [PATCH 59/68] sock_map: Simplify sock_map_link() a bit

sock_map_link() passes down map progs, but it is confusing
to see both map progs and psock progs. Make the map progs
more obvious by retrieving it directly with sock_map_progs()
inside sock_map_link(). Now it is aligned with
sock_map_link_no_progs() too.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-8-xiyou.wangcong@gmail.com
---
 net/core/sock_map.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index e564fdeaada1..d06face0f16c 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -26,6 +26,7 @@ struct bpf_stab {
 
 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 				struct bpf_prog *old, u32 which);
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 {
@@ -224,10 +225,10 @@ static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
 	return psock;
 }
 
-static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
-			 struct sock *sk)
+static int sock_map_link(struct bpf_map *map, struct sock *sk)
 {
 	struct bpf_prog *msg_parser, *stream_parser, *stream_verdict;
+	struct sk_psock_progs *progs = sock_map_progs(map);
 	struct sk_psock *psock;
 	int ret;
 
@@ -492,7 +493,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
 	 * and sk_write_space callbacks overridden.
 	 */
 	if (sock_map_redirect_allowed(sk))
-		ret = sock_map_link(map, &stab->progs, sk);
+		ret = sock_map_link(map, sk);
 	else
 		ret = sock_map_link_no_progs(map, sk);
 	if (ret < 0)
@@ -1004,7 +1005,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
 	 * and sk_write_space callbacks overridden.
 	 */
 	if (sock_map_redirect_allowed(sk))
-		ret = sock_map_link(map, &htab->progs, sk);
+		ret = sock_map_link(map, sk);
 	else
 		ret = sock_map_link_no_progs(map, sk);
 	if (ret < 0)

From b017055255d620b365299c3824610e0098414664 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:29 -0700
Subject: [PATCH 60/68] sock_map: Kill sock_map_link_no_progs()

Now we can fold sock_map_link_no_progs() into sock_map_link()
and get rid of sock_map_link_no_progs().

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210331023237.41094-9-xiyou.wangcong@gmail.com
---
 net/core/sock_map.c | 55 +++++++++++++--------------------------------
 1 file changed, 15 insertions(+), 40 deletions(-)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index d06face0f16c..42d797291d34 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -225,13 +225,24 @@ static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
 	return psock;
 }
 
+static bool sock_map_redirect_allowed(const struct sock *sk);
+
 static int sock_map_link(struct bpf_map *map, struct sock *sk)
 {
-	struct bpf_prog *msg_parser, *stream_parser, *stream_verdict;
 	struct sk_psock_progs *progs = sock_map_progs(map);
+	struct bpf_prog *stream_verdict = NULL;
+	struct bpf_prog *stream_parser = NULL;
+	struct bpf_prog *msg_parser = NULL;
 	struct sk_psock *psock;
 	int ret;
 
+	/* Only sockets we can redirect into/from in BPF need to hold
+	 * refs to parser/verdict progs and have their sk_data_ready
+	 * and sk_write_space callbacks overridden.
+	 */
+	if (!sock_map_redirect_allowed(sk))
+		goto no_progs;
+
 	stream_verdict = READ_ONCE(progs->stream_verdict);
 	if (stream_verdict) {
 		stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
@@ -257,6 +268,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 		}
 	}
 
+no_progs:
 	psock = sock_map_psock_get_checked(sk);
 	if (IS_ERR(psock)) {
 		ret = PTR_ERR(psock);
@@ -316,27 +328,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	return ret;
 }
 
-static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
-{
-	struct sk_psock *psock;
-	int ret;
-
-	psock = sock_map_psock_get_checked(sk);
-	if (IS_ERR(psock))
-		return PTR_ERR(psock);
-
-	if (!psock) {
-		psock = sk_psock_init(sk, map->numa_node);
-		if (IS_ERR(psock))
-			return PTR_ERR(psock);
-	}
-
-	ret = sock_map_init_proto(sk, psock);
-	if (ret < 0)
-		sk_psock_put(sk, psock);
-	return ret;
-}
-
 static void sock_map_free(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
@@ -467,8 +458,6 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
 	return 0;
 }
 
-static bool sock_map_redirect_allowed(const struct sock *sk);
-
 static int sock_map_update_common(struct bpf_map *map, u32 idx,
 				  struct sock *sk, u64 flags)
 {
@@ -488,14 +477,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
 	if (!link)
 		return -ENOMEM;
 
-	/* Only sockets we can redirect into/from in BPF need to hold
-	 * refs to parser/verdict progs and have their sk_data_ready
-	 * and sk_write_space callbacks overridden.
-	 */
-	if (sock_map_redirect_allowed(sk))
-		ret = sock_map_link(map, sk);
-	else
-		ret = sock_map_link_no_progs(map, sk);
+	ret = sock_map_link(map, sk);
 	if (ret < 0)
 		goto out_free;
 
@@ -1000,14 +982,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
 	if (!link)
 		return -ENOMEM;
 
-	/* Only sockets we can redirect into/from in BPF need to hold
-	 * refs to parser/verdict progs and have their sk_data_ready
-	 * and sk_write_space callbacks overridden.
-	 */
-	if (sock_map_redirect_allowed(sk))
-		ret = sock_map_link(map, sk);
-	else
-		ret = sock_map_link_no_progs(map, sk);
+	ret = sock_map_link(map, sk);
 	if (ret < 0)
 		goto out_free;
 

From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:30 -0700
Subject: [PATCH 61/68] sock_map: Introduce BPF_SK_SKB_VERDICT

Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is
confusing and more importantly we still want to distinguish them
from user-space. So we can just reuse the stream verdict code but
introduce a new type of eBPF program, skb_verdict. Users are not
allowed to attach stream_verdict and skb_verdict programs to the
same map.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h          |  2 ++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           |  1 +
 net/core/skmsg.c               |  4 +++-
 net/core/sock_map.c            | 28 ++++++++++++++++++++++++++++
 tools/bpf/bpftool/common.c     |  1 +
 tools/bpf/bpftool/prog.c       |  1 +
 tools/include/uapi/linux/bpf.h |  1 +
 8 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e7aba150539d..c83dbc2d81d9 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,7 @@ struct sk_psock_progs {
 	struct bpf_prog			*msg_parser;
 	struct bpf_prog			*stream_parser;
 	struct bpf_prog			*stream_verdict;
+	struct bpf_prog			*skb_verdict;
 };
 
 enum sk_psock_state_bits {
@@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs)
 	psock_set_prog(&progs->msg_parser, NULL);
 	psock_set_prog(&progs->stream_parser, NULL);
 	psock_set_prog(&progs->stream_verdict, NULL);
+	psock_set_prog(&progs->skb_verdict, NULL);
 }
 
 int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 598716742593..49371eba98ba 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9603de81811a..6428634da57e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SK_MSG;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
+	case BPF_SK_SKB_VERDICT:
 		return BPF_PROG_TYPE_SK_SKB;
 	case BPF_LIRC_MODE2:
 		return BPF_PROG_TYPE_LIRC_MODE2;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 656eceab73bc..a045812d7c78 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 	rcu_assign_sk_user_data(sk, NULL);
 	if (psock->progs.stream_parser)
 		sk_psock_stop_strp(sk, psock);
-	else if (psock->progs.stream_verdict)
+	else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
 
@@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 	}
 	skb_set_owner_r(skb, sk);
 	prog = READ_ONCE(psock->progs.stream_verdict);
+	if (!prog)
+		prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
 		skb_dst_drop(skb);
 		skb_bpf_redirect_clear(skb);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 42d797291d34..c2a0411e08a8 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk,
 				strp_stop = true;
 			if (psock->saved_data_ready && stab->progs.stream_verdict)
 				verdict_stop = true;
+			if (psock->saved_data_ready && stab->progs.skb_verdict)
+				verdict_stop = true;
 			list_del(&link->list);
 			sk_psock_free_link(link);
 		}
@@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	struct sk_psock_progs *progs = sock_map_progs(map);
 	struct bpf_prog *stream_verdict = NULL;
 	struct bpf_prog *stream_parser = NULL;
+	struct bpf_prog *skb_verdict = NULL;
 	struct bpf_prog *msg_parser = NULL;
 	struct sk_psock *psock;
 	int ret;
@@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 		}
 	}
 
+	skb_verdict = READ_ONCE(progs->skb_verdict);
+	if (skb_verdict) {
+		skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+		if (IS_ERR(skb_verdict)) {
+			ret = PTR_ERR(skb_verdict);
+			goto out_put_msg_parser;
+		}
+	}
+
 no_progs:
 	psock = sock_map_psock_get_checked(sk);
 	if (IS_ERR(psock)) {
@@ -278,6 +290,9 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	if (psock) {
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
 		    (stream_parser  && READ_ONCE(psock->progs.stream_parser)) ||
+		    (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+		    (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+		    (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
 		    (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
 			sk_psock_put(sk, psock);
 			ret = -EBUSY;
@@ -309,6 +324,9 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	} else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
 		psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
 		sk_psock_start_verdict(sk,psock);
+	} else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+		sk_psock_start_verdict(sk, psock);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 	return 0;
@@ -317,6 +335,9 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 out_drop:
 	sk_psock_put(sk, psock);
 out_progs:
+	if (skb_verdict)
+		bpf_prog_put(skb_verdict);
+out_put_msg_parser:
 	if (msg_parser)
 		bpf_prog_put(msg_parser);
 out_put_stream_parser:
@@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 		break;
 #endif
 	case BPF_SK_SKB_STREAM_VERDICT:
+		if (progs->skb_verdict)
+			return -EBUSY;
 		pprog = &progs->stream_verdict;
 		break;
+	case BPF_SK_SKB_VERDICT:
+		if (progs->stream_verdict)
+			return -EBUSY;
+		pprog = &progs->skb_verdict;
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 65303664417e..1828bba19020 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
 
 	[BPF_SK_SKB_STREAM_PARSER]	= "sk_skb_stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT]	= "sk_skb_stream_verdict",
+	[BPF_SK_SKB_VERDICT]		= "sk_skb_verdict",
 	[BPF_SK_MSG_VERDICT]		= "sk_msg_verdict",
 	[BPF_LIRC_MODE2]		= "lirc_mode2",
 	[BPF_FLOW_DISSECTOR]		= "flow_dissector",
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f2b915b20546..3f067d2d7584 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -76,6 +76,7 @@ enum dump_mode {
 static const char * const attach_type_strings[] = {
 	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
+	[BPF_SK_SKB_VERDICT] = "skb_verdict",
 	[BPF_SK_MSG_VERDICT] = "msg_verdict",
 	[BPF_FLOW_DISSECTOR] = "flow_dissector",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ab9f2233607c..69902603012c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 

From 8a59f9d1e3d4340659fdfee8879dc09a6f2546e1 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:31 -0700
Subject: [PATCH 62/68] sock: Introduce sk->sk_prot->psock_update_sk_prot()

Currently sockmap calls into each protocol to update the struct
proto and replace it. This certainly won't work when the protocol
is implemented as a module, for example, AF_UNIX.

Introduce a new ops sk->sk_prot->psock_update_sk_prot(), so each
protocol can implement its own way to replace the struct proto.
This also helps get rid of symbol dependencies on CONFIG_INET.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210331023237.41094-11-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 18 +++---------------
 include/net/sock.h    |  3 +++
 include/net/tcp.h     |  1 +
 include/net/udp.h     |  1 +
 net/core/skmsg.c      |  5 -----
 net/core/sock_map.c   | 24 ++++--------------------
 net/ipv4/tcp_bpf.c    | 24 +++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c   |  3 +++
 net/ipv4/udp.c        |  3 +++
 net/ipv4/udp_bpf.c    | 15 +++++++++++++--
 net/ipv6/tcp_ipv6.c   |  3 +++
 net/ipv6/udp.c        |  3 +++
 12 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c83dbc2d81d9..5e800ddc2dc6 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -99,6 +99,7 @@ struct sk_psock {
 	void (*saved_close)(struct sock *sk, long timeout);
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
+	int  (*psock_update_sk_prot)(struct sock *sk, bool restore);
 	struct proto			*sk_proto;
 	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
@@ -395,25 +396,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock)
 	}
 }
 
-static inline void sk_psock_update_proto(struct sock *sk,
-					 struct sk_psock *psock,
-					 struct proto *ops)
-{
-	/* Pairs with lockless read in sk_clone_lock() */
-	WRITE_ONCE(sk->sk_prot, ops);
-}
-
 static inline void sk_psock_restore_proto(struct sock *sk,
 					  struct sk_psock *psock)
 {
 	sk->sk_prot->unhash = psock->saved_unhash;
-	if (inet_csk_has_ulp(sk)) {
-		tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
-	} else {
-		sk->sk_write_space = psock->saved_write_space;
-		/* Pairs with lockless read in sk_clone_lock() */
-		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
-	}
+	if (psock->psock_update_sk_prot)
+		psock->psock_update_sk_prot(sk, true);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index 0b6266fd6bf6..8b4155e756c2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1184,6 +1184,9 @@ struct proto {
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
+#ifdef CONFIG_BPF_SYSCALL
+	int			(*psock_update_sk_prot)(struct sock *sk, bool restore);
+#endif
 
 	/* Keeping track of sockets in use */
 #ifdef CONFIG_PROC_FS
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 075de26f449d..2efa4e5ea23d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2203,6 +2203,7 @@ struct sk_psock;
 
 #ifdef CONFIG_BPF_SYSCALL
 struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
+int tcp_bpf_update_proto(struct sock *sk, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/include/net/udp.h b/include/net/udp.h
index d4d064c59232..df7cc1edc200 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -518,6 +518,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
 #ifdef CONFIG_BPF_SYSCALL
 struct sk_psock;
 struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
+int udp_bpf_update_proto(struct sock *sk, bool restore);
 #endif
 
 #endif	/* _UDP_H */
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index a045812d7c78..9fc83f7cc1a0 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -562,11 +562,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 
 	write_lock_bh(&sk->sk_callback_lock);
 
-	if (inet_csk_has_ulp(sk)) {
-		psock = ERR_PTR(-EINVAL);
-		goto out;
-	}
-
 	if (sk->sk_user_data) {
 		psock = ERR_PTR(-EBUSY);
 		goto out;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index c2a0411e08a8..2915c7c8778b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -185,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
 
 static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
 {
-	struct proto *prot;
-
-	switch (sk->sk_type) {
-	case SOCK_STREAM:
-		prot = tcp_bpf_get_proto(sk, psock);
-		break;
-
-	case SOCK_DGRAM:
-		prot = udp_bpf_get_proto(sk, psock);
-		break;
-
-	default:
+	if (!sk->sk_prot->psock_update_sk_prot)
 		return -EINVAL;
-	}
-
-	if (IS_ERR(prot))
-		return PTR_ERR(prot);
-
-	sk_psock_update_proto(sk, psock, prot);
-	return 0;
+	psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
+	return sk->sk_prot->psock_update_sk_prot(sk, false);
 }
 
 static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
@@ -556,7 +540,7 @@ static bool sock_map_redirect_allowed(const struct sock *sk)
 
 static bool sock_map_sk_is_suitable(const struct sock *sk)
 {
-	return sk_is_tcp(sk) || sk_is_udp(sk);
+	return !!sk->sk_prot->psock_update_sk_prot;
 }
 
 static bool sock_map_sk_state_allowed(const struct sock *sk)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ae980716d896..ac8cfbaeacd2 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -595,20 +595,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
 	       ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
 }
 
-struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int tcp_bpf_update_proto(struct sock *sk, bool restore)
 {
+	struct sk_psock *psock = sk_psock(sk);
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
 	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
 
+	if (restore) {
+		if (inet_csk_has_ulp(sk)) {
+			tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+		} else {
+			sk->sk_write_space = psock->saved_write_space;
+			/* Pairs with lockless read in sk_clone_lock() */
+			WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		}
+		return 0;
+	}
+
+	if (inet_csk_has_ulp(sk))
+		return -EINVAL;
+
 	if (sk->sk_family == AF_INET6) {
 		if (tcp_bpf_assert_proto_ops(psock->sk_proto))
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 
 		tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
 	}
 
-	return &tcp_bpf_prots[family][config];
+	/* Pairs with lockless read in sk_clone_lock() */
+	WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
 
 /* If a child got cloned from a listening socket that had tcp_bpf
  * protocol callbacks installed, we need to restore the callbacks to
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index daad4f99db32..dfc6d1c0e710 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2806,6 +2806,9 @@ struct proto tcp_prot = {
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= tcp_bpf_update_proto,
+#endif
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.leave_memory_pressure	= tcp_leave_memory_pressure,
 	.stream_memory_free	= tcp_stream_memory_free,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4a0478b17243..38952aaee3a1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2849,6 +2849,9 @@ struct proto udp_prot = {
 	.unhash			= udp_lib_unhash,
 	.rehash			= udp_v4_rehash,
 	.get_port		= udp_v4_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= udp_bpf_update_proto,
+#endif
 	.memory_allocated	= &udp_memory_allocated,
 	.sysctl_mem		= sysctl_udp_mem,
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 7a94791efc1a..6001f93cd3a0 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -41,12 +41,23 @@ static int __init udp_bpf_v4_build_proto(void)
 }
 core_initcall(udp_bpf_v4_build_proto);
 
-struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int udp_bpf_update_proto(struct sock *sk, bool restore)
 {
 	int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+	struct sk_psock *psock = sk_psock(sk);
+
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		/* Pairs with lockless read in sk_clone_lock() */
+		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		return 0;
+	}
 
 	if (sk->sk_family == AF_INET6)
 		udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
 
-	return &udp_bpf_prots[family];
+	/* Pairs with lockless read in sk_clone_lock() */
+	WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d0f007741e8e..bff22d6ef516 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2139,6 +2139,9 @@ struct proto tcpv6_prot = {
 	.hash			= inet6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= tcp_bpf_update_proto,
+#endif
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.leave_memory_pressure	= tcp_leave_memory_pressure,
 	.stream_memory_free	= tcp_stream_memory_free,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d25e5a9252fd..ef2c75bb4771 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1713,6 +1713,9 @@ struct proto udpv6_prot = {
 	.unhash			= udp_lib_unhash,
 	.rehash			= udp_v6_rehash,
 	.get_port		= udp_v6_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= udp_bpf_update_proto,
+#endif
 	.memory_allocated	= &udp_memory_allocated,
 	.sysctl_mem		= sysctl_udp_mem,
 	.sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),

From d7f571188ecf25c244789b883c878ec7c64b5b08 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:32 -0700
Subject: [PATCH 63/68] udp: Implement ->read_sock() for sockmap

This is similar to tcp_read_sock(), except we do not need
to worry about connections, we just need to retrieve skb
from UDP receive queue.

Note, the return value of ->read_sock() is unused in
sk_psock_verdict_data_ready(), and UDP still does not
support splice() due to lack of ->splice_read(), so users
can not reach udp_read_sock() directly.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-12-xiyou.wangcong@gmail.com
---
 include/net/udp.h   |  2 ++
 net/ipv4/af_inet.c  |  1 +
 net/ipv4/udp.c      | 29 +++++++++++++++++++++++++++++
 net/ipv6/af_inet6.c |  1 +
 4 files changed, 33 insertions(+)

diff --git a/include/net/udp.h b/include/net/udp.h
index df7cc1edc200..347b62a753c3 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			       struct sk_buff *skb);
 struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
 				 __be16 sport, __be16 dport);
+int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
+		  sk_read_actor_t recv_actor);
 
 /* UDP uses skb->dev_scratch to cache as much information as possible and avoid
  * possibly multiple cache miss on dequeue()
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1355e6c0d567..f17870ee558b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = {
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
+	.read_sock	   = udp_read_sock,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 38952aaee3a1..4d02f6839e38 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1782,6 +1782,35 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 }
 EXPORT_SYMBOL(__skb_recv_udp);
 
+int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
+		  sk_read_actor_t recv_actor)
+{
+	int copied = 0;
+
+	while (1) {
+		struct sk_buff *skb;
+		int err, used;
+
+		skb = skb_recv_udp(sk, 0, 1, &err);
+		if (!skb)
+			return err;
+		used = recv_actor(desc, skb, 0, skb->len);
+		if (used <= 0) {
+			if (!copied)
+				copied = used;
+			break;
+		} else if (used <= skb->len) {
+			copied += used;
+		}
+
+		if (!desc->count)
+			break;
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL(udp_read_sock);
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 802f5111805a..71de739b4a9e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -714,6 +714,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet6_sendmsg,		/* retpoline's sake */
 	.recvmsg	   = inet6_recvmsg,		/* retpoline's sake */
+	.read_sock	   = udp_read_sock,
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 	.set_peek_off	   = sk_set_peek_off,

From 2bc793e3272a13e337416c057cb81c5396ad91d1 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:33 -0700
Subject: [PATCH 64/68] skmsg: Extract __tcp_bpf_recvmsg() and
 tcp_bpf_wait_data()

Although these two functions are only used by TCP, they are not
specific to TCP at all, both operate on skmsg and ingress_msg,
so fit in net/core/skmsg.c very well.

And we will need them for non-TCP, so rename and move them to
skmsg.c and export them to modules.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210331023237.41094-13-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h |   4 ++
 include/net/tcp.h     |   2 -
 net/core/skmsg.c      |  98 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_bpf.c    | 100 +-----------------------------------------
 net/tls/tls_sw.c      |   4 +-
 5 files changed, 106 insertions(+), 102 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 5e800ddc2dc6..f78e90a04a69 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -125,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 			      struct sk_msg *msg, u32 bytes);
 int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     struct sk_msg *msg, u32 bytes);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+		     long timeo, int *err);
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+		   int len, int flags);
 
 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
 {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2efa4e5ea23d..31b1696c62ba 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2209,8 +2209,6 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 
 int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
 			  int flags);
-int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
 
 #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 9fc83f7cc1a0..92a83c02562a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,6 +399,104 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 }
 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
 
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+		     long timeo, int *err)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = sk_wait_event(sk, &timeo,
+			    !list_empty(&psock->ingress_msg) ||
+			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_wait_data);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+		   int len, int flags)
+{
+	struct iov_iter *iter = &msg->msg_iter;
+	int peek = flags & MSG_PEEK;
+	struct sk_msg *msg_rx;
+	int i, copied = 0;
+
+	msg_rx = sk_psock_peek_msg(psock);
+	while (copied != len) {
+		struct scatterlist *sge;
+
+		if (unlikely(!msg_rx))
+			break;
+
+		i = msg_rx->sg.start;
+		do {
+			struct page *page;
+			int copy;
+
+			sge = sk_msg_elem(msg_rx, i);
+			copy = sge->length;
+			page = sg_page(sge);
+			if (copied + copy > len)
+				copy = len - copied;
+			copy = copy_page_to_iter(page, sge->offset, copy, iter);
+			if (!copy)
+				return copied ? copied : -EFAULT;
+
+			copied += copy;
+			if (likely(!peek)) {
+				sge->offset += copy;
+				sge->length -= copy;
+				if (!msg_rx->skb)
+					sk_mem_uncharge(sk, copy);
+				msg_rx->sg.size -= copy;
+
+				if (!sge->length) {
+					sk_msg_iter_var_next(i);
+					if (!msg_rx->skb)
+						put_page(page);
+				}
+			} else {
+				/* Lets not optimize peek case if copy_page_to_iter
+				 * didn't copy the entire length lets just break.
+				 */
+				if (copy != sge->length)
+					return copied;
+				sk_msg_iter_var_next(i);
+			}
+
+			if (copied == len)
+				break;
+		} while (i != msg_rx->sg.end);
+
+		if (unlikely(peek)) {
+			msg_rx = sk_psock_next_msg(psock, msg_rx);
+			if (!msg_rx)
+				break;
+			continue;
+		}
+
+		msg_rx->sg.start = i;
+		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+			msg_rx = sk_psock_dequeue_msg(psock);
+			kfree_sk_msg(msg_rx);
+		}
+		msg_rx = sk_psock_peek_msg(psock);
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
 static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
 						  struct sk_buff *skb)
 {
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ac8cfbaeacd2..3d622a0d0753 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,80 +10,6 @@
 #include <net/inet_common.h>
 #include <net/tls.h>
 
-int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len, int flags)
-{
-	struct iov_iter *iter = &msg->msg_iter;
-	int peek = flags & MSG_PEEK;
-	struct sk_msg *msg_rx;
-	int i, copied = 0;
-
-	msg_rx = sk_psock_peek_msg(psock);
-	while (copied != len) {
-		struct scatterlist *sge;
-
-		if (unlikely(!msg_rx))
-			break;
-
-		i = msg_rx->sg.start;
-		do {
-			struct page *page;
-			int copy;
-
-			sge = sk_msg_elem(msg_rx, i);
-			copy = sge->length;
-			page = sg_page(sge);
-			if (copied + copy > len)
-				copy = len - copied;
-			copy = copy_page_to_iter(page, sge->offset, copy, iter);
-			if (!copy)
-				return copied ? copied : -EFAULT;
-
-			copied += copy;
-			if (likely(!peek)) {
-				sge->offset += copy;
-				sge->length -= copy;
-				if (!msg_rx->skb)
-					sk_mem_uncharge(sk, copy);
-				msg_rx->sg.size -= copy;
-
-				if (!sge->length) {
-					sk_msg_iter_var_next(i);
-					if (!msg_rx->skb)
-						put_page(page);
-				}
-			} else {
-				/* Lets not optimize peek case if copy_page_to_iter
-				 * didn't copy the entire length lets just break.
-				 */
-				if (copy != sge->length)
-					return copied;
-				sk_msg_iter_var_next(i);
-			}
-
-			if (copied == len)
-				break;
-		} while (i != msg_rx->sg.end);
-
-		if (unlikely(peek)) {
-			msg_rx = sk_psock_next_msg(psock, msg_rx);
-			if (!msg_rx)
-				break;
-			continue;
-		}
-
-		msg_rx->sg.start = i;
-		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
-			msg_rx = sk_psock_dequeue_msg(psock);
-			kfree_sk_msg(msg_rx);
-		}
-		msg_rx = sk_psock_peek_msg(psock);
-	}
-
-	return copied;
-}
-EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
-
 static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 			   struct sk_msg *msg, u32 apply_bytes, int flags)
 {
@@ -237,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk)
 	return !empty;
 }
 
-static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
-			     int flags, long timeo, int *err)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int ret = 0;
-
-	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		return 1;
-
-	if (!timeo)
-		return ret;
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	ret = sk_wait_event(sk, &timeo,
-			    !list_empty(&psock->ingress_msg) ||
-			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
-	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	remove_wait_queue(sk_sleep(sk), &wait);
-	return ret;
-}
-
 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len)
 {
@@ -278,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	}
 	lock_sock(sk);
 msg_bytes_ready:
-	copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
 		int data, err = 0;
 		long timeo;
 
 		timeo = sock_rcvtimeo(sk, nonblock);
-		data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+		data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
 		if (data) {
 			if (!sk_psock_queue_empty(psock))
 				goto msg_bytes_ready;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 01d933ae5f16..1dcb34dfd56b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1789,8 +1789,8 @@ int tls_sw_recvmsg(struct sock *sk,
 		skb = tls_wait_data(sk, psock, flags, timeo, &err);
 		if (!skb) {
 			if (psock) {
-				int ret = __tcp_bpf_recvmsg(sk, psock,
-							    msg, len, flags);
+				int ret = sk_msg_recvmsg(sk, psock, msg, len,
+							 flags);
 
 				if (ret > 0) {
 					decrypted += ret;

From 1f5be6b3b063767202f815d5571f7d03729f1282 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:34 -0700
Subject: [PATCH 65/68] udp: Implement udp_bpf_recvmsg() for sockmap

We have to implement udp_bpf_recvmsg() to replace the ->recvmsg()
to retrieve skmsg from ingress_msg.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-14-xiyou.wangcong@gmail.com
---
 net/ipv4/udp_bpf.c | 64 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 6001f93cd3a0..7d5c4ebf42fe 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -4,6 +4,68 @@
 #include <linux/skmsg.h>
 #include <net/sock.h>
 #include <net/udp.h>
+#include <net/inet_common.h>
+
+#include "udp_impl.h"
+
+static struct proto *udpv6_prot_saved __read_mostly;
+
+static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+			  int noblock, int flags, int *addr_len)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags,
+						 addr_len);
+#endif
+	return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len);
+}
+
+static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+			   int nonblock, int flags, int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied, ret;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+	lock_sock(sk);
+	if (sk_psock_queue_empty(psock)) {
+		ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+		goto out;
+	}
+
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		int data, err = 0;
+		long timeo;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
+		if (data) {
+			if (!sk_psock_queue_empty(psock))
+				goto msg_bytes_ready;
+			ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+			goto out;
+		}
+		if (err) {
+			ret = err;
+			goto out;
+		}
+		copied = -EAGAIN;
+	}
+	ret = copied;
+out:
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return ret;
+}
 
 enum {
 	UDP_BPF_IPV4,
@@ -11,7 +73,6 @@ enum {
 	UDP_BPF_NUM_PROTS,
 };
 
-static struct proto *udpv6_prot_saved __read_mostly;
 static DEFINE_SPINLOCK(udpv6_prot_lock);
 static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
 
@@ -20,6 +81,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
 	*prot        = *base;
 	prot->unhash = sock_map_unhash;
 	prot->close  = sock_map_close;
+	prot->recvmsg = udp_bpf_recvmsg;
 }
 
 static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)

From 122e6c79efe1c25816118aca9cfabe54e99c2432 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:35 -0700
Subject: [PATCH 66/68] sock_map: Update sock type checks for UDP

Now UDP supports sockmap and redirection, we can safely update
the sock type checks for it accordingly.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-15-xiyou.wangcong@gmail.com
---
 net/core/sock_map.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 2915c7c8778b..3d190d22b0d8 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -535,7 +535,10 @@ static bool sk_is_udp(const struct sock *sk)
 
 static bool sock_map_redirect_allowed(const struct sock *sk)
 {
-	return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN;
+	if (sk_is_tcp(sk))
+		return sk->sk_state != TCP_LISTEN;
+	else
+		return sk->sk_state == TCP_ESTABLISHED;
 }
 
 static bool sock_map_sk_is_suitable(const struct sock *sk)

From d6378af615275435ce6e390a538c980ac19b6659 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:36 -0700
Subject: [PATCH 67/68] selftests/bpf: Add a test case for udp sockmap

Add a test case to ensure redirection between two UDP sockets work.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210331023237.41094-16-xiyou.wangcong@gmail.com
---
 .../selftests/bpf/prog_tests/sockmap_listen.c | 136 ++++++++++++++++++
 .../selftests/bpf/progs/test_sockmap_listen.c |  22 +++
 2 files changed, 158 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index c26e6bf05e49..648d9ae898d2 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1603,6 +1603,141 @@ static void test_reuseport(struct test_sockmap_listen *skel,
 	}
 }
 
+static void udp_redir_to_connected(int family, int sotype, int sock_mapfd,
+				   int verd_mapfd, enum redir_mode mode)
+{
+	const char *log_prefix = redir_mode_str(mode);
+	struct sockaddr_storage addr;
+	int c0, c1, p0, p1;
+	unsigned int pass;
+	socklen_t len;
+	int err, n;
+	u64 value;
+	u32 key;
+	char b;
+
+	zero_verdict_count(verd_mapfd);
+
+	p0 = socket_loopback(family, sotype | SOCK_NONBLOCK);
+	if (p0 < 0)
+		return;
+	len = sizeof(addr);
+	err = xgetsockname(p0, sockaddr(&addr), &len);
+	if (err)
+		goto close_peer0;
+
+	c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0);
+	if (c0 < 0)
+		goto close_peer0;
+	err = xconnect(c0, sockaddr(&addr), len);
+	if (err)
+		goto close_cli0;
+	err = xgetsockname(c0, sockaddr(&addr), &len);
+	if (err)
+		goto close_cli0;
+	err = xconnect(p0, sockaddr(&addr), len);
+	if (err)
+		goto close_cli0;
+
+	p1 = socket_loopback(family, sotype | SOCK_NONBLOCK);
+	if (p1 < 0)
+		goto close_cli0;
+	err = xgetsockname(p1, sockaddr(&addr), &len);
+	if (err)
+		goto close_cli0;
+
+	c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0);
+	if (c1 < 0)
+		goto close_peer1;
+	err = xconnect(c1, sockaddr(&addr), len);
+	if (err)
+		goto close_cli1;
+	err = xgetsockname(c1, sockaddr(&addr), &len);
+	if (err)
+		goto close_cli1;
+	err = xconnect(p1, sockaddr(&addr), len);
+	if (err)
+		goto close_cli1;
+
+	key = 0;
+	value = p0;
+	err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+	if (err)
+		goto close_cli1;
+
+	key = 1;
+	value = p1;
+	err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+	if (err)
+		goto close_cli1;
+
+	n = write(c1, "a", 1);
+	if (n < 0)
+		FAIL_ERRNO("%s: write", log_prefix);
+	if (n == 0)
+		FAIL("%s: incomplete write", log_prefix);
+	if (n < 1)
+		goto close_cli1;
+
+	key = SK_PASS;
+	err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
+	if (err)
+		goto close_cli1;
+	if (pass != 1)
+		FAIL("%s: want pass count 1, have %d", log_prefix, pass);
+
+	n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
+	if (n < 0)
+		FAIL_ERRNO("%s: read", log_prefix);
+	if (n == 0)
+		FAIL("%s: incomplete read", log_prefix);
+
+close_cli1:
+	xclose(c1);
+close_peer1:
+	xclose(p1);
+close_cli0:
+	xclose(c0);
+close_peer0:
+	xclose(p0);
+}
+
+static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel,
+				       struct bpf_map *inner_map, int family)
+{
+	int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+	int sock_map = bpf_map__fd(inner_map);
+	int err;
+
+	err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
+	if (err)
+		return;
+
+	skel->bss->test_ingress = false;
+	udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+			       REDIR_EGRESS);
+	skel->bss->test_ingress = true;
+	udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
+			       REDIR_INGRESS);
+
+	xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
+}
+
+static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
+			   int family)
+{
+	const char *family_name, *map_name;
+	char s[MAX_TEST_NAME];
+
+	family_name = family_str(family);
+	map_name = map_type_str(map);
+	snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
+	if (!test__start_subtest(s))
+		return;
+	udp_skb_redir_to_connected(skel, map, family);
+}
+
 static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
 		      int family)
 {
@@ -1611,6 +1746,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
 	test_redir(skel, map, family, SOCK_STREAM);
 	test_reuseport(skel, map, family, SOCK_STREAM);
 	test_reuseport(skel, map, family, SOCK_DGRAM);
+	test_udp_redir(skel, map, family);
 }
 
 void test_sockmap_listen(void)
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
index fa221141e9c1..a39eba9f5201 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
@@ -29,6 +29,7 @@ struct {
 } verdict_map SEC(".maps");
 
 static volatile bool test_sockmap; /* toggled by user-space */
+static volatile bool test_ingress; /* toggled by user-space */
 
 SEC("sk_skb/stream_parser")
 int prog_stream_parser(struct __sk_buff *skb)
@@ -55,6 +56,27 @@ int prog_stream_verdict(struct __sk_buff *skb)
 	return verdict;
 }
 
+SEC("sk_skb/skb_verdict")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+	unsigned int *count;
+	__u32 zero = 0;
+	int verdict;
+
+	if (test_sockmap)
+		verdict = bpf_sk_redirect_map(skb, &sock_map, zero,
+					      test_ingress ? BPF_F_INGRESS : 0);
+	else
+		verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero,
+					       test_ingress ? BPF_F_INGRESS : 0);
+
+	count = bpf_map_lookup_elem(&verdict_map, &verdict);
+	if (count)
+		(*count)++;
+
+	return verdict;
+}
+
 SEC("sk_msg")
 int prog_msg_verdict(struct sk_msg_md *msg)
 {

From 8d7cb74f2ccb5486ab8c631a8fcdc7621bbbc42c Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:37 -0700
Subject: [PATCH 68/68] selftests/bpf: Add a test case for loading
 BPF_SK_SKB_VERDICT

This adds a test case to ensure BPF_SK_SKB_VERDICT and
BPF_SK_STREAM_VERDICT will never be attached at the same time.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210331023237.41094-17-xiyou.wangcong@gmail.com
---
 .../selftests/bpf/prog_tests/sockmap_basic.c  | 40 +++++++++++++++++++
 .../progs/test_sockmap_skb_verdict_attach.c   | 18 +++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index b8b48cac2ac3..ab77596b64e3 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -7,6 +7,7 @@
 #include "test_skmsg_load_helpers.skel.h"
 #include "test_sockmap_update.skel.h"
 #include "test_sockmap_invalid_update.skel.h"
+#include "test_sockmap_skb_verdict_attach.skel.h"
 #include "bpf_iter_sockmap.skel.h"
 
 #define TCP_REPAIR		19	/* TCP sock is under repair right now */
@@ -281,6 +282,39 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 	bpf_iter_sockmap__destroy(skel);
 }
 
+static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first,
+					    enum bpf_attach_type second)
+{
+	struct test_sockmap_skb_verdict_attach *skel;
+	int err, map, verdict;
+
+	skel = test_sockmap_skb_verdict_attach__open_and_load();
+	if (CHECK_FAIL(!skel)) {
+		perror("test_sockmap_skb_verdict_attach__open_and_load");
+		return;
+	}
+
+	verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+	map = bpf_map__fd(skel->maps.sock_map);
+
+	err = bpf_prog_attach(verdict, map, first, 0);
+	if (CHECK_FAIL(err)) {
+		perror("bpf_prog_attach");
+		goto out;
+	}
+
+	err = bpf_prog_attach(verdict, map, second, 0);
+	assert(err == -1 && errno == EBUSY);
+
+	err = bpf_prog_detach2(verdict, map, first);
+	if (CHECK_FAIL(err)) {
+		perror("bpf_prog_detach2");
+		goto out;
+	}
+out:
+	test_sockmap_skb_verdict_attach__destroy(skel);
+}
+
 void test_sockmap_basic(void)
 {
 	if (test__start_subtest("sockmap create_update_free"))
@@ -301,4 +335,10 @@ void test_sockmap_basic(void)
 		test_sockmap_copy(BPF_MAP_TYPE_SOCKMAP);
 	if (test__start_subtest("sockhash copy"))
 		test_sockmap_copy(BPF_MAP_TYPE_SOCKHASH);
+	if (test__start_subtest("sockmap skb_verdict attach")) {
+		test_sockmap_skb_verdict_attach(BPF_SK_SKB_VERDICT,
+						BPF_SK_SKB_STREAM_VERDICT);
+		test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT,
+						BPF_SK_SKB_VERDICT);
+	}
 }
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c
new file mode 100644
index 000000000000..2d31f66e4f23
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SOCKMAP);
+	__uint(max_entries, 2);
+	__type(key, __u32);
+	__type(value, __u64);
+} sock_map SEC(".maps");
+
+SEC("sk_skb/skb_verdict")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+	return SK_DROP;
+}
+
+char _license[] SEC("license") = "GPL";