From 0db63c0b86e981a1e97d2596d64ceceba1a5470e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Apr 2024 17:25:44 -0700 Subject: [PATCH 001/119] bpf: Fix verifier assumptions about socket->sk The verifier assumes that 'sk' field in 'struct socket' is valid and non-NULL when 'socket' pointer itself is trusted and non-NULL. That may not be the case when socket was just created and passed to LSM socket_accept hook. Fix this verifier assumption and adjust tests. Reported-by: Liam Wisehart Acked-by: Kumar Kartikeya Dwivedi Fixes: 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier.") Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20240427002544.68803-1-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/verifier.c | 23 +++++++++++++++---- .../bpf/progs/bench_local_storage_create.c | 5 ++-- .../selftests/bpf/progs/local_storage.c | 20 ++++++++-------- .../testing/selftests/bpf/progs/lsm_cgroup.c | 8 +++++-- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87ff414899cf..5d42db05315e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2368,6 +2368,8 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -5400,8 +5402,6 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); - /* For mark_ptr_or_null_reg */ - val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -5719,7 +5719,8 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg) return true; /* Types listed in the reg2btf_ids are always trusted */ - if (reg2btf_ids[base_type(reg->type)]) + if (reg2btf_ids[base_type(reg->type)] && + !bpf_type_has_unsafe_modifiers(reg->type)) return true; /* If a register is not referenced, it is trusted if it has the @@ -6339,6 +6340,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) #define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) +#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null) /* * Allow list few fields as RCU trusted or full trusted. @@ -6402,7 +6404,7 @@ BTF_TYPE_SAFE_TRUSTED(struct dentry) { struct inode *d_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct socket) { +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; @@ -6437,11 +6439,20 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } +static bool type_is_trusted_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, + "__safe_trusted_or_null"); +} + static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, int regno, int off, int size, @@ -6550,6 +6561,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; + } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) { + flag |= PTR_TRUSTED | PTR_MAYBE_NULL; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ diff --git a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c index e4bfbba6c193..c8ec0d0368e4 100644 --- a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c +++ b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c @@ -61,14 +61,15 @@ SEC("lsm.s/socket_post_create") int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) { + struct sock *sk = sock->sk; struct storage *stg; __u32 pid; pid = bpf_get_current_pid_tgid() >> 32; - if (pid != bench_pid) + if (pid != bench_pid || !sk) return 0; - stg = bpf_sk_storage_get(&sk_storage_map, sock->sk, NULL, + stg = bpf_sk_storage_get(&sk_storage_map, sk, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE); if (stg) diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index e5e3a8b8dd07..637e75df2e14 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -140,11 +140,12 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, 0); + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0); if (!storage) return 0; @@ -155,24 +156,24 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, /* This tests that we can associate multiple elements * with the local storage. */ - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map2, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map2, sk)) return 0; - storage = bpf_sk_storage_get(&sk_storage_map2, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (bpf_sk_storage_delete(&sk_storage_map, sock->sk)) + if (bpf_sk_storage_delete(&sk_storage_map, sk)) return 0; /* Ensure that the sk_storage_map is disconnected from the storage. */ - if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap) + if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap) return 0; sk_storage_result = 0; @@ -185,11 +186,12 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + struct sock *sk = sock->sk; - if (pid != monitored_pid) + if (pid != monitored_pid || !sk) return 0; - storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, + storage = bpf_sk_storage_get(&sk_storage_map, sk, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index 02c11d16b692..d7598538aa2d 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -103,11 +103,15 @@ static __always_inline int real_bind(struct socket *sock, int addrlen) { struct sockaddr_ll sa = {}; + struct sock *sk = sock->sk; - if (sock->sk->__sk_common.skc_family != AF_PACKET) + if (!sk) return 1; - if (sock->sk->sk_kern_sock) + if (sk->__sk_common.skc_family != AF_PACKET) + return 1; + + if (sk->sk_kern_sock) return 1; bpf_probe_read_kernel(&sa, sizeof(sa), address); From 397658ddc88ce3c21d2aa3bed8e15fc69dfec946 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Mon, 29 Apr 2024 00:10:32 +0800 Subject: [PATCH 002/119] samples/bpf: Add valid info for VMLINUX_BTF When I use the command 'make M=samples/bpf' to compile samples/bpf code in ubuntu 22.04, the error info occured: Cannot find a vmlinux for VMLINUX_BTF at any of " /home/ubuntu/code/linux/vmlinux", build the kernel or set VMLINUX_BTF or VMLINUX_H variable Others often encounter this kind of issue, new kernel has the vmlinux, so we can set the path in error info which seems more intuitive, like: Cannot find a vmlinux for VMLINUX_BTF at any of " /home/ubuntu/code/linux/vmlinux", buiild the kernel or set VMLINUX_BTF like "VMLINUX_BTF=/sys/kernel/btf/vmlinux" or VMLINUX_H variable Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240428161032.239043-1-chen.dylane@gmail.com --- samples/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 933f6c3fe6b0..9aa027b144df 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -337,7 +337,7 @@ $(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) ifeq ($(VMLINUX_H),) ifeq ($(VMLINUX_BTF),) $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)",\ - build the kernel or set VMLINUX_BTF or VMLINUX_H variable) + build the kernel or set VMLINUX_BTF like "VMLINUX_BTF=/sys/kernel/btf/vmlinux" or VMLINUX_H variable) endif $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ else From cb01621b6d91567ac74c8b95e4db731febdbdec3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2024 15:13:22 +0300 Subject: [PATCH 003/119] bpf: Use struct_size() Use struct_size() instead of hand writing it. This is less verbose and more robust. Signed-off-by: Andy Shevchenko Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240429121323.3818497-1-andriy.shevchenko@linux.intel.com --- kernel/bpf/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 95c7fd093e55..a8ca6dd6e614 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -2455,13 +2456,14 @@ EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { - if (prog_cnt) - return kzalloc(sizeof(struct bpf_prog_array) + - sizeof(struct bpf_prog_array_item) * - (prog_cnt + 1), - flags); + struct bpf_prog_array *p; - return &bpf_empty_prog_array.hdr; + if (prog_cnt) + p = kzalloc(struct_size(p, items, prog_cnt + 1), flags); + else + p = &bpf_empty_prog_array.hdr; + + return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) From a3034872cd90a6881ad4e10ca6d30e1215a99ada Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2024 15:00:05 +0300 Subject: [PATCH 004/119] bpf: Switch to krealloc_array() Let the krealloc_array() copy the original data and check for a multiplication overflow. Signed-off-by: Andy Shevchenko Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240429120005.3539116-1-andriy.shevchenko@linux.intel.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a8ca6dd6e614..99b8b1c9a248 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -850,7 +850,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return -EINVAL; } - tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; From 19468ed51488dae19254e8a67c75d583b05fa5e3 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Mon, 29 Apr 2024 13:23:11 +0200 Subject: [PATCH 005/119] selftests/bpf: Run cgroup1_hierarchy test in own mount namespace The cgroup1_hierarchy test uses setup_classid_environment to setup cgroupv1 environment. The problem is that the environment is set in /sys/fs/cgroup and therefore, if not run under an own mount namespace, effectively deletes all system cgroups: $ ls /sys/fs/cgroup | wc -l 27 $ sudo ./test_progs -t cgroup1_hierarchy #41/1 cgroup1_hierarchy/test_cgroup1_hierarchy:OK #41/2 cgroup1_hierarchy/test_root_cgid:OK #41/3 cgroup1_hierarchy/test_invalid_level:OK #41/4 cgroup1_hierarchy/test_invalid_cgid:OK #41/5 cgroup1_hierarchy/test_invalid_hid:OK #41/6 cgroup1_hierarchy/test_invalid_cgrp_name:OK #41/7 cgroup1_hierarchy/test_invalid_cgrp_name2:OK #41/8 cgroup1_hierarchy/test_sleepable_prog:OK #41 cgroup1_hierarchy:OK Summary: 1/8 PASSED, 0 SKIPPED, 0 FAILED $ ls /sys/fs/cgroup | wc -l 1 To avoid this, run setup_cgroup_environment first which will create an own mount namespace. This only affects the cgroupv1_hierarchy test as all other cgroup1 test progs already run setup_cgroup_environment prior to running setup_classid_environment. Also add a comment to the header of setup_classid_environment to warn against this invalid usage in future. Fixes: 360769233cc9 ("selftests/bpf: Add selftests for cgroup1 hierarchy") Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240429112311.402497-1-vmalik@redhat.com --- tools/testing/selftests/bpf/cgroup_helpers.c | 3 +++ tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index f2952a65dcc2..23bb9a9e6a7d 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -508,6 +508,9 @@ int cgroup_setup_and_join(const char *path) { /** * setup_classid_environment() - Setup the cgroupv1 net_cls environment * + * This function should only be called in a custom mount namespace, e.g. + * created by running setup_cgroup_environment. + * * After calling this function, cleanup_classid_environment should be called * once testing is complete. * diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c index 74d6d7546f40..25332e596750 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup1_hierarchy.c @@ -87,9 +87,12 @@ void test_cgroup1_hierarchy(void) goto destroy; /* Setup cgroup1 hierarchy */ + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + goto destroy; err = setup_classid_environment(); if (!ASSERT_OK(err, "setup_classid_environment")) - goto destroy; + goto cleanup_cgroup; err = join_classid(); if (!ASSERT_OK(err, "join_cgroup1")) @@ -153,6 +156,8 @@ void test_cgroup1_hierarchy(void) cleanup: cleanup_classid_environment(); +cleanup_cgroup: + cleanup_cgroup_environment(); destroy: test_cgroup1_hierarchy__destroy(skel); } From 237c522c1d5d19e8d3057a38ce690c753020c7d1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 29 Apr 2024 15:07:33 +0800 Subject: [PATCH 006/119] selftests/bpf: Free strdup memory in test_sockmap The strdup() function returns a pointer to a new string which is a duplicate of the string "ptr". Memory for the new string is obtained with malloc(), and need to be freed with free(). This patch adds these missing "free(ptr)" in check_whitelist() and check_blacklist() to avoid memory leaks in test_sockmap.c. Signed-off-by: Geliang Tang Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/b76f2f4c550aebe4ab8ea73d23c4cbe4f06ea996.1714374022.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/test_sockmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 43612de44fbf..92752f5eeded 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1887,10 +1887,13 @@ static int check_whitelist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } @@ -1907,10 +1910,13 @@ static int check_blacklist(struct _test *t, struct sockmap_options *opt) while (entry) { if ((opt->prepend && strstr(opt->prepend, entry) != 0) || strstr(opt->map, entry) != 0 || - strstr(t->title, entry) != 0) + strstr(t->title, entry) != 0) { + free(ptr); return 0; + } entry = strtok(NULL, ","); } + free(ptr); return -EINVAL; } From 25927d0a1bec5091d371693c9fdd9640478837de Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 29 Apr 2024 15:07:34 +0800 Subject: [PATCH 007/119] selftests/bpf: Free strdup memory in veristat The strdup() function returns a pointer to a new string which is a duplicate of the string "input". Memory for the new string is obtained with malloc(), and need to be freed with free(). This patch adds these missing "free(input)" in parse_stats() to avoid memory leak in veristat.c. Signed-off-by: Geliang Tang Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/ded44f8865cd7f337f52fc5fb0a5fbed7d6bd641.1714374022.git.tanggeliang@kylinos.cn --- tools/testing/selftests/bpf/veristat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 244d4996e06e..b2854238d4a0 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -792,10 +792,13 @@ static int parse_stats(const char *stats_str, struct stat_specs *specs) while ((next = strtok_r(state ? NULL : input, ",", &state))) { err = parse_stat(next, specs); - if (err) + if (err) { + free(input); return err; + } } + free(input); return 0; } From cfd3bfe9507b4aa39f7e86772e60b50b799e490e Mon Sep 17 00:00:00 2001 From: Dmitrii Bundin Date: Sat, 20 Apr 2024 07:24:57 +0300 Subject: [PATCH 008/119] bpf: Include linux/types.h for u32 Inclusion of the header linux/btf_ids.h relies on indirect inclusion of the header linux/types.h. Including it directly on the top level helps to avoid potential problems if linux/types.h hasn't been included before. The main motivation to introduce this it is to avoid similar problems that have shown up in the bpftool where GNU libc indirectly pulls linux/types.h causing compile error of the form: error: unknown type name 'u32' u32 cnt; ^~~ The bpftool compile error was fixed in 62248b22d01e ("tools/resolve_btfids: fix build with musl libc"). Signed-off-by: Dmitrii Bundin Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240420042457.3198883-1-dmitrii.bundin.a@gmail.com --- include/linux/btf_ids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index e24aabfe8ecc..c0e3e1426a82 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -3,6 +3,8 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +#include /* for u32 */ + struct btf_id_set { u32 cnt; u32 ids[]; From f973fccd43d34b096077d5d21d051ef75b22a7ea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 27 Apr 2024 20:09:53 -0700 Subject: [PATCH 009/119] libbpf: handle nulled-out program in struct_ops correctly If struct_ops has one of program callbacks set declaratively and host kernel is old and doesn't support this callback, libbpf will allow to load such struct_ops as long as that callback was explicitly nulled-out (presumably through skeleton). This is all working correctly, except we won't reset corresponding program slot to NULL before bailing out, which will lead to libbpf not detecting that BPF program has to be not auto-loaded. Fix this by unconditionally resetting corresponding program slot to NULL. Fixes: c911fc61a7ce ("libbpf: Skip zeroed or null fields if not found in the kernel type.") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240428030954.3918764-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 97eb6e5dd7c8..898d5d34ecea 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1148,6 +1148,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) * presented in the kernel BTF. */ if (libbpf_is_mem_zeroed(mdata, msize)) { + st_ops->progs[i] = NULL; pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", map->name, mname); continue; From 1bba3b3d373dbafae891e7cb06b8c82c8d62aba1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 27 Apr 2024 20:09:54 -0700 Subject: [PATCH 010/119] selftests/bpf: validate nulled-out struct_ops program is handled properly Add a selftests validating that it's possible to have some struct_ops callback set declaratively, then disable it (by setting to NULL) programmatically. Libbpf should detect that such program should not be loaded. Otherwise, it will unnecessarily fail the loading when the host kernel does not have the type information. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240428030954.3918764-2-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/test_struct_ops_module.c | 18 ++++++++++++++++-- .../selftests/bpf/progs/struct_ops_module.c | 7 +++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 7cf2b9ddd3e1..bd39586abd5a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -66,6 +66,7 @@ static void test_struct_ops_load(void) * auto-loading, or it will fail to load. */ bpf_program__set_autoload(skel->progs.test_2, false); + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); err = struct_ops_module__load(skel); if (!ASSERT_OK(err, "struct_ops_module_load")) @@ -103,6 +104,10 @@ static void test_struct_ops_not_zeroed(void) if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) return; + skel->struct_ops.testmod_zeroed->zeroed = 0; + /* zeroed_op prog should be not loaded automatically now */ + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; + err = struct_ops_module__load(skel); ASSERT_OK(err, "struct_ops_module_load"); @@ -118,6 +123,7 @@ static void test_struct_ops_not_zeroed(void) * value of "zeroed" is non-zero. */ skel->struct_ops.testmod_zeroed->zeroed = 0xdeadbeef; + skel->struct_ops.testmod_zeroed->zeroed_op = NULL; err = struct_ops_module__load(skel); ASSERT_ERR(err, "struct_ops_module_load_not_zeroed"); @@ -148,15 +154,23 @@ static void test_struct_ops_incompatible(void) { struct struct_ops_module *skel; struct bpf_link *link; + int err; - skel = struct_ops_module__open_and_load(); - if (!ASSERT_OK_PTR(skel, "open_and_load")) + skel = struct_ops_module__open(); + if (!ASSERT_OK_PTR(skel, "struct_ops_module_open")) return; + bpf_map__set_autocreate(skel->maps.testmod_zeroed, false); + + err = struct_ops_module__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + link = bpf_map__attach_struct_ops(skel->maps.testmod_incompatible); if (ASSERT_OK_PTR(link, "attach_struct_ops")) bpf_link__destroy(link); +cleanup: struct_ops_module__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 63b065dae002..40109be2b3ae 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -63,10 +63,17 @@ struct bpf_testmod_ops___zeroed { int zeroed; }; +SEC("?struct_ops/test_3") +int BPF_PROG(zeroed_op) +{ + return 1; +} + SEC(".struct_ops.link") struct bpf_testmod_ops___zeroed testmod_zeroed = { .test_1 = (void *)test_1, .test_2 = (void *)test_2_v2, + .zeroed_op = (void *)zeroed_op, }; struct bpf_testmod_ops___incompatible { From b98a5c68ccaa94e93b9e898091fe2cf21c1500e6 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:24 +0200 Subject: [PATCH 011/119] bpf: Do not walk twice the map on free If someone stores both a timer and a workqueue in a map, on free we would walk it twice. Add a check in array_map_free_timers_wq and free the timers and workqueues if they are present. Fixes: 246331e3f1ea ("bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-1-27afe7f3b17c@kernel.org --- kernel/bpf/arraymap.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 580d07b15471..feabc0193852 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -436,13 +436,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER)) - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); - - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) - for (i = 0; i < array->map.max_entries; i++) - bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + for (i = 0; i < array->map.max_entries; i++) { + if (btf_record_has_field(map->record, BPF_TIMER)) + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + } + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ From a891711d0166133ec5120615fcf365d9745d82b2 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:25 +0200 Subject: [PATCH 012/119] bpf: Do not walk twice the hash map on free If someone stores both a timer and a workqueue in a hash map, on free, we would walk it twice. Add a check in htab_free_malloced_timers_or_wq and free the timers and workqueues if they are present. Fixes: 246331e3f1ea ("bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-2-27afe7f3b17c@kernel.org --- kernel/bpf/hashtab.c | 49 ++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0179183c543a..06115f8728e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -221,13 +221,11 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab); } -static void htab_free_prealloced_timers(struct bpf_htab *htab) +static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -235,27 +233,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); - cond_resched(); - } -} - -static void htab_free_prealloced_wq(struct bpf_htab *htab) -{ - u32 num_entries = htab->map.max_entries; - int i; - - if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - return; - if (htab_has_extra_elems(htab)) - num_entries += num_possible_cpus(); - - for (i = 0; i < num_entries; i++) { - struct htab_elem *elem; - - elem = get_htab_elem(htab, i); - bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -1515,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) +static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) { int i; @@ -1527,10 +1510,10 @@ static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - if (is_timer) + if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); - else + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } @@ -1544,17 +1527,11 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, true); + htab_free_malloced_timers_and_wq(htab); else - htab_free_prealloced_timers(htab); - } - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, false); - else - htab_free_prealloced_wq(htab); + htab_free_prealloced_timers_and_wq(htab); } } From 05cbc217aafbc631a6c2fab4accf95850cb48358 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:26 +0200 Subject: [PATCH 013/119] selftests/bpf: Drop an unused local variable Some copy/paste leftover, this is never used. Fixes: e3d9eac99afd ("selftests/bpf: wq: add bpf_wq_init() checks") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-3-27afe7f3b17c@kernel.org --- tools/testing/selftests/bpf/prog_tests/wq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index c4bacd3160e1..99e438fe12ac 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -36,7 +36,5 @@ void serial_test_wq(void) void serial_test_failures_wq(void) { - LIBBPF_OPTS(bpf_test_run_opts, topts); - RUN_TESTS(wq_failures); } From 535a3692ba7245792e6f23654507865d4293c850 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:24 +0200 Subject: [PATCH 014/119] bpf: Add support for kprobe session attach Adding support to attach bpf program for entry and return probe of the same function. This is common use case which at the moment requires to create two kprobe multi links. Adding new BPF_TRACE_KPROBE_SESSION attach type that instructs kernel to attach single link program to both entry and exit probe. It's possible to control execution of the bpf program on return probe simply by returning zero or non zero from the entry bpf program execution to execute or not the bpf program on return probe respectively. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-2-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 7 ++++++- kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++-------- tools/include/uapi/linux/bpf.h | 1 + 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d94a72593ead..90706a47f6ff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f655adf42e39..13ad74ecf2cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4016,11 +4016,15 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && + attach_type != BPF_TRACE_KPROBE_SESSION) + return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_SESSION && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; return 0; @@ -5281,7 +5285,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) ret = bpf_uprobe_multi_link_attach(attr, prog); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ba722b57af3..06a9671834b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1631,6 +1631,17 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static bool is_kprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + +static inline bool is_kprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1646,13 +1657,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) return &bpf_get_attach_cookie_proto_umulti; @@ -2834,10 +2845,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, void *data) { struct bpf_kprobe_multi_link *link; + int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); - return 0; + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + return is_kprobe_session(link->link.prog) ? err : 0; } static void @@ -2981,7 +2993,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + if (!is_kprobe_multi(prog)) return -EINVAL; flags = attr->link_create.kprobe_multi.flags; @@ -3062,10 +3074,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error; - if (flags & BPF_F_KPROBE_MULTI_RETURN) - link->fp.exit_handler = kprobe_multi_link_exit_handler; - else + if (!(flags & BPF_F_KPROBE_MULTI_RETURN)) link->fp.entry_handler = kprobe_multi_link_handler; + if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) + link->fp.exit_handler = kprobe_multi_link_exit_handler; link->addrs = addrs; link->cookies = cookies; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d94a72593ead..90706a47f6ff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1115,6 +1115,7 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_GETSOCKNAME, BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; From adf46d88ae4b2557f7e2e02547a25fb866935711 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:25 +0200 Subject: [PATCH 015/119] bpf: Add support for kprobe session context Adding struct bpf_session_run_ctx object to hold session related data, which is atm is_return bool and data pointer coming in following changes. Placing bpf_session_run_ctx layer in between bpf_run_ctx and bpf_kprobe_multi_run_ctx so the session data can be retrieved regardless of if it's kprobe_multi or uprobe_multi link, which support is coming in future. This way both kprobe_multi and uprobe_multi can use same kfuncs to access the session data. Adding bpf_session_is_return kfunc that returns true if the bpf program is executed from the exit probe of the kprobe multi link attached in wrapper mode. It returns false otherwise. Adding new kprobe hook for kprobe program type. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-3-jolsa@kernel.org --- kernel/bpf/btf.c | 3 ++ kernel/trace/bpf_trace.c | 67 +++++++++++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8291fbfd27b1..821063660d9f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -218,6 +218,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_SOCKET_FILTER, BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, + BTF_KFUNC_HOOK_KPROBE, BTF_KFUNC_HOOK_MAX, }; @@ -8157,6 +8158,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_LWT; case BPF_PROG_TYPE_NETFILTER: return BTF_KFUNC_HOOK_NETFILTER; + case BPF_PROG_TYPE_KPROBE: + return BTF_KFUNC_HOOK_KPROBE; default: return BTF_KFUNC_HOOK_MAX; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 06a9671834b6..3e88212bbf84 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2596,6 +2596,11 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ +struct bpf_session_run_ctx { + struct bpf_run_ctx run_ctx; + bool is_return; +}; + #ifdef CONFIG_FPROBE struct bpf_kprobe_multi_link { struct bpf_link link; @@ -2609,7 +2614,7 @@ struct bpf_kprobe_multi_link { }; struct bpf_kprobe_multi_run_ctx { - struct bpf_run_ctx run_ctx; + struct bpf_session_run_ctx session_ctx; struct bpf_kprobe_multi_link *link; unsigned long entry_ip; }; @@ -2788,7 +2793,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) if (WARN_ON_ONCE(!ctx)) return 0; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); link = run_ctx->link; if (!link->cookies) return 0; @@ -2805,15 +2811,20 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->entry_ip; } static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, - unsigned long entry_ip, struct pt_regs *regs) + unsigned long entry_ip, struct pt_regs *regs, + bool is_return) { struct bpf_kprobe_multi_run_ctx run_ctx = { + .session_ctx = { + .is_return = is_return, + }, .link = link, .entry_ip = entry_ip, }; @@ -2828,7 +2839,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, migrate_disable(); rcu_read_lock(); - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); @@ -2848,7 +2859,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false); return is_kprobe_session(link->link.prog) ? err : 0; } @@ -2860,7 +2871,7 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -3503,3 +3514,45 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return 0; } #endif /* CONFIG_UPROBES */ + +#ifdef CONFIG_FPROBE +__bpf_kfunc_start_defs(); + +__bpf_kfunc bool bpf_session_is_return(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->is_return; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_session_is_return) +BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) + +static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + return 0; + + if (!is_kprobe_session(prog)) + return -EACCES; + + return 0; +} + +static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { + .owner = THIS_MODULE, + .set = &kprobe_multi_kfunc_set_ids, + .filter = bpf_kprobe_multi_filter, +}; + +static int __init bpf_kprobe_multi_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); +} + +late_initcall(bpf_kprobe_multi_kfuncs_init); +#endif From 5c919acef85147886eb2abf86fb147f94680a8b0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:26 +0200 Subject: [PATCH 016/119] bpf: Add support for kprobe session cookie Adding support for cookie within the session of kprobe multi entry and return program. The session cookie is u64 value and can be retrieved be new kfunc bpf_session_cookie, which returns pointer to the cookie value. The bpf program can use the pointer to store (on entry) and load (on return) the value. The cookie value is implemented via fprobe feature that allows to share values between entry and return ftrace fprobe callbacks. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-4-jolsa@kernel.org --- kernel/bpf/verifier.c | 7 +++++++ kernel/trace/bpf_trace.c | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5d42db05315e..7360f04f9ec7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11063,6 +11063,7 @@ enum special_kfunc_type { KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, + KF_bpf_session_cookie, }; BTF_SET_START(special_kfunc_set) @@ -11123,6 +11124,7 @@ BTF_ID(func, bpf_iter_css_task_new) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_session_cookie) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12294,6 +12296,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) { + meta.r0_size = sizeof(u64); + meta.r0_rdonly = false; + } + if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e88212bbf84..f5154c051d2c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2599,6 +2599,7 @@ fs_initcall(bpf_event_init); struct bpf_session_run_ctx { struct bpf_run_ctx run_ctx; bool is_return; + void *data; }; #ifdef CONFIG_FPROBE @@ -2819,11 +2820,12 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct pt_regs *regs, - bool is_return) + bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { .session_ctx = { .is_return = is_return, + .data = data, }, .link = link, .entry_ip = entry_ip, @@ -2859,7 +2861,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false); + err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data); return is_kprobe_session(link->link.prog) ? err : 0; } @@ -2871,7 +2873,7 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -3089,6 +3091,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->fp.entry_handler = kprobe_multi_link_handler; if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) link->fp.exit_handler = kprobe_multi_link_exit_handler; + if (is_kprobe_session(prog)) + link->fp.entry_data_size = sizeof(u64); link->addrs = addrs; link->cookies = cookies; @@ -3526,10 +3530,19 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } +__bpf_kfunc __u64 *bpf_session_cookie(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->data; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) +BTF_ID_FLAGS(func, bpf_session_cookie) BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) From 2ca178f02b2f4e523e970894def16282e4adbc39 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:27 +0200 Subject: [PATCH 017/119] libbpf: Add support for kprobe session attach Adding support to attach program in kprobe session mode with bpf_program__attach_kprobe_multi_opts function. Adding session bool to bpf_kprobe_multi_opts struct that allows to load and attach the bpf program via kprobe session. the attachment to create kprobe multi session. Also adding new program loader section that allows: SEC("kprobe.session/bpf_fentry_test*") and loads/attaches kprobe program as kprobe session. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-5-jolsa@kernel.org --- tools/lib/bpf/bpf.c | 1 + tools/lib/bpf/libbpf.c | 39 +++++++++++++++++++++++++++++++++++++-- tools/lib/bpf/libbpf.h | 4 +++- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index c9f4e04f38fe..466a29d80124 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -766,6 +766,7 @@ int bpf_link_create(int prog_fd, int target_fd, return libbpf_err(-EINVAL); break; case BPF_TRACE_KPROBE_MULTI: + case BPF_TRACE_KPROBE_SESSION: attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 898d5d34ecea..16dae279a900 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9273,6 +9273,7 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -9289,6 +9290,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kprobe.session+", KPROBE, BPF_TRACE_KPROBE_SESSION, SEC_NONE, attach_kprobe_session), SEC_DEF("uprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), SEC_DEF("uprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), @@ -11381,13 +11383,14 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, struct kprobe_multi_resolve res = { .pattern = pattern, }; + enum bpf_attach_type attach_type; struct bpf_link *link = NULL; char errmsg[STRERR_BUFSIZE]; const unsigned long *addrs; int err, link_fd, prog_fd; + bool retprobe, session; const __u64 *cookies; const char **syms; - bool retprobe; size_t cnt; if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) @@ -11426,6 +11429,12 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } retprobe = OPTS_GET(opts, retprobe, false); + session = OPTS_GET(opts, session, false); + + if (retprobe && session) + return libbpf_err_ptr(-EINVAL); + + attach_type = session ? BPF_TRACE_KPROBE_SESSION : BPF_TRACE_KPROBE_MULTI; lopts.kprobe_multi.syms = syms; lopts.kprobe_multi.addrs = addrs; @@ -11440,7 +11449,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + link_fd = bpf_link_create(prog_fd, 0, attach_type, &lopts); if (link_fd < 0) { err = -errno; pr_warn("prog '%s': failed to attach: %s\n", @@ -11546,6 +11555,32 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru return libbpf_get_error(*link); } +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, + struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, .session = true); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.session") */ + if (strcmp(prog->sec_name, "kprobe.session") == 0) + return 0; + + spec = prog->sec_name + sizeof("kprobe.session/") - 1; + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe session pattern is invalid: %s\n", pattern); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return *link ? 0 : -errno; +} + static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) { char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1333ae20ebe6..c3f77d9260fe 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -539,10 +539,12 @@ struct bpf_kprobe_multi_opts { size_t cnt; /* create return kprobes */ bool retprobe; + /* create session kprobes */ + bool session; size_t :0; }; -#define bpf_kprobe_multi_opts__last_field retprobe +#define bpf_kprobe_multi_opts__last_field session LIBBPF_API struct bpf_link * bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, From 7b94965429f2fa32a83e1275c6bf6ed0add08603 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:28 +0200 Subject: [PATCH 018/119] libbpf: Add kprobe session attach type name to attach_type_name Adding kprobe session attach type name to attach_type_name, so libbpf_bpf_attach_type_str returns proper string name for BPF_TRACE_KPROBE_SESSION attach type. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-6-jolsa@kernel.org --- tools/lib/bpf/libbpf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 16dae279a900..7667671187e9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -132,6 +132,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", [BPF_NETKIT_PRIMARY] = "netkit_primary", [BPF_NETKIT_PEER] = "netkit_peer", + [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", }; static const char * const link_type_name[] = { From 0983b1697aefbf69f465f907b934b89bbce467ea Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:29 +0200 Subject: [PATCH 019/119] selftests/bpf: Add kprobe session test Adding kprobe session test and testing that the entry program return value controls execution of the return probe program. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-7-jolsa@kernel.org --- tools/testing/selftests/bpf/bpf_kfuncs.h | 2 + .../bpf/prog_tests/kprobe_multi_test.c | 39 +++++++++ .../bpf/progs/kprobe_multi_session.c | 79 +++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/kprobe_multi_session.c diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 14ebe7d9e1a3..cb946d9fd63a 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -75,4 +75,6 @@ extern void bpf_key_put(struct bpf_key *key) __ksym; extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; + +extern bool bpf_session_is_return(void) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 51628455b6f5..42d6592f1e7f 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -4,6 +4,7 @@ #include "trace_helpers.h" #include "kprobe_multi_empty.skel.h" #include "kprobe_multi_override.skel.h" +#include "kprobe_multi_session.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -326,6 +327,42 @@ cleanup: kprobe_multi__destroy(skel); } +static void test_session_skel_api(void) +{ + struct kprobe_multi_session *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int i, err, prog_fd; + + skel = kprobe_multi_session__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_session__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_session__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + /* bpf_fentry_test1-4 trigger return probe, result is 2 */ + for (i = 0; i < 4; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 2, "kprobe_session_result"); + + /* bpf_fentry_test5-8 trigger only entry probe, result is 1 */ + for (i = 4; i < 8; i++) + ASSERT_EQ(skel->bss->kprobe_session_result[i], 1, "kprobe_session_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session__destroy(skel); +} + static size_t symbol_hash(long key, void *ctx __maybe_unused) { return str_hash((const char *) key); @@ -690,4 +727,6 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("session")) + test_session_skel_api(); } diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c new file mode 100644 index 000000000000..bbba9eb46551 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0])) + +char _license[] SEC("license") = "GPL"; + +extern const void bpf_fentry_test1 __ksym; +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_fentry_test5 __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; +extern const void bpf_fentry_test8 __ksym; + +int pid = 0; + +__u64 kprobe_session_result[8]; + +static int session_check(void *ctx) +{ + unsigned int i; + __u64 addr; + const void *kfuncs[] = { + &bpf_fentry_test1, + &bpf_fentry_test2, + &bpf_fentry_test3, + &bpf_fentry_test4, + &bpf_fentry_test5, + &bpf_fentry_test6, + &bpf_fentry_test7, + &bpf_fentry_test8, + }; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + addr = bpf_get_func_ip(ctx); + + for (i = 0; i < ARRAY_SIZE(kfuncs); i++) { + if (kfuncs[i] == (void *) addr) { + kprobe_session_result[i]++; + break; + } + } + + /* + * Force probes for function bpf_fentry_test[5-8] not to + * install and execute the return probe + */ + if (((const void *) addr == &bpf_fentry_test5) || + ((const void *) addr == &bpf_fentry_test6) || + ((const void *) addr == &bpf_fentry_test7) || + ((const void *) addr == &bpf_fentry_test8)) + return 1; + + return 0; +} + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +SEC("kprobe.session/bpf_fentry_test*") +int test_kprobe(struct pt_regs *ctx) +{ + return session_check(ctx); +} From a3a5113393ccfad2eb23ca091aa6e55b5bd67eb4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Apr 2024 13:28:30 +0200 Subject: [PATCH 020/119] selftests/bpf: Add kprobe session cookie test Adding kprobe session test that verifies the cookie value get properly propagated from entry to return program. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240430112830.1184228-8-jolsa@kernel.org --- tools/testing/selftests/bpf/bpf_kfuncs.h | 1 + .../bpf/prog_tests/kprobe_multi_test.c | 35 +++++++++++ .../bpf/progs/kprobe_multi_session_cookie.c | 58 +++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index cb946d9fd63a..be91a6919315 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -77,4 +77,5 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_key *trusted_keyring) __ksym; extern bool bpf_session_is_return(void) __ksym __weak; +extern long *bpf_session_cookie(void) __ksym __weak; #endif diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 42d6592f1e7f..960c9323d1e0 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -5,6 +5,7 @@ #include "kprobe_multi_empty.skel.h" #include "kprobe_multi_override.skel.h" #include "kprobe_multi_session.skel.h" +#include "kprobe_multi_session_cookie.skel.h" #include "bpf/libbpf_internal.h" #include "bpf/hashmap.h" @@ -363,6 +364,38 @@ cleanup: kprobe_multi_session__destroy(skel); } +static void test_session_cookie_skel_api(void) +{ + struct kprobe_multi_session_cookie *skel = NULL; + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *link = NULL; + int err, prog_fd; + + skel = kprobe_multi_session_cookie__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fentry_raw_skel_load")) + return; + + skel->bss->pid = getpid(); + + err = kprobe_multi_session_cookie__attach(skel); + if (!ASSERT_OK(err, " kprobe_multi_wrapper__attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.trigger); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + ASSERT_EQ(skel->bss->test_kprobe_1_result, 1, "test_kprobe_1_result"); + ASSERT_EQ(skel->bss->test_kprobe_2_result, 2, "test_kprobe_2_result"); + ASSERT_EQ(skel->bss->test_kprobe_3_result, 3, "test_kprobe_3_result"); + +cleanup: + bpf_link__destroy(link); + kprobe_multi_session_cookie__destroy(skel); +} + static size_t symbol_hash(long key, void *ctx __maybe_unused) { return str_hash((const char *) key); @@ -729,4 +762,6 @@ void test_kprobe_multi_test(void) test_attach_override(); if (test__start_subtest("session")) test_session_skel_api(); + if (test__start_subtest("session_cookie")) + test_session_cookie_skel_api(); } diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c new file mode 100644 index 000000000000..d49070803e22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +char _license[] SEC("license") = "GPL"; + +int pid = 0; + +__u64 test_kprobe_1_result = 0; +__u64 test_kprobe_2_result = 0; +__u64 test_kprobe_3_result = 0; + +/* + * No tests in here, just to trigger 'bpf_fentry_test*' + * through tracing test_run + */ +SEC("fentry/bpf_modify_return_test") +int BPF_PROG(trigger) +{ + return 0; +} + +static int check_cookie(__u64 val, __u64 *result) +{ + long *cookie; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + cookie = bpf_session_cookie(); + + if (bpf_session_is_return()) + *result = *cookie == val ? val : 0; + else + *cookie = val; + return 0; +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_1(struct pt_regs *ctx) +{ + return check_cookie(1, &test_kprobe_1_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_2(struct pt_regs *ctx) +{ + return check_cookie(2, &test_kprobe_2_result); +} + +SEC("kprobe.session/bpf_fentry_test1") +int test_kprobe_3(struct pt_regs *ctx) +{ + return check_cookie(3, &test_kprobe_3_result); +} From 95b88500b97ca8bafc0b9c8e79e9716c2ddc40c6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 25 Apr 2024 11:23:41 +0800 Subject: [PATCH 021/119] selftests/bpf: Add opts argument for __start_server This patch adds network_helper_opts parameter for __start_server() instead of "int protocol" and "int timeout_ms". This not only reduces the number of parameters, but also makes it more flexible. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/127d2f0929980b41f757dcfebe1b667e6bfb43f1.1714014697.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index a9e4905a2115..05af3b87ea45 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -80,19 +80,19 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) -static int __start_server(int type, int protocol, const struct sockaddr *addr, - socklen_t addrlen, int timeout_ms, bool reuseport) +static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, + bool reuseport, const struct network_helper_opts *opts) { int on = 1; int fd; - fd = socket(addr->sa_family, type, protocol); + fd = socket(addr->sa_family, type, opts->proto); if (fd < 0) { log_err("Failed to create server socket"); return -1; } - if (settimeo(fd, timeout_ms)) + if (settimeo(fd, opts->timeout_ms)) goto error_close; if (reuseport && @@ -123,14 +123,17 @@ error_close: static int start_server_proto(int family, int type, int protocol, const char *addr_str, __u16 port, int timeout_ms) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .proto = protocol, + }; struct sockaddr_storage addr; socklen_t addrlen; if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, protocol, (struct sockaddr *)&addr, - addrlen, timeout_ms, false); + return __start_server(type, (struct sockaddr *)&addr, addrlen, false, &opts); } int start_server(int family, int type, const char *addr_str, __u16 port, @@ -149,6 +152,9 @@ int start_mptcp_server(int family, const char *addr_str, __u16 port, int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + }; struct sockaddr_storage addr; unsigned int nr_fds = 0; socklen_t addrlen; @@ -164,8 +170,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, 0, (struct sockaddr *)&addr, addrlen, - timeout_ms, true); + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -174,8 +179,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, 0, (struct sockaddr *)&addr, - addrlen, timeout_ms, true); + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -193,8 +197,7 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t l if (!opts) opts = &default_opts; - return __start_server(type, 0, (struct sockaddr *)addr, len, - opts->timeout_ms, 0); + return __start_server(type, (struct sockaddr *)addr, len, 0, opts); } void free_fds(int *fds, unsigned int nr_close_fds) From 044032ee6c4e786746058aaf5527be13e831cc5c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 25 Apr 2024 11:23:42 +0800 Subject: [PATCH 022/119] selftests/bpf: Make start_mptcp_server static start_mptcp_server() shouldn't be a public helper, it only be used in MPTCP tests. This patch moves it into prog_tests/mptcp.c, and implenments it using make_sockaddr() and start_server_addr() instead of using start_server_proto(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/50ec7049e280c60a2924937940851f8fee2b73b8.1714014697.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 7 ------- tools/testing/selftests/bpf/network_helpers.h | 2 -- tools/testing/selftests/bpf/prog_tests/mptcp.c | 16 ++++++++++++++++ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 05af3b87ea45..aa23692029c4 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -142,13 +142,6 @@ int start_server(int family, int type, const char *addr_str, __u16 port, return start_server_proto(family, type, 0, addr_str, port, timeout_ms); } -int start_mptcp_server(int family, const char *addr_str, __u16 port, - int timeout_ms) -{ - return start_server_proto(family, SOCK_STREAM, IPPROTO_MPTCP, addr_str, - port, timeout_ms); -} - int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 5a8c5cf4ec1a..c62b54daa914 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -49,8 +49,6 @@ extern struct ipv6_packet pkt_v6; int settimeo(int fd, int timeout_ms); int start_server(int family, int type, const char *addr, __u16 port, int timeout_ms); -int start_mptcp_server(int family, const char *addr, __u16 port, - int timeout_ms); int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens); diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index 4e0f69295872..274d2e033e39 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -82,6 +82,22 @@ static void cleanup_netns(struct nstoken *nstoken) SYS_NOFAIL("ip netns del %s", NS_TEST); } +static int start_mptcp_server(int family, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + .proto = IPPROTO_MPTCP, + }; + struct sockaddr_storage addr; + socklen_t addrlen; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + return -1; + + return start_server_addr(SOCK_STREAM, &addr, addrlen, &opts); +} + static int verify_tsk(int map_fd, int client_fd) { int err, cfd = client_fd; From 8405e6980f21e2b75f232e970edd76bc50cf1491 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 25 Apr 2024 11:23:43 +0800 Subject: [PATCH 023/119] selftests/bpf: Drop start_server_proto helper Protocol can be set by __start_server() helper directly now, this makes the heler start_server_proto() useless. This patch drops it, and implenments start_server() using make_sockaddr() and __start_server(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/55d8a04e0bb8240a5fda2da3e9bdffe6fc8547b2.1714014697.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index aa23692029c4..054d26e383e0 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -120,12 +120,11 @@ error_close: return -1; } -static int start_server_proto(int family, int type, int protocol, - const char *addr_str, __u16 port, int timeout_ms) +int start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) { struct network_helper_opts opts = { .timeout_ms = timeout_ms, - .proto = protocol, }; struct sockaddr_storage addr; socklen_t addrlen; @@ -136,12 +135,6 @@ static int start_server_proto(int family, int type, int protocol, return __start_server(type, (struct sockaddr *)&addr, addrlen, false, &opts); } -int start_server(int family, int type, const char *addr_str, __u16 port, - int timeout_ms) -{ - return start_server_proto(family, type, 0, addr_str, port, timeout_ms); -} - int *start_reuseport_server(int family, int type, const char *addr_str, __u16 port, int timeout_ms, unsigned int nr_listens) { From 8f8a024272f3e335854515b41638bdf89c6d3146 Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Tue, 30 Apr 2024 11:38:06 +0200 Subject: [PATCH 024/119] libbpf: support "module: Function" syntax for tracing programs In some situations, it is useful to explicitly specify a kernel module to search for a tracing program target (e.g. when a function of the same name exists in multiple modules or in vmlinux). This patch enables that by allowing the "module:function" syntax for the find_kernel_btf_id function. Thanks to this, the syntax can be used both from a SEC macro (i.e. `SEC(fentry/module:function)`) and via the bpf_program__set_attach_target API call. Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/9085a8cb9a552de98e554deb22ff7e977d025440.1714469650.git.vmalik@redhat.com --- tools/lib/bpf/libbpf.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7667671187e9..109b33fb68dc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9862,16 +9862,28 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, enum bpf_attach_type attach_type, int *btf_obj_fd, int *btf_type_id) { - int ret, i; + int ret, i, mod_len; + const char *fn_name, *mod_name = NULL; - ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); - if (ret > 0) { - *btf_obj_fd = 0; /* vmlinux BTF */ - *btf_type_id = ret; - return 0; + fn_name = strchr(attach_name, ':'); + if (fn_name) { + mod_name = attach_name; + mod_len = fn_name - mod_name; + fn_name++; + } + + if (!mod_name || strncmp(mod_name, "vmlinux", mod_len) == 0) { + ret = find_attach_btf_id(obj->btf_vmlinux, + mod_name ? fn_name : attach_name, + attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; } - if (ret != -ENOENT) - return ret; ret = load_module_btfs(obj); if (ret) @@ -9880,7 +9892,12 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, for (i = 0; i < obj->btf_module_cnt; i++) { const struct module_btf *mod = &obj->btf_modules[i]; - ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0) + continue; + + ret = find_attach_btf_id(mod->btf, + mod_name ? fn_name : attach_name, + attach_type); if (ret > 0) { *btf_obj_fd = mod->fd; *btf_type_id = ret; From 960635887c967338fd567def3e7905a294f5002b Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Tue, 30 Apr 2024 11:38:07 +0200 Subject: [PATCH 025/119] selftests/bpf: add tests for the "module: Function" syntax The previous patch added support for the "module:function" syntax for tracing programs. This adds tests for explicitly specifying the module name via the SEC macro and via the bpf_program__set_attach_target call. Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/8a076168ed847f7c8a6c25715737b1fea84e38be.1714469650.git.vmalik@redhat.com --- .../selftests/bpf/prog_tests/module_attach.c | 6 +++++ .../selftests/bpf/progs/test_module_attach.c | 23 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index f53d658ed080..6d391d95f96e 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -51,6 +51,10 @@ void test_module_attach(void) 0, "bpf_testmod_test_read"); ASSERT_OK(err, "set_attach_target"); + err = bpf_program__set_attach_target(skel->progs.handle_fentry_explicit_manual, + 0, "bpf_testmod:bpf_testmod_test_read"); + ASSERT_OK(err, "set_attach_target_explicit"); + err = test_module_attach__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton\n")) return; @@ -70,6 +74,8 @@ void test_module_attach(void) ASSERT_EQ(bss->tp_btf_read_sz, READ_SZ, "tp_btf"); ASSERT_EQ(bss->fentry_read_sz, READ_SZ, "fentry"); ASSERT_EQ(bss->fentry_manual_read_sz, READ_SZ, "fentry_manual"); + ASSERT_EQ(bss->fentry_explicit_read_sz, READ_SZ, "fentry_explicit"); + ASSERT_EQ(bss->fentry_explicit_manual_read_sz, READ_SZ, "fentry_explicit_manual"); ASSERT_EQ(bss->fexit_read_sz, READ_SZ, "fexit"); ASSERT_EQ(bss->fexit_ret, -EIO, "fexit_tet"); ASSERT_EQ(bss->fmod_ret_read_sz, READ_SZ, "fmod_ret"); diff --git a/tools/testing/selftests/bpf/progs/test_module_attach.c b/tools/testing/selftests/bpf/progs/test_module_attach.c index 8a1b50f3a002..cc1a012d038f 100644 --- a/tools/testing/selftests/bpf/progs/test_module_attach.c +++ b/tools/testing/selftests/bpf/progs/test_module_attach.c @@ -73,6 +73,29 @@ int BPF_PROG(handle_fentry_manual, return 0; } +__u32 fentry_explicit_read_sz = 0; + +SEC("fentry/bpf_testmod:bpf_testmod_test_read") +int BPF_PROG(handle_fentry_explicit, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_read_sz = len; + return 0; +} + + +__u32 fentry_explicit_manual_read_sz = 0; + +SEC("fentry") +int BPF_PROG(handle_fentry_explicit_manual, + struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) +{ + fentry_explicit_manual_read_sz = len; + return 0; +} + __u32 fexit_read_sz = 0; int fexit_ret = 0; From 0737df6de94661ae55fd3343ce9abec32c687e62 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Apr 2024 21:17:06 -0700 Subject: [PATCH 026/119] libbpf: better fix for handling nulled-out struct_ops program Previous attempt to fix the handling of nulled-out (from skeleton) struct_ops program is working well only if struct_ops program is defined as non-autoloaded by default (i.e., has SEC("?struct_ops") annotation, with question mark). Unfortunately, that fix is incomplete due to how bpf_object_adjust_struct_ops_autoload() is marking referenced or non-referenced struct_ops program as autoloaded (or not). Because bpf_object_adjust_struct_ops_autoload() is run after bpf_map__init_kern_struct_ops() step, which sets program slot to NULL, such programs won't be considered "referenced", and so its autoload property won't be changed. This all sounds convoluted and it is, but the desire is to have as natural behavior (as far as struct_ops usage is concerned) as possible. This fix is redoing the original fix but makes it work for autoloaded-by-default struct_ops programs as well. We achieve this by forcing prog->autoload to false if prog was declaratively set for some struct_ops map, but then nulled-out from skeleton (programmatically). This achieves desired effect of not autoloading it. If such program is still referenced somewhere else (different struct_ops map or different callback field), it will get its autoload property adjusted by bpf_object_adjust_struct_ops_autoload() later. We also fix selftest, which accidentally used SEC("?struct_ops") annotation. It was meant to use autoload-by-default program from the very beginning. Fixes: f973fccd43d3 ("libbpf: handle nulled-out program in struct_ops correctly") Cc: Kui-Feng Lee Cc: Eduard Zingerman Cc: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240501041706.3712608-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 37 +++++++++++++------ .../selftests/bpf/progs/struct_ops_module.c | 2 +- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 109b33fb68dc..4ffc8873d1ad 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1128,6 +1128,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) const struct btf_type *mtype, *kern_mtype; __u32 mtype_id, kern_mtype_id; void *mdata, *kern_mdata; + struct bpf_program *prog; __s64 msize, kern_msize; __u32 moff, kern_moff; __u32 kern_member_idx; @@ -1145,19 +1146,35 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) kern_member = find_member_by_name(kern_btf, kern_type, mname); if (!kern_member) { - /* Skip all zeros or null fields if they are not - * presented in the kernel BTF. - */ - if (libbpf_is_mem_zeroed(mdata, msize)) { - st_ops->progs[i] = NULL; - pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", + if (!libbpf_is_mem_zeroed(mdata, msize)) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", map->name, mname); - continue; + return -ENOTSUP; } - pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + prog = st_ops->progs[i]; + if (prog) { + /* If we had declaratively set struct_ops callback, we need to + * first validate that it's actually a struct_ops program. + * And then force its autoload to false, because it doesn't have + * a chance of succeeding from POV of the current struct_ops map. + * If this program is still referenced somewhere else, though, + * then bpf_object_adjust_struct_ops_autoload() will update its + * autoload accordingly. + */ + if (!is_valid_st_ops_program(obj, prog)) { + pr_warn("struct_ops init_kern %s: member %s is declaratively assigned a non-struct_ops program\n", + map->name, mname); + return -EINVAL; + } + prog->autoload = false; + st_ops->progs[i] = NULL; + } + + /* Skip all-zero/NULL fields if they are not present in the kernel BTF */ + pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", map->name, mname); - return -ENOTSUP; + continue; } kern_member_idx = kern_member - btf_members(kern_type); @@ -1183,8 +1200,6 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) } if (btf_is_ptr(mtype)) { - struct bpf_program *prog; - /* Update the value from the shadow type */ prog = *(void **)mdata; st_ops->progs[i] = prog; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_module.c b/tools/testing/selftests/bpf/progs/struct_ops_module.c index 40109be2b3ae..4c56d4a9d9f4 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_module.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_module.c @@ -63,7 +63,7 @@ struct bpf_testmod_ops___zeroed { int zeroed; }; -SEC("?struct_ops/test_3") +SEC("struct_ops/test_3") int BPF_PROG(zeroed_op) { return 1; From ac2f438c2a85acd07e0ac7dc2f69d45bda1bb498 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 1 May 2024 10:01:30 -0700 Subject: [PATCH 027/119] bpf: crypto: fix build when CONFIG_CRYPTO=m Crypto subsytem can be build as a module. In this case we still have to build BPF crypto framework otherwise the build will fail. Fixes: 3e1c6f35409f ("bpf: make common crypto API for TC/XDP programs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405011634.4JK40epY-lkp@intel.com/ Signed-off-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240501170130.1682309-1-vadfed@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 85786fd97d2a..7eb9ad3a3ae6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,7 +44,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif -ifeq ($(CONFIG_CRYPTO),y) +ifneq ($(CONFIG_CRYPTO),) obj-$(CONFIG_BPF_SYSCALL) += crypto.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ From 5a3941f84b8f91bb1e111499d803b32188d33e5d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 2 May 2024 09:55:40 +0200 Subject: [PATCH 028/119] libbpf: Fix error message in attach_kprobe_session We just failed to retrieve pattern, so we need to print spec instead. Fixes: 2ca178f02b2f ("libbpf: Add support for kprobe session attach") Reported-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502075541.1425761-1-jolsa@kernel.org --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4ffc8873d1ad..68c87aed8335 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11605,7 +11605,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, spec = prog->sec_name + sizeof("kprobe.session/") - 1; n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); if (n < 1) { - pr_warn("kprobe session pattern is invalid: %s\n", pattern); + pr_warn("kprobe session pattern is invalid: %s\n", spec); return -EINVAL; } From 7c13ef16e87ac2e44d16c0468b1191bceb06f95c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 2 May 2024 09:55:41 +0200 Subject: [PATCH 029/119] libbpf: Fix error message in attach_kprobe_multi We just failed to retrieve pattern, so we need to print spec instead. Fixes: ddc6b04989eb ("libbpf: Add bpf_program__attach_kprobe_multi_opts function") Reported-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502075541.1425761-2-jolsa@kernel.org --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 68c87aed8335..a3566a456bc8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -11579,7 +11579,7 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); if (n < 1) { - pr_warn("kprobe multi pattern is invalid: %s\n", pattern); + pr_warn("kprobe multi pattern is invalid: %s\n", spec); return -EINVAL; } From 08e90da6872a9f9f63ca2911bbce6883b6fc1a19 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Thu, 2 May 2024 16:08:31 +0200 Subject: [PATCH 030/119] bpf: Missing trailing slash in tools/testing/selftests/bpf/Makefile tools/lib/bpf/Makefile assumes that the patch in OUTPUT is a directory and that it includes a trailing slash. This seems to be a common expectation for OUTPUT among all the Makefiles. In the rule for runqslower in tools/testing/selftests/bpf/Makefile the variable BPFTOOL_OUTPUT is set to a directory name that lacks a trailing slash. This results in a malformed BPF_HELPER_DEFS being defined in lib/bpf/Makefile. This problem becomes evident when a file like tools/lib/bpf/bpf_tracing.h gets updated. This patch fixes the problem by adding the missing slash in the value for BPFTOOL_OUTPUT in the $(OUTPUT)/runqslower rule. Regtested by running selftests in bpf-next master and building samples/bpf programs. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502140831.23915-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 82247aeef857..ba28d42b74db 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -262,7 +262,7 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \ BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ - BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf \ + BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf/ \ BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) \ EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS)' \ EXTRA_LDFLAGS='$(SAN_LDFLAGS)' && \ From 8e667a065daa6f4c01eadc20f3815f7bf13255bc Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:18 -0500 Subject: [PATCH 031/119] selftests/bpf: Fix bind program for big endian systems Without this fix, the bind4 and bind6 programs will reject bind attempts on big endian systems. This patch ensures that CI tests pass for the s390x architecture. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-2-jrife@google.com Signed-off-by: Martin KaFai Lau --- .../testing/selftests/bpf/progs/bind4_prog.c | 18 ++++++++++-------- .../testing/selftests/bpf/progs/bind6_prog.c | 18 ++++++++++-------- tools/testing/selftests/bpf/progs/bind_prog.h | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+), 16 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/bind_prog.h diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index a487f60b73ac..66005c1a5b36 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -12,6 +12,8 @@ #include #include +#include "bind_prog.h" + #define SERV4_IP 0xc0a801feU /* 192.168.1.254 */ #define SERV4_PORT 4040 #define SERV4_REWRITE_IP 0x7f000001U /* 127.0.0.1 */ @@ -118,23 +120,23 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[1] << 8; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[2] << 16; - user_ip4 |= ((volatile __u8 *)&ctx->user_ip4)[3] << 24; + user_ip4 |= load_byte(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 1, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 2, sizeof(user_ip4)); + user_ip4 |= load_byte(ctx->user_ip4, 3, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: user_ip4 = 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[0] << 0; - user_ip4 |= ((volatile __u16 *)&ctx->user_ip4)[1] << 16; + user_ip4 |= load_word(ctx->user_ip4, 0, sizeof(user_ip4)); + user_ip4 |= load_word(ctx->user_ip4, 1, sizeof(user_ip4)); if (ctx->user_ip4 != user_ip4) return 0; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index d62cd9e9cf0e..9c86c712348c 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -12,6 +12,8 @@ #include #include +#include "bind_prog.h" + #define SERV6_IP_0 0xfaceb00c /* face:b00c:1234:5678::abcd */ #define SERV6_IP_1 0x12345678 #define SERV6_IP_2 0x00000000 @@ -129,25 +131,25 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) // u8 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[1] << 8; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[2] << 16; - user_ip6 |= ((volatile __u8 *)&ctx->user_ip6[i])[3] << 24; + user_ip6 |= load_byte(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 1, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 2, sizeof(user_ip6)); + user_ip6 |= load_byte(ctx->user_ip6[i], 3, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } user_port = 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[0] << 0; - user_port |= ((volatile __u8 *)&ctx->user_port)[1] << 8; + user_port |= load_byte(ctx->user_port, 0, sizeof(user_port)); + user_port |= load_byte(ctx->user_port, 1, sizeof(user_port)); if (ctx->user_port != user_port) return 0; // u16 narrow loads: for (i = 0; i < 4; i++) { user_ip6 = 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[0] << 0; - user_ip6 |= ((volatile __u16 *)&ctx->user_ip6[i])[1] << 16; + user_ip6 |= load_word(ctx->user_ip6[i], 0, sizeof(user_ip6)); + user_ip6 |= load_word(ctx->user_ip6[i], 1, sizeof(user_ip6)); if (ctx->user_ip6[i] != user_ip6) return 0; } diff --git a/tools/testing/selftests/bpf/progs/bind_prog.h b/tools/testing/selftests/bpf/progs/bind_prog.h new file mode 100644 index 000000000000..e830caa940c3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bind_prog.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BIND_PROG_H__ +#define __BIND_PROG_H__ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[b] << 8 * b) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * w) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define load_byte(src, b, s) \ + (((volatile __u8 *)&(src))[(b) + (sizeof(src) - (s))] << 8 * ((s) - (b) - 1)) +#define load_word(src, w, s) \ + (((volatile __u16 *)&(src))[w] << 16 * (((s) / 2) - (w) - 1)) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#endif From bbb1cfdd02249dc8cf878e86a523b28814ed36c0 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:19 -0500 Subject: [PATCH 032/119] selftests/bpf: Implement socket kfuncs for bpf_testmod This patch adds a set of kfuncs to bpf_testmod that can be used to manipulate a socket from kernel space. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-3-jrife@google.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 255 ++++++++++++++++++ .../bpf/bpf_testmod/bpf_testmod_kfunc.h | 27 ++ 2 files changed, 282 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index eb2b78552ca2..e93013fc7bf4 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -10,18 +10,30 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include "bpf_testmod.h" #include "bpf_testmod_kfunc.h" #define CREATE_TRACE_POINTS #include "bpf_testmod-events.h" +#define CONNECT_TIMEOUT_SEC 1 + typedef int (*func_proto_typedef)(long); typedef int (*func_proto_typedef_nested1)(func_proto_typedef); typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1); DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123; long bpf_testmod_test_struct_arg_result; +static DEFINE_MUTEX(sock_lock); +static struct socket *sock; struct bpf_testmod_struct_arg_1 { int a; @@ -498,6 +510,237 @@ __bpf_kfunc void bpf_kfunc_call_test_sleepable(void) { } +__bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args) +{ + int proto; + int err; + + mutex_lock(&sock_lock); + + if (sock) { + pr_err("%s called without releasing old sock", __func__); + err = -EPERM; + goto out; + } + + switch (args->af) { + case AF_INET: + case AF_INET6: + proto = args->type == SOCK_STREAM ? IPPROTO_TCP : IPPROTO_UDP; + break; + case AF_UNIX: + proto = PF_UNIX; + break; + default: + pr_err("invalid address family %d\n", args->af); + err = -EINVAL; + goto out; + } + + err = sock_create_kern(current->nsproxy->net_ns, args->af, args->type, + proto, &sock); + + if (!err) + /* Set timeout for call to kernel_connect() to prevent it from hanging, + * and consider the connection attempt failed if it returns + * -EINPROGRESS. + */ + sock->sk->sk_sndtimeo = CONNECT_TIMEOUT_SEC * HZ; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc void bpf_kfunc_close_sock(void) +{ + mutex_lock(&sock_lock); + + if (sock) { + sock_release(sock); + sock = NULL; + } + + mutex_unlock(&sock_lock); +} + +__bpf_kfunc int bpf_kfunc_call_kernel_connect(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_connect(sock, (struct sockaddr *)&args->addr, + args->addrlen, 0); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_bind(struct addr_args *args) +{ + int err; + + if (args->addrlen > sizeof(args->addr)) + return -EINVAL; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_bind(sock, (struct sockaddr *)&args->addr, args->addrlen); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_listen(void) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_listen(sock, 128); +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_sendmsg(sock, &msg, &iov, 1, args->msglen); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) +{ + struct msghdr msg = { + .msg_name = &args->addr.addr, + .msg_namelen = args->addr.addrlen, + }; + struct kvec iov; + int err; + + if (args->addr.addrlen > sizeof(args->addr.addr) || + args->msglen > sizeof(args->msg)) + return -EINVAL; + + iov.iov_base = args->msg; + iov.iov_len = args->msglen; + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, args->msglen); + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = sock_sendmsg(sock, &msg); + args->addr.addrlen = msg.msg_namelen; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getsockname(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + +__bpf_kfunc int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) +{ + int err; + + mutex_lock(&sock_lock); + + if (!sock) { + pr_err("%s called without initializing sock", __func__); + err = -EPERM; + goto out; + } + + err = kernel_getpeername(sock, (struct sockaddr *)&args->addr); + if (err < 0) + goto out; + + args->addrlen = err; + err = 0; +out: + mutex_unlock(&sock_lock); + + return err; +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -525,6 +768,15 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_bind, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_listen, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -655,6 +907,8 @@ static int bpf_testmod_init(void) return ret; if (bpf_fentry_test1(0) < 0) return -EINVAL; + sock = NULL; + mutex_init(&sock_lock); return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } @@ -668,6 +922,7 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h index ce5cd763561c..b0d586a6751f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod_kfunc.h @@ -64,6 +64,22 @@ struct prog_test_fail3 { char arr2[]; }; +struct init_sock_args { + int af; + int type; +}; + +struct addr_args { + char addr[sizeof(struct __kernel_sockaddr_storage)]; + int addrlen; +}; + +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; + struct prog_test_ref_kfunc * bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr) __ksym; void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym; @@ -107,4 +123,15 @@ void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p); void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len); void bpf_kfunc_common_test(void) __ksym; + +int bpf_kfunc_init_sock(struct init_sock_args *args) __ksym; +void bpf_kfunc_close_sock(void) __ksym; +int bpf_kfunc_call_kernel_connect(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_bind(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_listen(void) __ksym; +int bpf_kfunc_call_kernel_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_sock_sendmsg(struct sendmsg_args *args) __ksym; +int bpf_kfunc_call_kernel_getsockname(struct addr_args *args) __ksym; +int bpf_kfunc_call_kernel_getpeername(struct addr_args *args) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ From 15b6671efa508ff9c1fb995452913f8de85db73b Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:20 -0500 Subject: [PATCH 033/119] selftests/bpf: Implement BPF programs for kernel socket operations This patch lays out a set of SYSCALL programs that can be used to invoke the socket operation kfuncs in bpf_testmod, allowing a test program to manipulate kernel socket operations from userspace. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-4-jrife@google.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/progs/sock_addr_kern.c | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/sock_addr_kern.c diff --git a/tools/testing/selftests/bpf/progs/sock_addr_kern.c b/tools/testing/selftests/bpf/progs/sock_addr_kern.c new file mode 100644 index 000000000000..8386bb15ccdc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sock_addr_kern.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ +#include +#include +#include "../bpf_testmod/bpf_testmod_kfunc.h" + +SEC("syscall") +int init_sock(struct init_sock_args *args) +{ + bpf_kfunc_init_sock(args); + + return 0; +} + +SEC("syscall") +int close_sock(void *ctx) +{ + bpf_kfunc_close_sock(); + + return 0; +} + +SEC("syscall") +int kernel_connect(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_connect(args); +} + +SEC("syscall") +int kernel_bind(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_bind(args); +} + +SEC("syscall") +int kernel_listen(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_listen(); +} + +SEC("syscall") +int kernel_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_kernel_sendmsg(args); +} + +SEC("syscall") +int sock_sendmsg(struct sendmsg_args *args) +{ + return bpf_kfunc_call_sock_sendmsg(args); +} + +SEC("syscall") +int kernel_getsockname(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getsockname(args); +} + +SEC("syscall") +int kernel_getpeername(struct addr_args *args) +{ + return bpf_kfunc_call_kernel_getpeername(args); +} + +char _license[] SEC("license") = "GPL"; From 8a9d22b8aeb2182cfe83991f11a88b3351084d3e Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:21 -0500 Subject: [PATCH 034/119] selftests/bpf: Move IPv4 and IPv6 sockaddr test cases This patch lays the groundwork for testing IPv4 and IPv6 sockaddr hooks and their interaction with both socket syscalls and kernel functions (e.g. kernel_connect, kernel_bind, etc.). It moves some of the test cases from the old-style bpf/test_sock_addr.c self test into the sock_addr prog_test in a step towards fully retiring bpf/test_sock_addr.c. We will expand the test dimensions in the sock_addr prog_test in a later patch series in order to migrate the remaining test cases. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-5-jrife@google.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sock_addr.c | 390 ++++++++++++------ tools/testing/selftests/bpf/test_sock_addr.c | 192 --------- 2 files changed, 268 insertions(+), 314 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 61668e0f11b0..bf1ad18262d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -3,16 +3,44 @@ #include "test_progs.h" +#include "bind4_prog.skel.h" +#include "bind6_prog.skel.h" #include "connect_unix_prog.skel.h" +#include "connect4_prog.skel.h" +#include "connect6_prog.skel.h" +#include "sendmsg4_prog.skel.h" +#include "sendmsg6_prog.skel.h" +#include "recvmsg4_prog.skel.h" +#include "recvmsg6_prog.skel.h" #include "sendmsg_unix_prog.skel.h" #include "recvmsg_unix_prog.skel.h" #include "getsockname_unix_prog.skel.h" #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" +#define TEST_NS "sock_addr" +#define TEST_IF_PREFIX "test_sock_addr" +#define TEST_IPV4 "127.0.0.4" +#define TEST_IPV6 "::6" + +#define SERV4_IP "192.168.1.254" +#define SERV4_REWRITE_IP "127.0.0.1" +#define SRC4_IP "172.16.0.1" +#define SRC4_REWRITE_IP TEST_IPV4 +#define SERV4_PORT 4040 +#define SERV4_REWRITE_PORT 4444 + +#define SERV6_IP "face:b00c:1234:5678::abcd" +#define SERV6_REWRITE_IP "::1" +#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" +#define SRC6_IP "::1" +#define SRC6_REWRITE_IP TEST_IPV6 +#define SERV6_PORT 6060 +#define SERV6_REWRITE_PORT 6666 + #define SERVUN_ADDRESS "bpf_cgroup_unix_test" #define SERVUN_REWRITE_ADDRESS "bpf_cgroup_unix_test_rewrite" -#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" +#define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" enum sock_addr_test_type { SOCK_ADDR_TEST_BIND, @@ -43,130 +71,148 @@ struct sock_addr_test { const char *expected_src_addr; }; -static void *connect_unix_prog_load(int cgroup_fd) -{ - struct connect_unix_prog *skel; - - skel = connect_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.connect_unix_prog = bpf_program__attach_cgroup( - skel->progs.connect_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.connect_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - connect_unix_prog__destroy(skel); - return NULL; +#define BPF_SKEL_FUNCS(skel_name, prog_name) \ +static void *skel_name##_load(int cgroup_fd) \ +{ \ + struct skel_name *skel; \ + skel = skel_name##__open_and_load(); \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + skel->links.prog_name = bpf_program__attach_cgroup( \ + skel->progs.prog_name, cgroup_fd); \ + if (!ASSERT_OK_PTR(skel->links.prog_name, "prog_attach")) \ + goto cleanup; \ + return skel; \ +cleanup: \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void skel_name##_destroy(void *skel) \ +{ \ + skel_name##__destroy(skel); \ } -static void connect_unix_prog_destroy(void *skel) -{ - connect_unix_prog__destroy(skel); -} - -static void *sendmsg_unix_prog_load(int cgroup_fd) -{ - struct sendmsg_unix_prog *skel; - - skel = sendmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.sendmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.sendmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.sendmsg_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - sendmsg_unix_prog__destroy(skel); - return NULL; -} - -static void sendmsg_unix_prog_destroy(void *skel) -{ - sendmsg_unix_prog__destroy(skel); -} - -static void *recvmsg_unix_prog_load(int cgroup_fd) -{ - struct recvmsg_unix_prog *skel; - - skel = recvmsg_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.recvmsg_unix_prog = bpf_program__attach_cgroup( - skel->progs.recvmsg_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.recvmsg_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - recvmsg_unix_prog__destroy(skel); - return NULL; -} - -static void recvmsg_unix_prog_destroy(void *skel) -{ - recvmsg_unix_prog__destroy(skel); -} - -static void *getsockname_unix_prog_load(int cgroup_fd) -{ - struct getsockname_unix_prog *skel; - - skel = getsockname_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.getsockname_unix_prog = bpf_program__attach_cgroup( - skel->progs.getsockname_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getsockname_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - getsockname_unix_prog__destroy(skel); - return NULL; -} - -static void getsockname_unix_prog_destroy(void *skel) -{ - getsockname_unix_prog__destroy(skel); -} - -static void *getpeername_unix_prog_load(int cgroup_fd) -{ - struct getpeername_unix_prog *skel; - - skel = getpeername_unix_prog__open_and_load(); - if (!ASSERT_OK_PTR(skel, "skel_open")) - goto cleanup; - - skel->links.getpeername_unix_prog = bpf_program__attach_cgroup( - skel->progs.getpeername_unix_prog, cgroup_fd); - if (!ASSERT_OK_PTR(skel->links.getpeername_unix_prog, "prog_attach")) - goto cleanup; - - return skel; -cleanup: - getpeername_unix_prog__destroy(skel); - return NULL; -} - -static void getpeername_unix_prog_destroy(void *skel) -{ - getpeername_unix_prog__destroy(skel); -} +BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); static struct sock_addr_test tests[] = { + /* bind - system calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (stream)", + bind4_prog_load, + bind4_prog_destroy, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind (dgram)", + bind4_prog_load, + bind4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (stream)", + bind6_prog_load, + bind6_prog_destroy, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind (dgram)", + bind6_prog_load, + bind6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + + /* connect - system calls */ { SOCK_ADDR_TEST_CONNECT, - "connect_unix", + "connect4: connect (stream)", + connect4_prog_load, + connect4_prog_destroy, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect (dgram)", + connect4_prog_load, + connect4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect (stream)", + connect6_prog_load, + connect6_prog_destroy, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect (dgram)", + connect6_prog_load, + connect6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: connect (stream)", connect_unix_prog_load, connect_unix_prog_destroy, AF_UNIX, @@ -177,9 +223,37 @@ static struct sock_addr_test tests[] = { 0, NULL, }, + + /* sendmsg - system calls */ { SOCK_ADDR_TEST_SENDMSG, - "sendmsg_unix", + "sendmsg4: sendmsg (dgram)", + sendmsg4_prog_load, + sendmsg4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg (dgram)", + sendmsg6_prog_load, + sendmsg6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, AF_UNIX, @@ -190,9 +264,37 @@ static struct sock_addr_test tests[] = { 0, NULL, }, + + /* recvmsg - system calls */ { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-dgram", + "recvmsg4: recvfrom (dgram)", + recvmsg4_prog_load, + recvmsg4_prog_destroy, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg6: recvfrom (dgram)", + recvmsg6_prog_load, + recvmsg6_prog_destroy, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, AF_UNIX, @@ -205,7 +307,7 @@ static struct sock_addr_test tests[] = { }, { SOCK_ADDR_TEST_RECVMSG, - "recvmsg_unix-stream", + "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, AF_UNIX, @@ -216,6 +318,8 @@ static struct sock_addr_test tests[] = { 0, SERVUN_ADDRESS, }, + + /* getsockname - system calls */ { SOCK_ADDR_TEST_GETSOCKNAME, "getsockname_unix", @@ -229,6 +333,8 @@ static struct sock_addr_test tests[] = { 0, NULL, }, + + /* getpeername - system calls */ { SOCK_ADDR_TEST_GETPEERNAME, "getpeername_unix", @@ -558,8 +664,44 @@ cleanup: close(serv); } +static int setup_test_env(struct nstoken **tok) +{ + int err; + + SYS_NOFAIL("ip netns delete %s", TEST_NS); + SYS(fail, "ip netns add %s", TEST_NS); + *tok = open_netns(TEST_NS); + if (!ASSERT_OK_PTR(*tok, "netns token")) + goto fail; + + SYS(fail, "ip link add dev %s1 type veth peer name %s2", TEST_IF_PREFIX, + TEST_IF_PREFIX); + SYS(fail, "ip link set lo up"); + SYS(fail, "ip link set %s1 up", TEST_IF_PREFIX); + SYS(fail, "ip link set %s2 up", TEST_IF_PREFIX); + SYS(fail, "ip -4 addr add %s/8 dev %s1", TEST_IPV4, TEST_IF_PREFIX); + SYS(fail, "ip -6 addr add %s/128 nodad dev %s1", TEST_IPV6, TEST_IF_PREFIX); + + err = 0; + goto out; +fail: + err = -1; + close_netns(*tok); + *tok = NULL; + SYS_NOFAIL("ip netns delete %s", TEST_NS); +out: + return err; +} + +static void cleanup_test_env(struct nstoken *tok) +{ + close_netns(tok); + SYS_NOFAIL("ip netns delete %s", TEST_NS); +} + void test_sock_addr(void) { + struct nstoken *tok = NULL; int cgroup_fd = -1; void *skel; @@ -567,6 +709,9 @@ void test_sock_addr(void) if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup")) goto cleanup; + if (!ASSERT_OK(setup_test_env(&tok), "setup_test_env")) + goto cleanup; + for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; @@ -607,6 +752,7 @@ void test_sock_addr(void) } cleanup: + cleanup_test_env(tok); if (cgroup_fd >= 0) close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index c412de84b88f..aa2198a0f24d 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -97,11 +97,7 @@ static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int recvmsg_allow_prog_load(const struct sock_addr_test *test); static int recvmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); @@ -135,34 +131,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "bind4: rewrite IP & TCP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind4: rewrite IP & UDP port in", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - NULL, - SUCCESS, - }, { "bind6: load prog with wrong expected attach type", bind6_prog_load, @@ -191,34 +159,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "bind6: rewrite IP & TCP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, - { - "bind6: rewrite IP & UDP port in", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - NULL, - SUCCESS, - }, /* connect */ { @@ -249,34 +189,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "connect4: rewrite IP & TCP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "connect4: rewrite IP & UDP port", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, { "connect6: load prog with wrong expected attach type", connect6_prog_load, @@ -305,34 +217,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "connect6: rewrite IP & TCP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, - { - "connect6: rewrite IP & UDP port", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, /* sendmsg */ { @@ -377,20 +261,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg4: rewrite IP & port (C)", - sendmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, { "sendmsg4: deny call", sendmsg_deny_prog_load, @@ -447,20 +317,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: rewrite IP & port (C)", - sendmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, { "sendmsg6: IPv4-mapped IPv6", sendmsg6_rw_v4mapped_prog_load, @@ -575,34 +431,6 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, - { - "recvmsg4: rewrite IP & port (C)", - recvmsg4_rw_c_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SERV4_IP, - SUCCESS, - }, - { - "recvmsg6: rewrite IP & port (C)", - recvmsg6_rw_c_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SERV6_IP, - SUCCESS, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -761,16 +589,6 @@ static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) return load_insns(test, insns, ARRAY_SIZE(insns)); } -static int recvmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG4_PROG_PATH); -} - -static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG4_PROG_PATH); -} - static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, const char *rw_dst_ip) { @@ -829,11 +647,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); } -static int recvmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, RECVMSG6_PROG_PATH); -} - static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) { return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP); @@ -844,11 +657,6 @@ static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); } -static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, SENDMSG6_PROG_PATH); -} - static int cmp_addr(const struct sockaddr_storage *addr1, const struct sockaddr_storage *addr2, int cmp_port) { From 524e05ac4e14bb50b4f442e1fe88540abc9b72fd Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:22 -0500 Subject: [PATCH 035/119] selftests/bpf: Make sock configurable for each test case In order to reuse the same test code for both socket system calls (e.g. connect(), bind(), etc.) and kernel socket functions (e.g. kernel_connect(), kernel_bind(), etc.), this patch introduces the "ops" field to sock_addr_test. This field allows each test cases to configure the set of functions used in the test case to create, manipulate, and tear down a socket. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-6-jrife@google.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sock_addr.c | 140 ++++++++++++------ 1 file changed, 98 insertions(+), 42 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index bf1ad18262d8..8f678fa413f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -54,12 +54,64 @@ enum sock_addr_test_type { typedef void *(*load_fn)(int cgroup_fd); typedef void (*destroy_fn)(void *skel); +struct sock_ops { + int (*connect_to_addr)(int type, const struct sockaddr_storage *addr, + socklen_t addrlen, + const struct network_helper_opts *opts); + int (*start_server)(int family, int type, const char *addr_str, + __u16 port, int timeout_ms); + int (*socket)(int famil, int type, int protocol); + int (*bind)(int fd, struct sockaddr *addr, socklen_t addrlen); + int (*getsockname)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*getpeername)(int fd, struct sockaddr *addr, socklen_t *addrlen); + int (*sendmsg)(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen); + int (*close)(int fd); +}; + +static int user_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) +{ + struct msghdr hdr; + struct iovec iov; + + memset(&iov, 0, sizeof(iov)); + iov.iov_base = msg; + iov.iov_len = msglen; + + memset(&hdr, 0, sizeof(hdr)); + hdr.msg_name = (void *)addr; + hdr.msg_namelen = addrlen; + hdr.msg_iov = &iov; + hdr.msg_iovlen = 1; + + return sendmsg(fd, &hdr, 0); +} + +static int user_bind(int fd, struct sockaddr *addr, socklen_t addrlen) +{ + return bind(fd, (const struct sockaddr *)addr, addrlen); +} + +struct sock_ops user_ops = { + .connect_to_addr = connect_to_addr, + .start_server = start_server, + .socket = socket, + .bind = user_bind, + .getsockname = getsockname, + .getpeername = getpeername, + .sendmsg = user_sendmsg, + .close = close, +}; + struct sock_addr_test { enum sock_addr_test_type type; const char *name; /* BPF prog properties */ load_fn loadfn; destroy_fn destroyfn; + /* Socket operations */ + struct sock_ops *ops; /* Socket properties */ int socket_family; int socket_type; @@ -113,6 +165,7 @@ static struct sock_addr_test tests[] = { "bind4: bind (stream)", bind4_prog_load, bind4_prog_destroy, + &user_ops, AF_INET, SOCK_STREAM, SERV4_IP, @@ -125,6 +178,7 @@ static struct sock_addr_test tests[] = { "bind4: bind (dgram)", bind4_prog_load, bind4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_IP, @@ -137,6 +191,7 @@ static struct sock_addr_test tests[] = { "bind6: bind (stream)", bind6_prog_load, bind6_prog_destroy, + &user_ops, AF_INET6, SOCK_STREAM, SERV6_IP, @@ -149,6 +204,7 @@ static struct sock_addr_test tests[] = { "bind6: bind (dgram)", bind6_prog_load, bind6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_IP, @@ -163,6 +219,7 @@ static struct sock_addr_test tests[] = { "connect4: connect (stream)", connect4_prog_load, connect4_prog_destroy, + &user_ops, AF_INET, SOCK_STREAM, SERV4_IP, @@ -176,6 +233,7 @@ static struct sock_addr_test tests[] = { "connect4: connect (dgram)", connect4_prog_load, connect4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_IP, @@ -189,6 +247,7 @@ static struct sock_addr_test tests[] = { "connect6: connect (stream)", connect6_prog_load, connect6_prog_destroy, + &user_ops, AF_INET6, SOCK_STREAM, SERV6_IP, @@ -202,6 +261,7 @@ static struct sock_addr_test tests[] = { "connect6: connect (dgram)", connect6_prog_load, connect6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_IP, @@ -215,6 +275,7 @@ static struct sock_addr_test tests[] = { "connect_unix: connect (stream)", connect_unix_prog_load, connect_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -230,6 +291,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: sendmsg (dgram)", sendmsg4_prog_load, sendmsg4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_IP, @@ -243,6 +305,7 @@ static struct sock_addr_test tests[] = { "sendmsg6: sendmsg (dgram)", sendmsg6_prog_load, sendmsg6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_IP, @@ -256,6 +319,7 @@ static struct sock_addr_test tests[] = { "sendmsg_unix: sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_DGRAM, SERVUN_ADDRESS, @@ -271,6 +335,7 @@ static struct sock_addr_test tests[] = { "recvmsg4: recvfrom (dgram)", recvmsg4_prog_load, recvmsg4_prog_destroy, + &user_ops, AF_INET, SOCK_DGRAM, SERV4_REWRITE_IP, @@ -284,6 +349,7 @@ static struct sock_addr_test tests[] = { "recvmsg6: recvfrom (dgram)", recvmsg6_prog_load, recvmsg6_prog_destroy, + &user_ops, AF_INET6, SOCK_DGRAM, SERV6_REWRITE_IP, @@ -297,6 +363,7 @@ static struct sock_addr_test tests[] = { "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_DGRAM, SERVUN_REWRITE_ADDRESS, @@ -310,6 +377,7 @@ static struct sock_addr_test tests[] = { "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_REWRITE_ADDRESS, @@ -325,6 +393,7 @@ static struct sock_addr_test tests[] = { "getsockname_unix", getsockname_unix_prog_load, getsockname_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -340,6 +409,7 @@ static struct sock_addr_test tests[] = { "getpeername_unix", getpeername_unix_prog_load, getpeername_unix_prog_destroy, + &user_ops, AF_UNIX, SOCK_STREAM, SERVUN_ADDRESS, @@ -400,26 +470,15 @@ static int cmp_sock_addr(info_fn fn, int sock1, return cmp_addr(&addr1, len1, addr2, addr2_len, cmp_port); } -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) -{ - return cmp_sock_addr(getsockname, sock1, addr2, addr2_len, cmp_port); -} - -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2, - socklen_t addr2_len, bool cmp_port) -{ - return cmp_sock_addr(getpeername, sock1, addr2, addr2_len, cmp_port); -} - static void test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, client = -1, err; - serv = start_server(test->socket_family, test->socket_type, - test->requested_addr, test->requested_port, 0); + serv = test->ops->start_server(test->socket_family, test->socket_type, + test->requested_addr, + test->requested_port, 0); if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; @@ -429,7 +488,8 @@ static void test_bind(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; @@ -442,7 +502,7 @@ cleanup: if (client != -1) close(client); if (serv != -1) - close(serv); + test->ops->close(serv); } static void test_connect(struct sock_addr_test *test) @@ -463,7 +523,8 @@ static void test_connect(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -479,18 +540,21 @@ static void test_connect(struct sock_addr_test *test) goto cleanup; } - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; if (test->expected_src_addr) { - err = cmp_local_addr(client, &expected_src_addr, expected_src_addr_len, false); + err = cmp_sock_addr(test->ops->getsockname, client, + &expected_src_addr, expected_src_addr_len, + false); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; } cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); } @@ -500,8 +564,6 @@ static void test_xmsg(struct sock_addr_test *test) struct sockaddr_storage addr, src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), src_addr_len = sizeof(struct sockaddr_storage); - struct msghdr hdr; - struct iovec iov; char data = 'a'; int serv = -1, client = -1, err; @@ -514,7 +576,7 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; - client = socket(test->socket_family, test->socket_type, 0); + client = test->ops->socket(test->socket_family, test->socket_type, 0); if (!ASSERT_GE(client, 0, "socket")) goto cleanup; @@ -524,7 +586,8 @@ static void test_xmsg(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = bind(client, (const struct sockaddr *) &src_addr, src_addr_len); + err = test->ops->bind(client, (struct sockaddr *)&src_addr, + src_addr_len); if (!ASSERT_OK(err, "bind")) goto cleanup; } @@ -535,17 +598,8 @@ static void test_xmsg(struct sock_addr_test *test) goto cleanup; if (test->socket_type == SOCK_DGRAM) { - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)&addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - err = sendmsg(client, &hdr, 0); + err = test->ops->sendmsg(client, (struct sockaddr *)&addr, + addr_len, &data, sizeof(data)); if (!ASSERT_EQ(err, sizeof(data), "sendmsg")) goto cleanup; } else { @@ -596,7 +650,7 @@ static void test_xmsg(struct sock_addr_test *test) cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); } @@ -607,7 +661,7 @@ static void test_getsockname(struct sock_addr_test *test) socklen_t expected_addr_len = sizeof(struct sockaddr_storage); int serv = -1, err; - serv = start_server(test->socket_family, test->socket_type, + serv = test->ops->start_server(test->socket_family, test->socket_type, test->requested_addr, test->requested_port, 0); if (!ASSERT_GE(serv, 0, "start_server")) goto cleanup; @@ -618,13 +672,13 @@ static void test_getsockname(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_local_addr(serv, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getsockname, serv, &expected_addr, expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_local_addr")) goto cleanup; cleanup: if (serv != -1) - close(serv); + test->ops->close(serv); } static void test_getpeername(struct sock_addr_test *test) @@ -644,7 +698,8 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - client = connect_to_addr(test->socket_type, &addr, addr_len, NULL); + client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, + NULL); if (!ASSERT_GE(client, 0, "connect_to_addr")) goto cleanup; @@ -653,13 +708,14 @@ static void test_getpeername(struct sock_addr_test *test) if (!ASSERT_EQ(err, 0, "make_sockaddr")) goto cleanup; - err = cmp_peer_addr(client, &expected_addr, expected_addr_len, true); + err = cmp_sock_addr(test->ops->getpeername, client, &expected_addr, + expected_addr_len, true); if (!ASSERT_EQ(err, 0, "cmp_peer_addr")) goto cleanup; cleanup: if (client != -1) - close(client); + test->ops->close(client); if (serv != -1) close(serv); } From e0c8a7e7526ff1526d4dcc7d71aad4e7fe2ec767 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 29 Apr 2024 16:45:23 -0500 Subject: [PATCH 036/119] selftests/bpf: Add kernel socket operation tests This patch creates two sets of sock_ops that call out to the SYSCALL hooks in the sock_addr_kern BPF program and uses them to construct test cases for the range of supported operations (kernel_connect(), kernel_bind(), kernel_sendms(), sock_sendmsg(), kernel_getsockname(), kenel_getpeername()). This ensures that these interact with BPF sockaddr hooks as intended. Beyond this it also ensures that these operations do not modify their address parameter, providing regression coverage for the issues addressed by this set of patches: - commit 0bdf399342c5("net: Avoid address overwrite in kernel_connect") - commit 86a7e0b69bd5("net: prevent rewrite of msg_name in sock_sendmsg()") - commit c889a99a21bf("net: prevent address rewrite in kernel_bind()") - commit 01b2885d9415("net: Save and restore msg_namelen in sock_sendmsg") Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240429214529.2644801-7-jrife@google.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/sock_addr.c | 472 ++++++++++++++++++ 1 file changed, 472 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 8f678fa413f5..9c709c33f889 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -3,6 +3,7 @@ #include "test_progs.h" +#include "sock_addr_kern.skel.h" #include "bind4_prog.skel.h" #include "bind6_prog.skel.h" #include "connect_unix_prog.skel.h" @@ -54,6 +55,216 @@ enum sock_addr_test_type { typedef void *(*load_fn)(int cgroup_fd); typedef void (*destroy_fn)(void *skel); +static int cmp_addr(const struct sockaddr_storage *addr1, socklen_t addr1_len, + const struct sockaddr_storage *addr2, socklen_t addr2_len, + bool cmp_port); + +struct init_sock_args { + int af; + int type; +}; + +struct addr_args { + char addr[sizeof(struct sockaddr_storage)]; + int addrlen; +}; + +struct sendmsg_args { + struct addr_args addr; + char msg[10]; + int msglen; +}; + +static struct sock_addr_kern *skel; + +static int run_bpf_prog(const char *prog_name, void *ctx, int ctx_size) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_program *prog; + int prog_fd, err; + + topts.ctx_in = ctx; + topts.ctx_size_in = ctx_size; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto err; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, prog_name)) + goto err; + + err = topts.retval; + goto out; +err: + err = -1; +out: + return err; +} + +static int kernel_init_sock(int af, int type, int protocol) +{ + struct init_sock_args args = { + .af = af, + .type = type, + }; + + return run_bpf_prog("init_sock", &args, sizeof(args)); +} + +static int kernel_close_sock(int fd) +{ + return run_bpf_prog("close_sock", NULL, 0); +} + +static int sock_addr_op(const char *name, struct sockaddr *addr, + socklen_t *addrlen, bool expect_change) +{ + struct addr_args args; + int err; + + if (addrlen) + args.addrlen = *addrlen; + + if (addr) + memcpy(&args.addr, addr, *addrlen); + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!expect_change && addr) + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + *addrlen, + (struct sockaddr_storage *)&args.addr, + args.addrlen, 1), + 0, "address_param_modified")) + return -1; + + if (addrlen) + *addrlen = args.addrlen; + + if (addr) + memcpy(addr, &args.addr, *addrlen); + + return err; +} + +static int send_msg_op(const char *name, struct sockaddr *addr, + socklen_t addrlen, const char *msg, int msglen) +{ + struct sendmsg_args args; + int err; + + memset(&args, 0, sizeof(args)); + memcpy(&args.addr.addr, addr, addrlen); + args.addr.addrlen = addrlen; + memcpy(args.msg, msg, msglen); + args.msglen = msglen; + + err = run_bpf_prog(name, &args, sizeof(args)); + + if (!ASSERT_EQ(cmp_addr((struct sockaddr_storage *)addr, + addrlen, + (struct sockaddr_storage *)&args.addr.addr, + args.addr.addrlen, 1), + 0, "address_param_modified")) + return -1; + + return err; +} + +static int kernel_connect(struct sockaddr *addr, socklen_t addrlen) +{ + return sock_addr_op("kernel_connect", addr, &addrlen, false); +} + +static int kernel_bind(int fd, struct sockaddr *addr, socklen_t addrlen) +{ + return sock_addr_op("kernel_bind", addr, &addrlen, false); +} + +static int kernel_listen(void) +{ + return sock_addr_op("kernel_listen", NULL, NULL, false); +} + +static int kernel_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) +{ + return send_msg_op("kernel_sendmsg", addr, addrlen, msg, msglen); +} + +static int sock_sendmsg(int fd, struct sockaddr *addr, socklen_t addrlen, + char *msg, int msglen) +{ + return send_msg_op("sock_sendmsg", addr, addrlen, msg, msglen); +} + +static int kernel_getsockname(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getsockname", addr, addrlen, true); +} + +static int kernel_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen) +{ + return sock_addr_op("kernel_getpeername", addr, addrlen, true); +} + +int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, socklen_t addrlen, + const struct network_helper_opts *opts) +{ + int err; + + if (!ASSERT_OK(kernel_init_sock(addr->ss_family, type, 0), + "kernel_init_sock")) + goto err; + + if (!ASSERT_OK(kernel_connect((struct sockaddr *)addr, addrlen), + "kernel_connect")) + goto err; + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); +out: + return err; +} + +int kernel_start_server(int family, int type, const char *addr_str, __u16 port, + int timeout_ms) +{ + struct sockaddr_storage addr; + socklen_t addrlen; + int err; + + if (!ASSERT_OK(kernel_init_sock(family, type, 0), "kernel_init_sock")) + goto err; + + if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) + goto err; + + if (!ASSERT_OK(kernel_bind(0, (struct sockaddr *)&addr, addrlen), + "kernel_bind")) + goto err; + + if (type == SOCK_STREAM) { + if (!ASSERT_OK(kernel_listen(), "kernel_listen")) + goto err; + } + + /* Test code expects a "file descriptor" on success. */ + err = 1; + goto out; +err: + err = -1; + ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); +out: + return err; +} + struct sock_ops { int (*connect_to_addr)(int type, const struct sockaddr_storage *addr, socklen_t addrlen, @@ -104,6 +315,28 @@ struct sock_ops user_ops = { .close = close, }; +struct sock_ops kern_ops_sock_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = sock_sendmsg, + .close = kernel_close_sock, +}; + +struct sock_ops kern_ops_kernel_sendmsg = { + .connect_to_addr = kernel_connect_to_addr, + .start_server = kernel_start_server, + .socket = kernel_init_sock, + .bind = kernel_bind, + .getsockname = kernel_getsockname, + .getpeername = kernel_getpeername, + .sendmsg = kernel_sendmsg, + .close = kernel_close_sock, +}; + struct sock_addr_test { enum sock_addr_test_type type; const char *name; @@ -213,6 +446,60 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_PORT, }, + /* bind - kernel calls */ + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (stream)", + bind4_prog_load, + bind4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind (dgram)", + bind4_prog_load, + bind4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (stream)", + bind6_prog_load, + bind6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind (dgram)", + bind6_prog_load, + bind6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + }, + /* connect - system calls */ { SOCK_ADDR_TEST_CONNECT, @@ -285,6 +572,78 @@ static struct sock_addr_test tests[] = { NULL, }, + /* connect - kernel calls */ + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (stream)", + connect4_prog_load, + connect4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect (dgram)", + connect4_prog_load, + connect4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (stream)", + connect6_prog_load, + connect6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect (dgram)", + connect6_prog_load, + connect6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect (dgram)", + connect_unix_prog_load, + connect_unix_prog_destroy, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + }, + /* sendmsg - system calls */ { SOCK_ADDR_TEST_SENDMSG, @@ -329,6 +688,94 @@ static struct sock_addr_test tests[] = { NULL, }, + /* sendmsg - kernel calls (sock_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg (dgram)", + sendmsg4_prog_load, + sendmsg4_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg (dgram)", + sendmsg6_prog_load, + sendmsg6_prog_destroy, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + }, + + /* sendmsg - kernel calls (kernel_sendmsg) */ + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg (dgram)", + sendmsg4_prog_load, + sendmsg4_prog_destroy, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg (dgram)", + sendmsg6_prog_load, + sendmsg6_prog_destroy, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg (dgram)", + sendmsg_unix_prog_load, + sendmsg_unix_prog_destroy, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + }, + /* recvmsg - system calls */ { SOCK_ADDR_TEST_RECVMSG, @@ -470,6 +917,27 @@ static int cmp_sock_addr(info_fn fn, int sock1, return cmp_addr(&addr1, len1, addr2, addr2_len, cmp_port); } +static int load_sock_addr_kern(void) +{ + int err; + + skel = sock_addr_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto err; + + err = 0; + goto out; +err: + err = -1; +out: + return err; +} + +static void unload_sock_addr_kern(void) +{ + sock_addr_kern__destroy(skel); +} + static void test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; @@ -768,6 +1236,9 @@ void test_sock_addr(void) if (!ASSERT_OK(setup_test_env(&tok), "setup_test_env")) goto cleanup; + if (!ASSERT_OK(load_sock_addr_kern(), "load_sock_addr_kern")) + goto cleanup; + for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; @@ -808,6 +1279,7 @@ void test_sock_addr(void) } cleanup: + unload_sock_addr_kern(); cleanup_test_env(tok); if (cgroup_fd >= 0) close(cgroup_fd); From 57bfc7605ca5b102ba336779ae9adbc5bbba1d96 Mon Sep 17 00:00:00 2001 From: Miao Xu Date: Wed, 1 May 2024 21:23:16 -0700 Subject: [PATCH 037/119] tcp: Add new args for cong_control in tcp_congestion_ops This patch adds two new arguments for cong_control of struct tcp_congestion_ops: - ack - flag These two arguments are inherited from the caller tcp_cong_control in tcp_intput.c. One use case of them is to update cwnd and pacing rate inside cong_control based on the info they provide. For example, the flag can be used to decide if it is the right time to raise or reduce a sender's cwnd. Reviewed-by: Eric Dumazet Signed-off-by: Miao Xu Link: https://lore.kernel.org/r/20240502042318.801932-2-miaxu@meta.com Signed-off-by: Martin KaFai Lau --- include/net/tcp.h | 2 +- net/ipv4/bpf_tcp_ca.c | 3 ++- net/ipv4/tcp_bbr.c | 2 +- net/ipv4/tcp_input.c | 2 +- tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 6 +++--- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index fe98fb01879b..7294da8fb780 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1172,7 +1172,7 @@ struct tcp_congestion_ops { /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 7f518ea5f4ac..6bd7f8db189a 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -307,7 +307,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) return 0; } -static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) +static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) { } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 7e52ab24e40a..760941e55153 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_update_gains(sk); } -__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) +__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { struct bbr *bbr = inet_csk_ca(sk); u32 bw; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53e1150f706f..23ccfc7b1d3c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3541,7 +3541,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cong_control) { - icsk->icsk_ca_ops->cong_control(sk, rs); + icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs); return; } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c index fcfbfe0336b4..52b610357309 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -5,7 +5,7 @@ #include extern void bbr_init(struct sock *sk) __ksym; -extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym; +extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym; extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym; extern u32 bbr_undo_cwnd(struct sock *sk) __ksym; extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; @@ -42,9 +42,9 @@ void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) } SEC("struct_ops/cong_control") -void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs) +void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { - bbr_main(sk, rs); + bbr_main(sk, ack, flag, rs); } SEC("struct_ops/cong_avoid") From 0325cbd21e3c26eff88bc7da303ffb46b4f5d294 Mon Sep 17 00:00:00 2001 From: Miao Xu Date: Wed, 1 May 2024 21:23:17 -0700 Subject: [PATCH 038/119] bpf: tcp: Allow to write tp->snd_cwnd_stamp in bpf_tcp_ca This patch allows the write of tp->snd_cwnd_stamp in a bpf tcp ca program. An use case of writing this field is to keep track of the time whenever tp->snd_cwnd is raised or reduced inside the `cong_control` callback. Reviewed-by: Eric Dumazet Signed-off-by: Miao Xu Link: https://lore.kernel.org/r/20240502042318.801932-3-miaxu@meta.com Signed-off-by: Martin KaFai Lau --- net/ipv4/bpf_tcp_ca.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 6bd7f8db189a..18227757ec0c 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -107,6 +107,9 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, case offsetof(struct tcp_sock, snd_cwnd_cnt): end = offsetofend(struct tcp_sock, snd_cwnd_cnt); break; + case offsetof(struct tcp_sock, snd_cwnd_stamp): + end = offsetofend(struct tcp_sock, snd_cwnd_stamp); + break; case offsetof(struct tcp_sock, snd_ssthresh): end = offsetofend(struct tcp_sock, snd_ssthresh); break; From 96c3490d6423b7f24d356e24a61c24de69f3de77 Mon Sep 17 00:00:00 2001 From: Miao Xu Date: Wed, 1 May 2024 21:23:18 -0700 Subject: [PATCH 039/119] selftests/bpf: Add test for the use of new args in cong_control This patch adds a selftest to show the usage of the new arguments in cong_control. For simplicity's sake, the testing example reuses cubic's kernel functions. Signed-off-by: Miao Xu Link: https://lore.kernel.org/r/20240502042318.801932-4-miaxu@meta.com Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 24 +++ .../selftests/bpf/progs/bpf_cc_cubic.c | 199 ++++++++++++++++++ .../selftests/bpf/progs/bpf_tracing_net.h | 10 + 3 files changed, 233 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_cc_cubic.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 907bac46c774..0aca02532794 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -14,6 +14,7 @@ #include "tcp_ca_incompl_cong_ops.skel.h" #include "tcp_ca_unsupp_cong_op.skel.h" #include "tcp_ca_kfunc.skel.h" +#include "bpf_cc_cubic.skel.h" #ifndef ENOTSUPP #define ENOTSUPP 524 @@ -452,6 +453,27 @@ static void test_tcp_ca_kfunc(void) tcp_ca_kfunc__destroy(skel); } +static void test_cc_cubic(void) +{ + struct bpf_cc_cubic *cc_cubic_skel; + struct bpf_link *link; + + cc_cubic_skel = bpf_cc_cubic__open_and_load(); + if (!ASSERT_OK_PTR(cc_cubic_skel, "bpf_cc_cubic__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(cc_cubic_skel->maps.cc_cubic); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_cc_cubic__destroy(cc_cubic_skel); + return; + } + + do_test("bpf_cc_cubic", NULL); + + bpf_link__destroy(link); + bpf_cc_cubic__destroy(cc_cubic_skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -482,4 +504,6 @@ void test_bpf_tcp_ca(void) test_link_replace(); if (test__start_subtest("tcp_ca_kfunc")) test_tcp_ca_kfunc(); + if (test__start_subtest("cc_cubic")) + test_cc_cubic(); } diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c new file mode 100644 index 000000000000..5c7697c70e2a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Highlights: + * 1. The major difference between this bpf program and tcp_cubic.c + * is that this bpf program relies on `cong_control` rather than + * `cong_avoid` in the struct tcp_congestion_ops. + * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and + * tcp_update_pacing_rate is bypassed when `cong_control` is + * defined, so moving these logic to `cong_control`. + * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c. + * The main purpose is to show use cases of the arguments in + * `cong_control`. For simplicity's sake, it reuses tcp cubic's + * kernel functions. + */ + +#include "vmlinux.h" + +#include +#include +#include "bpf_tracing_net.h" + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, args) + +#define USEC_PER_SEC 1000000UL +#define TCP_PACING_SS_RATIO (200) +#define TCP_PACING_CA_RATIO (120) +#define TCP_REORDERING (12) + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define after(seq2, seq1) before(seq1, seq2) + +extern void cubictcp_init(struct sock *sk) __ksym; +extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; +extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym; +extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym; +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; +extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; +extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; + +static struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} + +static __u64 div64_u64(__u64 dividend, __u64 divisor) +{ + return dividend / divisor; +} + +static void tcp_update_pacing_rate(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u64 rate; + + /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ + rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= TCP_PACING_SS_RATIO; + else + rate *= TCP_PACING_CA_RATIO; + + rate *= max(tp->snd_cwnd, tp->packets_out); + + if (tp->srtt_us) + rate = div64_u64(rate, (__u64)tp->srtt_us); + + sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate); +} + +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int newly_lost, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out; + int delta = tp->snd_ssthresh - pkts_in_flight; + + if (newly_acked_sacked <= 0 || !tp->prior_cwnd) + return; + + __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked; + + if (delta < 0) { + __u64 dividend = + (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1; + sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked); + if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) + sndcnt++; + sndcnt = min(delta, sndcnt); + } + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); + tp->snd_cwnd = pkts_in_flight + sndcnt; +} + +/* Decide wheather to run the increase function of congestion control. */ +static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) +{ + if (tcp_sk(sk)->reordering > TCP_REORDERING) + return flag & FLAG_FORWARD_PROGRESS; + + return flag & FLAG_DATA_ACKED; +} + +void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk) +{ + cubictcp_init(sk); +} + +void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) +{ + cubictcp_cwnd_event(sk, event); +} + +void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (((1<icsk_ca_state)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag); + + if (!before(tp->snd_una, tp->high_seq)) { + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_jiffies32; + } + } + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + cubictcp_cong_avoid(sk, ack, rs->acked_sacked); + tp->snd_cwnd_stamp = tcp_jiffies32; + } + + tcp_update_pacing_rate(sk); +} + +__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +{ + return cubictcp_recalc_ssthresh(sk); +} + +void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +{ + cubictcp_state(sk, new_state); +} + +void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, + const struct ack_sample *sample) +{ + cubictcp_acked(sk, sample); +} + +__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +{ + return tcp_reno_undo_cwnd(sk); +} + +SEC(".struct_ops") +struct tcp_congestion_ops cc_cubic = { + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_control = (void *)bpf_cubic_cong_control, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, + .name = "bpf_cc_cubic", +}; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 7001965d1cc3..f9ec630dfcd5 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -80,6 +80,14 @@ #define TCP_INFINITE_SSTHRESH 0x7fffffff #define TCP_PINGPONG_THRESH 3 +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_SND_UNA_ADVANCED \ + 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED) + #define fib_nh_dev nh_common.nhc_dev #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw6 nh_common.nhc_gw.ipv6 @@ -119,4 +127,6 @@ #define tw_v6_daddr __tw_common.skc_v6_daddr #define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr +#define tcp_jiffies32 ((__u32)bpf_jiffies64()) + #endif From 00f0e08f23fc007f4a5a71cd7e37fcdb15af0c1b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Apr 2024 13:19:51 -0700 Subject: [PATCH 040/119] libbpf: fix potential overflow in ring__consume_n() ringbuf_process_ring() return int64_t, while ring__consume_n() assigns it to int. It's highly unlikely, but possible for ringbuf_process_ring() to return value larger than INT_MAX, so use int64_t. ring__consume_n() does check INT_MAX before returning int result to the user. Fixes: 4d22ea94ea33 ("libbpf: Add ring__consume_n / ring_buffer__consume_n") Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240430201952.888293-1-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 99e44cf02321..37c5a2d86a78 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -405,7 +405,7 @@ int ring__map_fd(const struct ring *r) int ring__consume_n(struct ring *r, size_t n) { - int res; + int64_t res; res = ringbuf_process_ring(r, n); if (res < 0) From 087d757fb4736ecbd3e42eebf9b39d5225d4a2ee Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 30 Apr 2024 13:19:52 -0700 Subject: [PATCH 041/119] libbpf: fix ring_buffer__consume_n() return result logic Add INT_MAX check to ring_buffer__consume_n(). We do the similar check to handle int return result of all these ring buffer APIs in other APIs and ring_buffer__consume_n() is missing one. This patch fixes this omission. Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240430201952.888293-2-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 37c5a2d86a78..bfd8dac4c0cc 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -301,7 +301,7 @@ int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) if (n == 0) break; } - return res; + return res > INT_MAX ? INT_MAX : res; } /* Consume available ring buffer(s) data without event polling. From cf9bea94f6b2934d409511c05337010b137316a3 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sun, 28 Apr 2024 13:25:59 +0200 Subject: [PATCH 042/119] libbpf: Fix bpf_ksym_exists() in GCC The macro bpf_ksym_exists is defined in bpf_helpers.h as: #define bpf_ksym_exists(sym) ({ \ _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ !!sym; \ }) The purpose of the macro is to determine whether a given symbol has been defined, given the address of the object associated with the symbol. It also has a compile-time check to make sure the object whose address is passed to the macro has been declared as weak, which makes the check on `sym' meaningful. As it happens, the check for weak doesn't work in GCC in all cases, because __builtin_constant_p not always folds at parse time when optimizing. This is because optimizations that happen later in the compilation process, like inlining, may make a previously non-constant expression a constant. This results in errors like the following when building the selftests with GCC: bpf_helpers.h:190:24: error: expression in static assertion is not constant 190 | _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fortunately recent versions of GCC support a __builtin_has_attribute that can be used to directly check for the __weak__ attribute. This patch changes bpf_helpers.h to use that builtin when building with a recent enough GCC, and to omit the check if GCC is too old to support the builtin. The macro used for GCC becomes: #define bpf_ksym_exists(sym) ({ \ _Static_assert(__builtin_has_attribute (*sym, __weak__), #sym " should be marked as __weak"); \ !!sym; \ }) Note that since bpf_ksym_exists is designed to get the address of the object associated with symbol SYM, we pass *sym to __builtin_has_attribute instead of sym. When an expression is passed to __builtin_has_attribute then it is the type of the passed expression that is checked for the specified attribute. The expression itself is not evaluated. This accommodates well with the existing usages of the macro: - For function objects: struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym __weak; [...] bpf_ksym_exists(bpf_task_acquire) - For variable objects: extern const struct rq runqueues __ksym __weak; /* typed */ [...] bpf_ksym_exists(&runqueues) Note also that BPF support was added in GCC 10 and support for __builtin_has_attribute in GCC 9. Locally tested in bpf-next master branch. No regressions. Signed-of-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240428112559.10518-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_helpers.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 62e1c0cc4a59..305c62817dd3 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -186,10 +186,21 @@ enum libbpf_tristate { #define __kptr __attribute__((btf_type_tag("kptr"))) #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) -#define bpf_ksym_exists(sym) ({ \ - _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ - !!sym; \ +#if defined (__clang__) +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), \ + #sym " should be marked as __weak"); \ + !!sym; \ }) +#elif __GNUC__ > 8 +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(__builtin_has_attribute (*sym, __weak__), \ + #sym " should be marked as __weak"); \ + !!sym; \ +}) +#else +#define bpf_ksym_exists(sym) !!sym +#endif #define __arg_ctx __attribute__((btf_decl_tag("arg:ctx"))) #define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) From a9e7715ce8b3a62a2133e47e87107632a26ad1e2 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Thu, 2 May 2024 19:09:25 +0200 Subject: [PATCH 043/119] libbpf: Avoid casts from pointers to enums in bpf_tracing.h [Differences from V1: - Do not introduce a global typedef, as this is a public header. - Keep the void* casts in BPF_KPROBE_READ_RET_IP and BPF_KRETPROBE_READ_RET_IP, as these are necessary for converting to a const void* argument of bpf_probe_read_kernel.] The BPF_PROG, BPF_KPROBE and BPF_KSYSCALL macros defined in tools/lib/bpf/bpf_tracing.h use a clever hack in order to provide a convenient way to define entry points for BPF programs as if they were normal C functions that get typed actual arguments, instead of as elements in a single "context" array argument. For example, PPF_PROGS allows writing: SEC("struct_ops/cwnd_event") void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) { bbr_cwnd_event(sk, event); dctcp_cwnd_event(sk, event); cubictcp_cwnd_event(sk, event); } That expands into a pair of functions: void ____cwnd_event (unsigned long long *ctx, struct sock *sk, enum tcp_ca_event event) { bbr_cwnd_event(sk, event); dctcp_cwnd_event(sk, event); cubictcp_cwnd_event(sk, event); } void cwnd_event (unsigned long long *ctx) { _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") return ____cwnd_event(ctx, (void*)ctx[0], (void*)ctx[1]); _Pragma("GCC diagnostic pop") } Note how the 64-bit unsigned integers in the incoming CTX get casted to a void pointer, and then implicitly converted to whatever type of the actual argument in the wrapped function. In this case: Arg1: unsigned long long -> void * -> struct sock * Arg2: unsigned long long -> void * -> enum tcp_ca_event The behavior of GCC and clang when facing such conversions differ: pointer -> pointer Allowed by the C standard. GCC: no warning nor error. clang: no warning nor error. pointer -> integer type [C standard says the result of this conversion is implementation defined, and it may lead to unaligned pointer etc.] GCC: error: integer from pointer without a cast [-Wint-conversion] clang: error: incompatible pointer to integer conversion [-Wint-conversion] pointer -> enumerated type GCC: error: incompatible types in assigment (*) clang: error: incompatible pointer to integer conversion [-Wint-conversion] These macros work because converting pointers to pointers is allowed, and converting pointers to integers also works provided a suitable integer type even if it is implementation defined, much like casting a pointer to uintptr_t is guaranteed to work by the C standard. The conversion errors emitted by both compilers by default are silenced by the pragmas. However, the GCC error marked with (*) above when assigning a pointer to an enumerated value is not associated with the -Wint-conversion warning, and it is not possible to turn it off. This is preventing building the BPF kernel selftests with GCC. This patch fixes this by avoiding intermediate casts to void*, replaced with casts to `unsigned long long', which is an integer type capable of safely store a BPF pointer, much like the standard uintptr_t. Testing performed in bpf-next master: - vmtest.sh -- ./test_verifier - vmtest.sh -- ./test_progs - make M=samples/bpf No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240502170925.3194-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_tracing.h | 70 ++++++++++++++++++------------------- tools/lib/bpf/usdt.bpf.h | 24 ++++++------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index 1c13f8e88833..9314fa95f04e 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -633,18 +633,18 @@ struct pt_regs; #endif #define ___bpf_ctx_cast0() ctx -#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] -#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] -#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] -#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] -#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] -#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] -#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] -#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] -#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] -#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] -#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] -#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), ctx[11] #define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) /* @@ -786,14 +786,14 @@ ____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) struct pt_regs; #define ___bpf_kprobe_args0() ctx -#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) -#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) -#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) -#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) -#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) -#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx) -#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx) -#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx) +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (unsigned long long)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (unsigned long long)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (unsigned long long)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (unsigned long long)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (unsigned long long)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (unsigned long long)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (unsigned long long)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (unsigned long long)PT_REGS_PARM8(ctx) #define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) /* @@ -821,7 +821,7 @@ static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #define ___bpf_kretprobe_args0() ctx -#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (unsigned long long)PT_REGS_RC(ctx) #define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) /* @@ -845,24 +845,24 @@ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) /* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ #define ___bpf_syscall_args0() ctx -#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) -#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) -#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) -#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) -#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) -#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs) -#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs) +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (unsigned long long)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (unsigned long long)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (unsigned long long)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (unsigned long long)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (unsigned long long)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (unsigned long long)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (unsigned long long)PT_REGS_PARM7_SYSCALL(regs) #define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) /* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ #define ___bpf_syswrap_args0() ctx -#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs) -#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (unsigned long long)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (unsigned long long)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (unsigned long long)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (unsigned long long)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (unsigned long long)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (unsigned long long)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (unsigned long long)PT_REGS_PARM7_CORE_SYSCALL(regs) #define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) /* diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index f6763300b26a..76359bcdc94a 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -214,18 +214,18 @@ long bpf_usdt_cookie(struct pt_regs *ctx) /* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ #define ___bpf_usdt_args0() ctx -#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) -#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) -#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) -#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) -#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) -#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) -#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) -#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) -#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) -#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) -#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) -#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; }) #define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) /* From 8e6d9ae2e09f1f6ba65614a5e5c5a2a2e335dcba Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 3 May 2024 17:50:45 -0700 Subject: [PATCH 044/119] selftests/bpf: Use bpf_tracing.h instead of bpf_tcp_helpers.h The bpf programs that this patch changes require the BPF_PROG macro. The BPF_PROG macro is defined in the libbpf's bpf_tracing.h. Some tests include bpf_tcp_helpers.h which includes bpf_tracing.h. They don't need other things from bpf_tcp_helpers.h other than bpf_tracing.h. This patch simplifies it by directly including the bpf_tracing.h. The motivation of this unnecessary code churn is to retire the bpf_tcp_helpers.h by directly using vmlinux.h. Right now, the main usage of the bpf_tcp_helpers.h is the partial kernel socket definitions (e.g. socket, sock, tcp_sock). While the test cases continue to grow, fields are kept adding to those partial socket definitions (e.g. the recent bpf_cc_cubic.c test which tried to extend bpf_tcp_helpers.c but eventually used the vmlinux.h instead). The idea is to retire bpf_tcp_helpers.c and consistently use vmlinux.h for the tests that require the kernel sockets. This patch tackles the obvious tests that can directly use bpf_tracing.h instead of bpf_tcp_helpers.h. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240504005045.848376-1-martin.lau@linux.dev --- tools/testing/selftests/bpf/progs/timer.c | 3 ++- tools/testing/selftests/bpf/progs/timer_failure.c | 2 +- tools/testing/selftests/bpf/progs/timer_mim.c | 2 +- tools/testing/selftests/bpf/progs/timer_mim_reject.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index f615da97df26..4c677c001258 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -2,9 +2,10 @@ /* Copyright (c) 2021 Facebook */ #include #include +#include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_failure.c b/tools/testing/selftests/bpf/progs/timer_failure.c index 0996c2486f05..5a2e9dabf1c6 100644 --- a/tools/testing/selftests/bpf/progs/timer_failure.c +++ b/tools/testing/selftests/bpf/progs/timer_failure.c @@ -5,8 +5,8 @@ #include #include #include +#include #include "bpf_misc.h" -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c index 2fee7ab105ef..50ebc3f68522 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim.c +++ b/tools/testing/selftests/bpf/progs/timer_mim.c @@ -4,7 +4,7 @@ #include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c index 5d648e3d8a41..dd3f1ed6d6e6 100644 --- a/tools/testing/selftests/bpf/progs/timer_mim_reject.c +++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c @@ -4,7 +4,7 @@ #include #include #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; struct hmap_elem { From e549b39a0ab8880d7ae6c6495b00fc1cb8f36174 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 6 May 2024 16:50:22 +0200 Subject: [PATCH 045/119] selftests/bpf: Fix pointer arithmetic in test_xdp_do_redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cast operation has a higher precedence than addition. The code here wants to zero the 2nd half of the 64-bit metadata, but due to a pointer arithmetic mistake, it writes the zero at offset 16 instead. Just adding parentheses around "data + 4" would fix this, but I think this will be slightly better readable with array syntax. I was unable to test this with tools/testing/selftests/bpf/vmtest.sh, because my glibc is newer than glibc in the provided VM image. So I just checked the difference in the compiled code. objdump -S tools/testing/selftests/bpf/xdp_do_redirect.test.o: - *((__u32 *)data) = 0x42; /* metadata test value */ + ((__u32 *)data)[0] = 0x42; /* metadata test value */ be7: 48 8d 85 30 fc ff ff lea -0x3d0(%rbp),%rax bee: c7 00 42 00 00 00 movl $0x42,(%rax) - *((__u32 *)data + 4) = 0; + ((__u32 *)data)[1] = 0; bf4: 48 8d 85 30 fc ff ff lea -0x3d0(%rbp),%rax - bfb: 48 83 c0 10 add $0x10,%rax + bfb: 48 83 c0 04 add $0x4,%rax bff: c7 00 00 00 00 00 movl $0x0,(%rax) Fixes: 5640b6d89434 ("selftests/bpf: fix "metadata marker" getting overwritten by the netstack") Signed-off-by: Michal Schmidt Signed-off-by: Andrii Nakryiko Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20240506145023.214248-1-mschmidt@redhat.com --- tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c index 498d3bdaa4b0..bad0ea167be7 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c @@ -107,8 +107,8 @@ void test_xdp_do_redirect(void) .attach_point = BPF_TC_INGRESS); memcpy(&data[sizeof(__u64)], &pkt_udp, sizeof(pkt_udp)); - *((__u32 *)data) = 0x42; /* metadata test value */ - *((__u32 *)data + 4) = 0; + ((__u32 *)data)[0] = 0x42; /* metadata test value */ + ((__u32 *)data)[1] = 0; skel = test_xdp_do_redirect__open(); if (!ASSERT_OK_PTR(skel, "skel")) From 41b307ad756e1b7b618bf9d9c1cce3595705ede4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 5 May 2024 16:00:54 -0700 Subject: [PATCH 046/119] bpftool, selftests/hid/bpf: Fix 29 clang warnings When building either tools/bpf/bpftool, or tools/testing/selftests/hid, (the same Makefile is used for these), clang generates many instances of the following: "clang: warning: -lLLVM-17: 'linker' input unused" Quentin points out that the LLVM version is only required in $(LIBS), not in $(CFLAGS), so the fix is to remove it from CFLAGS. Suggested-by: Quentin Monnet Signed-off-by: John Hubbard Signed-off-by: Andrii Nakryiko Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20240505230054.13813-1-jhubbard@nvidia.com --- tools/bpf/bpftool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index b67454b45a49..dfa4f1bebbb3 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -147,7 +147,7 @@ ifeq ($(feature-llvm),1) # If LLVM is available, use it for JIT disassembly CFLAGS += -DHAVE_LLVM_SUPPORT LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets - CFLAGS += $(shell $(LLVM_CONFIG) --cflags --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + CFLAGS += $(shell $(LLVM_CONFIG) --cflags) LIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) LIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) From d786957ebd3fb4cfd9147dbcccd1e8f3871b45ce Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:44 +0100 Subject: [PATCH 047/119] bpf/verifier: replace calls to mark_reg_unknown. In order to further simplify the code in adjust_scalar_min_max_vals all the calls to mark_reg_unknown are replaced by __mark_reg_unknown. static void mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { ... mark all regs not init ... return; } __mark_reg_unknown(env, regs + regno); } The 'regno >= MAX_BPF_REG' does not apply to adjust_scalar_min_max_vals(), because it is only called from the following stack: - check_alu_op - adjust_reg_min_max_vals - adjust_scalar_min_max_vals The check_alu_op() does check_reg_arg() which verifies that both src and dst register numbers are within bounds. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-2-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7360f04f9ec7..41c66cc6db80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13887,7 +13887,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known; s64 smin_val, smax_val; @@ -13994,7 +13993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14007,7 +14006,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14020,7 +14019,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 31 or 63 are undefined. * This includes shifts by a negative number. */ - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } if (alu32) @@ -14029,7 +14028,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_arsh(dst_reg, &src_reg); break; default: - mark_reg_unknown(env, regs, insn->dst_reg); + __mark_reg_unknown(env, dst_reg); break; } From 0922c78f592c60e5a8fe6ab968479def124d4ff3 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:45 +0100 Subject: [PATCH 048/119] bpf/verifier: refactor checks for range computation Split range computation checks in its own function, isolating pessimitic range set for dst_reg and failing return to a single point. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko bpf/verifier: improve code after range computation recent changes. Link: https://lore.kernel.org/r/20240506141849.185293-3-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 109 +++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 64 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41c66cc6db80..bdaf0413bf06 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13878,6 +13878,50 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, + const struct bpf_reg_state *src_reg) +{ + bool src_is_const = false; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + + if (insn_bitness == 32) { + if (tnum_subreg_is_const(src_reg->var_off) + && src_reg->s32_min_value == src_reg->s32_max_value + && src_reg->u32_min_value == src_reg->u32_max_value) + src_is_const = true; + } else { + if (tnum_is_const(src_reg->var_off) + && src_reg->smin_value == src_reg->smax_value + && src_reg->umin_value == src_reg->umax_value) + src_is_const = true; + } + + switch (BPF_OP(insn->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + return true; + + /* Compute range for the following only if the src_reg is const. + */ + case BPF_XOR: + case BPF_OR: + case BPF_MUL: + return src_is_const; + + /* Shift operators range is only computable if shift dimension operand + * is a constant. Shifts greater than 31 or 63 are undefined. This + * includes shifts by a negative number. + */ + case BPF_LSH: + case BPF_RSH: + case BPF_ARSH: + return (src_is_const && src_reg->umax_value < insn_bitness); + default: + return false; + } +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -13888,51 +13932,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); - bool src_known; - s64 smin_val, smax_val; - u64 umin_val, umax_val; - s32 s32_min_val, s32_max_val; - u32 u32_min_val, u32_max_val; - u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; - smin_val = src_reg.smin_value; - smax_val = src_reg.smax_value; - umin_val = src_reg.umin_value; - umax_val = src_reg.umax_value; - - s32_min_val = src_reg.s32_min_value; - s32_max_val = src_reg.s32_max_value; - u32_min_val = src_reg.u32_min_value; - u32_max_val = src_reg.u32_max_value; - - if (alu32) { - src_known = tnum_subreg_is_const(src_reg.var_off); - if ((src_known && - (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || - s32_min_val > s32_max_val || u32_min_val > u32_max_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } else { - src_known = tnum_is_const(src_reg.var_off); - if ((src_known && - (smin_val != smax_val || umin_val != umax_val)) || - smin_val > smax_val || umin_val > umax_val) { - /* Taint dst register if offset had invalid bounds - * derived from e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; - } - } - - if (!src_known && - opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) { __mark_reg_unknown(env, dst_reg); return 0; } @@ -13989,46 +13992,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_xor(dst_reg, &src_reg); break; case BPF_LSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_lsh(dst_reg, &src_reg); else scalar_min_max_lsh(dst_reg, &src_reg); break; case BPF_RSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_rsh(dst_reg, &src_reg); else scalar_min_max_rsh(dst_reg, &src_reg); break; case BPF_ARSH: - if (umax_val >= insn_bitness) { - /* Shifts greater than 31 or 63 are undefined. - * This includes shifts by a negative number. - */ - __mark_reg_unknown(env, dst_reg); - break; - } if (alu32) scalar32_min_max_arsh(dst_reg, &src_reg); else scalar_min_max_arsh(dst_reg, &src_reg); break; default: - __mark_reg_unknown(env, dst_reg); break; } From 138cc42c05d11fd5ee82ee1606d2c9823373a926 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:46 +0100 Subject: [PATCH 049/119] bpf/verifier: improve XOR and OR range computation Range for XOR and OR operators would not be attempted unless src_reg would resolve to a single value, i.e. a known constant value. This condition is unnecessary, and the following XOR/OR operator handling could compute a possible better range. Acked-by: Eduard Zingerman Signed-off-by: Cupertino Miranda Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-4-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bdaf0413bf06..1f6deb3e44c5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13900,12 +13900,12 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_ADD: case BPF_SUB: case BPF_AND: + case BPF_XOR: + case BPF_OR: return true; /* Compute range for the following only if the src_reg is const. */ - case BPF_XOR: - case BPF_OR: case BPF_MUL: return src_is_const; From 5ec9a7d13f49b9c1c5ba854244d1f2ba414cf139 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:47 +0100 Subject: [PATCH 050/119] selftests/bpf: XOR and OR range computation tests. Added a test for bound computation in XOR and OR when non constant values are used and both registers have bounded ranges. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240506141849.185293-5-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_bounds.c | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 960998f16306..7d570acf23ee 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -885,6 +885,48 @@ l1_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("bounds check for non const xor src dst") +__success __log_level(2) +__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_xor_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 ^= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("bounds check for non const or src dst") +__success __log_level(2) +__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__naked void non_const_or_src_dst(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xaf; \ + r0 &= 0x1a0; \ + r0 |= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("bounds checks after 32-bit truncation. test 1") __success __failure_unpriv __msg_unpriv("R0 leaks addr") From 41d047a871062f1a4d1871a1908d380c14e75428 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:48 +0100 Subject: [PATCH 051/119] bpf/verifier: relax MUL range computation check MUL instruction required that src_reg would be a known value (i.e. src_reg would be a const value). The condition in this case can be relaxed, since the range computation algorithm used in current code already supports a proper range computation for any valid range value on its operands. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20240506141849.185293-6-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1f6deb3e44c5..9e3aba08984e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13902,12 +13902,8 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_AND: case BPF_XOR: case BPF_OR: - return true; - - /* Compute range for the following only if the src_reg is const. - */ case BPF_MUL: - return src_is_const; + return true; /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This From 92956786b4e26ea22e5b3c1c86cc71f5c9b3b9d8 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 6 May 2024 15:18:49 +0100 Subject: [PATCH 052/119] selftests/bpf: MUL range computation tests. Added a test for bound computation in MUL when non constant values are used and both registers have bounded ranges. Signed-off-by: Cupertino Miranda Acked-by: Eduard Zingerman Acked-by: Andrii Nakryiko Cc: Yonghong Song Cc: Alexei Starovoitov Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20240506141849.185293-7-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_bounds.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 7d570acf23ee..a0bb7fb40ea5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -927,6 +927,27 @@ __naked void non_const_or_src_dst(void) : __clobber_all); } +SEC("socket") +__description("bounds check for non const mul regs") +__success __log_level(2) +__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__naked void non_const_mul_regs(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + call %[bpf_get_prandom_u32]; \ + r6 &= 0xff; \ + r0 &= 0x0f; \ + r0 *= r6; \ + exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("bounds checks after 32-bit truncation. test 1") __success __failure_unpriv __msg_unpriv("R0 leaks addr") From 75b0fbf15d8466be618a997cae774eef445c0c7d Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 7 May 2024 14:33:39 +0800 Subject: [PATCH 053/119] bpf: Remove redundant page mask of vmf->address As the comment described in "struct vm_fault": ".address" : 'Faulting virtual address - masked' ".real_address" : 'Faulting virtual address - unmasked' The link [1] said: "Whatever the routes, all architectures end up to the invocation of handle_mm_fault() which, in turn, (likely) ends up calling __handle_mm_fault() to carry out the actual work of allocating the page tables." __handle_mm_fault() does address assignment: .address = address & PAGE_MASK, .real_address = address, This is debug dump by running `./test_progs -a "*arena*"`: [ 69.767494] arena fault: vmf->address = 10000001d000, vmf->real_address = 10000001d008 [ 69.767496] arena fault: vmf->address = 10000001c000, vmf->real_address = 10000001c008 [ 69.767499] arena fault: vmf->address = 10000001b000, vmf->real_address = 10000001b008 [ 69.767501] arena fault: vmf->address = 10000001a000, vmf->real_address = 10000001a008 [ 69.767504] arena fault: vmf->address = 100000019000, vmf->real_address = 100000019008 [ 69.769388] arena fault: vmf->address = 10000001e000, vmf->real_address = 10000001e1e8 So we can use the value of 'vmf->address' to do BPF arena kernel address space cast directly. [1] https://docs.kernel.org/mm/page_tables.html Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240507063358.8048-1-haiyue.wang@intel.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 6c81630c5293..f5953f1a95cd 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -251,7 +251,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) int ret; kbase = bpf_arena_get_kern_vm_start(arena); - kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + kaddr = kbase + (u32)(vmf->address); guard(mutex)(&arena->lock); page = vmalloc_to_page((void *)kaddr); From 2ce987e1650216638b2b5f44948c6efea67038ae Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 09:42:26 +0200 Subject: [PATCH 054/119] bpf: Avoid __hidden__ attribute in static object An object defined as `static' defaults to hidden visibility. If additionally the visibility(__weak__) compiler attribute is applied to the declaration of the object, GCC warns that the attribute gets ignored. This patch removes the only instance of this problem among the BPF selftests. Tested in bpf-next master. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507074227.4523-2-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/progs/cpumask_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index c705d8112a35..b979e91f55f0 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -9,7 +9,7 @@ int err; -#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) +#define private(name) SEC(".bss." #name) __attribute__((aligned(8))) private(MASK) static struct bpf_cpumask __kptr * global_mask; struct __cpumask_map_value { From b0fbdf759da05a35b67fd27b8859738b79af25d6 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 09:42:27 +0200 Subject: [PATCH 055/119] bpf: Disable some `attribute ignored' warnings in GCC This patch modifies selftests/bpf/Makefile to pass -Wno-attributes to GCC. This is because of the following attributes which are ignored: - btf_decl_tag - btf_type_tag There are many of these. At the moment none of these are recognized/handled by gcc-bpf. We are aware that btf_decl_tag is necessary for some of the selftest harness to communicate test failure/success. Support for it is in progress in GCC upstream: https://gcc.gnu.org/pipermail/gcc-patches/2024-May/650482.html However, the GCC master branch is not yet open, so the series above (currently under review upstream) wont be able to make it there until 14.1 gets released, probably mid next week. As for btf_type_tag, more extensive work will be needed in GCC upstream to support it in both BTF and DWARF. We have a WIP big patch for that, but that is not needed to compile/build the selftests. - used There are SEC macros defined in the selftests as: #define SEC(N) __attribute__((section(N),used)) The SEC macro is used for both functions and global variables. According to the GCC documentation `used' attribute is really only meaningful for functions, and it warns when the attribute is used for other global objects, like for example ctl_array in test_xdp_noinline.c. Ignoring this is benign. - align_value In progs/test_cls_redirect.c:127 there is: typedef uint8_t *net_ptr __attribute__((align_value(8))); GCC warns that it is ignoring this attribute, because it is not implemented by GCC. I think ignoring this attribute in GCC is benign, because according to the clang documentation [1] its purpose seems to be merely declarative and doesn't seem to translate into extra checks at run-time, only to perhaps better optimized code ("runtime behavior is undefined if the pointed memory object is not aligned to the specified alignment"). [1] https://clang.llvm.org/docs/AttributeReference.html#align-value Tested in bpf-next master. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507074227.4523-3-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ba28d42b74db..5d9c906bc3cb 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -431,7 +431,7 @@ endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -Wno-attributes -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c From 675b4e24bc50f4600b6bf3527fdbaa1f73498334 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 11:50:11 +0200 Subject: [PATCH 056/119] bpf: Temporarily define BPF_NO_PRESEVE_ACCESS_INDEX for GCC The vmlinux.h file generated by bpftool makes use of compiler pragmas in order to install the CO-RE preserve_access_index in all the struct types derived from the BTF info: #ifndef __VMLINUX_H__ #define __VMLINUX_H__ #ifndef BPF_NO_PRESERVE_ACCESS_INDEX #pragma clang attribute push (__attribute__((preserve_access_index)), apply_t = record #endif [... type definitions generated from kernel BTF ... ] #ifndef BPF_NO_PRESERVE_ACCESS_INDEX #pragma clang attribute pop #endif The `clang attribute push/pop' pragmas are specific to clang/llvm and are not supported by GCC. At the moment the BTF dumping services in libbpf do not support dicriminating between types dumped because they are directly referred and types dumped because they are dependencies. A suitable API is being worked now. See [1] and [2]. In the interim, this patch changes the selftests/bpf Makefile so it passes -DBPF_NO_PRESERVE_ACCESS_INDEX to GCC when it builds the selftests. This workaround is temporary, and may have an impact on the results of the GCC-built tests. [1] https://lore.kernel.org/bpf/20240503111836.25275-1-jose.marchesi@oracle.com/T/#u [2] https://lore.kernel.org/bpf/20240504205510.24785-1-jose.marchesi@oracle.com/T/#u Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240507095011.15867-1-jose.marchesi@oracle.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 5d9c906bc3cb..f0c429cf4424 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -431,7 +431,7 @@ endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2) - $(Q)$(BPF_GCC) $3 -Wno-attributes -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c From 207cf6e649ee551ab3bdb1cfe1b2848e6a4337a5 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 7 May 2024 13:22:19 +0100 Subject: [PATCH 057/119] selftests/bpf: Add CFLAGS per source file and runner This patch adds support to specify CFLAGS per source file and per test runner. Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507122220.207820-2-cupertino.miranda@oracle.com --- tools/testing/selftests/bpf/Makefile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f0c429cf4424..511ea84139a3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -81,11 +81,11 @@ TEST_INST_SUBDIRS += bpf_gcc # The following tests contain C code that, although technically legal, # triggers GCC warnings that cannot be disabled: declaration of # anonymous struct types in function parameter lists. -progs/btf_dump_test_case_bitfields.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_namespacing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_packing.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_padding.c-CFLAGS := -Wno-error -progs/btf_dump_test_case_syntax.c-CFLAGS := -Wno-error +progs/btf_dump_test_case_bitfields.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_namespacing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error +progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error endif ifneq ($(CLANG_CPUV4),) @@ -470,7 +470,7 @@ LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(ske # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2 @@ -498,7 +498,7 @@ endef # Using TRUNNER_XXX variables, provided by callers of DEFINE_TEST_RUNNER and # set up by DEFINE_TEST_RUNNER itself, create test runner build rules with: # $1 - test runner base binary name (e.g., test_progs) -# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, gcc-bpf, etc) +# $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER_RULES ifeq ($($(TRUNNER_OUTPUT)-dir),) @@ -521,7 +521,8 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS) \ - $$($$<-CFLAGS)) + $$($$<-CFLAGS) \ + $$($$<-$2-CFLAGS)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) From b2e086cb28aa358f7b5564888304908aff735827 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 7 May 2024 13:22:20 +0100 Subject: [PATCH 058/119] selftests/bpf: Change functions definitions to support GCC The test_xdp_noinline.c contains 2 functions that use more then 5 arguments. This patch collapses the 2 last arguments in an array. Also in GCC and ipa_sra optimization increases the number of arguments used in function encap_v4. This pass disables the optimization for that particular file. Signed-off-by: Cupertino Miranda Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240507122220.207820-3-cupertino.miranda@oracle.com --- .../selftests/bpf/progs/test_xdp_noinline.c | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 5c7e4758a0ca..fad94e41cef9 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -318,6 +318,14 @@ bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC push_options +/* GCC optimization collapses functions and increases the number of arguments + * beyond the compatible amount supported by BPF. + */ +#pragma GCC optimize("-fno-ipa-sra") +#endif + static __attribute__ ((noinline)) bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, struct packet_description *pckt, @@ -372,6 +380,10 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, return true; } +#ifndef __clang__ +#pragma GCC pop_options +#endif + static __attribute__ ((noinline)) int swap_mac_and_send(void *data, void *data_end) { @@ -588,12 +600,13 @@ static void connection_table_lookup(struct real_definition **real, __attribute__ ((noinline)) static int process_l3_headers_v6(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct ipv6hdr *ip6h; __u64 iph_len; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; ip6h = data + off; if (ip6h + 1 > data_end) @@ -619,11 +632,12 @@ static int process_l3_headers_v6(struct packet_description *pckt, __attribute__ ((noinline)) static int process_l3_headers_v4(struct packet_description *pckt, __u8 *protocol, __u64 off, - __u16 *pkt_bytes, void *data, - void *data_end) + __u16 *pkt_bytes, void *extra_args[2]) { struct iphdr *iph; int action; + void *data = extra_args[0]; + void *data_end = extra_args[1]; iph = data + off; if (iph + 1 > data_end) @@ -666,13 +680,14 @@ static int process_packet(void *data, __u64 off, void *data_end, __u8 protocol; __u32 vip_num; int action; + void *extra_args[2] = { data, data_end }; if (is_ipv6) action = process_l3_headers_v6(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); else action = process_l3_headers_v4(&pckt, &protocol, off, - &pkt_bytes, data, data_end); + &pkt_bytes, extra_args); if (action >= 0) return action; protocol = pckt.flow.proto; From 8374b56b1df5566d19d645e49da2bf31b660bcfd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:29 -0700 Subject: [PATCH 059/119] libbpf: remove unnecessary struct_ops prog validity check libbpf ensures that BPF program references set in map->st_ops->progs[i] during open phase are always valid STRUCT_OPS programs. This is done in bpf_object__collect_st_ops_relos(). So there is no need to double-check that in bpf_map__init_kern_struct_ops(). Simplify the code by removing unnecessary check. Also, we avoid using local prog variable to keep code similar to the upcoming fix, which adds similar logic in another part of bpf_map__init_kern_struct_ops(). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-2-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a3566a456bc8..7d77a1b158ce 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1152,22 +1152,15 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) return -ENOTSUP; } - prog = st_ops->progs[i]; - if (prog) { + if (st_ops->progs[i]) { /* If we had declaratively set struct_ops callback, we need to - * first validate that it's actually a struct_ops program. - * And then force its autoload to false, because it doesn't have + * force its autoload to false, because it doesn't have * a chance of succeeding from POV of the current struct_ops map. * If this program is still referenced somewhere else, though, * then bpf_object_adjust_struct_ops_autoload() will update its * autoload accordingly. */ - if (!is_valid_st_ops_program(obj, prog)) { - pr_warn("struct_ops init_kern %s: member %s is declaratively assigned a non-struct_ops program\n", - map->name, mname); - return -EINVAL; - } - prog->autoload = false; + st_ops->progs[i]->autoload = false; st_ops->progs[i] = NULL; } From e18e2e70dbd1ee3099049557060067b6ec703efa Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:30 -0700 Subject: [PATCH 060/119] libbpf: handle yet another corner case of nulling out struct_ops program There is yet another corner case where user can set STRUCT_OPS program reference in STRUCT_OPS map to NULL, but libbpf will fail to disable autoload for such BPF program. This time it's the case of "new" kernel which has type information about callback field, but user explicitly nulled-out program reference from user-space after opening BPF object. Fix, hopefully, the last remaining unhandled case. Fixes: 0737df6de946 ("libbpf: better fix for handling nulled-out struct_ops program") Fixes: f973fccd43d3 ("libbpf: handle nulled-out program in struct_ops correctly") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-3-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7d77a1b158ce..04de4fb81785 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1193,11 +1193,19 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map) } if (btf_is_ptr(mtype)) { - /* Update the value from the shadow type */ prog = *(void **)mdata; + /* just like for !kern_member case above, reset declaratively + * set (at compile time) program's autload to false, + * if user replaced it with another program or NULL + */ + if (st_ops->progs[i] && st_ops->progs[i] != prog) + st_ops->progs[i]->autoload = false; + + /* Update the value from the shadow type */ st_ops->progs[i] = prog; if (!prog) continue; + if (!is_valid_st_ops_program(obj, prog)) { pr_warn("struct_ops init_kern %s: member %s is not a struct_ops program\n", map->name, mname); From 9d66d60e968d85742569d025a2fb509cb57333bb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:31 -0700 Subject: [PATCH 061/119] selftests/bpf: add another struct_ops callback use case test Add a test which tests the case that was just fixed. Kernel has full type information about callback, but user explicitly nulls out the reference to declaratively set BPF program reference. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-4-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/test_struct_ops_module.c | 27 +++++++++++++++++++ .../bpf/progs/struct_ops_nulled_out_cb.c | 22 +++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index bd39586abd5a..f3c61ebad323 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -4,6 +4,7 @@ #include #include "struct_ops_module.skel.h" +#include "struct_ops_nulled_out_cb.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -174,6 +175,30 @@ cleanup: struct_ops_module__destroy(skel); } +/* validate that it's ok to "turn off" callback that kernel supports */ +static void test_struct_ops_nulled_out_cb(void) +{ + struct struct_ops_nulled_out_cb *skel; + int err; + + skel = struct_ops_nulled_out_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + /* kernel knows about test_1, but we still null it out */ + skel->struct_ops.ops->test_1 = NULL; + + err = struct_ops_nulled_out_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + ASSERT_FALSE(bpf_program__autoload(skel->progs.test_1_turn_off), "prog_autoload"); + ASSERT_LT(bpf_program__fd(skel->progs.test_1_turn_off), 0, "prog_fd"); + +cleanup: + struct_ops_nulled_out_cb__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) @@ -182,5 +207,7 @@ void serial_test_struct_ops_module(void) test_struct_ops_not_zeroed(); if (test__start_subtest("test_struct_ops_incompatible")) test_struct_ops_incompatible(); + if (test__start_subtest("test_struct_ops_null_out_cb")) + test_struct_ops_nulled_out_cb(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c new file mode 100644 index 000000000000..fa2021388485 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_nulled_out_cb.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +int rand; +int arr[1]; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_turn_off) +{ + return arr[rand]; /* potentially way out of range access */ +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + .test_1 = (void *)test_1_turn_off, +}; + From 548c2ede0dc81cb8c86f3a72c1c63fe1c179cbfe Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:32 -0700 Subject: [PATCH 062/119] libbpf: fix libbpf_strerror_r() handling unknown errors strerror_r(), used from libbpf-specific libbpf_strerror_r() wrapper is documented to return error in two different ways, depending on glibc version. Take that into account when handling strerror_r()'s own errors, which happens when we pass some non-standard (internal) kernel error to it. Before this patch we'd have "ERROR: strerror_r(524)=22", which is quite confusing. Now for the same situation we'll see a bit less visually scary "unknown error (-524)". At least we won't confuse user with irrelevant EINVAL (22). Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-5-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/str_error.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c index 146da01979c7..5e6a1e27ddf9 100644 --- a/tools/lib/bpf/str_error.c +++ b/tools/lib/bpf/str_error.c @@ -2,6 +2,7 @@ #undef _GNU_SOURCE #include #include +#include #include "str_error.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -15,7 +16,18 @@ char *libbpf_strerror_r(int err, char *dst, int len) { int ret = strerror_r(err < 0 ? -err : err, dst, len); - if (ret) - snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + /* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't + * handle the error, on glibc >=2.13 *positive* (errno-like) error + * code is returned directly + */ + if (ret == -1) + ret = errno; + if (ret) { + if (ret == EINVAL) + /* strerror_r() doesn't recognize this specific error */ + snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err); + else + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + } return dst; } From c78420bafe7cf9ce14fa7ceb40ce62e1372e661d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:33 -0700 Subject: [PATCH 063/119] libbpf: improve early detection of doomed-to-fail BPF program loading Extend libbpf's pre-load checks for BPF programs, detecting more typical conditions that are destinated to cause BPF program failure. This is an opportunity to provide more helpful and actionable error message to users, instead of potentially very confusing BPF verifier log and/or error. In this case, we detect struct_ops BPF program that was not referenced anywhere, but still attempted to be loaded (according to libbpf logic). Suggest that the program might need to be used in some struct_ops variable. User will get a message of the following kind: libbpf: prog 'test_1_forgotten': SEC("struct_ops") program isn't referenced anywhere, did you forget to use it? Suggested-by: Tejun Heo Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-6-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- tools/lib/bpf/libbpf.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 04de4fb81785..5401f2df463d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7372,7 +7372,11 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog __u32 log_level = prog->log_level; int ret, err; - if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* Be more helpful by rejecting programs that can't be validated early + * with more meaningful and actionable error message. + */ + switch (prog->type) { + case BPF_PROG_TYPE_UNSPEC: /* * The program type must be set. Most likely we couldn't find a proper * section definition at load time, and thus we didn't infer the type. @@ -7380,6 +7384,15 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", prog->name, prog->sec_name); return -EINVAL; + case BPF_PROG_TYPE_STRUCT_OPS: + if (prog->attach_btf_id == 0) { + pr_warn("prog '%s': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?\n", + prog->name); + return -EINVAL; + } + break; + default: + break; } if (!insns || !insns_cnt) From 41df0733ea414a49094258adab4d600db0420731 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:34 -0700 Subject: [PATCH 064/119] selftests/bpf: validate struct_ops early failure detection logic Add a simple test that validates that libbpf will reject isolated struct_ops program early with helpful warning message. Also validate that explicit use of such BPF program through BPF skeleton after BPF object is open won't trigger any warnings. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-7-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/test_struct_ops_module.c | 45 +++++++++++++++++++ .../bpf/progs/struct_ops_forgotten_cb.c | 19 ++++++++ 2 files changed, 64 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index f3c61ebad323..3785b648c8ad 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -5,6 +5,7 @@ #include "struct_ops_module.skel.h" #include "struct_ops_nulled_out_cb.skel.h" +#include "struct_ops_forgotten_cb.skel.h" static void check_map_info(struct bpf_map_info *info) { @@ -199,6 +200,48 @@ cleanup: struct_ops_nulled_out_cb__destroy(skel); } +/* validate that libbpf generates reasonable error message if struct_ops is + * not referenced in any struct_ops map + */ +static void test_struct_ops_forgotten_cb(void) +{ + struct struct_ops_forgotten_cb *skel; + char *log; + int err; + + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + start_libbpf_log_capture(); + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_ERR(err, "skel_load")) + goto cleanup; + + log = stop_libbpf_log_capture(); + ASSERT_HAS_SUBSTR(log, + "prog 'test_1_forgotten': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?", + "libbpf_log"); + free(log); + + struct_ops_forgotten_cb__destroy(skel); + + /* now let's programmatically use it, we should be fine now */ + skel = struct_ops_forgotten_cb__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->struct_ops.ops->test_1 = skel->progs.test_1_forgotten; /* not anymore */ + + err = struct_ops_forgotten_cb__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + +cleanup: + struct_ops_forgotten_cb__destroy(skel); +} + void serial_test_struct_ops_module(void) { if (test__start_subtest("test_struct_ops_load")) @@ -209,5 +252,7 @@ void serial_test_struct_ops_module(void) test_struct_ops_incompatible(); if (test__start_subtest("test_struct_ops_null_out_cb")) test_struct_ops_nulled_out_cb(); + if (test__start_subtest("struct_ops_forgotten_cb")) + test_struct_ops_forgotten_cb(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c new file mode 100644 index 000000000000..3c822103bd40 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_forgotten_cb.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "../bpf_testmod/bpf_testmod.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/test_1") +int BPF_PROG(test_1_forgotten) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_ops ops = { + /* we forgot to reference test_1_forgotten above, oops */ +}; + From 7b9959b8cdbc40b31b4c66bb900ec8d5e5b305bd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 6 May 2024 17:13:35 -0700 Subject: [PATCH 065/119] selftests/bpf: shorten subtest names for struct_ops_module test Drive-by clean up, we shouldn't use meaningless "test_" prefix for subtest names. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240507001335.1445325-8-andrii@kernel.org Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/test_struct_ops_module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c index 3785b648c8ad..29e183a80f49 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c @@ -244,13 +244,13 @@ cleanup: void serial_test_struct_ops_module(void) { - if (test__start_subtest("test_struct_ops_load")) + if (test__start_subtest("struct_ops_load")) test_struct_ops_load(); - if (test__start_subtest("test_struct_ops_not_zeroed")) + if (test__start_subtest("struct_ops_not_zeroed")) test_struct_ops_not_zeroed(); - if (test__start_subtest("test_struct_ops_incompatible")) + if (test__start_subtest("struct_ops_incompatible")) test_struct_ops_incompatible(); - if (test__start_subtest("test_struct_ops_null_out_cb")) + if (test__start_subtest("struct_ops_null_out_cb")) test_struct_ops_nulled_out_cb(); if (test__start_subtest("struct_ops_forgotten_cb")) test_struct_ops_forgotten_cb(); From e612b5c1d3ee325aff991b4078b4999bf6bac096 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 26 Apr 2024 16:11:16 +0000 Subject: [PATCH 066/119] bpf, arm64: Add support for lse atomics in bpf_arena When LSE atomics are available, BPF atomic instructions are implemented as single ARM64 atomic instructions, therefore it is easy to enable these in bpf_arena using the currently available exception handling setup. LL_SC atomics use loops and therefore would need more work to enable in bpf_arena. Enable LSE atomics based instructions in bpf_arena and use the bpf_jit_supports_insn() callback to reject atomics in bpf_arena if LSE atomics are not available. All atomics and arena_atomics selftests are passing: [root@ip-172-31-2-216 bpf]# ./test_progs -a atomics,arena_atomics #3/1 arena_atomics/add:OK #3/2 arena_atomics/sub:OK #3/3 arena_atomics/and:OK #3/4 arena_atomics/or:OK #3/5 arena_atomics/xor:OK #3/6 arena_atomics/cmpxchg:OK #3/7 arena_atomics/xchg:OK #3 arena_atomics:OK #10/1 atomics/add:OK #10/2 atomics/sub:OK #10/3 atomics/and:OK #10/4 atomics/or:OK #10/5 atomics/xor:OK #10/6 atomics/cmpxchg:OK #10/7 atomics/xchg:OK #10 atomics:OK Summary: 2/14 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240426161116.441-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 48 ++++++++++++++++---- tools/testing/selftests/bpf/DENYLIST.aarch64 | 1 - 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 76b91f36c729..53347d4217f4 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -494,20 +494,26 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) static int emit_lse_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) { const u8 code = insn->code; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; const u8 dst = bpf2a64[insn->dst_reg]; const u8 src = bpf2a64[insn->src_reg]; const u8 tmp = bpf2a64[TMP_REG_1]; const u8 tmp2 = bpf2a64[TMP_REG_2]; const bool isdw = BPF_SIZE(code) == BPF_DW; + const bool arena = BPF_MODE(code) == BPF_PROBE_ATOMIC; const s16 off = insn->off; - u8 reg; + u8 reg = dst; - if (!off) { - reg = dst; - } else { - emit_a64_mov_i(1, tmp, off, ctx); - emit(A64_ADD(1, tmp, tmp, dst), ctx); - reg = tmp; + if (off || arena) { + if (off) { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_ADD(1, tmp, tmp, dst), ctx); + reg = tmp; + } + if (arena) { + emit(A64_ADD(1, tmp, reg, arena_vm_base), ctx); + reg = tmp; + } } switch (insn->imm) { @@ -576,6 +582,12 @@ static int emit_ll_sc_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) u8 reg; s32 jmp_offset; + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + /* ll_sc based atomics don't support unsafe pointers yet. */ + pr_err_once("unknown atomic opcode %02x\n", code); + return -EINVAL; + } + if (!off) { reg = dst; } else { @@ -777,7 +789,8 @@ static int add_exception_handler(const struct bpf_insn *insn, if (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32) + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; if (!ctx->prog->aux->extable || @@ -1474,12 +1487,18 @@ emit_cond_jmp: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) ret = emit_lse_atomic(insn, ctx); else ret = emit_ll_sc_atomic(insn, ctx); if (ret) return ret; + + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; break; default: @@ -2527,6 +2546,19 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (!cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) + return false; + } + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index cf657fc35619..0445ac38bc07 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,4 +10,3 @@ fill_link_info/kprobe_multi_link_info # bpf_program__attach_kprobe_mu fill_link_info/kretprobe_multi_link_info # bpf_program__attach_kprobe_multi_opts unexpected error: -95 fill_link_info/kprobe_multi_invalid_ubuff # bpf_program__attach_kprobe_multi_opts unexpected error: -95 missed/kprobe_recursion # missed_kprobe_recursion__attach unexpected error: -95 (errno 95) -arena_atomics From cd3fc3b9782130a5bc1dc3dfccffbc1657637a93 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 7 May 2024 20:47:56 +0200 Subject: [PATCH 067/119] bpf: avoid uninitialized warnings in verifier_global_subprogs.c [Changes from V1: - The warning to disable is -Wmaybe-uninitialized, not -Wuninitialized. - This warning is only supported in GCC.] The BPF selftest verifier_global_subprogs.c contains code that purposedly performs out of bounds access to memory, to check whether the kernel verifier is able to catch them. For example: __noinline int global_unsupp(const int *mem) { if (!mem) return 0; return mem[100]; /* BOOM */ } With -O1 and higher and no inlining, GCC notices this fact and emits a "maybe uninitialized" warning. This is by design. Note that the emission of these warnings is highly dependent on the precise optimizations that are performed. This patch adds a compiler pragma to verifier_global_subprogs.c to ignore these warnings. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240507184756.1772-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/verifier_global_subprogs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index baff5ffe9405..a9fc30ed4d73 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -8,6 +8,13 @@ #include "xdp_metadata.h" #include "bpf_kfuncs.h" +/* The compiler may be able to detect the access to uninitialized + memory in the routines performing out of bound memory accesses and + emit warnings about it. This is the case of GCC. */ +#if !defined(__clang__) +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + int arr[1]; int unkn_idx; const volatile bool call_dead_subprog = false; From 1209a523f6914404e8941d9e04caa42be7cab8d5 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Wed, 8 May 2024 12:35:51 +0200 Subject: [PATCH 068/119] bpf: avoid UB in usages of the __imm_insn macro [Changes from V2: - no-strict-aliasing is only applied when building with GCC. - cpumask_failure.c is excluded, as it doesn't use __imm_insn.] The __imm_insn macro is defined in bpf_misc.h as: #define __imm_insn(name, expr) [name]"i"(*(long *)&(expr)) This may lead to type-punning and strict aliasing rules violations in it's typical usage where the address of a struct bpf_insn is passed as expr, like in: __imm_insn(st_mem, BPF_ST_MEM(BPF_W, BPF_REG_1, offsetof(struct __sk_buff, mark), 42)) Where: #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) In all the actual instances of this in the BPF selftests the value is fed to a volatile asm statement as soon as it gets read from memory, and thus it is unlikely anti-aliasing rules breakage may lead to misguided optimizations. However, GCC detects the potential problem (indirectly) by issuing a warning stating that a temporary is used uninitialized, where the temporary corresponds to the memory read by *(long *). This patch adds -fno-strict-aliasing to the compilation flags of the particular selftests that do type punning via __imm_insn, only for GCC. Tested in master bpf-next. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240508103551.14955-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 511ea84139a3..e962a0bd8a78 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -86,6 +86,19 @@ progs/btf_dump_test_case_namespacing.c-bpf_gcc-CFLAGS := -Wno-error progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +# The following tests do type-punning, via the __imm_insn macro, from +# `struct bpf_insn' to long and then uses the value. This triggers an +# "is used uninitialized" warning in GCC due to strict-aliasing +# rules. +progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing endif ifneq ($(CLANG_CPUV4),) From 911edc69c832161b62a8ad10a6972290157a7bd3 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Wed, 8 May 2024 13:03:32 +0200 Subject: [PATCH 069/119] bpf: guard BPF_NO_PRESERVE_ACCESS_INDEX in skb_pkt_end.c This little patch is a follow-up to: https://lore.kernel.org/bpf/20240507095011.15867-1-jose.marchesi@oracle.com/T/#u The temporary workaround of passing -DBPF_NO_PRESERVE_ACCESS_INDEX when building with GCC triggers a redefinition preprocessor error when building progs/skb_pkt_end.c. This patch adds a guard to avoid redefinition. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Cc: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240508110332.17332-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/skb_pkt_end.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c index 992b7861003a..db4abd2682fc 100644 --- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX #define BPF_NO_PRESERVE_ACCESS_INDEX +#endif #include #include #include From 009367099eb61a4fc2af44d4eb06b6b4de7de6db Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Wed, 8 May 2024 12:13:13 +0200 Subject: [PATCH 070/119] bpf: Avoid uninitialized value in BPF_CORE_READ_BITFIELD [Changes from V1: - Use a default branch in the switch statement to initialize `val'.] GCC warns that `val' may be used uninitialized in the BPF_CRE_READ_BITFIELD macro, defined in bpf_core_read.h as: [...] unsigned long long val; \ [...] \ switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ case 1: val = *(const unsigned char *)p; break; \ case 2: val = *(const unsigned short *)p; break; \ case 4: val = *(const unsigned int *)p; break; \ case 8: val = *(const unsigned long long *)p; break; \ } \ [...] val; \ } \ This patch adds a default entry in the switch statement that sets `val' to zero in order to avoid the warning, and random values to be used in case __builtin_preserve_field_info returns unexpected values for BPF_FIELD_BYTE_SIZE. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240508101313.16662-1-jose.marchesi@oracle.com --- tools/lib/bpf/bpf_core_read.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h index b5c7ce5c243a..c0e13cdf9660 100644 --- a/tools/lib/bpf/bpf_core_read.h +++ b/tools/lib/bpf/bpf_core_read.h @@ -104,6 +104,7 @@ enum bpf_enum_value_kind { case 2: val = *(const unsigned short *)p; break; \ case 4: val = *(const unsigned int *)p; break; \ case 8: val = *(const unsigned long long *)p; break; \ + default: val = 0; break; \ } \ val <<= __CORE_RELO(s, field, LSHIFT_U64); \ if (__CORE_RELO(s, field, SIGNED)) \ From c0338e609e6e8aff8a7052c90cff83a6bc792ebc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:17 -0700 Subject: [PATCH 071/119] selftests/bpf: Remove bpf_tracing_net.h usages from two networking tests This patch removes the bpf_tracing_net.h usage from the networking tests, fib_lookup and test_lwt_redirect. Instead of using the (copied) macro TC_ACT_SHOT and ETH_HLEN from bpf_tracing_net.h, they can directly use the ones defined in the network header files under linux/. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/fib_lookup.c | 2 +- tools/testing/selftests/bpf/progs/test_lwt_redirect.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/fib_lookup.c b/tools/testing/selftests/bpf/progs/fib_lookup.c index c4514dd58c62..7b5dd2214ff4 100644 --- a/tools/testing/selftests/bpf/progs/fib_lookup.c +++ b/tools/testing/selftests/bpf/progs/fib_lookup.c @@ -3,8 +3,8 @@ #include #include +#include #include -#include "bpf_tracing_net.h" struct bpf_fib_lookup fib_params = {}; int fib_lookup_ret = 0; diff --git a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c index 8c895122f293..83439b87b766 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_redirect.c @@ -3,7 +3,7 @@ #include #include #include -#include "bpf_tracing_net.h" +#include /* We don't care about whether the packet can be received by network stack. * Just care if the packet is sent to the correct device at correct direction From cbaec46df6c08a2fab6be03d093d3d6ce74adc9a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:18 -0700 Subject: [PATCH 072/119] selftests/bpf: Add a few tcp helper functions and macros to bpf_tracing_net.h This patch adds a few tcp related helper functions to bpf_tracing_net.h. They will be useful for both tcp-cc and network tracing related bpf progs. They have already been in the bpf_tcp_helpers.h. This change is needed to retire the bpf_tcp_helpers.h and consolidate all tests to vmlinux.h (i.e. bpf_tracing_net.h). Some of the helpers (tcp_sk and inet_csk) are also defined in bpf_cc_cubic.c and they are removed. While at it, remove the vmlinux.h from bpf_cc_cubic.c. bpf_tracing_net.h (which has vmlinux.h after this patch) is enough and will be consistent with the other tcp-cc tests in the later patches. The other TCP_* macro additions will be needed for the bpf_dctcp changes in the later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/bpf_cc_cubic.c | 14 +------ .../selftests/bpf/progs/bpf_tracing_net.h | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 5c7697c70e2a..2004be380683 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -13,11 +13,9 @@ * kernel functions. */ -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include -#include "bpf_tracing_net.h" #define BPF_STRUCT_OPS(name, args...) \ SEC("struct_ops/"#name) \ @@ -40,16 +38,6 @@ extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -static struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} - -static struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - static bool before(__u32 seq1, __u32 seq2) { return (__s32)(seq1-seq2) < 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index f9ec630dfcd5..ba4ca0334586 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -2,6 +2,9 @@ #ifndef __BPF_TRACING_NET_H__ #define __BPF_TRACING_NET_H__ +#include +#include + #define AF_INET 2 #define AF_INET6 10 @@ -46,6 +49,13 @@ #define TCP_CA_NAME_MAX 16 #define TCP_NAGLE_OFF 1 +#define TCP_ECN_OK 1 +#define TCP_ECN_QUEUE_CWR 2 +#define TCP_ECN_DEMAND_CWR 4 +#define TCP_ECN_SEEN 8 + +#define TCP_CONG_NEEDS_ECN 0x2 + #define ICSK_TIME_RETRANS 1 #define ICSK_TIME_PROBE0 3 #define ICSK_TIME_LOSS_PROBE 5 @@ -129,4 +139,35 @@ #define tcp_jiffies32 ((__u32)bpf_jiffies64()) +static inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static inline void *inet_csk_ca(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ca_priv; +} + +static inline struct tcp_sock *tcp_sk(const struct sock *sk) +{ + return (struct tcp_sock *)sk; +} + +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd < tp->snd_ssthresh; +} + +static inline bool tcp_is_cwnd_limited(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ + if (tcp_in_slow_start(tp)) + return tp->snd_cwnd < 2 * tp->max_packets_out; + + return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); +} + #endif From cc5b18ce1714160be3e0e3b9440a6306dc87e5c4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:19 -0700 Subject: [PATCH 073/119] selftests/bpf: Reuse the tcp_sk() from the bpf_tracing_net.h This patch removes the individual tcp_sk implementations from the tcp-cc tests. The tcp_sk() implementation from the bpf_tracing_net.h is reused instead. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c | 8 +------- tools/testing/selftests/bpf/progs/tcp_ca_update.c | 8 +------- .../testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c | 8 +------- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c index 7bb872fb22dd..d6467fcb1deb 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -1,17 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include char _license[] SEC("license") = "GPL"; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - SEC("struct_ops/incompl_cong_ops_ssthresh") __u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c index b93a0ed33057..8581cad321b6 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_update.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include @@ -10,11 +9,6 @@ char _license[] SEC("license") = "GPL"; int ca1_cnt = 0; int ca2_cnt = 0; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - SEC("struct_ops/ca_update_1_init") void BPF_PROG(ca_update_1_init, struct sock *sk) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index 0724a79cec78..4a369439335e 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "vmlinux.h" - +#include "bpf_tracing_net.h" #include #include @@ -11,11 +10,6 @@ char _license[] SEC("license") = "GPL"; #define min(a, b) ((a) < (b) ? (a) : (b)) -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; From 7d3851a31832bf8dc776a78494b788518734ad0f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:20 -0700 Subject: [PATCH 074/119] selftests/bpf: Sanitize the SEC and inline usages in the bpf-tcp-cc tests It is needed to remove the BPF_STRUCT_OPS usages from the tcp-cc tests because it is defined in bpf_tcp_helpers.h which is going to be retired. While at it, this patch consolidates all tcp-cc struct_ops programs to use the SEC("struct_ops") + BPF_PROG(). It also removes the unnecessary __always_inline usages from the tcp-cc tests. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/bpf_cc_cubic.c | 28 +++++++------ tools/testing/selftests/bpf/progs/bpf_cubic.c | 42 +++++++++---------- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 26 ++++++------ .../selftests/bpf/progs/bpf_dctcp_release.c | 3 +- .../selftests/bpf/progs/bpf_tcp_nogpl.c | 3 +- .../bpf/progs/tcp_ca_incompl_cong_ops.c | 4 +- .../selftests/bpf/progs/tcp_ca_kfunc.c | 22 +++++----- .../bpf/progs/tcp_ca_unsupp_cong_op.c | 2 +- .../selftests/bpf/progs/tcp_ca_update.c | 10 ++--- .../bpf/progs/tcp_ca_write_sk_pacing.c | 12 +++--- 10 files changed, 77 insertions(+), 75 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c index 2004be380683..1654a530aa3d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cc_cubic.c @@ -17,10 +17,6 @@ #include #include -#define BPF_STRUCT_OPS(name, args...) \ -SEC("struct_ops/"#name) \ -BPF_PROG(name, args) - #define USEC_PER_SEC 1000000UL #define TCP_PACING_SS_RATIO (200) #define TCP_PACING_CA_RATIO (120) @@ -114,18 +110,21 @@ static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) return flag & FLAG_DATA_ACKED; } -void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_init, struct sock *sk) { cubictcp_init(sk); } -void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { cubictcp_cwnd_event(sk, event); } -void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, - const struct rate_sample *rs) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag, + const struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); @@ -151,23 +150,26 @@ void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag tcp_update_pacing_rate(sk); } -__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { return cubictcp_recalc_ssthresh(sk); } -void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) { cubictcp_state(sk, new_state); } -void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, - const struct ack_sample *sample) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { cubictcp_acked(sk, sample); } -__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) { return tcp_reno_undo_cwnd(sk); } diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index c997e3e3d3fb..53a98b609e5f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -91,7 +91,7 @@ struct bictcp { __u32 curr_rtt; /* the minimum rtt of current round */ }; -static inline void bictcp_reset(struct bictcp *ca) +static void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; @@ -112,7 +112,7 @@ extern unsigned long CONFIG_HZ __kconfig; #define USEC_PER_SEC 1000000UL #define USEC_PER_JIFFY (USEC_PER_SEC / HZ) -static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) +static __u64 div64_u64(__u64 dividend, __u64 divisor) { return dividend / divisor; } @@ -120,7 +120,7 @@ static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) #define div64_ul div64_u64 #define BITS_PER_U64 (sizeof(__u64) * 8) -static __always_inline int fls64(__u64 x) +static int fls64(__u64 x) { int num = BITS_PER_U64 - 1; @@ -153,12 +153,12 @@ static __always_inline int fls64(__u64 x) return num + 1; } -static __always_inline __u32 bictcp_clock_us(const struct sock *sk) +static __u32 bictcp_clock_us(const struct sock *sk) { return tcp_sk(sk)->tcp_mstamp; } -static __always_inline void bictcp_hystart_reset(struct sock *sk) +static void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -169,8 +169,7 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -/* "struct_ops/" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_init") +SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -184,8 +183,7 @@ void BPF_PROG(bpf_cubic_init, struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* "struct_ops" prefix is a requirement */ -SEC("struct_ops/bpf_cubic_cwnd_event") +SEC("struct_ops") void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { @@ -230,7 +228,7 @@ static const __u8 v[] = { * Newton-Raphson iteration. * Avg err ~= 0.195% */ -static __always_inline __u32 cubic_root(__u64 a) +static __u32 cubic_root(__u64 a) { __u32 x, b, shift; @@ -263,8 +261,7 @@ static __always_inline __u32 cubic_root(__u64 a) /* * Compute congestion window to use. */ -static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, - __u32 acked) +static void bictcp_update(struct bictcp *ca, __u32 cwnd, __u32 acked) { __u32 delta, bic_target, max_cnt; __u64 offs, t; @@ -377,8 +374,8 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ -void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -397,7 +394,8 @@ void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acke tcp_cong_avoid_ai(tp, ca->cnt, acked); } -__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -414,7 +412,8 @@ __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -433,7 +432,7 @@ void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ -static __always_inline __u32 hystart_ack_delay(struct sock *sk) +static __u32 hystart_ack_delay(struct sock *sk) { unsigned long rate; @@ -444,7 +443,7 @@ static __always_inline __u32 hystart_ack_delay(struct sock *sk) div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); } -static __always_inline void hystart_update(struct sock *sk, __u32 delay) +static void hystart_update(struct sock *sk, __u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -492,8 +491,8 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) int bpf_cubic_acked_called = 0; -void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, - const struct ack_sample *sample) +SEC("struct_ops") +void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -524,7 +523,8 @@ void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; -__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +SEC("struct_ops") +__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk) { return tcp_reno_undo_cwnd(sk); } diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 460682759aed..b74dbb121384 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -48,8 +48,7 @@ struct dctcp { static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */ static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA; -static __always_inline void dctcp_reset(const struct tcp_sock *tp, - struct dctcp *ca) +static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) { ca->next_seq = tp->snd_nxt; @@ -57,7 +56,7 @@ static __always_inline void dctcp_reset(const struct tcp_sock *tp, ca->old_delivered_ce = tp->delivered_ce; } -SEC("struct_ops/dctcp_init") +SEC("struct_ops") void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -104,7 +103,7 @@ void BPF_PROG(dctcp_init, struct sock *sk) dctcp_reset(tp, ca); } -SEC("struct_ops/dctcp_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); @@ -114,7 +113,7 @@ __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } -SEC("struct_ops/dctcp_update_alpha") +SEC("struct_ops") void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); @@ -144,7 +143,7 @@ void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) } } -static __always_inline void dctcp_react_to_loss(struct sock *sk) +static void dctcp_react_to_loss(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -153,7 +152,7 @@ static __always_inline void dctcp_react_to_loss(struct sock *sk) tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); } -SEC("struct_ops/dctcp_state") +SEC("struct_ops") void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Recovery && @@ -164,7 +163,7 @@ void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state) */ } -static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) +static void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); @@ -179,9 +178,8 @@ static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state) * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ -static __always_inline -void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, - __u32 *prior_rcv_nxt, __u32 *ce_state) +static void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + __u32 *prior_rcv_nxt, __u32 *ce_state) { __u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; @@ -201,7 +199,7 @@ void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, dctcp_ece_ack_cwr(sk, new_ce_state); } -SEC("struct_ops/dctcp_cwnd_event") +SEC("struct_ops") void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) { struct dctcp *ca = inet_csk_ca(sk); @@ -220,7 +218,7 @@ void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) } } -SEC("struct_ops/dctcp_cwnd_undo") +SEC("struct_ops") __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) { const struct dctcp *ca = inet_csk_ca(sk); @@ -230,7 +228,7 @@ __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; -SEC("struct_ops/dctcp_reno_cong_avoid") +SEC("struct_ops") void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { tcp_reno_cong_avoid(sk, ack, acked); diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c index d836f7c372f0..a946b070bb06 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -13,7 +13,8 @@ char _license[] SEC("license") = "GPL"; const char cubic[] = "cubic"; -void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(dctcp_nouse_release, struct sock *sk) { bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, (void *)cubic, sizeof(cubic)); diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c index 2ecd833dcd41..633164e704dd 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -8,7 +8,8 @@ char _license[] SEC("license") = "X"; -void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk) +SEC("struct_ops") +void BPF_PROG(nogpltcp_init, struct sock *sk) { } diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c index d6467fcb1deb..0016c90e9c13 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_incompl_cong_ops.c @@ -6,13 +6,13 @@ char _license[] SEC("license") = "GPL"; -SEC("struct_ops/incompl_cong_ops_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/incompl_cong_ops_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(incompl_cong_ops_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c index 52b610357309..f95862f570b7 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c @@ -27,7 +27,7 @@ extern void cubictcp_state(struct sock *sk, u8 new_state) __ksym; extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym; extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym; -SEC("struct_ops/init") +SEC("struct_ops") void BPF_PROG(init, struct sock *sk) { bbr_init(sk); @@ -35,38 +35,38 @@ void BPF_PROG(init, struct sock *sk) cubictcp_init(sk); } -SEC("struct_ops/in_ack_event") +SEC("struct_ops") void BPF_PROG(in_ack_event, struct sock *sk, u32 flags) { dctcp_update_alpha(sk, flags); } -SEC("struct_ops/cong_control") +SEC("struct_ops") void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) { bbr_main(sk, ack, flag, rs); } -SEC("struct_ops/cong_avoid") +SEC("struct_ops") void BPF_PROG(cong_avoid, struct sock *sk, u32 ack, u32 acked) { cubictcp_cong_avoid(sk, ack, acked); } -SEC("struct_ops/sndbuf_expand") +SEC("struct_ops") u32 BPF_PROG(sndbuf_expand, struct sock *sk) { return bbr_sndbuf_expand(sk); } -SEC("struct_ops/undo_cwnd") +SEC("struct_ops") u32 BPF_PROG(undo_cwnd, struct sock *sk) { bbr_undo_cwnd(sk); return dctcp_cwnd_undo(sk); } -SEC("struct_ops/cwnd_event") +SEC("struct_ops") void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) { bbr_cwnd_event(sk, event); @@ -74,7 +74,7 @@ void BPF_PROG(cwnd_event, struct sock *sk, enum tcp_ca_event event) cubictcp_cwnd_event(sk, event); } -SEC("struct_ops/ssthresh") +SEC("struct_ops") u32 BPF_PROG(ssthresh, struct sock *sk) { bbr_ssthresh(sk); @@ -82,13 +82,13 @@ u32 BPF_PROG(ssthresh, struct sock *sk) return cubictcp_recalc_ssthresh(sk); } -SEC("struct_ops/min_tso_segs") +SEC("struct_ops") u32 BPF_PROG(min_tso_segs, struct sock *sk) { return bbr_min_tso_segs(sk); } -SEC("struct_ops/set_state") +SEC("struct_ops") void BPF_PROG(set_state, struct sock *sk, u8 new_state) { bbr_set_state(sk, new_state); @@ -96,7 +96,7 @@ void BPF_PROG(set_state, struct sock *sk, u8 new_state) cubictcp_state(sk, new_state); } -SEC("struct_ops/pkts_acked") +SEC("struct_ops") void BPF_PROG(pkts_acked, struct sock *sk, const struct ack_sample *sample) { cubictcp_acked(sk, sample); diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c index c06f4a41c21a..54f916a931c6 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_unsupp_cong_op.c @@ -7,7 +7,7 @@ char _license[] SEC("license") = "GPL"; -SEC("struct_ops/unsupp_cong_op_get_info") +SEC("struct_ops") size_t BPF_PROG(unsupp_cong_op_get_info, struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_update.c b/tools/testing/selftests/bpf/progs/tcp_ca_update.c index 8581cad321b6..e4bd82bc0d01 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_update.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_update.c @@ -9,31 +9,31 @@ char _license[] SEC("license") = "GPL"; int ca1_cnt = 0; int ca2_cnt = 0; -SEC("struct_ops/ca_update_1_init") +SEC("struct_ops") void BPF_PROG(ca_update_1_init, struct sock *sk) { ca1_cnt++; } -SEC("struct_ops/ca_update_2_init") +SEC("struct_ops") void BPF_PROG(ca_update_2_init, struct sock *sk) { ca2_cnt++; } -SEC("struct_ops/ca_update_cong_control") +SEC("struct_ops") void BPF_PROG(ca_update_cong_control, struct sock *sk, const struct rate_sample *rs) { } -SEC("struct_ops/ca_update_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(ca_update_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/ca_update_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(ca_update_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c index 4a369439335e..a58b5194fc89 100644 --- a/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c +++ b/tools/testing/selftests/bpf/progs/tcp_ca_write_sk_pacing.c @@ -10,17 +10,17 @@ char _license[] SEC("license") = "GPL"; #define min(a, b) ((a) < (b) ? (a) : (b)) -static inline unsigned int tcp_left_out(const struct tcp_sock *tp) +static unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } -static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } -SEC("struct_ops/write_sk_pacing_init") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_init, struct sock *sk) { #ifdef ENABLE_ATOMICS_TESTS @@ -31,7 +31,7 @@ void BPF_PROG(write_sk_pacing_init, struct sock *sk) #endif } -SEC("struct_ops/write_sk_pacing_cong_control") +SEC("struct_ops") void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, const struct rate_sample *rs) { @@ -43,13 +43,13 @@ void BPF_PROG(write_sk_pacing_cong_control, struct sock *sk, tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ?: 1; } -SEC("struct_ops/write_sk_pacing_ssthresh") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_ssthresh, struct sock *sk) { return tcp_sk(sk)->snd_ssthresh; } -SEC("struct_ops/write_sk_pacing_undo_cwnd") +SEC("struct_ops") __u32 BPF_PROG(write_sk_pacing_undo_cwnd, struct sock *sk) { return tcp_sk(sk)->snd_cwnd; From b1d87ae9b0d3d91767d85183e40c96f4229a6c21 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:21 -0700 Subject: [PATCH 075/119] selftests/bpf: Rename tcp-cc private struct in bpf_cubic and bpf_dctcp The "struct bictcp" and "struct dctcp" are private to the bpf prog and they are stored in the private buffer in inet_csk(sk)->icsk_ca_priv. Hence, there is no bpf CO-RE required. The same struct name exists in the vmlinux.h. To reuse vmlinux.h, they need to be renamed such that the bpf prog logic will be immuned from the kernel tcp-cc changes. This patch adds a "bpf_" prefix to them. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_cubic.c | 20 +++++++++---------- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 16 +++++++-------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 53a98b609e5f..53872e2d2c52 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -70,7 +70,7 @@ static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) / (bic_scale * 10); /* BIC TCP Parameters */ -struct bictcp { +struct bpf_bictcp { __u32 cnt; /* increase cwnd by 1 after ACKs */ __u32 last_max_cwnd; /* last maximum snd_cwnd */ __u32 last_cwnd; /* the last snd_cwnd */ @@ -91,7 +91,7 @@ struct bictcp { __u32 curr_rtt; /* the minimum rtt of current round */ }; -static void bictcp_reset(struct bictcp *ca) +static void bictcp_reset(struct bpf_bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; @@ -161,7 +161,7 @@ static __u32 bictcp_clock_us(const struct sock *sk) static void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; @@ -172,7 +172,7 @@ static void bictcp_hystart_reset(struct sock *sk) SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); @@ -187,7 +187,7 @@ SEC("struct_ops") void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 now = tcp_jiffies32; __s32 delta; @@ -261,7 +261,7 @@ static __u32 cubic_root(__u64 a) /* * Compute congestion window to use. */ -static void bictcp_update(struct bictcp *ca, __u32 cwnd, __u32 acked) +static void bictcp_update(struct bpf_bictcp *ca, __u32 cwnd, __u32 acked) { __u32 delta, bic_target, max_cnt; __u64 offs, t; @@ -378,7 +378,7 @@ SEC("struct_ops") void BPF_PROG(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; @@ -398,7 +398,7 @@ SEC("struct_ops") __u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ @@ -446,7 +446,7 @@ static __u32 hystart_ack_delay(struct sock *sk) static void hystart_update(struct sock *sk, __u32 delay) { struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { @@ -495,7 +495,7 @@ SEC("struct_ops") void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); + struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 delay; bpf_cubic_acked_called = 1; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index b74dbb121384..a8673b7ff35d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -35,7 +35,7 @@ struct { #define DCTCP_MAX_ALPHA 1024U -struct dctcp { +struct bpf_dctcp { __u32 old_delivered; __u32 old_delivered_ce; __u32 prior_rcv_nxt; @@ -48,7 +48,7 @@ struct dctcp { static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */ static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA; -static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) +static void dctcp_reset(const struct tcp_sock *tp, struct bpf_dctcp *ca) { ca->next_seq = tp->snd_nxt; @@ -60,7 +60,7 @@ SEC("struct_ops") void BPF_PROG(dctcp_init, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); int *stg; if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { @@ -106,7 +106,7 @@ void BPF_PROG(dctcp_init, struct sock *sk) SEC("struct_ops") __u32 BPF_PROG(dctcp_ssthresh, struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; @@ -117,7 +117,7 @@ SEC("struct_ops") void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { @@ -145,7 +145,7 @@ void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags) static void dctcp_react_to_loss(struct sock *sk) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; @@ -202,7 +202,7 @@ static void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, SEC("struct_ops") void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) { - struct dctcp *ca = inet_csk_ca(sk); + struct bpf_dctcp *ca = inet_csk_ca(sk); switch (ev) { case CA_EVENT_ECN_IS_CE: @@ -221,7 +221,7 @@ void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev) SEC("struct_ops") __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + const struct bpf_dctcp *ca = inet_csk_ca(sk); return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } From a824c9a8a4d9a654d62674a8425c0f1abc9c3d33 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:22 -0700 Subject: [PATCH 076/119] selftests/bpf: Use bpf_tracing_net.h in bpf_cubic This patch uses bpf_tracing_net.h (i.e. vmlinux.h) in bpf_cubic. This will allow to retire the bpf_tcp_helpers.h and consolidate tcp-cc tests to vmlinux.h. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_cubic.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 53872e2d2c52..d665b8a15cc4 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -14,14 +14,22 @@ * "ca->ack_cnt / delta" operation. */ -#include -#include -#include -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" +#include char _license[] SEC("license") = "GPL"; #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} +#define after(seq2, seq1) before(seq1, seq2) + +extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; +extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta From 6ad4e6e94697e960630594907666bc09e78a3b8a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:23 -0700 Subject: [PATCH 077/119] selftests/bpf: Use bpf_tracing_net.h in bpf_dctcp This patch uses bpf_tracing_net.h (i.e. vmlinux.h) in bpf_dctcp. This will allow to retire the bpf_tcp_helpers.h and consolidate tcp-cc tests to vmlinux.h. It will have a dup on min/max macros with the bpf_cubic. It could be further refactored in the future. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index a8673b7ff35d..3c9ffe340312 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -6,15 +6,23 @@ * the kernel BPF logic. */ -#include -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" + +#ifndef EBUSY +#define EBUSY 16 +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) +static bool before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} char _license[] SEC("license") = "GPL"; From 6eee55aa769c241182da73a391980f51edba27dc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:24 -0700 Subject: [PATCH 078/119] selftests/bpf: Remove bpf_tcp_helpers.h usages from other misc bpf tcp-cc tests This patch removed the final few bpf_tcp_helpers.h usages in some misc bpf tcp-cc tests and replace it with bpf_tracing_net.h (i.e. vmlinux.h) Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-9-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_dctcp_release.c | 7 +------ tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c | 5 +---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c index a946b070bb06..c91763f248b2 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -1,14 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "GPL"; const char cubic[] = "cubic"; diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c index 633164e704dd..8a7a4c1b54e8 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -1,10 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include +#include "bpf_tracing_net.h" #include -#include "bpf_tcp_helpers.h" char _license[] SEC("license") = "X"; From c075c9c4af289bb5956b0164283a85cf9c293c8e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:25 -0700 Subject: [PATCH 079/119] selftests/bpf: Remove the bpf_tcp_helpers.h usages from other non tcp-cc tests The patch removes the remaining bpf_tcp_helpers.h usages in the non tcp-cc networking tests. It either replaces it with bpf_tracing_net.h or just removed it because the test is not actually using any kernel sockets. For the later, the missing macro (mainly SOL_TCP) is defined locally. An exception is the test_sock_fields which is testing the "struct bpf_sock" type instead of the kernel sock type. Whenever "vmlinux.h" is used instead, it hits a verifier error on doing arithmetic on the sock_common pointer: ; return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1); @ test_sock_fields.c:54 21: (61) r2 = *(u32 *)(r1 +28) ; R1_w=sock_common() R2_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 22: (56) if w2 != 0x0 goto pc-6 ; R2_w=0 23: (b7) r3 = 28 ; R3_w=28 24: (bf) r2 = r1 ; R1_w=sock_common() R2_w=sock_common() 25: (0f) r2 += r3 R2 pointer arithmetic on sock_common prohibited Hence, instead of including bpf_tracing_net.h, the test_sock_fields test defines a tcp_sock with one lsndtime field in it. Another highlight is, in sockopt_qos_to_cc.c, the tcp_cc_eq() is replaced by bpf_strncmp(). tcp_cc_eq() was a workaround in bpf_tcp_helpers.h before bpf_strncmp had been added. The SOL_IPV6 addition to bpf_tracing_net.h is needed by the test_tcpbpf_kern test. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/bpf_tracing_net.h | 1 + .../testing/selftests/bpf/progs/connect4_prog.c | 6 ++++-- tools/testing/selftests/bpf/progs/mptcp_sock.c | 4 ++-- .../selftests/bpf/progs/sockopt_qos_to_cc.c | 16 ++++++---------- .../bpf/progs/test_btf_skc_cls_ingress.c | 16 +++++----------- .../selftests/bpf/progs/test_sock_fields.c | 5 ++++- .../selftests/bpf/progs/test_tcpbpf_kern.c | 13 +------------ 7 files changed, 23 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index ba4ca0334586..59843b430f76 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -25,6 +25,7 @@ #define IP_TOS 1 +#define SOL_IPV6 41 #define IPV6_TCLASS 67 #define IPV6_AUTOFLOWLABEL 70 diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 7ef49ec04838..bec529da7c9d 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -14,8 +14,6 @@ #include #include -#include "bpf_tcp_helpers.h" - #define SRC_REWRITE_IP4 0x7f000004U #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 @@ -32,6 +30,10 @@ #define IFNAMSIZ 16 #endif +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + __attribute__ ((noinline)) __weak int do_bind(struct bpf_sock_addr *ctx) { diff --git a/tools/testing/selftests/bpf/progs/mptcp_sock.c b/tools/testing/selftests/bpf/progs/mptcp_sock.c index 91a0d7eff2ac..f3acb90588c7 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_sock.c +++ b/tools/testing/selftests/bpf/progs/mptcp_sock.c @@ -2,9 +2,9 @@ /* Copyright (c) 2020, Tessares SA. */ /* Copyright (c) 2022, SUSE. */ -#include +#include "bpf_tracing_net.h" #include -#include "bpf_tcp_helpers.h" +#include char _license[] SEC("license") = "GPL"; __u32 token = 0; diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c index 83753b00a556..5c3614333b01 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c +++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c @@ -1,24 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include -#include -#include -#include -#include "bpf_tcp_helpers.h" +#include "bpf_tracing_net.h" char _license[] SEC("license") = "GPL"; __s32 page_size = 0; +const char cc_reno[TCP_CA_NAME_MAX] = "reno"; +const char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; + SEC("cgroup/setsockopt") int sockopt_qos_to_cc(struct bpf_sockopt *ctx) { void *optval_end = ctx->optval_end; int *optval = ctx->optval; char buf[TCP_CA_NAME_MAX]; - char cc_reno[TCP_CA_NAME_MAX] = "reno"; - char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS) goto out; @@ -29,11 +25,11 @@ int sockopt_qos_to_cc(struct bpf_sockopt *ctx) if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) return 0; - if (!tcp_cc_eq(buf, cc_cubic)) + if (bpf_strncmp(buf, sizeof(buf), cc_cubic)) return 0; if (*optval == 0x2d) { - if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno, + if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, (void *)&cc_reno, sizeof(cc_reno))) return 0; } diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c index e2bea4da194b..f0759efff6ef 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c @@ -1,19 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" + +#ifndef ENOENT +#define ENOENT 2 +#endif struct sockaddr_in6 srv_sa6 = {}; __u16 listen_tp_sport = 0; diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c index f75e531bf36f..196844be349c 100644 --- a/tools/testing/selftests/bpf/progs/test_sock_fields.c +++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c @@ -7,7 +7,6 @@ #include #include -#include "bpf_tcp_helpers.h" enum bpf_linum_array_idx { EGRESS_LINUM_IDX, @@ -42,6 +41,10 @@ struct { __type(value, struct bpf_spinlock_cnt); } sk_pkt_out_cnt10 SEC(".maps"); +struct tcp_sock { + __u32 lsndtime; +} __attribute__((preserve_access_index)); + struct bpf_tcp_sock listen_tp = {}; struct sockaddr_in6 srv_sa6 = {}; struct bpf_tcp_sock cli_tp = {}; diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index a3f3f43fc195..6935f32eeb8f 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -1,18 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "bpf_tracing_net.h" #include #include -#include "bpf_tcp_helpers.h" #include "test_tcpbpf.h" struct tcpbpf_globals global = {}; From 6a650816b098a15c4690a22e3889858264d01aa8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 9 May 2024 10:50:26 -0700 Subject: [PATCH 080/119] selftests/bpf: Retire bpf_tcp_helpers.h The previous patches have consolidated the tests to use bpf_tracing_net.h (i.e. vmlinux.h) instead of bpf_tcp_helpers.h. This patch can finally retire the bpf_tcp_helpers.h from the repository. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240509175026.3423614-11-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 241 ------------------ 1 file changed, 241 deletions(-) delete mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h deleted file mode 100644 index 82a7c9de95f9..000000000000 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BPF_TCP_HELPERS_H -#define __BPF_TCP_HELPERS_H - -#include -#include -#include -#include -#include - -#define BPF_STRUCT_OPS(name, args...) \ -SEC("struct_ops/"#name) \ -BPF_PROG(name, args) - -#ifndef SOL_TCP -#define SOL_TCP 6 -#endif - -#ifndef TCP_CA_NAME_MAX -#define TCP_CA_NAME_MAX 16 -#endif - -#define tcp_jiffies32 ((__u32)bpf_jiffies64()) - -struct sock_common { - unsigned char skc_state; - __u16 skc_num; -} __attribute__((preserve_access_index)); - -enum sk_pacing { - SK_PACING_NONE = 0, - SK_PACING_NEEDED = 1, - SK_PACING_FQ = 2, -}; - -struct sock { - struct sock_common __sk_common; -#define sk_state __sk_common.skc_state - unsigned long sk_pacing_rate; - __u32 sk_pacing_status; /* see enum sk_pacing */ -} __attribute__((preserve_access_index)); - -struct inet_sock { - struct sock sk; -} __attribute__((preserve_access_index)); - -struct inet_connection_sock { - struct inet_sock icsk_inet; - __u8 icsk_ca_state:6, - icsk_ca_setsockopt:1, - icsk_ca_dst_locked:1; - struct { - __u8 pending; - } icsk_ack; - __u64 icsk_ca_priv[104 / sizeof(__u64)]; -} __attribute__((preserve_access_index)); - -struct request_sock { - struct sock_common __req_common; -} __attribute__((preserve_access_index)); - -struct tcp_sock { - struct inet_connection_sock inet_conn; - - __u32 rcv_nxt; - __u32 snd_nxt; - __u32 snd_una; - __u32 window_clamp; - __u8 ecn_flags; - __u32 delivered; - __u32 delivered_ce; - __u32 snd_cwnd; - __u32 snd_cwnd_cnt; - __u32 snd_cwnd_clamp; - __u32 snd_ssthresh; - __u8 syn_data:1, /* SYN includes data */ - syn_fastopen:1, /* SYN includes Fast Open option */ - syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ - syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ - __u32 max_packets_out; - __u32 lsndtime; - __u32 prior_cwnd; - __u64 tcp_mstamp; /* most recent packet received/sent */ - bool is_mptcp; -} __attribute__((preserve_access_index)); - -static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} - -static __always_inline void *inet_csk_ca(const struct sock *sk) -{ - return (void *)inet_csk(sk)->icsk_ca_priv; -} - -static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} - -static __always_inline bool before(__u32 seq1, __u32 seq2) -{ - return (__s32)(seq1-seq2) < 0; -} -#define after(seq2, seq1) before(seq1, seq2) - -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 - -enum inet_csk_ack_state_t { - ICSK_ACK_SCHED = 1, - ICSK_ACK_TIMER = 2, - ICSK_ACK_PUSHED = 4, - ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ -}; - -enum tcp_ca_event { - CA_EVENT_TX_START = 0, - CA_EVENT_CWND_RESTART = 1, - CA_EVENT_COMPLETE_CWR = 2, - CA_EVENT_LOSS = 3, - CA_EVENT_ECN_NO_CE = 4, - CA_EVENT_ECN_IS_CE = 5, -}; - -struct ack_sample { - __u32 pkts_acked; - __s32 rtt_us; - __u32 in_flight; -} __attribute__((preserve_access_index)); - -struct rate_sample { - __u64 prior_mstamp; /* starting timestamp for interval */ - __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ - __s32 delivered; /* number of packets delivered over interval */ - long interval_us; /* time for tp->delivered to incr "delivered" */ - __u32 snd_interval_us; /* snd interval for delivered packets */ - __u32 rcv_interval_us; /* rcv interval for delivered packets */ - long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ - int losses; /* number of packets marked lost upon ACK */ - __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ - __u32 prior_in_flight; /* in flight before this ACK */ - bool is_app_limited; /* is sample from packet with bubble in pipe? */ - bool is_retrans; /* is sample from retransmission? */ - bool is_ack_delayed; /* is this (likely) a delayed ACK? */ -} __attribute__((preserve_access_index)); - -#define TCP_CA_NAME_MAX 16 -#define TCP_CONG_NEEDS_ECN 0x2 - -struct tcp_congestion_ops { - char name[TCP_CA_NAME_MAX]; - __u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); - - /* return slow start threshold (required) */ - __u32 (*ssthresh)(struct sock *sk); - /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked); - /* call before changing ca_state (optional) */ - void (*set_state)(struct sock *sk, __u8 new_state); - /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); - /* call when ack arrives (optional) */ - void (*in_ack_event)(struct sock *sk, __u32 flags); - /* new value of cwnd after loss (required) */ - __u32 (*undo_cwnd)(struct sock *sk); - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ - __u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - __u32 (*sndbuf_expand)(struct sock *sk); - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) - */ - void (*cong_control)(struct sock *sk, const struct rate_sample *rs); - void *owner; -}; - -#define min(a, b) ((a) < (b) ? (a) : (b)) -#define max(a, b) ((a) > (b) ? (a) : (b)) -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) -{ - return tp->snd_cwnd < tp->snd_ssthresh; -} - -static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; - - return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); -} - -static __always_inline bool tcp_cc_eq(const char *a, const char *b) -{ - int i; - - for (i = 0; i < TCP_CA_NAME_MAX; i++) { - if (a[i] != b[i]) - return false; - if (!a[i]) - break; - } - - return true; -} - -extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; -extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; - -struct mptcp_sock { - struct inet_connection_sock sk; - - __u32 token; - struct sock *first; - char ca_name[TCP_CA_NAME_MAX]; -} __attribute__((preserve_access_index)); - -#endif From 20434d2d896f85b38fa1fe91b8739afcd9cde3b3 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:08 +0800 Subject: [PATCH 081/119] selftests/bpf: Add post_socket_cb for network_helper_opts __start_server() sets SO_REUSPORT through setsockopt() when the parameter 'reuseport' is set. This patch makes it more flexible by adding a function pointer post_socket_cb into struct network_helper_opts. The 'const struct post_socket_opts *cb_opts' args in the post_socket_cb is for the future extension. The 'reuseport' parameter can be dropped. Now the original start_reuseport_server() can be implemented by setting a newly defined reuseport_cb() function pointer to post_socket_cb filed of struct network_helper_opts. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/470cb82f209f055fc7fb39c66c6b090b5b7ed2b2.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/network_helpers.c | 24 ++++++++++++------- tools/testing/selftests/bpf/network_helpers.h | 3 +++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 054d26e383e0..35250e6cde7f 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -81,9 +81,8 @@ int settimeo(int fd, int timeout_ms) #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; }) static int __start_server(int type, const struct sockaddr *addr, socklen_t addrlen, - bool reuseport, const struct network_helper_opts *opts) + const struct network_helper_opts *opts) { - int on = 1; int fd; fd = socket(addr->sa_family, type, opts->proto); @@ -95,9 +94,8 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl if (settimeo(fd, opts->timeout_ms)) goto error_close; - if (reuseport && - setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) { - log_err("Failed to set SO_REUSEPORT"); + if (opts->post_socket_cb && opts->post_socket_cb(fd, NULL)) { + log_err("Failed to call post_socket_cb"); goto error_close; } @@ -132,7 +130,14 @@ int start_server(int family, int type, const char *addr_str, __u16 port, if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) return -1; - return __start_server(type, (struct sockaddr *)&addr, addrlen, false, &opts); + return __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); +} + +static int reuseport_cb(int fd, const struct post_socket_opts *opts) +{ + int on = 1; + + return setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)); } int *start_reuseport_server(int family, int type, const char *addr_str, @@ -140,6 +145,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, { struct network_helper_opts opts = { .timeout_ms = timeout_ms, + .post_socket_cb = reuseport_cb, }; struct sockaddr_storage addr; unsigned int nr_fds = 0; @@ -156,7 +162,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, if (!fds) return NULL; - fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); + fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[0] == -1) goto close_fds; nr_fds = 1; @@ -165,7 +171,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, goto close_fds; for (; nr_fds < nr_listens; nr_fds++) { - fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, true, &opts); + fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); if (fds[nr_fds] == -1) goto close_fds; } @@ -183,7 +189,7 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t l if (!opts) opts = &default_opts; - return __start_server(type, (struct sockaddr *)addr, len, 0, opts); + return __start_server(type, (struct sockaddr *)addr, len, opts); } void free_fds(int *fds, unsigned int nr_close_fds) diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index c62b54daa914..883c7ea9d8d5 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -21,6 +21,8 @@ typedef __u16 __sum16; #define VIP_NUM 5 #define MAGIC_BYTES 123 +struct post_socket_opts {}; + struct network_helper_opts { const char *cc; int timeout_ms; @@ -28,6 +30,7 @@ struct network_helper_opts { bool noconnect; int type; int proto; + int (*post_socket_cb)(int fd, const struct post_socket_opts *opts); }; /* ipv4 test vector */ From 5166b3e3e30a8eb93f7182283ed4db719bdfde1a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:09 +0800 Subject: [PATCH 082/119] selftests/bpf: Use start_server_addr in sockopt_inherit Include network_helpers.h in prog_tests/sockopt_inherit.c, use public helper start_server_addr() instead of the local defined function start_server(). This can avoid duplicate code. Add a helper custom_cb() to set SOL_CUSTOM sockopt looply, set it to post_socket_cb pointer of struct network_helper_opts, and pass it to start_server_addr(). Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/687af66f743a0bf15cdba372c5f71fe64863219e.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/sockopt_inherit.c | 33 +++++++------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 917f486db826..51901359b603 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include "cgroup_helpers.h" +#include "network_helpers.h" #include "sockopt_inherit.skel.h" @@ -98,47 +99,36 @@ static void *server_thread(void *arg) return (void *)(long)err; } -static int start_server(void) +static int custom_cb(int fd, const struct post_socket_opts *opts) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - }; char buf; int err; - int fd; int i; - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create server socket"); - return -1; - } - for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) { buf = 0x01; err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1); if (err) { log_err("Failed to call setsockopt(%d)", i); - close(fd); return -1; } } - if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) { - log_err("Failed to bind socket"); - close(fd); - return -1; - } - - return fd; + return 0; } static void run_test(int cgroup_fd) { struct bpf_link *link_getsockopt = NULL; struct bpf_link *link_setsockopt = NULL; + struct network_helper_opts opts = { + .post_socket_cb = custom_cb, + }; int server_fd = -1, client_fd; + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; struct sockopt_inherit *obj; void *server_err; pthread_t tid; @@ -160,7 +150,8 @@ static void run_test(int cgroup_fd) if (!ASSERT_OK_PTR(link_setsockopt, "cg-attach-setsockopt")) goto close_bpf_object; - server_fd = start_server(); + server_fd = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr, + sizeof(addr), &opts); if (!ASSERT_GE(server_fd, 0, "start_server")) goto close_bpf_object; From 49e1fa8dbd81340f610057be3f3909f24c232807 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:10 +0800 Subject: [PATCH 083/119] selftests/bpf: Use start_server_addr in test_tcp_check_syncookie Include network_helpers.h in test_tcp_check_syncookie_user.c, use public helper start_server_addr() in it instead of the local defined function start_server(). This can avoid duplicate code. Add two helpers v6only_true() and v6only_false() to set IPV6_V6ONLY sockopt to true or false, set them to post_socket_cb pointer of struct network_helper_opts, and pass it to start_server_setsockopt(). In order to use functions defined in network_helpers.c, Makefile needs to be updated too. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/e0c5324f5da84f453f47543536e70f126eaa8678.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/Makefile | 1 + .../bpf/test_tcp_check_syncookie_user.c | 68 +++++++------------ 2 files changed, 25 insertions(+), 44 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e962a0bd8a78..135023a357b3 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -310,6 +310,7 @@ $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) $(OUTPUT)/xsk.o: $(BPFOBJ) +$(OUTPUT)/test_tcp_check_syncookie_user: $(NETWORK_HELPERS) BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index 32df93747095..bf60bc267cbc 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -16,44 +16,7 @@ #include #include "cgroup_helpers.h" - -static int start_server(const struct sockaddr *addr, socklen_t len, bool dual) -{ - int mode = !dual; - int fd; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create server socket"); - goto out; - } - - if (addr->sa_family == AF_INET6) { - if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&mode, - sizeof(mode)) == -1) { - log_err("Failed to set the dual-stack mode"); - goto close_out; - } - } - - if (bind(fd, addr, len) == -1) { - log_err("Failed to bind server socket"); - goto close_out; - } - - if (listen(fd, 128) == -1) { - log_err("Failed to listen on server socket"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} +#include "network_helpers.h" static int connect_to_server(const struct sockaddr *addr, socklen_t len) { @@ -216,8 +179,23 @@ static bool get_port(int server_fd, in_port_t *port) return true; } +static int v6only_true(int fd, const struct post_socket_opts *opts) +{ + int mode = true; + + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); +} + +static int v6only_false(int fd, const struct post_socket_opts *opts) +{ + int mode = false; + + return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); +} + int main(int argc, char **argv) { + struct network_helper_opts opts = { 0 }; struct sockaddr_in addr4; struct sockaddr_in6 addr6; struct sockaddr_in addr4dual; @@ -259,18 +237,20 @@ int main(int argc, char **argv) addr6dual.sin6_addr = in6addr_any; addr6dual.sin6_port = 0; - server = start_server((const struct sockaddr *)&addr4, sizeof(addr4), - false); + server = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr4, + sizeof(addr4), NULL); if (server == -1 || !get_port(server, &addr4.sin_port)) goto err; - server_v6 = start_server((const struct sockaddr *)&addr6, - sizeof(addr6), false); + opts.post_socket_cb = v6only_true; + server_v6 = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6, + sizeof(addr6), &opts); if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port)) goto err; - server_dual = start_server((const struct sockaddr *)&addr6dual, - sizeof(addr6dual), true); + opts.post_socket_cb = v6only_false; + server_dual = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6dual, + sizeof(addr6dual), &opts); if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) goto err; From 5059c73eca67e686dea42af079c41857cb00a5a6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:11 +0800 Subject: [PATCH 084/119] selftests/bpf: Use connect_to_fd in sockopt_inherit This patch uses public helper connect_to_fd() exported in network_helpers.h instead of the local defined function connect_to_server() in prog_tests/sockopt_inherit.c. This can avoid duplicate code. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/71db79127cc160b0643fd9a12c70ae019ae076a1.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../bpf/prog_tests/sockopt_inherit.c | 31 +------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 51901359b603..1d3a20f01b60 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -10,35 +10,6 @@ #define CUSTOM_INHERIT2 1 #define CUSTOM_LISTENER 2 -static int connect_to_server(int server_fd) -{ - struct sockaddr_storage addr; - socklen_t len = sizeof(addr); - int fd; - - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - log_err("Failed to create client socket"); - return -1; - } - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)&addr, len) < 0) { - log_err("Fail to connect to server"); - goto out; - } - - return fd; - -out: - close(fd); - return -1; -} - static int verify_sockopt(int fd, int optname, const char *msg, char expected) { socklen_t optlen = 1; @@ -164,7 +135,7 @@ static void run_test(int cgroup_fd) pthread_cond_wait(&server_started, &server_started_mtx); pthread_mutex_unlock(&server_started_mtx); - client_fd = connect_to_server(server_fd); + client_fd = connect_to_fd(server_fd, 0); if (!ASSERT_GE(client_fd, 0, "connect_to_server")) goto close_server_fd; From 65a3f0df44dd3db0f77e6ccff0a126969abc0da4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:12 +0800 Subject: [PATCH 085/119] selftests/bpf: Use connect_to_fd in test_tcp_check_syncookie This patch uses public helper connect_to_fd() exported in network_helpers.h instead of the local defined function connect_to_server() in test_tcp_check_syncookie_user.c. This can avoid duplicate code. Then the arguments "addr" and "len" of run_test() become useless, drop them too. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/e0ae6b790ac0abc7193aadfb2660c8c9eb0fe1f0.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../bpf/test_tcp_check_syncookie_user.c | 38 +++---------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index bf60bc267cbc..b928474f3bf9 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -18,30 +18,6 @@ #include "cgroup_helpers.h" #include "network_helpers.h" -static int connect_to_server(const struct sockaddr *addr, socklen_t len) -{ - int fd = -1; - - fd = socket(addr->sa_family, SOCK_STREAM, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto out; - } - - if (connect(fd, (const struct sockaddr *)addr, len) == -1) { - log_err("Fail to connect to server"); - goto close_out; - } - - goto out; - -close_out: - close(fd); - fd = -1; -out: - return fd; -} - static int get_map_fd_by_prog_id(int prog_id, bool *xdp) { struct bpf_prog_info info = {}; @@ -80,8 +56,7 @@ err: return map_fd; } -static int run_test(int server_fd, int results_fd, bool xdp, - const struct sockaddr *addr, socklen_t len) +static int run_test(int server_fd, int results_fd, bool xdp) { int client = -1, srv_client = -1; int ret = 0; @@ -107,7 +82,7 @@ static int run_test(int server_fd, int results_fd, bool xdp, goto err; } - client = connect_to_server(addr, len); + client = connect_to_fd(server_fd, 0); if (client == -1) goto err; @@ -254,16 +229,13 @@ int main(int argc, char **argv) if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) goto err; - if (run_test(server, results, xdp, - (const struct sockaddr *)&addr4, sizeof(addr4))) + if (run_test(server, results, xdp)) goto err; - if (run_test(server_v6, results, xdp, - (const struct sockaddr *)&addr6, sizeof(addr6))) + if (run_test(server_v6, results, xdp)) goto err; - if (run_test(server_dual, results, xdp, - (const struct sockaddr *)&addr4dual, sizeof(addr4dual))) + if (run_test(server_dual, results, xdp)) goto err; printf("ok\n"); From 7abbf38cd8edb92bc72fe3405f8a0bf19f7761c2 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 5 May 2024 19:35:13 +0800 Subject: [PATCH 086/119] selftests/bpf: Drop get_port in test_tcp_check_syncookie The arguments "addr" and "len" of run_test() have dropped. This makes function get_port() useless. Drop it from test_tcp_check_syncookie_user.c. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/a9b5c8064ab4cbf0f68886fe0e4706428b8d0d47.1714907662.git.tanggeliang@kylinos.cn Signed-off-by: Martin KaFai Lau --- .../bpf/test_tcp_check_syncookie_user.c | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c index b928474f3bf9..7b5fc98838cd 100644 --- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c +++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c @@ -139,21 +139,6 @@ out: return ret; } -static bool get_port(int server_fd, in_port_t *port) -{ - struct sockaddr_in addr; - socklen_t len = sizeof(addr); - - if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) { - log_err("Failed to get server addr"); - return false; - } - - /* sin_port and sin6_port are located at the same offset. */ - *port = addr.sin_port; - return true; -} - static int v6only_true(int fd, const struct post_socket_opts *opts) { int mode = true; @@ -214,19 +199,19 @@ int main(int argc, char **argv) server = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr4, sizeof(addr4), NULL); - if (server == -1 || !get_port(server, &addr4.sin_port)) + if (server == -1) goto err; opts.post_socket_cb = v6only_true; server_v6 = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6, sizeof(addr6), &opts); - if (server_v6 == -1 || !get_port(server_v6, &addr6.sin6_port)) + if (server_v6 == -1) goto err; opts.post_socket_cb = v6only_false; server_dual = start_server_addr(SOCK_STREAM, (struct sockaddr_storage *)&addr6dual, sizeof(addr6dual), &opts); - if (server_dual == -1 || !get_port(server_dual, &addr4dual.sin_port)) + if (server_dual == -1) goto err; if (run_test(server, results, xdp)) From fcd1ed89a0439c45e1336bd9649485c44b7597c7 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 7 May 2024 14:55:14 +0100 Subject: [PATCH 087/119] kbuild,bpf: Switch to using --btf_features for pahole v1.26 and later The btf_features list can be used for pahole v1.26 and later - it is useful because if a feature is not yet implemented it will not exit with a failure message. This will allow us to add feature requests to the pahole options without having to check pahole versions in future; if the version of pahole supports the feature it will be added. Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Tested-by: Eduard Zingerman Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240507135514.490467-1-alan.maguire@oracle.com --- scripts/Makefile.btf | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf index 82377e470aed..2d6e5ed9081e 100644 --- a/scripts/Makefile.btf +++ b/scripts/Makefile.btf @@ -3,6 +3,8 @@ pahole-ver := $(CONFIG_PAHOLE_VERSION) pahole-flags-y := +ifeq ($(call test-le, $(pahole-ver), 125),y) + # pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars ifeq ($(call test-le, $(pahole-ver), 121),y) pahole-flags-$(call test-ge, $(pahole-ver), 118) += --skip_encoding_btf_vars @@ -12,8 +14,17 @@ pahole-flags-$(call test-ge, $(pahole-ver), 121) += --btf_gen_floats pahole-flags-$(call test-ge, $(pahole-ver), 122) += -j +ifeq ($(pahole-ver), 125) +pahole-flags-y += --skip_encoding_btf_inconsistent_proto --btf_gen_optimized +endif + +else + +# Switch to using --btf_features for v1.26 and later. +pahole-flags-$(call test-ge, $(pahole-ver), 126) = -j --btf_features=encode_force,var,float,enum64,decl_tag,type_tag,optimized_func,consistent_func + +endif + pahole-flags-$(CONFIG_PAHOLE_HAS_LANG_EXCLUDE) += --lang_exclude=rust -pahole-flags-$(call test-ge, $(pahole-ver), 125) += --skip_encoding_btf_inconsistent_proto --btf_gen_optimized - export PAHOLE_FLAGS := $(pahole-flags-y) From f122668ddcce450c2585f0be4bf4478d6fd6176b Mon Sep 17 00:00:00 2001 From: Shahab Vahedi Date: Tue, 30 Apr 2024 16:56:04 +0200 Subject: [PATCH 088/119] ARC: Add eBPF JIT support This will add eBPF JIT support to the 32-bit ARCv2 processors. The implementation is qualified by running the BPF tests on a Synopsys HSDK board with "ARC HS38 v2.1c at 500 MHz" as the 4-core CPU. The test_bpf.ko reports 2-10 fold improvements in execution time of its tests. For instance: test_bpf: #33 tcpdump port 22 jited:0 704 1766 2104 PASS test_bpf: #33 tcpdump port 22 jited:1 120 224 260 PASS test_bpf: #141 ALU_DIV_X: 4294967295 / 4294967295 = 1 jited:0 238 PASS test_bpf: #141 ALU_DIV_X: 4294967295 / 4294967295 = 1 jited:1 23 PASS test_bpf: #776 JMP32_JGE_K: all ... magnitudes jited:0 2034681 PASS test_bpf: #776 JMP32_JGE_K: all ... magnitudes jited:1 1020022 PASS Deployment and structure ------------------------ The related codes are added to "arch/arc/net": - bpf_jit.h -- The interface that a back-end translator must provide - bpf_jit_core.c -- Knows how to handle the input eBPF byte stream - bpf_jit_arcv2.c -- The back-end code that knows the translation logic The bpf_int_jit_compile() at the end of bpf_jit_core.c is the entrance to the whole process. Normally, the translation is done in one pass, namely the "normal pass". In case some relocations are not known during this pass, some data (arc_jit_data) is allocated for the next pass to come. This possible next (and last) pass is called the "extra pass". 1. Normal pass # The necessary pass 1a. Dry run # Get the whole JIT length, epilogue offset, etc. 1b. Emit phase # Allocate memory and start emitting instructions 2. Extra pass # Only needed if there are relocations to be fixed 2a. Patch relocations Support status -------------- The JIT compiler supports BPF instructions up to "cpu=v4". However, it does not yet provide support for: - Tail calls - Atomic operations - 64-bit division/remainder - BPF_PROBE_MEM* (exception table) The result of "test_bpf" test suite on an HSDK board is: hsdk-lnx# insmod test_bpf.ko test_suite=test_bpf test_bpf: Summary: 863 PASSED, 186 FAILED, [851/851 JIT'ed] All the failing test cases are due to the ones that were not JIT'ed. Categorically, they can be represented as: .-----------.------------.-------------. | test type | opcodes | # of cases | |-----------+------------+-------------| | atomic | 0xC3, 0xDB | 149 | | div64 | 0x37, 0x3F | 22 | | mod64 | 0x97, 0x9F | 15 | `-----------^------------+-------------| | (total) 186 | `-------------' Setup: build config ------------------- The following configs must be set to have a working JIT test: CONFIG_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_TEST_BPF=m The following options are not necessary for the tests module, but are good to have: CONFIG_DEBUG_INFO=y # prerequisite for below CONFIG_DEBUG_INFO_BTF=y # so bpftool can generate vmlinux.h CONFIG_FTRACE=y # CONFIG_BPF_SYSCALL=y # all these options lead to CONFIG_KPROBE_EVENTS=y # having CONFIG_BPF_EVENTS=y CONFIG_PERF_EVENTS=y # Some BPF programs provide data through /sys/kernel/debug: CONFIG_DEBUG_FS=y arc# mount -t debugfs debugfs /sys/kernel/debug Setup: elfutils --------------- The libdw.{so,a} library that is used by pahole for processing the final binary must come from elfutils 0.189 or newer. The support for ARCv2 [1] has been added since that version. [1] https://sourceware.org/git/?p=elfutils.git;a=commit;h=de3d46b3e7 Setup: pahole ------------- The line below in linux/scripts/Makefile.btf must be commented out: pahole-flags-$(call test-ge, $(pahole-ver), 121) += --btf_gen_floats Or else, the build will fail: $ make V=1 ... BTF .btf.vmlinux.bin.o pahole -J --btf_gen_floats \ -j --lang_exclude=rust \ --skip_encoding_btf_inconsistent_proto \ --btf_gen_optimized .tmp_vmlinux.btf Complex, interval and imaginary float types are not supported Encountered error while encoding BTF. ... BTFIDS vmlinux ./tools/bpf/resolve_btfids/resolve_btfids vmlinux libbpf: failed to find '.BTF' ELF section in vmlinux FAILED: load BTF from vmlinux: No data available This is due to the fact that the ARC toolchains generate "complex float" DIE entries in libgcc and at the moment, pahole can't handle such entries. Running the tests ----------------- host$ scp /bld/linux/lib/test_bpf.ko arc: arc # sysctl net.core.bpf_jit_enable=1 arc # insmod test_bpf.ko test_suite=test_bpf ... test_bpf: #1048 Staggered jumps: JMP32_JSLE_X jited:1 697811 PASS test_bpf: Summary: 863 PASSED, 186 FAILED, [851/851 JIT'ed] Acknowledgments --------------- - Claudiu Zissulescu for his unwavering support - Yuriy Kolerov for testing and troubleshooting - Vladimir Isaev for the pahole workaround - Sergey Matyukevich for paving the road by adding the interpreter support Signed-off-by: Shahab Vahedi Link: https://lore.kernel.org/r/20240430145604.38592-1-list+bpf@vahedi.org Signed-off-by: Alexei Starovoitov --- Documentation/admin-guide/sysctl/net.rst | 1 + Documentation/networking/filter.rst | 4 +- MAINTAINERS | 6 + arch/arc/Kbuild | 1 + arch/arc/Kconfig | 1 + arch/arc/net/Makefile | 6 + arch/arc/net/bpf_jit.h | 164 ++ arch/arc/net/bpf_jit_arcv2.c | 3005 ++++++++++++++++++++++ arch/arc/net/bpf_jit_core.c | 1425 ++++++++++ 9 files changed, 4611 insertions(+), 2 deletions(-) create mode 100644 arch/arc/net/Makefile create mode 100644 arch/arc/net/bpf_jit.h create mode 100644 arch/arc/net/bpf_jit_arcv2.c create mode 100644 arch/arc/net/bpf_jit_core.c diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 7250c0542828..7b0c4291c686 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -72,6 +72,7 @@ two flavors of JITs, the newer eBPF JIT currently supported on: - riscv64 - riscv32 - loongarch64 + - arc And the older cBPF JIT supported on the following archs: diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 7d8c5380492f..8eb9a5d40f31 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -513,7 +513,7 @@ JIT compiler ------------ The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, -PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through +PowerPC, ARM, ARM64, MIPS, RISC-V, s390, and ARC and can be enabled through CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each attached filter from user space or for internal kernel users if it has been previously enabled by root:: @@ -650,7 +650,7 @@ before a conversion to the new layout is being done behind the scenes! Currently, the classic BPF format is being used for JITing on most 32-bit architectures, whereas x86-64, aarch64, s390x, powerpc64, -sparc64, arm32, riscv64, riscv32, loongarch64 perform JIT compilation +sparc64, arm32, riscv64, riscv32, loongarch64, arc perform JIT compilation from eBPF instruction set. Testing diff --git a/MAINTAINERS b/MAINTAINERS index 943921d642ad..b6a946d24f00 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3712,6 +3712,12 @@ S: Maintained F: Documentation/devicetree/bindings/iio/imu/bosch,bmi323.yaml F: drivers/iio/imu/bmi323/ +BPF JIT for ARC +M: Shahab Vahedi +L: bpf@vger.kernel.org +S: Maintained +F: arch/arc/net/ + BPF JIT for ARM M: Russell King M: Puranjay Mohan diff --git a/arch/arc/Kbuild b/arch/arc/Kbuild index b94102fff68b..20ea7dd482d4 100644 --- a/arch/arc/Kbuild +++ b/arch/arc/Kbuild @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += kernel/ obj-y += mm/ +obj-y += net/ # for cleaning subdir- += boot diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 99d2845f3feb..551fc65a9898 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -52,6 +52,7 @@ config ARC select PCI_SYSCALL if PCI select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 select TRACE_IRQFLAGS_SUPPORT + select HAVE_EBPF_JIT if ISA_ARCV2 config LOCKDEP_SUPPORT def_bool y diff --git a/arch/arc/net/Makefile b/arch/arc/net/Makefile new file mode 100644 index 000000000000..ea5790952e9a --- /dev/null +++ b/arch/arc/net/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only + +ifeq ($(CONFIG_ISA_ARCV2),y) + obj-$(CONFIG_BPF_JIT) += bpf_jit_core.o + obj-$(CONFIG_BPF_JIT) += bpf_jit_arcv2.o +endif diff --git a/arch/arc/net/bpf_jit.h b/arch/arc/net/bpf_jit.h new file mode 100644 index 000000000000..ec44873c42d1 --- /dev/null +++ b/arch/arc/net/bpf_jit.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The interface that a back-end should provide to bpf_jit_core.c. + * + * Copyright (c) 2024 Synopsys Inc. + * Author: Shahab Vahedi + */ + +#ifndef _ARC_BPF_JIT_H +#define _ARC_BPF_JIT_H + +#include +#include + +/* Print debug info and assert. */ +//#define ARC_BPF_JIT_DEBUG + +/* Determine the address type of the target. */ +#ifdef CONFIG_ISA_ARCV2 +#define ARC_ADDR u32 +#endif + +/* + * For the translation of some BPF instructions, a temporary register + * might be needed for some interim data. + */ +#define JIT_REG_TMP MAX_BPF_JIT_REG + +/* + * Buffer access: If buffer "b" is not NULL, advance by "n" bytes. + * + * This macro must be used in any place that potentially requires a + * "buf + len". This way, we make sure that the "buf" argument for + * the underlying "arc_*(buf, ...)" ends up as NULL instead of something + * like "0+4" or "0+8", etc. Those "arc_*()" functions check their "buf" + * value to decide if instructions should be emitted or not. + */ +#define BUF(b, n) (((b) != NULL) ? ((b) + (n)) : (b)) + +/************** Functions that the back-end must provide **************/ +/* Extension for 32-bit operations. */ +inline u8 zext(u8 *buf, u8 rd); +/***** Moves *****/ +u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext); +u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm); +u8 mov_r64(u8 *buf, u8 rd, u8 rs, u8 sign_ext); +u8 mov_r64_i32(u8 *buf, u8 reg, s32 imm); +u8 mov_r64_i64(u8 *buf, u8 reg, u32 lo, u32 hi); +/***** Loads and stores *****/ +u8 load_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size, bool sign_ext); +u8 store_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size); +u8 store_i(u8 *buf, s32 imm, u8 rd, s16 off, u8 size); +/***** Addition *****/ +u8 add_r32(u8 *buf, u8 rd, u8 rs); +u8 add_r32_i32(u8 *buf, u8 rd, s32 imm); +u8 add_r64(u8 *buf, u8 rd, u8 rs); +u8 add_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Subtraction *****/ +u8 sub_r32(u8 *buf, u8 rd, u8 rs); +u8 sub_r32_i32(u8 *buf, u8 rd, s32 imm); +u8 sub_r64(u8 *buf, u8 rd, u8 rs); +u8 sub_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Multiplication *****/ +u8 mul_r32(u8 *buf, u8 rd, u8 rs); +u8 mul_r32_i32(u8 *buf, u8 rd, s32 imm); +u8 mul_r64(u8 *buf, u8 rd, u8 rs); +u8 mul_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Division *****/ +u8 div_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext); +u8 div_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext); +/***** Remainder *****/ +u8 mod_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext); +u8 mod_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext); +/***** Bitwise AND *****/ +u8 and_r32(u8 *buf, u8 rd, u8 rs); +u8 and_r32_i32(u8 *buf, u8 rd, s32 imm); +u8 and_r64(u8 *buf, u8 rd, u8 rs); +u8 and_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Bitwise OR *****/ +u8 or_r32(u8 *buf, u8 rd, u8 rs); +u8 or_r32_i32(u8 *buf, u8 rd, s32 imm); +u8 or_r64(u8 *buf, u8 rd, u8 rs); +u8 or_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Bitwise XOR *****/ +u8 xor_r32(u8 *buf, u8 rd, u8 rs); +u8 xor_r32_i32(u8 *buf, u8 rd, s32 imm); +u8 xor_r64(u8 *buf, u8 rd, u8 rs); +u8 xor_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Bitwise Negate *****/ +u8 neg_r32(u8 *buf, u8 r); +u8 neg_r64(u8 *buf, u8 r); +/***** Bitwise left shift *****/ +u8 lsh_r32(u8 *buf, u8 rd, u8 rs); +u8 lsh_r32_i32(u8 *buf, u8 rd, u8 imm); +u8 lsh_r64(u8 *buf, u8 rd, u8 rs); +u8 lsh_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Bitwise right shift (logical) *****/ +u8 rsh_r32(u8 *buf, u8 rd, u8 rs); +u8 rsh_r32_i32(u8 *buf, u8 rd, u8 imm); +u8 rsh_r64(u8 *buf, u8 rd, u8 rs); +u8 rsh_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Bitwise right shift (arithmetic) *****/ +u8 arsh_r32(u8 *buf, u8 rd, u8 rs); +u8 arsh_r32_i32(u8 *buf, u8 rd, u8 imm); +u8 arsh_r64(u8 *buf, u8 rd, u8 rs); +u8 arsh_r64_i32(u8 *buf, u8 rd, s32 imm); +/***** Frame related *****/ +u32 mask_for_used_regs(u8 bpf_reg, bool is_call); +u8 arc_prologue(u8 *buf, u32 usage, u16 frame_size); +u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size); +/***** Jumps *****/ +/* + * Different sorts of conditions (ARC enum as opposed to BPF_*). + * + * Do not change the order of enums here. ARC_CC_SLE+1 is used + * to determine the number of JCCs. + */ +enum ARC_CC { + ARC_CC_UGT = 0, /* unsigned > */ + ARC_CC_UGE, /* unsigned >= */ + ARC_CC_ULT, /* unsigned < */ + ARC_CC_ULE, /* unsigned <= */ + ARC_CC_SGT, /* signed > */ + ARC_CC_SGE, /* signed >= */ + ARC_CC_SLT, /* signed < */ + ARC_CC_SLE, /* signed <= */ + ARC_CC_AL, /* always */ + ARC_CC_EQ, /* == */ + ARC_CC_NE, /* != */ + ARC_CC_SET, /* test */ + ARC_CC_LAST +}; + +/* + * A few notes: + * + * - check_jmp_*() are prerequisites before calling the gen_jmp_*(). + * They return "true" if the jump is possible and "false" otherwise. + * + * - The notion of "*_off" is to emphasize that these parameters are + * merely offsets in the JIT stream and not absolute addresses. One + * can look at them as addresses if the JIT code would start from + * address 0x0000_0000. Nonetheless, since the buffer address for the + * JIT is on a word-aligned address, this works and actually makes + * things simpler (offsets are in the range of u32 which is more than + * enough). + */ +bool check_jmp_32(u32 curr_off, u32 targ_off, u8 cond); +bool check_jmp_64(u32 curr_off, u32 targ_off, u8 cond); +u8 gen_jmp_32(u8 *buf, u8 rd, u8 rs, u8 cond, u32 c_off, u32 t_off); +u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 c_off, u32 t_off); +/***** Miscellaneous *****/ +u8 gen_func_call(u8 *buf, ARC_ADDR func_addr, bool external_func); +u8 arc_to_bpf_return(u8 *buf); +/* + * - Perform byte swaps on "rd" based on the "size". + * - If "force" is set, do it unconditionally. Otherwise, consider the + * desired "endian"ness and the host endianness. + * - For data "size"s up to 32 bits, perform a zero-extension if asked + * by the "do_zext" boolean. + */ +u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext); + +#endif /* _ARC_BPF_JIT_H */ diff --git a/arch/arc/net/bpf_jit_arcv2.c b/arch/arc/net/bpf_jit_arcv2.c new file mode 100644 index 000000000000..31bfb6e9ce00 --- /dev/null +++ b/arch/arc/net/bpf_jit_arcv2.c @@ -0,0 +1,3005 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The ARCv2 backend of Just-In-Time compiler for eBPF bytecode. + * + * Copyright (c) 2024 Synopsys Inc. + * Author: Shahab Vahedi + */ +#include +#include "bpf_jit.h" + +/* ARC core registers. */ +enum { + ARC_R_0, ARC_R_1, ARC_R_2, ARC_R_3, ARC_R_4, ARC_R_5, + ARC_R_6, ARC_R_7, ARC_R_8, ARC_R_9, ARC_R_10, ARC_R_11, + ARC_R_12, ARC_R_13, ARC_R_14, ARC_R_15, ARC_R_16, ARC_R_17, + ARC_R_18, ARC_R_19, ARC_R_20, ARC_R_21, ARC_R_22, ARC_R_23, + ARC_R_24, ARC_R_25, ARC_R_26, ARC_R_FP, ARC_R_SP, ARC_R_ILINK, + ARC_R_30, ARC_R_BLINK, + /* + * Having ARC_R_IMM encoded as source register means there is an + * immediate that must be interpreted from the next 4 bytes. If + * encoded as the destination register though, it implies that the + * output of the operation is not assigned to any register. The + * latter is helpful if we only care about updating the CPU status + * flags. + */ + ARC_R_IMM = 62 +}; + +/* + * Remarks about the rationale behind the chosen mapping: + * + * - BPF_REG_{1,2,3,4} are the argument registers and must be mapped to + * argument registers in ARCv2 ABI: r0-r7. The r7 registers is the last + * argument register in the ABI. Therefore BPF_REG_5, as the fifth + * argument, must be pushed onto the stack. This is a must for calling + * in-kernel functions. + * + * - In ARCv2 ABI, the return value is in r0 for 32-bit results and (r1,r0) + * for 64-bit results. However, because they're already used for BPF_REG_1, + * the next available scratch registers, r8 and r9, are the best candidates + * for BPF_REG_0. After a "call" to a(n) (in-kernel) function, the result + * is "mov"ed to these registers. At a BPF_EXIT, their value is "mov"ed to + * (r1,r0). + * It is worth mentioning that scratch registers are the best choice for + * BPF_REG_0, because it is very popular in BPF instruction encoding. + * + * - JIT_REG_TMP is an artifact needed to translate some BPF instructions. + * Its life span is one single BPF instruction. Since during the + * analyze_reg_usage(), it is not known if temporary registers are used, + * it is mapped to ARC's scratch registers: r10 and r11. Therefore, they + * don't matter in analysing phase and don't need saving. This temporary + * register is added as yet another index in the bpf2arc array, so it will + * unfold like the rest of registers during the code generation process. + * + * - Mapping of callee-saved BPF registers, BPF_REG_{6,7,8,9}, starts from + * (r15,r14) register pair. The (r13,r12) is not a good choice, because + * in ARCv2 ABI, r12 is not a callee-saved register and this can cause + * problem when calling an in-kernel function. Theoretically, the mapping + * could start from (r14,r13), but it is not a conventional ARCv2 register + * pair. To have a future proof design, I opted for this arrangement. + * If/when we decide to add ARCv2 instructions that do use register pairs, + * the mapping, hopefully, doesn't need to be revisited. + */ +const u8 bpf2arc[][2] = { + /* Return value from in-kernel function, and exit value from eBPF */ + [BPF_REG_0] = {ARC_R_8, ARC_R_9}, + /* Arguments from eBPF program to in-kernel function */ + [BPF_REG_1] = {ARC_R_0, ARC_R_1}, + [BPF_REG_2] = {ARC_R_2, ARC_R_3}, + [BPF_REG_3] = {ARC_R_4, ARC_R_5}, + [BPF_REG_4] = {ARC_R_6, ARC_R_7}, + /* Remaining arguments, to be passed on the stack per 32-bit ABI */ + [BPF_REG_5] = {ARC_R_22, ARC_R_23}, + /* Callee-saved registers that in-kernel function will preserve */ + [BPF_REG_6] = {ARC_R_14, ARC_R_15}, + [BPF_REG_7] = {ARC_R_16, ARC_R_17}, + [BPF_REG_8] = {ARC_R_18, ARC_R_19}, + [BPF_REG_9] = {ARC_R_20, ARC_R_21}, + /* Read-only frame pointer to access the eBPF stack. 32-bit only. */ + [BPF_REG_FP] = {ARC_R_FP, }, + /* Register for blinding constants */ + [BPF_REG_AX] = {ARC_R_24, ARC_R_25}, + /* Temporary registers for internal use */ + [JIT_REG_TMP] = {ARC_R_10, ARC_R_11} +}; + +#define ARC_CALLEE_SAVED_REG_FIRST ARC_R_13 +#define ARC_CALLEE_SAVED_REG_LAST ARC_R_25 + +#define REG_LO(r) (bpf2arc[(r)][0]) +#define REG_HI(r) (bpf2arc[(r)][1]) + +/* + * To comply with ARCv2 ABI, BPF's arg5 must be put on stack. After which, + * the stack needs to be restored by ARG5_SIZE. + */ +#define ARG5_SIZE 8 + +/* Instruction lengths in bytes. */ +enum { + INSN_len_normal = 4, /* Normal instructions length. */ + INSN_len_imm = 4 /* Length of an extra 32-bit immediate. */ +}; + +/* ZZ defines the size of operation in encodings that it is used. */ +enum { + ZZ_1_byte = 1, + ZZ_2_byte = 2, + ZZ_4_byte = 0, + ZZ_8_byte = 3 +}; + +/* + * AA is mostly about address write back mode. It determines if the + * address in question should be updated before usage or after: + * addr += offset; data = *addr; + * data = *addr; addr += offset; + * + * In "scaling" mode, the effective address will become the sum + * of "address" + "index"*"size". The "size" is specified by the + * "ZZ" field. There is no write back when AA is set for scaling: + * data = *(addr + offset<= 0) +#define IN_S9_RANGE(x) ((x) <= (0x100 - 1) && (x) >= -0x100) +#define IN_S12_RANGE(x) ((x) <= (0x800 - 1) && (x) >= -0x800) +#define IN_S21_RANGE(x) ((x) <= (0x100000 - 1) && (x) >= -0x100000) +#define IN_S25_RANGE(x) ((x) <= (0x1000000 - 1) && (x) >= -0x1000000) + +/* Operands in most of the encodings. */ +#define OP_A(x) ((x) & 0x03f) +#define OP_B(x) ((((x) & 0x07) << 24) | (((x) & 0x38) << 9)) +#define OP_C(x) (((x) & 0x03f) << 6) +#define OP_IMM (OP_C(ARC_R_IMM)) +#define COND(x) (OP_A((x) & 31)) +#define FLAG(x) (((x) & 1) << 15) + +/* + * The 4-byte encoding of "mov b,c": + * + * 0010_0bbb 0000_1010 0BBB_cccc cc00_0000 + * + * b: BBBbbb destination register + * c: cccccc source register + */ +#define OPC_MOV 0x200a0000 + +/* + * The 4-byte encoding of "mov b,s12" (used for moving small immediates): + * + * 0010_0bbb 1000_1010 0BBB_ssss ssSS_SSSS + * + * b: BBBbbb destination register + * s: SSSSSSssssss source immediate (signed) + */ +#define OPC_MOVI 0x208a0000 +#define MOVI_S12(x) ((((x) & 0xfc0) >> 6) | (((x) & 0x3f) << 6)) + +/* + * The 4-byte encoding of "mov[.qq] b,u6", used for conditional + * moving of even smaller immediates: + * + * 0010_0bbb 1100_1010 0BBB_cccc cciq_qqqq + * + * qq: qqqqq condition code + * i: If set, c is considered a 6-bit immediate, else a reg. + * + * b: BBBbbb destination register + * c: cccccc source + */ +#define OPC_MOV_CC 0x20ca0000 +#define MOV_CC_I BIT(5) +#define OPC_MOVU_CC (OPC_MOV_CC | MOV_CC_I) + +/* + * The 4-byte encoding of "sexb b,c" (8-bit sign extension): + * + * 0010_0bbb 0010_1111 0BBB_cccc cc00_0101 + * + * b: BBBbbb destination register + * c: cccccc source register + */ +#define OPC_SEXB 0x202f0005 + +/* + * The 4-byte encoding of "sexh b,c" (16-bit sign extension): + * + * 0010_0bbb 0010_1111 0BBB_cccc cc00_0110 + * + * b: BBBbbb destination register + * c: cccccc source register + */ +#define OPC_SEXH 0x202f0006 + +/* + * The 4-byte encoding of "ld[zz][.x][.aa] c,[b,s9]": + * + * 0001_0bbb ssss_ssss SBBB_0aaz zxcc_cccc + * + * zz: size mode + * aa: address write back mode + * x: extension mode + * + * s9: S_ssss_ssss 9-bit signed number + * b: BBBbbb source reg for address + * c: cccccc destination register + */ +#define OPC_LOAD 0x10000000 +#define LOAD_X(x) ((x) << 6) +#define LOAD_ZZ(x) ((x) << 7) +#define LOAD_AA(x) ((x) << 9) +#define LOAD_S9(x) ((((x) & 0x0ff) << 16) | (((x) & 0x100) << 7)) +#define LOAD_C(x) ((x) & 0x03f) +/* Unsigned and signed loads. */ +#define OPC_LDU (OPC_LOAD | LOAD_X(X_zero)) +#define OPC_LDS (OPC_LOAD | LOAD_X(X_sign)) +/* 32-bit load. */ +#define OPC_LD32 (OPC_LDU | LOAD_ZZ(ZZ_4_byte)) +/* "pop reg" is merely a "ld.ab reg,[sp,4]". */ +#define OPC_POP \ + (OPC_LD32 | LOAD_AA(AA_post) | LOAD_S9(4) | OP_B(ARC_R_SP)) + +/* + * The 4-byte encoding of "st[zz][.aa] c,[b,s9]": + * + * 0001_1bbb ssss_ssss SBBB_cccc cc0a_azz0 + * + * zz: zz size mode + * aa: aa address write back mode + * + * s9: S_ssss_ssss 9-bit signed number + * b: BBBbbb source reg for address + * c: cccccc source reg to be stored + */ +#define OPC_STORE 0x18000000 +#define STORE_ZZ(x) ((x) << 1) +#define STORE_AA(x) ((x) << 3) +#define STORE_S9(x) ((((x) & 0x0ff) << 16) | (((x) & 0x100) << 7)) +/* 32-bit store. */ +#define OPC_ST32 (OPC_STORE | STORE_ZZ(ZZ_4_byte)) +/* "push reg" is merely a "st.aw reg,[sp,-4]". */ +#define OPC_PUSH \ + (OPC_ST32 | STORE_AA(AA_pre) | STORE_S9(-4) | OP_B(ARC_R_SP)) + +/* + * The 4-byte encoding of "add a,b,c": + * + * 0010_0bbb 0i00_0000 fBBB_cccc ccaa_aaaa + * + * f: indicates if flags (carry, etc.) should be updated + * i: If set, c is considered a 6-bit immediate, else a reg. + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_ADD 0x20000000 +/* Addition with updating the pertinent flags in "status32" register. */ +#define OPC_ADDF (OPC_ADD | FLAG(1)) +#define ADDI BIT(22) +#define ADDI_U6(x) OP_C(x) +#define OPC_ADDI (OPC_ADD | ADDI) +#define OPC_ADDIF (OPC_ADDI | FLAG(1)) +#define OPC_ADD_I (OPC_ADD | OP_IMM) + +/* + * The 4-byte encoding of "adc a,b,c" (addition with carry): + * + * 0010_0bbb 0i00_0001 0BBB_cccc ccaa_aaaa + * + * i: if set, c is considered a 6-bit immediate, else a reg. + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_ADC 0x20010000 +#define ADCI BIT(22) +#define ADCI_U6(x) OP_C(x) +#define OPC_ADCI (OPC_ADC | ADCI) + +/* + * The 4-byte encoding of "sub a,b,c": + * + * 0010_0bbb 0i00_0010 fBBB_cccc ccaa_aaaa + * + * f: indicates if flags (carry, etc.) should be updated + * i: if set, c is considered a 6-bit immediate, else a reg. + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_SUB 0x20020000 +/* Subtraction with updating the pertinent flags in "status32" register. */ +#define OPC_SUBF (OPC_SUB | FLAG(1)) +#define SUBI BIT(22) +#define SUBI_U6(x) OP_C(x) +#define OPC_SUBI (OPC_SUB | SUBI) +#define OPC_SUB_I (OPC_SUB | OP_IMM) + +/* + * The 4-byte encoding of "sbc a,b,c" (subtraction with carry): + * + * 0010_0bbb 0000_0011 fBBB_cccc ccaa_aaaa + * + * f: indicates if flags (carry, etc.) should be updated + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_SBC 0x20030000 + +/* + * The 4-byte encoding of "cmp[.qq] b,c": + * + * 0010_0bbb 1100_1100 1BBB_cccc cc0q_qqqq + * + * qq: qqqqq condition code + * + * b: BBBbbb the 1st operand + * c: cccccc the 2nd operand + */ +#define OPC_CMP 0x20cc8000 + +/* + * The 4-byte encoding of "neg a,b": + * + * 0010_0bbb 0100_1110 0BBB_0000 00aa_aaaa + * + * a: aaaaaa result + * b: BBBbbb input + */ +#define OPC_NEG 0x204e0000 + +/* + * The 4-byte encoding of "mpy a,b,c". + * mpy is the signed 32-bit multiplication with the lower 32-bit + * of the product as the result. + * + * 0010_0bbb 0001_1010 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_MPY 0x201a0000 +#define OPC_MPYI (OPC_MPY | OP_IMM) + +/* + * The 4-byte encoding of "mpydu a,b,c". + * mpydu is the unsigned 32-bit multiplication with the lower 32-bit of + * the product in register "a" and the higher 32-bit in register "a+1". + * + * 0010_1bbb 0001_1001 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa 64-bit result in registers (R_a+1,R_a) + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_MPYDU 0x28190000 +#define OPC_MPYDUI (OPC_MPYDU | OP_IMM) + +/* + * The 4-byte encoding of "divu a,b,c" (unsigned division): + * + * 0010_1bbb 0000_0101 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa result (quotient) + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand (divisor) + */ +#define OPC_DIVU 0x28050000 +#define OPC_DIVUI (OPC_DIVU | OP_IMM) + +/* + * The 4-byte encoding of "div a,b,c" (signed division): + * + * 0010_1bbb 0000_0100 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa result (quotient) + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand (divisor) + */ +#define OPC_DIVS 0x28040000 +#define OPC_DIVSI (OPC_DIVS | OP_IMM) + +/* + * The 4-byte encoding of "remu a,b,c" (unsigned remainder): + * + * 0010_1bbb 0000_1001 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa result (remainder) + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand (divisor) + */ +#define OPC_REMU 0x28090000 +#define OPC_REMUI (OPC_REMU | OP_IMM) + +/* + * The 4-byte encoding of "rem a,b,c" (signed remainder): + * + * 0010_1bbb 0000_1000 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa result (remainder) + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand (divisor) + */ +#define OPC_REMS 0x28080000 +#define OPC_REMSI (OPC_REMS | OP_IMM) + +/* + * The 4-byte encoding of "and a,b,c": + * + * 0010_0bbb 0000_0100 fBBB_cccc ccaa_aaaa + * + * f: indicates if zero and negative flags should be updated + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_AND 0x20040000 +#define OPC_ANDI (OPC_AND | OP_IMM) + +/* + * The 4-byte encoding of "tst[.qq] b,c". + * Checks if the two input operands have any bit set at the same + * position. + * + * 0010_0bbb 1100_1011 1BBB_cccc cc0q_qqqq + * + * qq: qqqqq condition code + * + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_TST 0x20cb8000 + +/* + * The 4-byte encoding of "or a,b,c": + * + * 0010_0bbb 0000_0101 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_OR 0x20050000 +#define OPC_ORI (OPC_OR | OP_IMM) + +/* + * The 4-byte encoding of "xor a,b,c": + * + * 0010_0bbb 0000_0111 0BBB_cccc ccaa_aaaa + * + * a: aaaaaa result + * b: BBBbbb the 1st input operand + * c: cccccc the 2nd input operand + */ +#define OPC_XOR 0x20070000 +#define OPC_XORI (OPC_XOR | OP_IMM) + +/* + * The 4-byte encoding of "not b,c": + * + * 0010_0bbb 0010_1111 0BBB_cccc cc00_1010 + * + * b: BBBbbb result + * c: cccccc input + */ +#define OPC_NOT 0x202f000a + +/* + * The 4-byte encoding of "btst b,u6": + * + * 0010_0bbb 0101_0001 1BBB_uuuu uu00_0000 + * + * b: BBBbbb input number to check + * u6: uuuuuu 6-bit unsigned number specifying bit position to check + */ +#define OPC_BTSTU6 0x20518000 +#define BTST_U6(x) (OP_C((x) & 63)) + +/* + * The 4-byte encoding of "asl[.qq] b,b,c" (arithmetic shift left): + * + * 0010_1bbb 0i00_0000 0BBB_cccc ccaa_aaaa + * + * i: if set, c is considered a 5-bit immediate, else a reg. + * + * b: BBBbbb result and the first operand (number to be shifted) + * c: cccccc amount to be shifted + */ +#define OPC_ASL 0x28000000 +#define ASL_I BIT(22) +#define ASLI_U6(x) OP_C((x) & 31) +#define OPC_ASLI (OPC_ASL | ASL_I) + +/* + * The 4-byte encoding of "asr a,b,c" (arithmetic shift right): + * + * 0010_1bbb 0i00_0010 0BBB_cccc ccaa_aaaa + * + * i: if set, c is considered a 6-bit immediate, else a reg. + * + * a: aaaaaa result + * b: BBBbbb first input: number to be shifted + * c: cccccc second input: amount to be shifted + */ +#define OPC_ASR 0x28020000 +#define ASR_I ASL_I +#define ASRI_U6(x) ASLI_U6(x) +#define OPC_ASRI (OPC_ASR | ASR_I) + +/* + * The 4-byte encoding of "lsr a,b,c" (logical shift right): + * + * 0010_1bbb 0i00_0001 0BBB_cccc ccaa_aaaa + * + * i: if set, c is considered a 6-bit immediate, else a reg. + * + * a: aaaaaa result + * b: BBBbbb first input: number to be shifted + * c: cccccc second input: amount to be shifted + */ +#define OPC_LSR 0x28010000 +#define LSR_I ASL_I +#define LSRI_U6(x) ASLI_U6(x) +#define OPC_LSRI (OPC_LSR | LSR_I) + +/* + * The 4-byte encoding of "swape b,c": + * + * 0010_1bbb 0010_1111 0bbb_cccc cc00_1001 + * + * b: BBBbbb destination register + * c: cccccc source register + */ +#define OPC_SWAPE 0x282f0009 + +/* + * Encoding for jump to an address in register: + * j reg_c + * + * 0010_0000 1110_0000 0000_cccc cc00_0000 + * + * c: cccccc register holding the destination address + */ +#define OPC_JMP 0x20e00000 +/* Jump to "branch-and-link" register, which effectively is a "return". */ +#define OPC_J_BLINK (OPC_JMP | OP_C(ARC_R_BLINK)) + +/* + * Encoding for jump-and-link to an address in register: + * jl reg_c + * + * 0010_0000 0010_0010 0000_cccc cc00_0000 + * + * c: cccccc register holding the destination address + */ +#define OPC_JL 0x20220000 + +/* + * Encoding for (conditional) branch to an offset from the current location + * that is word aligned: (PC & 0xffff_fffc) + s21 + * B[qq] s21 + * + * 0000_0sss ssss_sss0 SSSS_SSSS SS0q_qqqq + * + * qq: qqqqq condition code + * s21: SSSS SSSS_SSss ssss_ssss The displacement (21-bit signed) + * + * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore, + * it should be a multiple of 2. Hence, there is an implied '0' bit at its + * LSB: S_SSSS SSSS_Ssss ssss_sss0 + */ +#define OPC_BCC 0x00000000 +#define BCC_S21(d) ((((d) & 0x7fe) << 16) | (((d) & 0x1ff800) >> 5)) + +/* + * Encoding for unconditional branch to an offset from the current location + * that is word aligned: (PC & 0xffff_fffc) + s25 + * B s25 + * + * 0000_0sss ssss_sss1 SSSS_SSSS SS00_TTTT + * + * s25: TTTT SSSS SSSS_SSss ssss_ssss The displacement (25-bit signed) + * + * The displacement is supposed to be 16-bit (2-byte) aligned. Therefore, + * it should be a multiple of 2. Hence, there is an implied '0' bit at its + * LSB: T TTTS_SSSS SSSS_Ssss ssss_sss0 + */ +#define OPC_B 0x00010000 +#define B_S25(d) ((((d) & 0x1e00000) >> 21) | BCC_S21(d)) + +static inline void emit_2_bytes(u8 *buf, u16 bytes) +{ + *((u16 *)buf) = bytes; +} + +static inline void emit_4_bytes(u8 *buf, u32 bytes) +{ + emit_2_bytes(buf, bytes >> 16); + emit_2_bytes(buf + 2, bytes & 0xffff); +} + +static inline u8 bpf_to_arc_size(u8 size) +{ + switch (size) { + case BPF_B: + return ZZ_1_byte; + case BPF_H: + return ZZ_2_byte; + case BPF_W: + return ZZ_4_byte; + case BPF_DW: + return ZZ_8_byte; + default: + return ZZ_4_byte; + } +} + +/************** Encoders (Deal with ARC regs) ************/ + +/* Move an immediate to register with a 4-byte instruction. */ +static u8 arc_movi_r(u8 *buf, u8 reg, s16 imm) +{ + const u32 insn = OPC_MOVI | OP_B(reg) | MOVI_S12(imm); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* rd <- rs */ +static u8 arc_mov_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_MOV | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* The emitted code may have different sizes based on "imm". */ +static u8 arc_mov_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_MOV | OP_B(rd) | OP_IMM; + + if (IN_S12_RANGE(imm)) + return arc_movi_r(buf, rd, imm); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* The emitted code will always have the same size (8). */ +static u8 arc_mov_i_fixed(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_MOV | OP_B(rd) | OP_IMM; + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* Conditional move. */ +static u8 arc_mov_cc_r(u8 *buf, u8 cc, u8 rd, u8 rs) +{ + const u32 insn = OPC_MOV_CC | OP_B(rd) | OP_C(rs) | COND(cc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* Conditional move of a small immediate to rd. */ +static u8 arc_movu_cc_r(u8 *buf, u8 cc, u8 rd, u8 imm) +{ + const u32 insn = OPC_MOVU_CC | OP_B(rd) | OP_C(imm) | COND(cc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* Sign extension from a byte. */ +static u8 arc_sexb_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_SEXB | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* Sign extension from two bytes. */ +static u8 arc_sexh_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_SEXH | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* st reg, [reg_mem, off] */ +static u8 arc_st_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz) +{ + const u32 insn = OPC_STORE | STORE_ZZ(zz) | OP_C(reg) | + OP_B(reg_mem) | STORE_S9(off); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* st.aw reg, [sp, -4] */ +static u8 arc_push_r(u8 *buf, u8 reg) +{ + const u32 insn = OPC_PUSH | OP_C(reg); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* ld reg, [reg_mem, off] (unsigned) */ +static u8 arc_ld_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz) +{ + const u32 insn = OPC_LDU | LOAD_ZZ(zz) | LOAD_C(reg) | + OP_B(reg_mem) | LOAD_S9(off); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* ld.x reg, [reg_mem, off] (sign extend) */ +static u8 arc_ldx_r(u8 *buf, u8 reg, u8 reg_mem, s16 off, u8 zz) +{ + const u32 insn = OPC_LDS | LOAD_ZZ(zz) | LOAD_C(reg) | + OP_B(reg_mem) | LOAD_S9(off); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* ld.ab reg,[sp,4] */ +static u8 arc_pop_r(u8 *buf, u8 reg) +{ + const u32 insn = OPC_POP | LOAD_C(reg); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* add Ra,Ra,Rc */ +static u8 arc_add_r(u8 *buf, u8 ra, u8 rc) +{ + const u32 insn = OPC_ADD | OP_A(ra) | OP_B(ra) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* add.f Ra,Ra,Rc */ +static u8 arc_addf_r(u8 *buf, u8 ra, u8 rc) +{ + const u32 insn = OPC_ADDF | OP_A(ra) | OP_B(ra) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* add.f Ra,Ra,u6 */ +static u8 arc_addif_r(u8 *buf, u8 ra, u8 u6) +{ + const u32 insn = OPC_ADDIF | OP_A(ra) | OP_B(ra) | ADDI_U6(u6); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* add Ra,Ra,u6 */ +static u8 arc_addi_r(u8 *buf, u8 ra, u8 u6) +{ + const u32 insn = OPC_ADDI | OP_A(ra) | OP_B(ra) | ADDI_U6(u6); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* add Ra,Rb,imm */ +static u8 arc_add_i(u8 *buf, u8 ra, u8 rb, s32 imm) +{ + const u32 insn = OPC_ADD_I | OP_A(ra) | OP_B(rb); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* adc Ra,Ra,Rc */ +static u8 arc_adc_r(u8 *buf, u8 ra, u8 rc) +{ + const u32 insn = OPC_ADC | OP_A(ra) | OP_B(ra) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* adc Ra,Ra,u6 */ +static u8 arc_adci_r(u8 *buf, u8 ra, u8 u6) +{ + const u32 insn = OPC_ADCI | OP_A(ra) | OP_B(ra) | ADCI_U6(u6); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* sub Ra,Ra,Rc */ +static u8 arc_sub_r(u8 *buf, u8 ra, u8 rc) +{ + const u32 insn = OPC_SUB | OP_A(ra) | OP_B(ra) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* sub.f Ra,Ra,Rc */ +static u8 arc_subf_r(u8 *buf, u8 ra, u8 rc) +{ + const u32 insn = OPC_SUBF | OP_A(ra) | OP_B(ra) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* sub Ra,Ra,u6 */ +static u8 arc_subi_r(u8 *buf, u8 ra, u8 u6) +{ + const u32 insn = OPC_SUBI | OP_A(ra) | OP_B(ra) | SUBI_U6(u6); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* sub Ra,Ra,imm */ +static u8 arc_sub_i(u8 *buf, u8 ra, s32 imm) +{ + const u32 insn = OPC_SUB_I | OP_A(ra) | OP_B(ra); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* sbc Ra,Ra,Rc */ +static u8 arc_sbc_r(u8 *buf, u8 ra, u8 rc) +{ + const u32 insn = OPC_SBC | OP_A(ra) | OP_B(ra) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* cmp Rb,Rc */ +static u8 arc_cmp_r(u8 *buf, u8 rb, u8 rc) +{ + const u32 insn = OPC_CMP | OP_B(rb) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* + * cmp.z Rb,Rc + * + * This "cmp.z" variant of compare instruction is used on lower + * 32-bits of register pairs after "cmp"ing their upper parts. If the + * upper parts are equal (z), then this one will proceed to check the + * rest. + */ +static u8 arc_cmpz_r(u8 *buf, u8 rb, u8 rc) +{ + const u32 insn = OPC_CMP | OP_B(rb) | OP_C(rc) | CC_equal; + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* neg Ra,Rb */ +static u8 arc_neg_r(u8 *buf, u8 ra, u8 rb) +{ + const u32 insn = OPC_NEG | OP_A(ra) | OP_B(rb); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* mpy Ra,Rb,Rc */ +static u8 arc_mpy_r(u8 *buf, u8 ra, u8 rb, u8 rc) +{ + const u32 insn = OPC_MPY | OP_A(ra) | OP_B(rb) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* mpy Ra,Rb,imm */ +static u8 arc_mpy_i(u8 *buf, u8 ra, u8 rb, s32 imm) +{ + const u32 insn = OPC_MPYI | OP_A(ra) | OP_B(rb); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* mpydu Ra,Ra,Rc */ +static u8 arc_mpydu_r(u8 *buf, u8 ra, u8 rc) +{ + const u32 insn = OPC_MPYDU | OP_A(ra) | OP_B(ra) | OP_C(rc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* mpydu Ra,Ra,imm */ +static u8 arc_mpydu_i(u8 *buf, u8 ra, s32 imm) +{ + const u32 insn = OPC_MPYDUI | OP_A(ra) | OP_B(ra); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* divu Rd,Rd,Rs */ +static u8 arc_divu_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_DIVU | OP_A(rd) | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* divu Rd,Rd,imm */ +static u8 arc_divu_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_DIVUI | OP_A(rd) | OP_B(rd); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* div Rd,Rd,Rs */ +static u8 arc_divs_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_DIVS | OP_A(rd) | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* div Rd,Rd,imm */ +static u8 arc_divs_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_DIVSI | OP_A(rd) | OP_B(rd); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* remu Rd,Rd,Rs */ +static u8 arc_remu_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_REMU | OP_A(rd) | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* remu Rd,Rd,imm */ +static u8 arc_remu_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_REMUI | OP_A(rd) | OP_B(rd); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* rem Rd,Rd,Rs */ +static u8 arc_rems_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_REMS | OP_A(rd) | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* rem Rd,Rd,imm */ +static u8 arc_rems_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_REMSI | OP_A(rd) | OP_B(rd); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* and Rd,Rd,Rs */ +static u8 arc_and_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_AND | OP_A(rd) | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* and Rd,Rd,limm */ +static u8 arc_and_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_ANDI | OP_A(rd) | OP_B(rd); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +/* tst Rd,Rs */ +static u8 arc_tst_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_TST | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* + * This particular version, "tst.z ...", is meant to be used after a + * "tst" on the low 32-bit of register pairs. If that "tst" is not + * zero, then we don't need to test the upper 32-bits lest it sets + * the zero flag. + */ +static u8 arc_tstz_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_TST | OP_B(rd) | OP_C(rs) | CC_equal; + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_or_r(u8 *buf, u8 rd, u8 rs1, u8 rs2) +{ + const u32 insn = OPC_OR | OP_A(rd) | OP_B(rs1) | OP_C(rs2); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_or_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_ORI | OP_A(rd) | OP_B(rd); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +static u8 arc_xor_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_XOR | OP_A(rd) | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_xor_i(u8 *buf, u8 rd, s32 imm) +{ + const u32 insn = OPC_XORI | OP_A(rd) | OP_B(rd); + + if (buf) { + emit_4_bytes(buf, insn); + emit_4_bytes(buf + INSN_len_normal, imm); + } + return INSN_len_normal + INSN_len_imm; +} + +static u8 arc_not_r(u8 *buf, u8 rd, u8 rs) +{ + const u32 insn = OPC_NOT | OP_B(rd) | OP_C(rs); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_btst_i(u8 *buf, u8 rs, u8 imm) +{ + const u32 insn = OPC_BTSTU6 | OP_B(rs) | BTST_U6(imm); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_asl_r(u8 *buf, u8 rd, u8 rs1, u8 rs2) +{ + const u32 insn = OPC_ASL | OP_A(rd) | OP_B(rs1) | OP_C(rs2); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_asli_r(u8 *buf, u8 rd, u8 rs, u8 imm) +{ + const u32 insn = OPC_ASLI | OP_A(rd) | OP_B(rs) | ASLI_U6(imm); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_asr_r(u8 *buf, u8 rd, u8 rs1, u8 rs2) +{ + const u32 insn = OPC_ASR | OP_A(rd) | OP_B(rs1) | OP_C(rs2); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_asri_r(u8 *buf, u8 rd, u8 rs, u8 imm) +{ + const u32 insn = OPC_ASRI | OP_A(rd) | OP_B(rs) | ASRI_U6(imm); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_lsr_r(u8 *buf, u8 rd, u8 rs1, u8 rs2) +{ + const u32 insn = OPC_LSR | OP_A(rd) | OP_B(rs1) | OP_C(rs2); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_lsri_r(u8 *buf, u8 rd, u8 rs, u8 imm) +{ + const u32 insn = OPC_LSRI | OP_A(rd) | OP_B(rs) | LSRI_U6(imm); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_swape_r(u8 *buf, u8 r) +{ + const u32 insn = OPC_SWAPE | OP_B(r) | OP_C(r); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +static u8 arc_jmp_return(u8 *buf) +{ + if (buf) + emit_4_bytes(buf, OPC_J_BLINK); + return INSN_len_normal; +} + +static u8 arc_jl(u8 *buf, u8 reg) +{ + const u32 insn = OPC_JL | OP_C(reg); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* + * Conditional jump to an address that is max 21 bits away (signed). + * + * b s21 + */ +static u8 arc_bcc(u8 *buf, u8 cc, int offset) +{ + const u32 insn = OPC_BCC | BCC_S21(offset) | COND(cc); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/* + * Unconditional jump to an address that is max 25 bits away (signed). + * + * b s25 + */ +static u8 arc_b(u8 *buf, s32 offset) +{ + const u32 insn = OPC_B | B_S25(offset); + + if (buf) + emit_4_bytes(buf, insn); + return INSN_len_normal; +} + +/************* Packers (Deal with BPF_REGs) **************/ + +inline u8 zext(u8 *buf, u8 rd) +{ + if (rd != BPF_REG_FP) + return arc_movi_r(buf, REG_HI(rd), 0); + else + return 0; +} + +u8 mov_r32(u8 *buf, u8 rd, u8 rs, u8 sign_ext) +{ + u8 len = 0; + + if (sign_ext) { + if (sign_ext == 8) + len = arc_sexb_r(buf, REG_LO(rd), REG_LO(rs)); + else if (sign_ext == 16) + len = arc_sexh_r(buf, REG_LO(rd), REG_LO(rs)); + else if (sign_ext == 32 && rd != rs) + len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs)); + + return len; + } + + /* Unsigned move. */ + + if (rd != rs) + len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs)); + + return len; +} + +u8 mov_r32_i32(u8 *buf, u8 reg, s32 imm) +{ + return arc_mov_i(buf, REG_LO(reg), imm); +} + +u8 mov_r64(u8 *buf, u8 rd, u8 rs, u8 sign_ext) +{ + u8 len = 0; + + if (sign_ext) { + /* First handle the low 32-bit part. */ + len = mov_r32(buf, rd, rs, sign_ext); + + /* Now propagate the sign bit of LO to HI. */ + if (sign_ext == 8 || sign_ext == 16 || sign_ext == 32) { + len += arc_asri_r(BUF(buf, len), + REG_HI(rd), REG_LO(rd), 31); + } + + return len; + } + + /* Unsigned move. */ + + if (rd == rs) + return 0; + + len = arc_mov_r(buf, REG_LO(rd), REG_LO(rs)); + + if (rs != BPF_REG_FP) + len += arc_mov_r(BUF(buf, len), REG_HI(rd), REG_HI(rs)); + /* BPF_REG_FP is mapped to 32-bit "fp" register. */ + else + len += arc_movi_r(BUF(buf, len), REG_HI(rd), 0); + + return len; +} + +/* Sign extend the 32-bit immediate into 64-bit register pair. */ +u8 mov_r64_i32(u8 *buf, u8 reg, s32 imm) +{ + u8 len = 0; + + len = arc_mov_i(buf, REG_LO(reg), imm); + + /* BPF_REG_FP is mapped to 32-bit "fp" register. */ + if (reg != BPF_REG_FP) { + if (imm >= 0) + len += arc_movi_r(BUF(buf, len), REG_HI(reg), 0); + else + len += arc_movi_r(BUF(buf, len), REG_HI(reg), -1); + } + + return len; +} + +/* + * This is merely used for translation of "LD R, IMM64" instructions + * of the BPF. These sort of instructions are sometimes used for + * relocations. If during the normal pass, the relocation value is + * not known, the BPF instruction may look something like: + * + * LD R <- 0x0000_0001_0000_0001 + * + * Which will nicely translate to two 4-byte ARC instructions: + * + * mov R_lo, 1 # imm is small enough to be s12 + * mov R_hi, 1 # same + * + * However, during the extra pass, the IMM64 will have changed + * to the resolved address and looks something like: + * + * LD R <- 0x0000_0000_1234_5678 + * + * Now, the translated code will require 12 bytes: + * + * mov R_lo, 0x12345678 # this is an 8-byte instruction + * mov R_hi, 0 # still 4 bytes + * + * Which in practice will result in overwriting the following + * instruction. To avoid such cases, we will always emit codes + * with fixed sizes. + */ +u8 mov_r64_i64(u8 *buf, u8 reg, u32 lo, u32 hi) +{ + u8 len; + + len = arc_mov_i_fixed(buf, REG_LO(reg), lo); + len += arc_mov_i_fixed(BUF(buf, len), REG_HI(reg), hi); + + return len; +} + +/* + * If the "off"set is too big (doesn't encode as S9) for: + * + * {ld,st} r, [rm, off] + * + * Then emit: + * + * add r10, REG_LO(rm), off + * + * and make sure that r10 becomes the effective address: + * + * {ld,st} r, [r10, 0] + */ +static u8 adjust_mem_access(u8 *buf, s16 *off, u8 size, + u8 rm, u8 *arc_reg_mem) +{ + u8 len = 0; + *arc_reg_mem = REG_LO(rm); + + if (!IN_S9_RANGE(*off) || + (size == BPF_DW && !IN_S9_RANGE(*off + 4))) { + len += arc_add_i(BUF(buf, len), + REG_LO(JIT_REG_TMP), REG_LO(rm), (u32)(*off)); + *arc_reg_mem = REG_LO(JIT_REG_TMP); + *off = 0; + } + + return len; +} + +/* store rs, [rd, off] */ +u8 store_r(u8 *buf, u8 rs, u8 rd, s16 off, u8 size) +{ + u8 len, arc_reg_mem; + + len = adjust_mem_access(buf, &off, size, rd, &arc_reg_mem); + + if (size == BPF_DW) { + len += arc_st_r(BUF(buf, len), REG_LO(rs), arc_reg_mem, + off, ZZ_4_byte); + len += arc_st_r(BUF(buf, len), REG_HI(rs), arc_reg_mem, + off + 4, ZZ_4_byte); + } else { + u8 zz = bpf_to_arc_size(size); + + len += arc_st_r(BUF(buf, len), REG_LO(rs), arc_reg_mem, + off, zz); + } + + return len; +} + +/* + * For {8,16,32}-bit stores: + * mov r21, imm + * st r21, [...] + * For 64-bit stores: + * mov r21, imm + * st r21, [...] + * mov r21, {0,-1} + * st r21, [...+4] + */ +u8 store_i(u8 *buf, s32 imm, u8 rd, s16 off, u8 size) +{ + u8 len, arc_reg_mem; + /* REG_LO(JIT_REG_TMP) might be used by "adjust_mem_access()". */ + const u8 arc_rs = REG_HI(JIT_REG_TMP); + + len = adjust_mem_access(buf, &off, size, rd, &arc_reg_mem); + + if (size == BPF_DW) { + len += arc_mov_i(BUF(buf, len), arc_rs, imm); + len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem, + off, ZZ_4_byte); + imm = (imm >= 0 ? 0 : -1); + len += arc_mov_i(BUF(buf, len), arc_rs, imm); + len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem, + off + 4, ZZ_4_byte); + } else { + u8 zz = bpf_to_arc_size(size); + + len += arc_mov_i(BUF(buf, len), arc_rs, imm); + len += arc_st_r(BUF(buf, len), arc_rs, arc_reg_mem, off, zz); + } + + return len; +} + +/* + * For the calling convention of a little endian machine, the LO part + * must be on top of the stack. + */ +static u8 push_r64(u8 *buf, u8 reg) +{ + u8 len = 0; + +#ifdef __LITTLE_ENDIAN + /* BPF_REG_FP is mapped to 32-bit "fp" register. */ + if (reg != BPF_REG_FP) + len += arc_push_r(BUF(buf, len), REG_HI(reg)); + len += arc_push_r(BUF(buf, len), REG_LO(reg)); +#else + len += arc_push_r(BUF(buf, len), REG_LO(reg)); + if (reg != BPF_REG_FP) + len += arc_push_r(BUF(buf, len), REG_HI(reg)); +#endif + + return len; +} + +/* load rd, [rs, off] */ +u8 load_r(u8 *buf, u8 rd, u8 rs, s16 off, u8 size, bool sign_ext) +{ + u8 len, arc_reg_mem; + + len = adjust_mem_access(buf, &off, size, rs, &arc_reg_mem); + + if (size == BPF_B || size == BPF_H || size == BPF_W) { + const u8 zz = bpf_to_arc_size(size); + + /* Use LD.X only if the data size is less than 32-bit. */ + if (sign_ext && (zz == ZZ_1_byte || zz == ZZ_2_byte)) { + len += arc_ldx_r(BUF(buf, len), REG_LO(rd), + arc_reg_mem, off, zz); + } else { + len += arc_ld_r(BUF(buf, len), REG_LO(rd), + arc_reg_mem, off, zz); + } + + if (sign_ext) { + /* Propagate the sign bit to the higher reg. */ + len += arc_asri_r(BUF(buf, len), + REG_HI(rd), REG_LO(rd), 31); + } else { + len += arc_movi_r(BUF(buf, len), REG_HI(rd), 0); + } + } else if (size == BPF_DW) { + /* + * We are about to issue 2 consecutive loads: + * + * ld rx, [rb, off+0] + * ld ry, [rb, off+4] + * + * If "rx" and "rb" are the same registers, then the order + * should change to guarantee that "rb" remains intact + * during these 2 operations: + * + * ld ry, [rb, off+4] + * ld rx, [rb, off+0] + */ + if (REG_LO(rd) != arc_reg_mem) { + len += arc_ld_r(BUF(buf, len), REG_LO(rd), arc_reg_mem, + off, ZZ_4_byte); + len += arc_ld_r(BUF(buf, len), REG_HI(rd), arc_reg_mem, + off + 4, ZZ_4_byte); + } else { + len += arc_ld_r(BUF(buf, len), REG_HI(rd), arc_reg_mem, + off + 4, ZZ_4_byte); + len += arc_ld_r(BUF(buf, len), REG_LO(rd), arc_reg_mem, + off, ZZ_4_byte); + } + } + + return len; +} + +u8 add_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_add_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 add_r32_i32(u8 *buf, u8 rd, s32 imm) +{ + if (IN_U6_RANGE(imm)) + return arc_addi_r(buf, REG_LO(rd), imm); + else + return arc_add_i(buf, REG_LO(rd), REG_LO(rd), imm); +} + +u8 add_r64(u8 *buf, u8 rd, u8 rs) +{ + u8 len; + + len = arc_addf_r(buf, REG_LO(rd), REG_LO(rs)); + len += arc_adc_r(BUF(buf, len), REG_HI(rd), REG_HI(rs)); + return len; +} + +u8 add_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + u8 len; + + if (IN_U6_RANGE(imm)) { + len = arc_addif_r(buf, REG_LO(rd), imm); + len += arc_adci_r(BUF(buf, len), REG_HI(rd), 0); + } else { + len = mov_r64_i32(buf, JIT_REG_TMP, imm); + len += add_r64(BUF(buf, len), rd, JIT_REG_TMP); + } + return len; +} + +u8 sub_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_sub_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 sub_r32_i32(u8 *buf, u8 rd, s32 imm) +{ + if (IN_U6_RANGE(imm)) + return arc_subi_r(buf, REG_LO(rd), imm); + else + return arc_sub_i(buf, REG_LO(rd), imm); +} + +u8 sub_r64(u8 *buf, u8 rd, u8 rs) +{ + u8 len; + + len = arc_subf_r(buf, REG_LO(rd), REG_LO(rs)); + len += arc_sbc_r(BUF(buf, len), REG_HI(rd), REG_HI(rs)); + return len; +} + +u8 sub_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + u8 len; + + len = mov_r64_i32(buf, JIT_REG_TMP, imm); + len += sub_r64(BUF(buf, len), rd, JIT_REG_TMP); + return len; +} + +static u8 cmp_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_cmp_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 neg_r32(u8 *buf, u8 r) +{ + return arc_neg_r(buf, REG_LO(r), REG_LO(r)); +} + +/* In a two's complement system, -r is (~r + 1). */ +u8 neg_r64(u8 *buf, u8 r) +{ + u8 len; + + len = arc_not_r(buf, REG_LO(r), REG_LO(r)); + len += arc_not_r(BUF(buf, len), REG_HI(r), REG_HI(r)); + len += add_r64_i32(BUF(buf, len), r, 1); + return len; +} + +u8 mul_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_mpy_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs)); +} + +u8 mul_r32_i32(u8 *buf, u8 rd, s32 imm) +{ + return arc_mpy_i(buf, REG_LO(rd), REG_LO(rd), imm); +} + +/* + * MUL B, C + * -------- + * mpy t0, B_hi, C_lo + * mpy t1, B_lo, C_hi + * mpydu B_lo, B_lo, C_lo + * add B_hi, B_hi, t0 + * add B_hi, B_hi, t1 + */ +u8 mul_r64(u8 *buf, u8 rd, u8 rs) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 t1 = REG_HI(JIT_REG_TMP); + const u8 C_lo = REG_LO(rs); + const u8 C_hi = REG_HI(rs); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + u8 len; + + len = arc_mpy_r(buf, t0, B_hi, C_lo); + len += arc_mpy_r(BUF(buf, len), t1, B_lo, C_hi); + len += arc_mpydu_r(BUF(buf, len), B_lo, C_lo); + len += arc_add_r(BUF(buf, len), B_hi, t0); + len += arc_add_r(BUF(buf, len), B_hi, t1); + + return len; +} + +/* + * MUL B, imm + * ---------- + * + * To get a 64-bit result from a signed 64x32 multiplication: + * + * B_hi B_lo * + * sign imm + * ----------------------------- + * HI(B_lo*imm) LO(B_lo*imm) + + * B_hi*imm + + * B_lo*sign + * ----------------------------- + * res_hi res_lo + * + * mpy t1, B_lo, sign(imm) + * mpy t0, B_hi, imm + * mpydu B_lo, B_lo, imm + * add B_hi, B_hi, t0 + * add B_hi, B_hi, t1 + * + * Note: We can't use signed double multiplication, "mpyd", instead of an + * unsigned version, "mpydu", and then get rid of the sign adjustments + * calculated in "t1". The signed multiplication, "mpyd", will consider + * both operands, "B_lo" and "imm", as signed inputs. However, for this + * 64x32 multiplication, "B_lo" must be treated as an unsigned number. + */ +u8 mul_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 t1 = REG_HI(JIT_REG_TMP); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + u8 len = 0; + + if (imm == 1) + return 0; + + /* Is the sign-extension of the immediate "-1"? */ + if (imm < 0) + len += arc_neg_r(BUF(buf, len), t1, B_lo); + + len += arc_mpy_i(BUF(buf, len), t0, B_hi, imm); + len += arc_mpydu_i(BUF(buf, len), B_lo, imm); + len += arc_add_r(BUF(buf, len), B_hi, t0); + + /* Add the "sign*B_lo" part, if necessary. */ + if (imm < 0) + len += arc_add_r(BUF(buf, len), B_hi, t1); + + return len; +} + +u8 div_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext) +{ + if (sign_ext) + return arc_divs_r(buf, REG_LO(rd), REG_LO(rs)); + else + return arc_divu_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 div_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext) +{ + if (imm == 0) + return 0; + + if (sign_ext) + return arc_divs_i(buf, REG_LO(rd), imm); + else + return arc_divu_i(buf, REG_LO(rd), imm); +} + +u8 mod_r32(u8 *buf, u8 rd, u8 rs, bool sign_ext) +{ + if (sign_ext) + return arc_rems_r(buf, REG_LO(rd), REG_LO(rs)); + else + return arc_remu_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 mod_r32_i32(u8 *buf, u8 rd, s32 imm, bool sign_ext) +{ + if (imm == 0) + return 0; + + if (sign_ext) + return arc_rems_i(buf, REG_LO(rd), imm); + else + return arc_remu_i(buf, REG_LO(rd), imm); +} + +u8 and_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_and_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 and_r32_i32(u8 *buf, u8 rd, s32 imm) +{ + return arc_and_i(buf, REG_LO(rd), imm); +} + +u8 and_r64(u8 *buf, u8 rd, u8 rs) +{ + u8 len; + + len = arc_and_r(buf, REG_LO(rd), REG_LO(rs)); + len += arc_and_r(BUF(buf, len), REG_HI(rd), REG_HI(rs)); + return len; +} + +u8 and_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + u8 len; + + len = mov_r64_i32(buf, JIT_REG_TMP, imm); + len += and_r64(BUF(buf, len), rd, JIT_REG_TMP); + return len; +} + +static u8 tst_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_tst_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 or_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_or_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs)); +} + +u8 or_r32_i32(u8 *buf, u8 rd, s32 imm) +{ + return arc_or_i(buf, REG_LO(rd), imm); +} + +u8 or_r64(u8 *buf, u8 rd, u8 rs) +{ + u8 len; + + len = arc_or_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs)); + len += arc_or_r(BUF(buf, len), REG_HI(rd), REG_HI(rd), REG_HI(rs)); + return len; +} + +u8 or_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + u8 len; + + len = mov_r64_i32(buf, JIT_REG_TMP, imm); + len += or_r64(BUF(buf, len), rd, JIT_REG_TMP); + return len; +} + +u8 xor_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_xor_r(buf, REG_LO(rd), REG_LO(rs)); +} + +u8 xor_r32_i32(u8 *buf, u8 rd, s32 imm) +{ + return arc_xor_i(buf, REG_LO(rd), imm); +} + +u8 xor_r64(u8 *buf, u8 rd, u8 rs) +{ + u8 len; + + len = arc_xor_r(buf, REG_LO(rd), REG_LO(rs)); + len += arc_xor_r(BUF(buf, len), REG_HI(rd), REG_HI(rs)); + return len; +} + +u8 xor_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + u8 len; + + len = mov_r64_i32(buf, JIT_REG_TMP, imm); + len += xor_r64(BUF(buf, len), rd, JIT_REG_TMP); + return len; +} + +/* "asl a,b,c" --> "a = (b << (c & 31))". */ +u8 lsh_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_asl_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs)); +} + +u8 lsh_r32_i32(u8 *buf, u8 rd, u8 imm) +{ + return arc_asli_r(buf, REG_LO(rd), REG_LO(rd), imm); +} + +/* + * algorithm + * --------- + * if (n <= 32) + * to_hi = lo >> (32-n) # (32-n) is the negate of "n" in a 5-bit width. + * lo <<= n + * hi <<= n + * hi |= to_hi + * else + * hi = lo << (n-32) + * lo = 0 + * + * assembly translation for "LSH B, C" + * (heavily influenced by ARC gcc) + * ----------------------------------- + * not t0, C_lo # The first 3 lines are almost the same as: + * lsr t1, B_lo, 1 # neg t0, C_lo + * lsr t1, t1, t0 # lsr t1, B_lo, t0 --> t1 is "to_hi" + * mov t0, C_lo* # with one important difference. In "neg" + * asl B_lo, B_lo, t0 # version, when C_lo=0, t1 becomes B_lo while + * asl B_hi, B_hi, t0 # it should be 0. The "not" approach instead, + * or B_hi, B_hi, t1 # "shift"s t1 once and 31 times, practically + * btst t0, 5 # setting it to 0 when C_lo=0. + * mov.ne B_hi, B_lo** + * mov.ne B_lo, 0 + * + * *The "mov t0, C_lo" is necessary to cover the cases that C is the same + * register as B. + * + * **ARC performs a shift in this manner: B <<= (C & 31) + * For 32<=n<64, "n-32" and "n&31" are the same. Therefore, "B << n" and + * "B << (n-32)" yield the same results. e.g. the results of "B << 35" and + * "B << 3" are the same. + * + * The behaviour is undefined for n >= 64. + */ +u8 lsh_r64(u8 *buf, u8 rd, u8 rs) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 t1 = REG_HI(JIT_REG_TMP); + const u8 C_lo = REG_LO(rs); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + u8 len; + + len = arc_not_r(buf, t0, C_lo); + len += arc_lsri_r(BUF(buf, len), t1, B_lo, 1); + len += arc_lsr_r(BUF(buf, len), t1, t1, t0); + len += arc_mov_r(BUF(buf, len), t0, C_lo); + len += arc_asl_r(BUF(buf, len), B_lo, B_lo, t0); + len += arc_asl_r(BUF(buf, len), B_hi, B_hi, t0); + len += arc_or_r(BUF(buf, len), B_hi, B_hi, t1); + len += arc_btst_i(BUF(buf, len), t0, 5); + len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_hi, B_lo); + len += arc_movu_cc_r(BUF(buf, len), CC_unequal, B_lo, 0); + + return len; +} + +/* + * if (n < 32) + * to_hi = B_lo >> 32-n # extract upper n bits + * lo <<= n + * hi <<=n + * hi |= to_hi + * else if (n < 64) + * hi = lo << n-32 + * lo = 0 + */ +u8 lsh_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + const u8 n = (u8)imm; + u8 len = 0; + + if (n == 0) { + return 0; + } else if (n <= 31) { + len = arc_lsri_r(buf, t0, B_lo, 32 - n); + len += arc_asli_r(BUF(buf, len), B_lo, B_lo, n); + len += arc_asli_r(BUF(buf, len), B_hi, B_hi, n); + len += arc_or_r(BUF(buf, len), B_hi, B_hi, t0); + } else if (n <= 63) { + len = arc_asli_r(buf, B_hi, B_lo, n - 32); + len += arc_movi_r(BUF(buf, len), B_lo, 0); + } + /* n >= 64 is undefined behaviour. */ + + return len; +} + +/* "lsr a,b,c" --> "a = (b >> (c & 31))". */ +u8 rsh_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_lsr_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs)); +} + +u8 rsh_r32_i32(u8 *buf, u8 rd, u8 imm) +{ + return arc_lsri_r(buf, REG_LO(rd), REG_LO(rd), imm); +} + +/* + * For better commentary, see lsh_r64(). + * + * algorithm + * --------- + * if (n <= 32) + * to_lo = hi << (32-n) + * hi >>= n + * lo >>= n + * lo |= to_lo + * else + * lo = hi >> (n-32) + * hi = 0 + * + * RSH B,C + * ---------- + * not t0, C_lo + * asl t1, B_hi, 1 + * asl t1, t1, t0 + * mov t0, C_lo + * lsr B_hi, B_hi, t0 + * lsr B_lo, B_lo, t0 + * or B_lo, B_lo, t1 + * btst t0, 5 + * mov.ne B_lo, B_hi + * mov.ne B_hi, 0 + */ +u8 rsh_r64(u8 *buf, u8 rd, u8 rs) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 t1 = REG_HI(JIT_REG_TMP); + const u8 C_lo = REG_LO(rs); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + u8 len; + + len = arc_not_r(buf, t0, C_lo); + len += arc_asli_r(BUF(buf, len), t1, B_hi, 1); + len += arc_asl_r(BUF(buf, len), t1, t1, t0); + len += arc_mov_r(BUF(buf, len), t0, C_lo); + len += arc_lsr_r(BUF(buf, len), B_hi, B_hi, t0); + len += arc_lsr_r(BUF(buf, len), B_lo, B_lo, t0); + len += arc_or_r(BUF(buf, len), B_lo, B_lo, t1); + len += arc_btst_i(BUF(buf, len), t0, 5); + len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_lo, B_hi); + len += arc_movu_cc_r(BUF(buf, len), CC_unequal, B_hi, 0); + + return len; +} + +/* + * if (n < 32) + * to_lo = B_lo << 32-n # extract lower n bits, right-padded with 32-n 0s + * lo >>=n + * hi >>=n + * hi |= to_lo + * else if (n < 64) + * lo = hi >> n-32 + * hi = 0 + */ +u8 rsh_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + const u8 n = (u8)imm; + u8 len = 0; + + if (n == 0) { + return 0; + } else if (n <= 31) { + len = arc_asli_r(buf, t0, B_hi, 32 - n); + len += arc_lsri_r(BUF(buf, len), B_lo, B_lo, n); + len += arc_lsri_r(BUF(buf, len), B_hi, B_hi, n); + len += arc_or_r(BUF(buf, len), B_lo, B_lo, t0); + } else if (n <= 63) { + len = arc_lsri_r(buf, B_lo, B_hi, n - 32); + len += arc_movi_r(BUF(buf, len), B_hi, 0); + } + /* n >= 64 is undefined behaviour. */ + + return len; +} + +/* "asr a,b,c" --> "a = (b s>> (c & 31))". */ +u8 arsh_r32(u8 *buf, u8 rd, u8 rs) +{ + return arc_asr_r(buf, REG_LO(rd), REG_LO(rd), REG_LO(rs)); +} + +u8 arsh_r32_i32(u8 *buf, u8 rd, u8 imm) +{ + return arc_asri_r(buf, REG_LO(rd), REG_LO(rd), imm); +} + +/* + * For comparison, see rsh_r64(). + * + * algorithm + * --------- + * if (n <= 32) + * to_lo = hi << (32-n) + * hi s>>= n + * lo >>= n + * lo |= to_lo + * else + * hi_sign = hi s>>31 + * lo = hi s>> (n-32) + * hi = hi_sign + * + * ARSH B,C + * ---------- + * not t0, C_lo + * asl t1, B_hi, 1 + * asl t1, t1, t0 + * mov t0, C_lo + * asr B_hi, B_hi, t0 + * lsr B_lo, B_lo, t0 + * or B_lo, B_lo, t1 + * btst t0, 5 + * asr t0, B_hi, 31 # now, t0 = 0 or -1 based on B_hi's sign + * mov.ne B_lo, B_hi + * mov.ne B_hi, t0 + */ +u8 arsh_r64(u8 *buf, u8 rd, u8 rs) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 t1 = REG_HI(JIT_REG_TMP); + const u8 C_lo = REG_LO(rs); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + u8 len; + + len = arc_not_r(buf, t0, C_lo); + len += arc_asli_r(BUF(buf, len), t1, B_hi, 1); + len += arc_asl_r(BUF(buf, len), t1, t1, t0); + len += arc_mov_r(BUF(buf, len), t0, C_lo); + len += arc_asr_r(BUF(buf, len), B_hi, B_hi, t0); + len += arc_lsr_r(BUF(buf, len), B_lo, B_lo, t0); + len += arc_or_r(BUF(buf, len), B_lo, B_lo, t1); + len += arc_btst_i(BUF(buf, len), t0, 5); + len += arc_asri_r(BUF(buf, len), t0, B_hi, 31); + len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_lo, B_hi); + len += arc_mov_cc_r(BUF(buf, len), CC_unequal, B_hi, t0); + + return len; +} + +/* + * if (n < 32) + * to_lo = lo << 32-n # extract lower n bits, right-padded with 32-n 0s + * lo >>=n + * hi s>>=n + * hi |= to_lo + * else if (n < 64) + * lo = hi s>> n-32 + * hi = (lo[msb] ? -1 : 0) + */ +u8 arsh_r64_i32(u8 *buf, u8 rd, s32 imm) +{ + const u8 t0 = REG_LO(JIT_REG_TMP); + const u8 B_lo = REG_LO(rd); + const u8 B_hi = REG_HI(rd); + const u8 n = (u8)imm; + u8 len = 0; + + if (n == 0) { + return 0; + } else if (n <= 31) { + len = arc_asli_r(buf, t0, B_hi, 32 - n); + len += arc_lsri_r(BUF(buf, len), B_lo, B_lo, n); + len += arc_asri_r(BUF(buf, len), B_hi, B_hi, n); + len += arc_or_r(BUF(buf, len), B_lo, B_lo, t0); + } else if (n <= 63) { + len = arc_asri_r(buf, B_lo, B_hi, n - 32); + len += arc_movi_r(BUF(buf, len), B_hi, -1); + len += arc_btst_i(BUF(buf, len), B_lo, 31); + len += arc_movu_cc_r(BUF(buf, len), CC_equal, B_hi, 0); + } + /* n >= 64 is undefined behaviour. */ + + return len; +} + +u8 gen_swap(u8 *buf, u8 rd, u8 size, u8 endian, bool force, bool do_zext) +{ + u8 len = 0; +#ifdef __BIG_ENDIAN + const u8 host_endian = BPF_FROM_BE; +#else + const u8 host_endian = BPF_FROM_LE; +#endif + if (host_endian != endian || force) { + switch (size) { + case 16: + /* + * r = B4B3_B2B1 << 16 --> r = B2B1_0000 + * then, swape(r) would become the desired 0000_B1B2 + */ + len = arc_asli_r(buf, REG_LO(rd), REG_LO(rd), 16); + fallthrough; + case 32: + len += arc_swape_r(BUF(buf, len), REG_LO(rd)); + if (do_zext) + len += zext(BUF(buf, len), rd); + break; + case 64: + /* + * swap "hi" and "lo": + * hi ^= lo; + * lo ^= hi; + * hi ^= lo; + * and then swap the bytes in "hi" and "lo". + */ + len = arc_xor_r(buf, REG_HI(rd), REG_LO(rd)); + len += arc_xor_r(BUF(buf, len), REG_LO(rd), REG_HI(rd)); + len += arc_xor_r(BUF(buf, len), REG_HI(rd), REG_LO(rd)); + len += arc_swape_r(BUF(buf, len), REG_LO(rd)); + len += arc_swape_r(BUF(buf, len), REG_HI(rd)); + break; + default: + /* The caller must have handled this. */ + } + } else { + /* + * If the same endianness, there's not much to do other + * than zeroing out the upper bytes based on the "size". + */ + switch (size) { + case 16: + len = arc_and_i(buf, REG_LO(rd), 0xffff); + fallthrough; + case 32: + if (do_zext) + len += zext(BUF(buf, len), rd); + break; + case 64: + break; + default: + /* The caller must have handled this. */ + } + } + + return len; +} + +/* + * To create a frame, all that is needed is: + * + * push fp + * mov fp, sp + * sub sp, + * + * "push fp" is taken care of separately while saving the clobbered registers. + * All that remains is copying SP value to FP and shrinking SP's address space + * for any possible function call to come. + */ +static inline u8 frame_create(u8 *buf, u16 size) +{ + u8 len; + + len = arc_mov_r(buf, ARC_R_FP, ARC_R_SP); + if (IN_U6_RANGE(size)) + len += arc_subi_r(BUF(buf, len), ARC_R_SP, size); + else + len += arc_sub_i(BUF(buf, len), ARC_R_SP, size); + return len; +} + +/* + * mov sp, fp + * + * The value of SP upon entering was copied to FP. + */ +static inline u8 frame_restore(u8 *buf) +{ + return arc_mov_r(buf, ARC_R_SP, ARC_R_FP); +} + +/* + * Going from a JITed code to the native caller: + * + * mov ARC_ABI_RET_lo, BPF_REG_0_lo # r0 <- r8 + * mov ARC_ABI_RET_hi, BPF_REG_0_hi # r1 <- r9 + */ +static u8 bpf_to_arc_return(u8 *buf) +{ + u8 len; + + len = arc_mov_r(buf, ARC_R_0, REG_LO(BPF_REG_0)); + len += arc_mov_r(BUF(buf, len), ARC_R_1, REG_HI(BPF_REG_0)); + return len; +} + +/* + * Coming back from an external (in-kernel) function to the JITed code: + * + * mov ARC_ABI_RET_lo, BPF_REG_0_lo # r8 <- r0 + * mov ARC_ABI_RET_hi, BPF_REG_0_hi # r9 <- r1 + */ +u8 arc_to_bpf_return(u8 *buf) +{ + u8 len; + + len = arc_mov_r(buf, REG_LO(BPF_REG_0), ARC_R_0); + len += arc_mov_r(BUF(buf, len), REG_HI(BPF_REG_0), ARC_R_1); + return len; +} + +/* + * This translation leads to: + * + * mov r10, addr # always an 8-byte instruction + * jl [r10] + * + * The length of the "mov" must be fixed (8), otherwise it may diverge + * during the normal and extra passes: + * + * normal pass extra pass + * + * 180: mov r10,0 | 180: mov r10,0x700578d8 + * 184: jl [r10] | 188: jl [r10] + * 188: add.f r16,r16,0x1 | 18c: adc r17,r17,0 + * 18c: adc r17,r17,0 | + * + * In the above example, the change from "r10 <- 0" to "r10 <- 0x700578d8" + * has led to an increase in the length of the "mov" instruction. + * Inadvertently, that caused the loss of the "add.f" instruction. + */ +static u8 jump_and_link(u8 *buf, u32 addr) +{ + u8 len; + + len = arc_mov_i_fixed(buf, REG_LO(JIT_REG_TMP), addr); + len += arc_jl(BUF(buf, len), REG_LO(JIT_REG_TMP)); + return len; +} + +/* + * This function determines which ARC registers must be saved and restored. + * It does so by looking into: + * + * "bpf_reg": The clobbered (destination) BPF register + * "is_call": Indicator if the current instruction is a call + * + * When a register of interest is clobbered, its corresponding bit position + * in return value, "usage", is set to true. + */ +u32 mask_for_used_regs(u8 bpf_reg, bool is_call) +{ + u32 usage = 0; + + /* BPF registers that must be saved. */ + if (bpf_reg >= BPF_REG_6 && bpf_reg <= BPF_REG_9) { + usage |= BIT(REG_LO(bpf_reg)); + usage |= BIT(REG_HI(bpf_reg)); + /* + * Using the frame pointer register implies that it should + * be saved and reinitialised with the current frame data. + */ + } else if (bpf_reg == BPF_REG_FP) { + usage |= BIT(REG_LO(BPF_REG_FP)); + /* Could there be some ARC registers that must to be saved? */ + } else { + if (REG_LO(bpf_reg) >= ARC_CALLEE_SAVED_REG_FIRST && + REG_LO(bpf_reg) <= ARC_CALLEE_SAVED_REG_LAST) + usage |= BIT(REG_LO(bpf_reg)); + + if (REG_HI(bpf_reg) >= ARC_CALLEE_SAVED_REG_FIRST && + REG_HI(bpf_reg) <= ARC_CALLEE_SAVED_REG_LAST) + usage |= BIT(REG_HI(bpf_reg)); + } + + /* A "call" indicates that ARC's "blink" reg must be saved. */ + usage |= is_call ? BIT(ARC_R_BLINK) : 0; + + return usage; +} + +/* + * push blink # if blink is marked as clobbered + * push r[0-n] # if r[i] is marked as clobbered + * push fp # if fp is marked as clobbered + * mov fp, sp # if frame_size > 0 (clobbers fp) + * sub sp, # same as above + */ +u8 arc_prologue(u8 *buf, u32 usage, u16 frame_size) +{ + u8 len = 0; + u32 gp_regs = 0; + + /* Deal with blink first. */ + if (usage & BIT(ARC_R_BLINK)) + len += arc_push_r(BUF(buf, len), ARC_R_BLINK); + + gp_regs = usage & ~(BIT(ARC_R_BLINK) | BIT(ARC_R_FP)); + while (gp_regs) { + u8 reg = __builtin_ffs(gp_regs) - 1; + + len += arc_push_r(BUF(buf, len), reg); + gp_regs &= ~BIT(reg); + } + + /* Deal with fp last. */ + if ((usage & BIT(ARC_R_FP)) || frame_size > 0) + len += arc_push_r(BUF(buf, len), ARC_R_FP); + + if (frame_size > 0) + len += frame_create(BUF(buf, len), frame_size); + +#ifdef ARC_BPF_JIT_DEBUG + if ((usage & BIT(ARC_R_FP)) && frame_size == 0) { + pr_err("FP is being saved while there is no frame."); + BUG(); + } +#endif + + return len; +} + +/* + * mov sp, fp # if frame_size > 0 + * pop fp # if fp is marked as clobbered + * pop r[n-0] # if r[i] is marked as clobbered + * pop blink # if blink is marked as clobbered + * mov r0, r8 # always: ABI_return <- BPF_return + * mov r1, r9 # continuation of above + * j [blink] # always + * + * "fp being marked as clobbered" and "frame_size > 0" are the two sides of + * the same coin. + */ +u8 arc_epilogue(u8 *buf, u32 usage, u16 frame_size) +{ + u32 len = 0; + u32 gp_regs = 0; + +#ifdef ARC_BPF_JIT_DEBUG + if ((usage & BIT(ARC_R_FP)) && frame_size == 0) { + pr_err("FP is being saved while there is no frame."); + BUG(); + } +#endif + + if (frame_size > 0) + len += frame_restore(BUF(buf, len)); + + /* Deal with fp first. */ + if ((usage & BIT(ARC_R_FP)) || frame_size > 0) + len += arc_pop_r(BUF(buf, len), ARC_R_FP); + + gp_regs = usage & ~(BIT(ARC_R_BLINK) | BIT(ARC_R_FP)); + while (gp_regs) { + /* "usage" is 32-bit, each bit indicating an ARC register. */ + u8 reg = 31 - __builtin_clz(gp_regs); + + len += arc_pop_r(BUF(buf, len), reg); + gp_regs &= ~BIT(reg); + } + + /* Deal with blink last. */ + if (usage & BIT(ARC_R_BLINK)) + len += arc_pop_r(BUF(buf, len), ARC_R_BLINK); + + /* Wrap up the return value and jump back to the caller. */ + len += bpf_to_arc_return(BUF(buf, len)); + len += arc_jmp_return(BUF(buf, len)); + + return len; +} + +/* + * For details on the algorithm, see the comments of "gen_jcc_64()". + * + * This data structure is holding information for jump translations. + * + * jit_off: How many bytes into the current JIT address, "b"ranch insn. occurs + * cond: The condition that the ARC branch instruction must use + * + * e.g.: + * + * BPF_JGE R1, R0, @target + * ------------------------ + * | + * v + * 0x1000: cmp r3, r1 # 0x1000 is the JIT address for "BPF_JGE ..." insn + * 0x1004: bhi @target # first jump (branch higher) + * 0x1008: blo @end # second jump acting as a skip (end is 0x1014) + * 0x100C: cmp r2, r0 # the lower 32 bits are evaluated + * 0x1010: bhs @target # third jump (branch higher or same) + * 0x1014: ... + * + * The jit_off(set) of the "bhi" is 4 bytes. + * The cond(ition) for the "bhi" is "CC_great_u". + * + * The jit_off(set) is necessary for calculating the exact displacement + * to the "target" address: + * + * jit_address + jit_off(set) - @target + * 0x1000 + 4 - @target + */ +#define JCC64_NR_OF_JMPS 3 /* Number of jumps in jcc64 template. */ +#define JCC64_INSNS_TO_END 3 /* Number of insn. inclusive the 2nd jmp to end. */ +#define JCC64_SKIP_JMP 1 /* Index of the "skip" jump to "end". */ +const struct { + /* + * "jit_off" is common between all "jmp[]" and is coupled with + * "cond" of each "jmp[]" instance. e.g.: + * + * arcv2_64_jccs.jit_off[1] + * arcv2_64_jccs.jmp[ARC_CC_UGT].cond[1] + * + * Are indicating that the second jump in JITed code of "UGT" + * is at offset "jit_off[1]" while its condition is "cond[1]". + */ + u8 jit_off[JCC64_NR_OF_JMPS]; + + struct { + u8 cond[JCC64_NR_OF_JMPS]; + } jmp[ARC_CC_SLE + 1]; +} arcv2_64_jccs = { + .jit_off = { + INSN_len_normal * 1, + INSN_len_normal * 2, + INSN_len_normal * 4 + }, + /* + * cmp rd_hi, rs_hi + * bhi @target # 1: u> + * blo @end # 2: u< + * cmp rd_lo, rs_lo + * bhi @target # 3: u> + * end: + */ + .jmp[ARC_CC_UGT] = { + .cond = {CC_great_u, CC_less_u, CC_great_u} + }, + /* + * cmp rd_hi, rs_hi + * bhi @target # 1: u> + * blo @end # 2: u< + * cmp rd_lo, rs_lo + * bhs @target # 3: u>= + * end: + */ + .jmp[ARC_CC_UGE] = { + .cond = {CC_great_u, CC_less_u, CC_great_eq_u} + }, + /* + * cmp rd_hi, rs_hi + * blo @target # 1: u< + * bhi @end # 2: u> + * cmp rd_lo, rs_lo + * blo @target # 3: u< + * end: + */ + .jmp[ARC_CC_ULT] = { + .cond = {CC_less_u, CC_great_u, CC_less_u} + }, + /* + * cmp rd_hi, rs_hi + * blo @target # 1: u< + * bhi @end # 2: u> + * cmp rd_lo, rs_lo + * bls @target # 3: u<= + * end: + */ + .jmp[ARC_CC_ULE] = { + .cond = {CC_less_u, CC_great_u, CC_less_eq_u} + }, + /* + * cmp rd_hi, rs_hi + * bgt @target # 1: s> + * blt @end # 2: s< + * cmp rd_lo, rs_lo + * bhi @target # 3: u> + * end: + */ + .jmp[ARC_CC_SGT] = { + .cond = {CC_great_s, CC_less_s, CC_great_u} + }, + /* + * cmp rd_hi, rs_hi + * bgt @target # 1: s> + * blt @end # 2: s< + * cmp rd_lo, rs_lo + * bhs @target # 3: u>= + * end: + */ + .jmp[ARC_CC_SGE] = { + .cond = {CC_great_s, CC_less_s, CC_great_eq_u} + }, + /* + * cmp rd_hi, rs_hi + * blt @target # 1: s< + * bgt @end # 2: s> + * cmp rd_lo, rs_lo + * blo @target # 3: u< + * end: + */ + .jmp[ARC_CC_SLT] = { + .cond = {CC_less_s, CC_great_s, CC_less_u} + }, + /* + * cmp rd_hi, rs_hi + * blt @target # 1: s< + * bgt @end # 2: s> + * cmp rd_lo, rs_lo + * bls @target # 3: u<= + * end: + */ + .jmp[ARC_CC_SLE] = { + .cond = {CC_less_s, CC_great_s, CC_less_eq_u} + } +}; + +/* + * The displacement (offset) for ARC's "b"ranch instruction is the distance + * from the aligned version of _current_ instruction (PCL) to the target + * instruction: + * + * DISP = TARGET - PCL # PCL is the word aligned PC + */ +static inline s32 get_displacement(u32 curr_off, u32 targ_off) +{ + return (s32)(targ_off - (curr_off & ~3L)); +} + +/* + * "disp"lacement should be: + * + * 1. 16-bit aligned. + * 2. fit in S25, because no "condition code" is supposed to be encoded. + */ +static inline bool is_valid_far_disp(s32 disp) +{ + return (!(disp & 1) && IN_S25_RANGE(disp)); +} + +/* + * "disp"lacement should be: + * + * 1. 16-bit aligned. + * 2. fit in S21, because "condition code" is supposed to be encoded too. + */ +static inline bool is_valid_near_disp(s32 disp) +{ + return (!(disp & 1) && IN_S21_RANGE(disp)); +} + +/* + * cmp rd_hi, rs_hi + * cmp.z rd_lo, rs_lo + * b{eq,ne} @target + * | | + * | `--> "eq" param is false (JNE) + * `-----> "eq" param is true (JEQ) + */ +static int gen_j_eq_64(u8 *buf, u8 rd, u8 rs, bool eq, + u32 curr_off, u32 targ_off) +{ + s32 disp; + u8 len = 0; + + len += arc_cmp_r(BUF(buf, len), REG_HI(rd), REG_HI(rs)); + len += arc_cmpz_r(BUF(buf, len), REG_LO(rd), REG_LO(rs)); + disp = get_displacement(curr_off + len, targ_off); + len += arc_bcc(BUF(buf, len), eq ? CC_equal : CC_unequal, disp); + + return len; +} + +/* + * tst rd_hi, rs_hi + * tst.z rd_lo, rs_lo + * bne @target + */ +static u8 gen_jset_64(u8 *buf, u8 rd, u8 rs, u32 curr_off, u32 targ_off) +{ + u8 len = 0; + s32 disp; + + len += arc_tst_r(BUF(buf, len), REG_HI(rd), REG_HI(rs)); + len += arc_tstz_r(BUF(buf, len), REG_LO(rd), REG_LO(rs)); + disp = get_displacement(curr_off + len, targ_off); + len += arc_bcc(BUF(buf, len), CC_unequal, disp); + + return len; +} + +/* + * Verify if all the jumps for a JITed jcc64 operation are valid, + * by consulting the data stored at "arcv2_64_jccs". + */ +static bool check_jcc_64(u32 curr_off, u32 targ_off, u8 cond) +{ + size_t i; + + if (cond >= ARC_CC_LAST) + return false; + + for (i = 0; i < JCC64_NR_OF_JMPS; i++) { + u32 from, to; + + from = curr_off + arcv2_64_jccs.jit_off[i]; + /* for the 2nd jump, we jump to the end of block. */ + if (i != JCC64_SKIP_JMP) + to = targ_off; + else + to = from + (JCC64_INSNS_TO_END * INSN_len_normal); + /* There is a "cc" in the instruction, so a "near" jump. */ + if (!is_valid_near_disp(get_displacement(from, to))) + return false; + } + + return true; +} + +/* Can the jump from "curr_off" to "targ_off" actually happen? */ +bool check_jmp_64(u32 curr_off, u32 targ_off, u8 cond) +{ + s32 disp; + + switch (cond) { + case ARC_CC_UGT: + case ARC_CC_UGE: + case ARC_CC_ULT: + case ARC_CC_ULE: + case ARC_CC_SGT: + case ARC_CC_SGE: + case ARC_CC_SLT: + case ARC_CC_SLE: + return check_jcc_64(curr_off, targ_off, cond); + case ARC_CC_EQ: + case ARC_CC_NE: + case ARC_CC_SET: + /* + * The "jump" for the JITed BPF_J{SET,EQ,NE} is actually the + * 3rd instruction. See comments of "gen_j{set,_eq}_64()". + */ + curr_off += 2 * INSN_len_normal; + disp = get_displacement(curr_off, targ_off); + /* There is a "cc" field in the issued instruction. */ + return is_valid_near_disp(disp); + case ARC_CC_AL: + disp = get_displacement(curr_off, targ_off); + return is_valid_far_disp(disp); + default: + return false; + } +} + +/* + * The template for the 64-bit jumps with the following BPF conditions + * + * u< u<= u> u>= s< s<= s> s>= + * + * Looks like below: + * + * cmp rd_hi, rs_hi + * b @target + * b @end + * cmp rd_lo, rs_lo # if execution reaches here, r{d,s}_hi are equal + * b @target + * end: + * + * "c1" is the condition that JIT is handling minus the equality part. + * For instance if we have to translate an "unsigned greater or equal", + * then "c1" will be "unsigned greater". We won't know about equality + * until all 64-bits of data (higeher and lower registers) are processed. + * + * "c2" is the counter logic of "c1". For instance, if "c1" is originated + * from "s>", then "c2" would be "s<". Notice that equality doesn't play + * a role here either, because the lower 32 bits are not processed yet. + * + * "c3" is the unsigned version of "c1", no matter if the BPF condition + * was signed or unsigned. An unsigned version is necessary, because the + * MSB of the lower 32 bits does not reflect a sign in the whole 64-bit + * scheme. Otherwise, 64-bit comparisons like + * (0x0000_0000,0x8000_0000) s>= (0x0000_0000,0x0000_0000) + * would yield an incorrect result. Finally, if there is an equality + * check in the BPF condition, it will be reflected in "c3". + * + * You can find all the instances of this template where the + * "arcv2_64_jccs" is getting initialised. + */ +static u8 gen_jcc_64(u8 *buf, u8 rd, u8 rs, u8 cond, + u32 curr_off, u32 targ_off) +{ + s32 disp; + u32 end_off; + const u8 *cc = arcv2_64_jccs.jmp[cond].cond; + u8 len = 0; + + /* cmp rd_hi, rs_hi */ + len += arc_cmp_r(buf, REG_HI(rd), REG_HI(rs)); + + /* b @target */ + disp = get_displacement(curr_off + len, targ_off); + len += arc_bcc(BUF(buf, len), cc[0], disp); + + /* b @end */ + end_off = curr_off + len + (JCC64_INSNS_TO_END * INSN_len_normal); + disp = get_displacement(curr_off + len, end_off); + len += arc_bcc(BUF(buf, len), cc[1], disp); + + /* cmp rd_lo, rs_lo */ + len += arc_cmp_r(BUF(buf, len), REG_LO(rd), REG_LO(rs)); + + /* b @target */ + disp = get_displacement(curr_off + len, targ_off); + len += arc_bcc(BUF(buf, len), cc[2], disp); + + return len; +} + +/* + * This function only applies the necessary logic to make the proper + * translations. All the sanity checks must have already been done + * by calling the check_jmp_64(). + */ +u8 gen_jmp_64(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off) +{ + u8 len = 0; + bool eq = false; + s32 disp; + + switch (cond) { + case ARC_CC_AL: + disp = get_displacement(curr_off, targ_off); + len = arc_b(buf, disp); + break; + case ARC_CC_UGT: + case ARC_CC_UGE: + case ARC_CC_ULT: + case ARC_CC_ULE: + case ARC_CC_SGT: + case ARC_CC_SGE: + case ARC_CC_SLT: + case ARC_CC_SLE: + len = gen_jcc_64(buf, rd, rs, cond, curr_off, targ_off); + break; + case ARC_CC_EQ: + eq = true; + fallthrough; + case ARC_CC_NE: + len = gen_j_eq_64(buf, rd, rs, eq, curr_off, targ_off); + break; + case ARC_CC_SET: + len = gen_jset_64(buf, rd, rs, curr_off, targ_off); + break; + default: +#ifdef ARC_BPF_JIT_DEBUG + pr_err("64-bit jump condition is not known."); + BUG(); +#endif + } + return len; +} + +/* + * The condition codes to use when generating JIT instructions + * for 32-bit jumps. + * + * The "ARC_CC_AL" index is not really used by the code, but it + * is here for the sake of completeness. + * + * The "ARC_CC_SET" becomes "CC_unequal" because of the "tst" + * instruction that precedes the conditional branch. + */ +const u8 arcv2_32_jmps[ARC_CC_LAST] = { + [ARC_CC_UGT] = CC_great_u, + [ARC_CC_UGE] = CC_great_eq_u, + [ARC_CC_ULT] = CC_less_u, + [ARC_CC_ULE] = CC_less_eq_u, + [ARC_CC_SGT] = CC_great_s, + [ARC_CC_SGE] = CC_great_eq_s, + [ARC_CC_SLT] = CC_less_s, + [ARC_CC_SLE] = CC_less_eq_s, + [ARC_CC_AL] = CC_always, + [ARC_CC_EQ] = CC_equal, + [ARC_CC_NE] = CC_unequal, + [ARC_CC_SET] = CC_unequal +}; + +/* Can the jump from "curr_off" to "targ_off" actually happen? */ +bool check_jmp_32(u32 curr_off, u32 targ_off, u8 cond) +{ + u8 addendum; + s32 disp; + + if (cond >= ARC_CC_LAST) + return false; + + /* + * The unconditional jump happens immediately, while the rest + * are either preceded by a "cmp" or "tst" instruction. + */ + addendum = (cond == ARC_CC_AL) ? 0 : INSN_len_normal; + disp = get_displacement(curr_off + addendum, targ_off); + + if (ARC_CC_AL) + return is_valid_far_disp(disp); + else + return is_valid_near_disp(disp); +} + +/* + * The JITed code for 32-bit (conditional) branches: + * + * ARC_CC_AL @target + * b @jit_targ_addr + * + * ARC_CC_SET rd, rs, @target + * tst rd, rs + * bnz @jit_targ_addr + * + * ARC_CC_xx rd, rs, @target + * cmp rd, rs + * b @jit_targ_addr # cc = arcv2_32_jmps[xx] + */ +u8 gen_jmp_32(u8 *buf, u8 rd, u8 rs, u8 cond, u32 curr_off, u32 targ_off) +{ + s32 disp; + u8 len = 0; + + /* + * Although this must have already been checked by "check_jmp_32()", + * we're not going to risk accessing "arcv2_32_jmps" array without + * the boundary check. + */ + if (cond >= ARC_CC_LAST) { +#ifdef ARC_BPF_JIT_DEBUG + pr_err("32-bit jump condition is not known."); + BUG(); +#endif + return 0; + } + + /* If there is a "condition", issue the "cmp" or "tst" first. */ + if (cond != ARC_CC_AL) { + if (cond == ARC_CC_SET) + len = tst_r32(buf, rd, rs); + else + len = cmp_r32(buf, rd, rs); + /* + * The issued instruction affects the "disp"lacement as + * it alters the "curr_off" by its "len"gth. The "curr_off" + * should always point to the jump instruction. + */ + disp = get_displacement(curr_off + len, targ_off); + len += arc_bcc(BUF(buf, len), arcv2_32_jmps[cond], disp); + } else { + /* The straight forward unconditional jump. */ + disp = get_displacement(curr_off, targ_off); + len = arc_b(buf, disp); + } + + return len; +} + +/* + * Generate code for functions calls. There can be two types of calls: + * + * - Calling another BPF function + * - Calling an in-kernel function which is compiled by ARC gcc + * + * In the later case, we must comply to ARCv2 ABI and handle arguments + * and return values accordingly. + */ +u8 gen_func_call(u8 *buf, ARC_ADDR func_addr, bool external_func) +{ + u8 len = 0; + + /* + * In case of an in-kernel function call, always push the 5th + * argument onto the stack, because that's where the ABI dictates + * it should be found. If the callee doesn't really use it, no harm + * is done. The stack is readjusted either way after the call. + */ + if (external_func) + len += push_r64(BUF(buf, len), BPF_REG_5); + + len += jump_and_link(BUF(buf, len), func_addr); + + if (external_func) + len += arc_add_i(BUF(buf, len), ARC_R_SP, ARC_R_SP, ARG5_SIZE); + + return len; +} diff --git a/arch/arc/net/bpf_jit_core.c b/arch/arc/net/bpf_jit_core.c new file mode 100644 index 000000000000..6f6b4ffccf2c --- /dev/null +++ b/arch/arc/net/bpf_jit_core.c @@ -0,0 +1,1425 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The back-end-agnostic part of Just-In-Time compiler for eBPF bytecode. + * + * Copyright (c) 2024 Synopsys Inc. + * Author: Shahab Vahedi + */ +#include +#include "bpf_jit.h" + +/* + * Check for the return value. A pattern used often in this file. + * There must be a "ret" variable of type "int" in the scope. + */ +#define CHECK_RET(cmd) \ + do { \ + ret = (cmd); \ + if (ret < 0) \ + return ret; \ + } while (0) + +#ifdef ARC_BPF_JIT_DEBUG +/* Dumps bytes in /var/log/messages at KERN_INFO level (4). */ +static void dump_bytes(const u8 *buf, u32 len, const char *header) +{ + u8 line[64]; + size_t i, j; + + pr_info("-----------------[ %s ]-----------------\n", header); + + for (i = 0, j = 0; i < len; i++) { + /* Last input byte? */ + if (i == len - 1) { + j += scnprintf(line + j, 64 - j, "0x%02x", buf[i]); + pr_info("%s\n", line); + break; + } + /* End of line? */ + else if (i % 8 == 7) { + j += scnprintf(line + j, 64 - j, "0x%02x", buf[i]); + pr_info("%s\n", line); + j = 0; + } else { + j += scnprintf(line + j, 64 - j, "0x%02x, ", buf[i]); + } + } +} +#endif /* ARC_BPF_JIT_DEBUG */ + +/********************* JIT context ***********************/ + +/* + * buf: Translated instructions end up here. + * len: The length of whole block in bytes. + * index: The offset at which the _next_ instruction may be put. + */ +struct jit_buffer { + u8 *buf; + u32 len; + u32 index; +}; + +/* + * This is a subset of "struct jit_context" that its information is deemed + * necessary for the next extra pass to come. + * + * bpf_header: Needed to finally lock the region. + * bpf2insn: Used to find the translation for instructions of interest. + * + * Things like "jit.buf" and "jit.len" can be retrieved respectively from + * "prog->bpf_func" and "prog->jited_len". + */ +struct arc_jit_data { + struct bpf_binary_header *bpf_header; + u32 *bpf2insn; +}; + +/* + * The JIT pertinent context that is used by different functions. + * + * prog: The current eBPF program being handled. + * orig_prog: The original eBPF program before any possible change. + * jit: The JIT buffer and its length. + * bpf_header: The JITed program header. "jit.buf" points inside it. + * emit: If set, opcodes are written to memory; else, a dry-run. + * do_zext: If true, 32-bit sub-regs must be zero extended. + * bpf2insn: Maps BPF insn indices to their counterparts in jit.buf. + * bpf2insn_valid: Indicates if "bpf2ins" is populated with the mappings. + * jit_data: A piece of memory to transfer data to the next pass. + * arc_regs_clobbered: Each bit status determines if that arc reg is clobbered. + * save_blink: Whether ARC's "blink" register needs to be saved. + * frame_size: Derived from "prog->aux->stack_depth". + * epilogue_offset: Used by early "return"s in the code to jump here. + * need_extra_pass: A forecast if an "extra_pass" will occur. + * is_extra_pass: Indicates if the current pass is an extra pass. + * user_bpf_prog: True, if VM opcodes come from a real program. + * blinded: True if "constant blinding" step returned a new "prog". + * success: Indicates if the whole JIT went OK. + */ +struct jit_context { + struct bpf_prog *prog; + struct bpf_prog *orig_prog; + struct jit_buffer jit; + struct bpf_binary_header *bpf_header; + bool emit; + bool do_zext; + u32 *bpf2insn; + bool bpf2insn_valid; + struct arc_jit_data *jit_data; + u32 arc_regs_clobbered; + bool save_blink; + u16 frame_size; + u32 epilogue_offset; + bool need_extra_pass; + bool is_extra_pass; + bool user_bpf_prog; + bool blinded; + bool success; +}; + +/* + * If we're in ARC_BPF_JIT_DEBUG mode and the debug level is right, dump the + * input BPF stream. "bpf_jit_dump()" is not fully suited for this purpose. + */ +static void vm_dump(const struct bpf_prog *prog) +{ +#ifdef ARC_BPF_JIT_DEBUG + if (bpf_jit_enable > 1) + dump_bytes((u8 *)prog->insns, 8 * prog->len, " VM "); +#endif +} + +/* + * If the right level of debug is set, dump the bytes. There are 2 variants + * of this function: + * + * 1. Use the standard bpf_jit_dump() which is meant only for JITed code. + * 2. Use the dump_bytes() to match its "vm_dump()" instance. + */ +static void jit_dump(const struct jit_context *ctx) +{ +#ifdef ARC_BPF_JIT_DEBUG + u8 header[8]; +#endif + const int pass = ctx->is_extra_pass ? 2 : 1; + + if (bpf_jit_enable <= 1 || !ctx->prog->jited) + return; + +#ifdef ARC_BPF_JIT_DEBUG + scnprintf(header, sizeof(header), "JIT:%d", pass); + dump_bytes(ctx->jit.buf, ctx->jit.len, header); + pr_info("\n"); +#else + bpf_jit_dump(ctx->prog->len, ctx->jit.len, pass, ctx->jit.buf); +#endif +} + +/* Initialise the context so there's no garbage. */ +static int jit_ctx_init(struct jit_context *ctx, struct bpf_prog *prog) +{ + memset(ctx, 0, sizeof(ctx)); + + ctx->orig_prog = prog; + + /* If constant blinding was requested but failed, scram. */ + ctx->prog = bpf_jit_blind_constants(prog); + if (IS_ERR(ctx->prog)) + return PTR_ERR(ctx->prog); + ctx->blinded = (ctx->prog == ctx->orig_prog ? false : true); + + /* If the verifier doesn't zero-extend, then we have to do it. */ + ctx->do_zext = !ctx->prog->aux->verifier_zext; + + ctx->is_extra_pass = ctx->prog->jited; + ctx->user_bpf_prog = ctx->prog->is_func; + + return 0; +} + +/* + * Only after the first iteration of normal pass (the dry-run), + * there are valid offsets in ctx->bpf2insn array. + */ +static inline bool offsets_available(const struct jit_context *ctx) +{ + return ctx->bpf2insn_valid; +} + +/* + * "*mem" should be freed when there is no "extra pass" to come, + * or the compilation terminated abruptly. A few of such memory + * allocations are: ctx->jit_data and ctx->bpf2insn. + */ +static inline void maybe_free(struct jit_context *ctx, void **mem) +{ + if (*mem) { + if (!ctx->success || !ctx->need_extra_pass) { + kfree(*mem); + *mem = NULL; + } + } +} + +/* + * Free memories based on the status of the context. + * + * A note about "bpf_header": On successful runs, "bpf_header" is + * not freed, because "jit.buf", a sub-array of it, is returned as + * the "bpf_func". However, "bpf_header" is lost and nothing points + * to it. This should not cause a leakage, because apparently + * "bpf_header" can be revived by "bpf_jit_binary_hdr()". This is + * how "bpf_jit_free()" in "kernel/bpf/core.c" releases the memory. + */ +static void jit_ctx_cleanup(struct jit_context *ctx) +{ + if (ctx->blinded) { + /* if all went well, release the orig_prog. */ + if (ctx->success) + bpf_jit_prog_release_other(ctx->prog, ctx->orig_prog); + else + bpf_jit_prog_release_other(ctx->orig_prog, ctx->prog); + } + + maybe_free(ctx, (void **)&ctx->bpf2insn); + maybe_free(ctx, (void **)&ctx->jit_data); + + if (!ctx->bpf2insn) + ctx->bpf2insn_valid = false; + + /* Freeing "bpf_header" is enough. "jit.buf" is a sub-array of it. */ + if (!ctx->success && ctx->bpf_header) { + bpf_jit_binary_free(ctx->bpf_header); + ctx->bpf_header = NULL; + ctx->jit.buf = NULL; + ctx->jit.index = 0; + ctx->jit.len = 0; + } + + ctx->emit = false; + ctx->do_zext = false; +} + +/* + * Analyse the register usage and record the frame size. + * The register usage is determined by consulting the back-end. + */ +static void analyze_reg_usage(struct jit_context *ctx) +{ + size_t i; + u32 usage = 0; + const struct bpf_insn *insn = ctx->prog->insnsi; + + for (i = 0; i < ctx->prog->len; i++) { + u8 bpf_reg; + bool call; + + bpf_reg = insn[i].dst_reg; + call = (insn[i].code == (BPF_JMP | BPF_CALL)) ? true : false; + usage |= mask_for_used_regs(bpf_reg, call); + } + + ctx->arc_regs_clobbered = usage; + ctx->frame_size = ctx->prog->aux->stack_depth; +} + +/* Verify that no instruction will be emitted when there is no buffer. */ +static inline int jit_buffer_check(const struct jit_context *ctx) +{ + if (ctx->emit) { + if (!ctx->jit.buf) { + pr_err("bpf-jit: inconsistence state; no " + "buffer to emit instructions.\n"); + return -EINVAL; + } else if (ctx->jit.index > ctx->jit.len) { + pr_err("bpf-jit: estimated JIT length is less " + "than the emitted instructions.\n"); + return -EFAULT; + } + } + return 0; +} + +/* On a dry-run (emit=false), "jit.len" is growing gradually. */ +static inline void jit_buffer_update(struct jit_context *ctx, u32 n) +{ + if (!ctx->emit) + ctx->jit.len += n; + else + ctx->jit.index += n; +} + +/* Based on "emit", determine the address where instructions are emitted. */ +static inline u8 *effective_jit_buf(const struct jit_context *ctx) +{ + return ctx->emit ? (ctx->jit.buf + ctx->jit.index) : NULL; +} + +/* Prologue based on context variables set by "analyze_reg_usage()". */ +static int handle_prologue(struct jit_context *ctx) +{ + int ret; + u8 *buf = effective_jit_buf(ctx); + u32 len = 0; + + CHECK_RET(jit_buffer_check(ctx)); + + len = arc_prologue(buf, ctx->arc_regs_clobbered, ctx->frame_size); + jit_buffer_update(ctx, len); + + return 0; +} + +/* The counter part for "handle_prologue()". */ +static int handle_epilogue(struct jit_context *ctx) +{ + int ret; + u8 *buf = effective_jit_buf(ctx); + u32 len = 0; + + CHECK_RET(jit_buffer_check(ctx)); + + len = arc_epilogue(buf, ctx->arc_regs_clobbered, ctx->frame_size); + jit_buffer_update(ctx, len); + + return 0; +} + +/* Tell which number of the BPF instruction we are dealing with. */ +static inline s32 get_index_for_insn(const struct jit_context *ctx, + const struct bpf_insn *insn) +{ + return (insn - ctx->prog->insnsi); +} + +/* + * In most of the cases, the "offset" is read from "insn->off". However, + * if it is an unconditional BPF_JMP32, then it comes from "insn->imm". + * + * (Courtesy of "cpu=v4" support) + */ +static inline s32 get_offset(const struct bpf_insn *insn) +{ + if ((BPF_CLASS(insn->code) == BPF_JMP32) && + (BPF_OP(insn->code) == BPF_JA)) + return insn->imm; + else + return insn->off; +} + +/* + * Determine to which number of the BPF instruction we're jumping to. + * + * The "offset" is interpreted as the "number" of BPF instructions + * from the _next_ BPF instruction. e.g.: + * + * 4 means 4 instructions after the next insn + * 0 means 0 instructions after the next insn -> fallthrough. + * -1 means 1 instruction before the next insn -> jmp to current insn. + * + * Another way to look at this, "offset" is the number of instructions + * that exist between the current instruction and the target instruction. + * + * It is worth noting that a "mov r,i64", which is 16-byte long, is + * treated as two instructions long, therefore "offset" needn't be + * treated specially for those. Everything is uniform. + */ +static inline s32 get_target_index_for_insn(const struct jit_context *ctx, + const struct bpf_insn *insn) +{ + return (get_index_for_insn(ctx, insn) + 1) + get_offset(insn); +} + +/* Is there an immediate operand encoded in the "insn"? */ +static inline bool has_imm(const struct bpf_insn *insn) +{ + return BPF_SRC(insn->code) == BPF_K; +} + +/* Is the last BPF instruction? */ +static inline bool is_last_insn(const struct bpf_prog *prog, u32 idx) +{ + return idx == (prog->len - 1); +} + +/* + * Invocation of this function, conditionally signals the need for + * an extra pass. The conditions that must be met are: + * + * 1. The current pass itself shouldn't be an extra pass. + * 2. The stream of bytes being JITed must come from a user program. + */ +static inline void set_need_for_extra_pass(struct jit_context *ctx) +{ + if (!ctx->is_extra_pass) + ctx->need_extra_pass = ctx->user_bpf_prog; +} + +/* + * Check if the "size" is valid and then transfer the control to + * the back-end for the swap. + */ +static int handle_swap(u8 *buf, u8 rd, u8 size, u8 endian, + bool force, bool do_zext, u8 *len) +{ + /* Sanity check on the size. */ + switch (size) { + case 16: + case 32: + case 64: + break; + default: + pr_err("bpf-jit: invalid size for swap.\n"); + return -EINVAL; + } + + *len = gen_swap(buf, rd, size, endian, force, do_zext); + + return 0; +} + +/* Checks if the (instruction) index is in valid range. */ +static inline bool check_insn_idx_valid(const struct jit_context *ctx, + const s32 idx) +{ + return (idx >= 0 && idx < ctx->prog->len); +} + +/* + * Decouple the back-end from BPF by converting BPF conditions + * to internal enum. ARC_CC_* start from 0 and are used as index + * to an array. BPF_J* usage must end after this conversion. + */ +static int bpf_cond_to_arc(const u8 op, u8 *arc_cc) +{ + switch (op) { + case BPF_JA: + *arc_cc = ARC_CC_AL; + break; + case BPF_JEQ: + *arc_cc = ARC_CC_EQ; + break; + case BPF_JGT: + *arc_cc = ARC_CC_UGT; + break; + case BPF_JGE: + *arc_cc = ARC_CC_UGE; + break; + case BPF_JSET: + *arc_cc = ARC_CC_SET; + break; + case BPF_JNE: + *arc_cc = ARC_CC_NE; + break; + case BPF_JSGT: + *arc_cc = ARC_CC_SGT; + break; + case BPF_JSGE: + *arc_cc = ARC_CC_SGE; + break; + case BPF_JLT: + *arc_cc = ARC_CC_ULT; + break; + case BPF_JLE: + *arc_cc = ARC_CC_ULE; + break; + case BPF_JSLT: + *arc_cc = ARC_CC_SLT; + break; + case BPF_JSLE: + *arc_cc = ARC_CC_SLE; + break; + default: + pr_err("bpf-jit: can't handle condition 0x%02X\n", op); + return -EINVAL; + } + return 0; +} + +/* + * Check a few things for a supposedly "jump" instruction: + * + * 0. "insn" is a "jump" instruction, but not the "call/exit" variant. + * 1. The current "insn" index is in valid range. + * 2. The index of target instruction is in valid range. + */ +static int check_bpf_jump(const struct jit_context *ctx, + const struct bpf_insn *insn) +{ + const u8 class = BPF_CLASS(insn->code); + const u8 op = BPF_OP(insn->code); + + /* Must be a jmp(32) instruction that is not a "call/exit". */ + if ((class != BPF_JMP && class != BPF_JMP32) || + (op == BPF_CALL || op == BPF_EXIT)) { + pr_err("bpf-jit: not a jump instruction.\n"); + return -EINVAL; + } + + if (!check_insn_idx_valid(ctx, get_index_for_insn(ctx, insn))) { + pr_err("bpf-jit: the bpf jump insn is not in prog.\n"); + return -EINVAL; + } + + if (!check_insn_idx_valid(ctx, get_target_index_for_insn(ctx, insn))) { + pr_err("bpf-jit: bpf jump label is out of range.\n"); + return -EINVAL; + } + + return 0; +} + +/* + * Based on input "insn", consult "ctx->bpf2insn" to get the + * related index (offset) of the translation in JIT stream. + */ +static u32 get_curr_jit_off(const struct jit_context *ctx, + const struct bpf_insn *insn) +{ + const s32 idx = get_index_for_insn(ctx, insn); +#ifdef ARC_BPF_JIT_DEBUG + BUG_ON(!offsets_available(ctx) || !check_insn_idx_valid(ctx, idx)); +#endif + return ctx->bpf2insn[idx]; +} + +/* + * The input "insn" must be a jump instruction. + * + * Based on input "insn", consult "ctx->bpf2insn" to get the + * related JIT index (offset) of "target instruction" that + * "insn" would jump to. + */ +static u32 get_targ_jit_off(const struct jit_context *ctx, + const struct bpf_insn *insn) +{ + const s32 tidx = get_target_index_for_insn(ctx, insn); +#ifdef ARC_BPF_JIT_DEBUG + BUG_ON(!offsets_available(ctx) || !check_insn_idx_valid(ctx, tidx)); +#endif + return ctx->bpf2insn[tidx]; +} + +/* + * This function will return 0 for a feasible jump. + * + * Consult the back-end to check if it finds it feasible to emit + * the necessary instructions based on "cond" and the displacement + * between the "from_off" and the "to_off". + */ +static int feasible_jit_jump(u32 from_off, u32 to_off, u8 cond, bool j32) +{ + int ret = 0; + + if (j32) { + if (!check_jmp_32(from_off, to_off, cond)) + ret = -EFAULT; + } else { + if (!check_jmp_64(from_off, to_off, cond)) + ret = -EFAULT; + } + + if (ret != 0) + pr_err("bpf-jit: the JIT displacement is not OK.\n"); + + return ret; +} + +/* + * This jump handler performs the following steps: + * + * 1. Compute ARC's internal condition code from BPF's + * 2. Determine the bitness of the operation (32 vs. 64) + * 3. Sanity check on BPF stream + * 4. Sanity check on what is supposed to be JIT's displacement + * 5. And finally, emit the necessary instructions + * + * The last two steps are performed through the back-end. + * The value of steps 1 and 2 are necessary inputs for the back-end. + */ +static int handle_jumps(const struct jit_context *ctx, + const struct bpf_insn *insn, + u8 *len) +{ + u8 cond; + int ret = 0; + u8 *buf = effective_jit_buf(ctx); + const bool j32 = (BPF_CLASS(insn->code) == BPF_JMP32) ? true : false; + const u8 rd = insn->dst_reg; + u8 rs = insn->src_reg; + u32 curr_off = 0, targ_off = 0; + + *len = 0; + + /* Map the BPF condition to internal enum. */ + CHECK_RET(bpf_cond_to_arc(BPF_OP(insn->code), &cond)); + + /* Sanity check on the BPF byte stream. */ + CHECK_RET(check_bpf_jump(ctx, insn)); + + /* + * Move the immediate into a temporary register _now_ for 2 reasons: + * + * 1. "gen_jmp_{32,64}()" deal with operands in registers. + * + * 2. The "len" parameter will grow so that the current jit offset + * (curr_off) will have increased to a point where the necessary + * instructions can be inserted by "gen_jmp_{32,64}()". + */ + if (has_imm(insn) && cond != ARC_CC_AL) { + if (j32) { + *len += mov_r32_i32(BUF(buf, *len), JIT_REG_TMP, + insn->imm); + } else { + *len += mov_r64_i32(BUF(buf, *len), JIT_REG_TMP, + insn->imm); + } + rs = JIT_REG_TMP; + } + + /* If the offsets are known, check if the branch can occur. */ + if (offsets_available(ctx)) { + curr_off = get_curr_jit_off(ctx, insn) + *len; + targ_off = get_targ_jit_off(ctx, insn); + + /* Sanity check on the back-end side. */ + CHECK_RET(feasible_jit_jump(curr_off, targ_off, cond, j32)); + } + + if (j32) { + *len += gen_jmp_32(BUF(buf, *len), rd, rs, cond, + curr_off, targ_off); + } else { + *len += gen_jmp_64(BUF(buf, *len), rd, rs, cond, + curr_off, targ_off); + } + + return ret; +} + +/* Jump to translated epilogue address. */ +static int handle_jmp_epilogue(struct jit_context *ctx, + const struct bpf_insn *insn, u8 *len) +{ + u8 *buf = effective_jit_buf(ctx); + u32 curr_off = 0, epi_off = 0; + + /* Check the offset only if the data is available. */ + if (offsets_available(ctx)) { + curr_off = get_curr_jit_off(ctx, insn); + epi_off = ctx->epilogue_offset; + + if (!check_jmp_64(curr_off, epi_off, ARC_CC_AL)) { + pr_err("bpf-jit: epilogue offset is not valid.\n"); + return -EINVAL; + } + } + + /* Jump to "epilogue offset" (rd and rs don't matter). */ + *len = gen_jmp_64(buf, 0, 0, ARC_CC_AL, curr_off, epi_off); + + return 0; +} + +/* Try to get the resolved address and generate the instructions. */ +static int handle_call(struct jit_context *ctx, + const struct bpf_insn *insn, + u8 *len) +{ + int ret; + bool in_kernel_func, fixed = false; + u64 addr = 0; + u8 *buf = effective_jit_buf(ctx); + + ret = bpf_jit_get_func_addr(ctx->prog, insn, ctx->is_extra_pass, + &addr, &fixed); + if (ret < 0) { + pr_err("bpf-jit: can't get the address for call.\n"); + return ret; + } + in_kernel_func = (fixed ? true : false); + + /* No valuable address retrieved (yet). */ + if (!fixed && !addr) + set_need_for_extra_pass(ctx); + + *len = gen_func_call(buf, (ARC_ADDR)addr, in_kernel_func); + + if (insn->src_reg != BPF_PSEUDO_CALL) { + /* Assigning ABI's return reg to JIT's return reg. */ + *len += arc_to_bpf_return(BUF(buf, *len)); + } + + return 0; +} + +/* + * Try to generate instructions for loading a 64-bit immediate. + * These sort of instructions are usually associated with the 64-bit + * relocations: R_BPF_64_64. Therefore, signal the need for an extra + * pass if the circumstances are right. + */ +static int handle_ld_imm64(struct jit_context *ctx, + const struct bpf_insn *insn, + u8 *len) +{ + const s32 idx = get_index_for_insn(ctx, insn); + u8 *buf = effective_jit_buf(ctx); + + /* We're about to consume 2 VM instructions. */ + if (is_last_insn(ctx->prog, idx)) { + pr_err("bpf-jit: need more data for 64-bit immediate.\n"); + return -EINVAL; + } + + *len = mov_r64_i64(buf, insn->dst_reg, insn->imm, (insn + 1)->imm); + + if (bpf_pseudo_func(insn)) + set_need_for_extra_pass(ctx); + + return 0; +} + +/* + * Handles one eBPF instruction at a time. To make this function faster, + * it does not call "jit_buffer_check()". Else, it would call it for every + * instruction. As a result, it should not be invoked directly. Only + * "handle_body()", that has already executed the "check", may call this + * function. + * + * If the "ret" value is negative, something has went wrong. Else, + * it mostly holds the value 0 and rarely 1. Number 1 signals + * the loop in "handle_body()" to skip the next instruction, because + * it has been consumed as part of a 64-bit immediate value. + */ +static int handle_insn(struct jit_context *ctx, u32 idx) +{ + const struct bpf_insn *insn = &ctx->prog->insnsi[idx]; + const u8 code = insn->code; + const u8 dst = insn->dst_reg; + const u8 src = insn->src_reg; + const s16 off = insn->off; + const s32 imm = insn->imm; + u8 *buf = effective_jit_buf(ctx); + u8 len = 0; + int ret = 0; + + switch (code) { + /* dst += src (32-bit) */ + case BPF_ALU | BPF_ADD | BPF_X: + len = add_r32(buf, dst, src); + break; + /* dst += imm (32-bit) */ + case BPF_ALU | BPF_ADD | BPF_K: + len = add_r32_i32(buf, dst, imm); + break; + /* dst -= src (32-bit) */ + case BPF_ALU | BPF_SUB | BPF_X: + len = sub_r32(buf, dst, src); + break; + /* dst -= imm (32-bit) */ + case BPF_ALU | BPF_SUB | BPF_K: + len = sub_r32_i32(buf, dst, imm); + break; + /* dst = -dst (32-bit) */ + case BPF_ALU | BPF_NEG: + len = neg_r32(buf, dst); + break; + /* dst *= src (32-bit) */ + case BPF_ALU | BPF_MUL | BPF_X: + len = mul_r32(buf, dst, src); + break; + /* dst *= imm (32-bit) */ + case BPF_ALU | BPF_MUL | BPF_K: + len = mul_r32_i32(buf, dst, imm); + break; + /* dst /= src (32-bit) */ + case BPF_ALU | BPF_DIV | BPF_X: + len = div_r32(buf, dst, src, off == 1); + break; + /* dst /= imm (32-bit) */ + case BPF_ALU | BPF_DIV | BPF_K: + len = div_r32_i32(buf, dst, imm, off == 1); + break; + /* dst %= src (32-bit) */ + case BPF_ALU | BPF_MOD | BPF_X: + len = mod_r32(buf, dst, src, off == 1); + break; + /* dst %= imm (32-bit) */ + case BPF_ALU | BPF_MOD | BPF_K: + len = mod_r32_i32(buf, dst, imm, off == 1); + break; + /* dst &= src (32-bit) */ + case BPF_ALU | BPF_AND | BPF_X: + len = and_r32(buf, dst, src); + break; + /* dst &= imm (32-bit) */ + case BPF_ALU | BPF_AND | BPF_K: + len = and_r32_i32(buf, dst, imm); + break; + /* dst |= src (32-bit) */ + case BPF_ALU | BPF_OR | BPF_X: + len = or_r32(buf, dst, src); + break; + /* dst |= imm (32-bit) */ + case BPF_ALU | BPF_OR | BPF_K: + len = or_r32_i32(buf, dst, imm); + break; + /* dst ^= src (32-bit) */ + case BPF_ALU | BPF_XOR | BPF_X: + len = xor_r32(buf, dst, src); + break; + /* dst ^= imm (32-bit) */ + case BPF_ALU | BPF_XOR | BPF_K: + len = xor_r32_i32(buf, dst, imm); + break; + /* dst <<= src (32-bit) */ + case BPF_ALU | BPF_LSH | BPF_X: + len = lsh_r32(buf, dst, src); + break; + /* dst <<= imm (32-bit) */ + case BPF_ALU | BPF_LSH | BPF_K: + len = lsh_r32_i32(buf, dst, imm); + break; + /* dst >>= src (32-bit) [unsigned] */ + case BPF_ALU | BPF_RSH | BPF_X: + len = rsh_r32(buf, dst, src); + break; + /* dst >>= imm (32-bit) [unsigned] */ + case BPF_ALU | BPF_RSH | BPF_K: + len = rsh_r32_i32(buf, dst, imm); + break; + /* dst >>= src (32-bit) [signed] */ + case BPF_ALU | BPF_ARSH | BPF_X: + len = arsh_r32(buf, dst, src); + break; + /* dst >>= imm (32-bit) [signed] */ + case BPF_ALU | BPF_ARSH | BPF_K: + len = arsh_r32_i32(buf, dst, imm); + break; + /* dst = src (32-bit) */ + case BPF_ALU | BPF_MOV | BPF_X: + len = mov_r32(buf, dst, src, (u8)off); + break; + /* dst = imm32 (32-bit) */ + case BPF_ALU | BPF_MOV | BPF_K: + len = mov_r32_i32(buf, dst, imm); + break; + /* dst = swap(dst) */ + case BPF_ALU | BPF_END | BPF_FROM_LE: + case BPF_ALU | BPF_END | BPF_FROM_BE: + case BPF_ALU64 | BPF_END | BPF_FROM_LE: { + CHECK_RET(handle_swap(buf, dst, imm, BPF_SRC(code), + BPF_CLASS(code) == BPF_ALU64, + ctx->do_zext, &len)); + break; + } + /* dst += src (64-bit) */ + case BPF_ALU64 | BPF_ADD | BPF_X: + len = add_r64(buf, dst, src); + break; + /* dst += imm32 (64-bit) */ + case BPF_ALU64 | BPF_ADD | BPF_K: + len = add_r64_i32(buf, dst, imm); + break; + /* dst -= src (64-bit) */ + case BPF_ALU64 | BPF_SUB | BPF_X: + len = sub_r64(buf, dst, src); + break; + /* dst -= imm32 (64-bit) */ + case BPF_ALU64 | BPF_SUB | BPF_K: + len = sub_r64_i32(buf, dst, imm); + break; + /* dst = -dst (64-bit) */ + case BPF_ALU64 | BPF_NEG: + len = neg_r64(buf, dst); + break; + /* dst *= src (64-bit) */ + case BPF_ALU64 | BPF_MUL | BPF_X: + len = mul_r64(buf, dst, src); + break; + /* dst *= imm32 (64-bit) */ + case BPF_ALU64 | BPF_MUL | BPF_K: + len = mul_r64_i32(buf, dst, imm); + break; + /* dst &= src (64-bit) */ + case BPF_ALU64 | BPF_AND | BPF_X: + len = and_r64(buf, dst, src); + break; + /* dst &= imm32 (64-bit) */ + case BPF_ALU64 | BPF_AND | BPF_K: + len = and_r64_i32(buf, dst, imm); + break; + /* dst |= src (64-bit) */ + case BPF_ALU64 | BPF_OR | BPF_X: + len = or_r64(buf, dst, src); + break; + /* dst |= imm32 (64-bit) */ + case BPF_ALU64 | BPF_OR | BPF_K: + len = or_r64_i32(buf, dst, imm); + break; + /* dst ^= src (64-bit) */ + case BPF_ALU64 | BPF_XOR | BPF_X: + len = xor_r64(buf, dst, src); + break; + /* dst ^= imm32 (64-bit) */ + case BPF_ALU64 | BPF_XOR | BPF_K: + len = xor_r64_i32(buf, dst, imm); + break; + /* dst <<= src (64-bit) */ + case BPF_ALU64 | BPF_LSH | BPF_X: + len = lsh_r64(buf, dst, src); + break; + /* dst <<= imm32 (64-bit) */ + case BPF_ALU64 | BPF_LSH | BPF_K: + len = lsh_r64_i32(buf, dst, imm); + break; + /* dst >>= src (64-bit) [unsigned] */ + case BPF_ALU64 | BPF_RSH | BPF_X: + len = rsh_r64(buf, dst, src); + break; + /* dst >>= imm32 (64-bit) [unsigned] */ + case BPF_ALU64 | BPF_RSH | BPF_K: + len = rsh_r64_i32(buf, dst, imm); + break; + /* dst >>= src (64-bit) [signed] */ + case BPF_ALU64 | BPF_ARSH | BPF_X: + len = arsh_r64(buf, dst, src); + break; + /* dst >>= imm32 (64-bit) [signed] */ + case BPF_ALU64 | BPF_ARSH | BPF_K: + len = arsh_r64_i32(buf, dst, imm); + break; + /* dst = src (64-bit) */ + case BPF_ALU64 | BPF_MOV | BPF_X: + len = mov_r64(buf, dst, src, (u8)off); + break; + /* dst = imm32 (sign extend to 64-bit) */ + case BPF_ALU64 | BPF_MOV | BPF_K: + len = mov_r64_i32(buf, dst, imm); + break; + /* dst = imm64 */ + case BPF_LD | BPF_DW | BPF_IMM: + CHECK_RET(handle_ld_imm64(ctx, insn, &len)); + /* Tell the loop to skip the next instruction. */ + ret = 1; + break; + /* dst = *(size *)(src + off) */ + case BPF_LDX | BPF_MEM | BPF_W: + case BPF_LDX | BPF_MEM | BPF_H: + case BPF_LDX | BPF_MEM | BPF_B: + case BPF_LDX | BPF_MEM | BPF_DW: + len = load_r(buf, dst, src, off, BPF_SIZE(code), false); + break; + case BPF_LDX | BPF_MEMSX | BPF_W: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_B: + len = load_r(buf, dst, src, off, BPF_SIZE(code), true); + break; + /* *(size *)(dst + off) = src */ + case BPF_STX | BPF_MEM | BPF_W: + case BPF_STX | BPF_MEM | BPF_H: + case BPF_STX | BPF_MEM | BPF_B: + case BPF_STX | BPF_MEM | BPF_DW: + len = store_r(buf, src, dst, off, BPF_SIZE(code)); + break; + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + case BPF_ST | BPF_MEM | BPF_DW: + len = store_i(buf, imm, dst, off, BPF_SIZE(code)); + break; + case BPF_JMP | BPF_JA: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JA: + case BPF_JMP32 | BPF_JEQ | BPF_X: + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_X: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_X: + case BPF_JMP32 | BPF_JSET | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_X: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_X: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_X: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_X: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_X: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_X: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_X: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_X: + case BPF_JMP32 | BPF_JSLE | BPF_K: + CHECK_RET(handle_jumps(ctx, insn, &len)); + break; + case BPF_JMP | BPF_CALL: + CHECK_RET(handle_call(ctx, insn, &len)); + break; + + case BPF_JMP | BPF_EXIT: + /* If this is the last instruction, epilogue will follow. */ + if (is_last_insn(ctx->prog, idx)) + break; + CHECK_RET(handle_jmp_epilogue(ctx, insn, &len)); + break; + default: + pr_err("bpf-jit: can't handle instruction code 0x%02X\n", code); + return -EOPNOTSUPP; + } + + if (BPF_CLASS(code) == BPF_ALU) { + /* + * Skip the "swap" instructions. Even 64-bit swaps are of type + * BPF_ALU (and not BPF_ALU64). Therefore, for the swaps, one + * has to look at the "size" of the operations rather than the + * ALU type. "gen_swap()" specifically takes care of that. + */ + if (BPF_OP(code) != BPF_END && ctx->do_zext) + len += zext(BUF(buf, len), dst); + } + + jit_buffer_update(ctx, len); + + return ret; +} + +static int handle_body(struct jit_context *ctx) +{ + int ret; + bool populate_bpf2insn = false; + const struct bpf_prog *prog = ctx->prog; + + CHECK_RET(jit_buffer_check(ctx)); + + /* + * Record the mapping for the instructions during the dry-run. + * Doing it this way allows us to have the mapping ready for + * the jump instructions during the real compilation phase. + */ + if (!ctx->emit) + populate_bpf2insn = true; + + for (u32 i = 0; i < prog->len; i++) { + /* During the dry-run, jit.len grows gradually per BPF insn. */ + if (populate_bpf2insn) + ctx->bpf2insn[i] = ctx->jit.len; + + CHECK_RET(handle_insn(ctx, i)); + if (ret > 0) { + /* "ret" is 1 if two (64-bit) chunks were consumed. */ + ctx->bpf2insn[i + 1] = ctx->bpf2insn[i]; + i++; + } + } + + /* If bpf2insn had to be populated, then it is done at this point. */ + if (populate_bpf2insn) + ctx->bpf2insn_valid = true; + + return 0; +} + +/* + * Initialize the memory with "unimp_s" which is the mnemonic for + * "unimplemented" instruction and always raises an exception. + * + * The instruction is 2 bytes. If "size" is odd, there is not much + * that can be done about the last byte in "area". Because, the + * CPU always fetches instructions in two bytes. Therefore, the + * byte beyond the last one is going to accompany it during a + * possible fetch. In the most likely case of a little endian + * system, that beyond-byte will become the major opcode and + * we have no control over its initialisation. + */ +static void fill_ill_insn(void *area, unsigned int size) +{ + const u16 unimp_s = 0x79e0; + + if (size & 1) { + *((u8 *)area + (size - 1)) = 0xff; + size -= 1; + } + + memset16(area, unimp_s, size >> 1); +} + +/* Piece of memory that can be allocated at the beginning of jit_prepare(). */ +static int jit_prepare_early_mem_alloc(struct jit_context *ctx) +{ + ctx->bpf2insn = kcalloc(ctx->prog->len, sizeof(ctx->jit.len), + GFP_KERNEL); + + if (!ctx->bpf2insn) { + pr_err("bpf-jit: could not allocate memory for " + "mapping of the instructions.\n"); + return -ENOMEM; + } + + return 0; +} + +/* + * Memory allocations that rely on parameters known at the end of + * jit_prepare(). + */ +static int jit_prepare_final_mem_alloc(struct jit_context *ctx) +{ + const size_t alignment = sizeof(u32); + + ctx->bpf_header = bpf_jit_binary_alloc(ctx->jit.len, &ctx->jit.buf, + alignment, fill_ill_insn); + if (!ctx->bpf_header) { + pr_err("bpf-jit: could not allocate memory for translation.\n"); + return -ENOMEM; + } + + if (ctx->need_extra_pass) { + ctx->jit_data = kzalloc(sizeof(*ctx->jit_data), GFP_KERNEL); + if (!ctx->jit_data) + return -ENOMEM; + } + + return 0; +} + +/* + * The first phase of the translation without actually emitting any + * instruction. It helps in getting a forecast on some aspects, such + * as the length of the whole program or where the epilogue starts. + * + * Whenever the necessary parameters are known, memories are allocated. + */ +static int jit_prepare(struct jit_context *ctx) +{ + int ret; + + /* Dry run. */ + ctx->emit = false; + + CHECK_RET(jit_prepare_early_mem_alloc(ctx)); + + /* Get the length of prologue section after some register analysis. */ + analyze_reg_usage(ctx); + CHECK_RET(handle_prologue(ctx)); + + CHECK_RET(handle_body(ctx)); + + /* Record at which offset epilogue begins. */ + ctx->epilogue_offset = ctx->jit.len; + + /* Process the epilogue section now. */ + CHECK_RET(handle_epilogue(ctx)); + + CHECK_RET(jit_prepare_final_mem_alloc(ctx)); + + return 0; +} + +/* + * All the "handle_*()" functions have been called before by the + * "jit_prepare()". If there was an error, we would know by now. + * Therefore, no extra error checking at this point, other than + * a sanity check at the end that expects the calculated length + * (jit.len) to be equal to the length of generated instructions + * (jit.index). + */ +static int jit_compile(struct jit_context *ctx) +{ + int ret; + + /* Let there be code. */ + ctx->emit = true; + + CHECK_RET(handle_prologue(ctx)); + + CHECK_RET(handle_body(ctx)); + + CHECK_RET(handle_epilogue(ctx)); + + if (ctx->jit.index != ctx->jit.len) { + pr_err("bpf-jit: divergence between the phases; " + "%u vs. %u (bytes).\n", + ctx->jit.len, ctx->jit.index); + return -EFAULT; + } + + return 0; +} + +/* + * Calling this function implies a successful JIT. A successful + * translation is signaled by setting the right parameters: + * + * prog->jited=1, prog->jited_len=..., prog->bpf_func=... + */ +static int jit_finalize(struct jit_context *ctx) +{ + struct bpf_prog *prog = ctx->prog; + + /* We're going to need this information for the "do_extra_pass()". */ + if (ctx->need_extra_pass) { + ctx->jit_data->bpf_header = ctx->bpf_header; + ctx->jit_data->bpf2insn = ctx->bpf2insn; + prog->aux->jit_data = (void *)ctx->jit_data; + } else { + /* + * If things seem finalised, then mark the JITed memory + * as R-X and flush it. + */ + if (bpf_jit_binary_lock_ro(ctx->bpf_header)) { + pr_err("bpf-jit: Could not lock the JIT memory.\n"); + return -EFAULT; + } + flush_icache_range((unsigned long)ctx->bpf_header, + (unsigned long) + BUF(ctx->jit.buf, ctx->jit.len)); + prog->aux->jit_data = NULL; + bpf_prog_fill_jited_linfo(prog, ctx->bpf2insn); + } + + ctx->success = true; + prog->bpf_func = (void *)ctx->jit.buf; + prog->jited_len = ctx->jit.len; + prog->jited = 1; + + jit_ctx_cleanup(ctx); + jit_dump(ctx); + + return 0; +} + +/* + * A lenient verification for the existence of JIT context in "prog". + * Apparently the JIT internals, namely jit_subprogs() in bpf/verifier.c, + * may request for a second compilation although nothing needs to be done. + */ +static inline int check_jit_context(const struct bpf_prog *prog) +{ + if (!prog->aux->jit_data) { + pr_notice("bpf-jit: no jit data for the extra pass.\n"); + return 1; + } else { + return 0; + } +} + +/* Reuse the previous pass's data. */ +static int jit_resume_context(struct jit_context *ctx) +{ + struct arc_jit_data *jdata = + (struct arc_jit_data *)ctx->prog->aux->jit_data; + + if (!jdata) { + pr_err("bpf-jit: no jit data for the extra pass.\n"); + return -EINVAL; + } + + ctx->jit.buf = (u8 *)ctx->prog->bpf_func; + ctx->jit.len = ctx->prog->jited_len; + ctx->bpf_header = jdata->bpf_header; + ctx->bpf2insn = (u32 *)jdata->bpf2insn; + ctx->bpf2insn_valid = ctx->bpf2insn ? true : false; + ctx->jit_data = jdata; + + return 0; +} + +/* + * Patch in the new addresses. The instructions of interest are: + * + * - call + * - ld r64, imm64 + * + * For "call"s, it resolves the addresses one more time through the + * handle_call(). + * + * For 64-bit immediate loads, it just retranslates them, because the BPF + * core in kernel might have changed the value since the normal pass. + */ +static int jit_patch_relocations(struct jit_context *ctx) +{ + const u8 bpf_opc_call = BPF_JMP | BPF_CALL; + const u8 bpf_opc_ldi64 = BPF_LD | BPF_DW | BPF_IMM; + const struct bpf_prog *prog = ctx->prog; + int ret; + + ctx->emit = true; + for (u32 i = 0; i < prog->len; i++) { + const struct bpf_insn *insn = &prog->insnsi[i]; + u8 dummy; + /* + * Adjust "ctx.jit.index", so "gen_*()" functions below + * can use it for their output addresses. + */ + ctx->jit.index = ctx->bpf2insn[i]; + + if (insn->code == bpf_opc_call) { + CHECK_RET(handle_call(ctx, insn, &dummy)); + } else if (insn->code == bpf_opc_ldi64) { + CHECK_RET(handle_ld_imm64(ctx, insn, &dummy)); + /* Skip the next instruction. */ + ++i; + } + } + return 0; +} + +/* + * A normal pass that involves a "dry-run" phase, jit_prepare(), + * to get the necessary data for the real compilation phase, + * jit_compile(). + */ +static struct bpf_prog *do_normal_pass(struct bpf_prog *prog) +{ + struct jit_context ctx; + + /* Bail out if JIT is disabled. */ + if (!prog->jit_requested) + return prog; + + if (jit_ctx_init(&ctx, prog)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + /* Get the lengths and allocate buffer. */ + if (jit_prepare(&ctx)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + if (jit_compile(&ctx)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + if (jit_finalize(&ctx)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + return ctx.prog; +} + +/* + * If there are multi-function BPF programs that call each other, + * their translated addresses are not known all at once. Therefore, + * an extra pass is needed to consult the bpf_jit_get_func_addr() + * again to get the newly translated addresses in order to resolve + * the "call"s. + */ +static struct bpf_prog *do_extra_pass(struct bpf_prog *prog) +{ + struct jit_context ctx; + + /* Skip if there's no context to resume from. */ + if (check_jit_context(prog)) + return prog; + + if (jit_ctx_init(&ctx, prog)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + if (jit_resume_context(&ctx)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + if (jit_patch_relocations(&ctx)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + if (jit_finalize(&ctx)) { + jit_ctx_cleanup(&ctx); + return prog; + } + + return ctx.prog; +} + +/* + * This function may be invoked twice for the same stream of BPF + * instructions. The "extra pass" happens, when there are "call"s + * involved that their addresses are not known during the first + * invocation. + */ +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +{ + vm_dump(prog); + + /* Was this program already translated? */ + if (!prog->jited) + return do_normal_pass(prog); + else + return do_extra_pass(prog); + + return prog; +} From 19c56d4e5be102cd118162b9f72d9c6d353e76fc Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 2 May 2024 15:18:51 +0000 Subject: [PATCH 089/119] riscv, bpf: add internal-only MOV instruction to resolve per-CPU addrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support an instruction for resolving absolute addresses of per-CPU data from their per-CPU offsets. This instruction is internal-only and users are not allowed to use them directly. They will only be used for internal inlining optimizations for now between BPF verifier and BPF JITs. RISC-V uses generic per-cpu implementation where the offsets for CPUs are kept in an array called __per_cpu_offset[cpu_number]. RISCV stores the address of the task_struct in TP register. The first element in task_struct is struct thread_info, and we can get the cpu number by reading from the TP register + offsetof(struct thread_info, cpu). Once we have the cpu number in a register we read the offset for that cpu from address: &__per_cpu_offset + cpu_number << 3. Then we add this offset to the destination register. To measure the improvement from this change, the benchmark in [1] was used on Qemu: Before: glob-arr-inc : 1.127 ± 0.013M/s arr-inc : 1.121 ± 0.004M/s hash-inc : 0.681 ± 0.052M/s After: glob-arr-inc : 1.138 ± 0.011M/s arr-inc : 1.366 ± 0.006M/s hash-inc : 0.676 ± 0.001M/s [1] https://github.com/anakryiko/linux/commit/8dec900975ef Signed-off-by: Puranjay Mohan Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20240502151854.9810-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit_comp64.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 15e482f2c657..1f0159963b3e 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "bpf_jit.h" #define RV_FENTRY_NINSNS 2 @@ -1089,6 +1090,24 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, emit_or(RV_REG_T1, rd, RV_REG_T1, ctx); emit_mv(rd, RV_REG_T1, ctx); break; + } else if (insn_is_mov_percpu_addr(insn)) { + if (rd != rs) + emit_mv(rd, rs, ctx); +#ifdef CONFIG_SMP + /* Load current CPU number in T1 */ + emit_ld(RV_REG_T1, offsetof(struct thread_info, cpu), + RV_REG_TP, ctx); + /* << 3 because offsets are 8 bytes */ + emit_slli(RV_REG_T1, RV_REG_T1, 3, ctx); + /* Load address of __per_cpu_offset array in T2 */ + emit_addr(RV_REG_T2, (u64)&__per_cpu_offset, extra_pass, ctx); + /* Add offset of current CPU to __per_cpu_offset */ + emit_add(RV_REG_T1, RV_REG_T2, RV_REG_T1, ctx); + /* Load __per_cpu_offset[cpu] in T1 */ + emit_ld(RV_REG_T1, 0, RV_REG_T1, ctx); + /* Add the offset to Rd */ + emit_add(rd, rd, RV_REG_T1, ctx); +#endif } if (imm == 1) { /* Special mov32 for zext */ @@ -2038,3 +2057,8 @@ bool bpf_jit_supports_arena(void) { return true; } + +bool bpf_jit_supports_percpu_insn(void) +{ + return true; +} From 2ddec2c80b4402c293c7e6e0881cecaaf77e8cec Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 2 May 2024 15:18:52 +0000 Subject: [PATCH 090/119] riscv, bpf: inline bpf_get_smp_processor_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline the calls to bpf_get_smp_processor_id() in the riscv bpf jit. RISCV saves the pointer to the CPU's task_struct in the TP (thread pointer) register. This makes it trivial to get the CPU's processor id. As thread_info is the first member of task_struct, we can read the processor id from TP + offsetof(struct thread_info, cpu). RISCV64 JIT output for `call bpf_get_smp_processor_id` ====================================================== Before After -------- ------- auipc t1,0x848c ld a5,32(tp) jalr 604(t1) mv a5,a0 Benchmark using [1] on Qemu. ./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc +---------------+------------------+------------------+--------------+ | Name | Before | After | % change | |---------------+------------------+------------------+--------------| | glob-arr-inc | 1.077 ± 0.006M/s | 1.336 ± 0.010M/s | + 24.04% | | arr-inc | 1.078 ± 0.002M/s | 1.332 ± 0.015M/s | + 23.56% | | hash-inc | 0.494 ± 0.004M/s | 0.653 ± 0.001M/s | + 32.18% | +---------------+------------------+------------------+--------------+ NOTE: This benchmark includes changes from this patch and the previous patch that implemented the per-cpu insn. [1] https://github.com/anakryiko/linux/commit/8dec900975ef Signed-off-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Acked-by: Björn Töpel Link: https://lore.kernel.org/r/20240502151854.9810-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit_comp64.c | 26 ++++++++++++++++++++++++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 11 +++++++++++ kernel/bpf/verifier.c | 4 ++++ 4 files changed, 42 insertions(+) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 1f0159963b3e..a46ec7fb4489 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1493,6 +1493,22 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, bool fixed_addr; u64 addr; + /* Inline calls to bpf_get_smp_processor_id() + * + * RV_REG_TP holds the address of the current CPU's task_struct and thread_info is + * at offset 0 in task_struct. + * Load cpu from thread_info: + * Set R0 to ((struct thread_info *)(RV_REG_TP))->cpu + * + * This replicates the implementation of raw_smp_processor_id() on RISCV + */ + if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) { + /* Load current CPU number in R0 */ + emit_ld(bpf_to_rv_reg(BPF_REG_0, ctx), offsetof(struct thread_info, cpu), + RV_REG_TP, ctx); + break; + } + mark_call(ctx); ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &addr, &fixed_addr); @@ -2062,3 +2078,13 @@ bool bpf_jit_supports_percpu_insn(void) { return true; } + +bool bpf_jit_inlines_helper_call(s32 imm) +{ + switch (imm) { + case BPF_FUNC_get_smp_processor_id: + return true; + default: + return false; + } +} diff --git a/include/linux/filter.h b/include/linux/filter.h index 7a27f19bf44d..3e19bb62ed1a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -993,6 +993,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); +bool bpf_jit_inlines_helper_call(s32 imm); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 99b8b1c9a248..aa59af9f9bd9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2941,6 +2941,17 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return true if the JIT inlines the call to the helper corresponding to + * the imm. + * + * The verifier will not patch the insn->imm for the call to the helper if + * this returns true. + */ +bool __weak bpf_jit_inlines_helper_call(s32 imm) +{ + return false; +} + /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e3aba08984e..1658ca4136a3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19996,6 +19996,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Skip inlining the helper call if the JIT does it. */ + if (bpf_jit_inlines_helper_call(insn->imm)) + goto next_insn; + if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) From 7a4c32222b0e14349a6311e72bf6ebd3e1d1064b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 2 May 2024 15:18:53 +0000 Subject: [PATCH 091/119] arm64, bpf: add internal-only MOV instruction to resolve per-CPU addrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support an instruction for resolving absolute addresses of per-CPU data from their per-CPU offsets. This instruction is internal-only and users are not allowed to use them directly. They will only be used for internal inlining optimizations for now between BPF verifier and BPF JITs. Since commit 7158627686f0 ("arm64: percpu: implement optimised pcpu access using tpidr_el1"), the per-cpu offset for the CPU is stored in the tpidr_el1/2 register of that CPU. To support this BPF instruction in the ARM64 JIT, the following ARM64 instructions are emitted: mov dst, src // Move src to dst, if src != dst mrs tmp, tpidr_el1/2 // Move per-cpu offset of the current cpu in tmp. add dst, dst, tmp // Add the per cpu offset to the dst. To measure the performance improvement provided by this change, the benchmark in [1] was used: Before: glob-arr-inc : 23.597 ± 0.012M/s arr-inc : 23.173 ± 0.019M/s hash-inc : 12.186 ± 0.028M/s After: glob-arr-inc : 23.819 ± 0.034M/s arr-inc : 23.285 ± 0.017M/s hash-inc : 12.419 ± 0.011M/s [1] https://github.com/anakryiko/linux/commit/8dec900975ef Signed-off-by: Puranjay Mohan Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240502151854.9810-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/insn.h | 7 +++++++ arch/arm64/lib/insn.c | 11 +++++++++++ arch/arm64/net/bpf_jit.h | 6 ++++++ arch/arm64/net/bpf_jit_comp.c | 14 ++++++++++++++ 4 files changed, 38 insertions(+) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index db1aeacd4cd9..8de0e39b29f3 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -135,6 +135,11 @@ enum aarch64_insn_special_register { AARCH64_INSN_SPCLREG_SP_EL2 = 0xF210 }; +enum aarch64_insn_system_register { + AARCH64_INSN_SYSREG_TPIDR_EL1 = 0x4684, + AARCH64_INSN_SYSREG_TPIDR_EL2 = 0x6682, +}; + enum aarch64_insn_variant { AARCH64_INSN_VARIANT_32BIT, AARCH64_INSN_VARIANT_64BIT @@ -686,6 +691,8 @@ u32 aarch64_insn_gen_cas(enum aarch64_insn_register result, } #endif u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type); +u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, + enum aarch64_insn_system_register sysreg); s32 aarch64_get_branch_offset(u32 insn); u32 aarch64_set_branch_offset(u32 insn, s32 offset); diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index a635ab83fee3..b008a9b46a7f 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -1515,3 +1515,14 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) return insn; } + +u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, + enum aarch64_insn_system_register sysreg) +{ + u32 insn = aarch64_insn_get_mrs_value(); + + insn &= ~GENMASK(19, 0); + insn |= sysreg << 5; + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, + insn, result); +} diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index 23b1b34db088..b627ef7188c7 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -297,4 +297,10 @@ #define A64_ADR(Rd, offset) \ aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR) +/* MRS */ +#define A64_MRS_TPIDR_EL1(Rt) \ + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL1) +#define A64_MRS_TPIDR_EL2(Rt) \ + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2) + #endif /* _BPF_JIT_H */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 53347d4217f4..4e7954e9829d 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -890,6 +890,15 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit(A64_ORR(1, tmp, dst, tmp), ctx); emit(A64_MOV(1, dst, tmp), ctx); break; + } else if (insn_is_mov_percpu_addr(insn)) { + if (dst != src) + emit(A64_MOV(1, dst, src), ctx); + if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN)) + emit(A64_MRS_TPIDR_EL2(tmp), ctx); + else + emit(A64_MRS_TPIDR_EL1(tmp), ctx); + emit(A64_ADD(1, dst, dst, tmp), ctx); + break; } switch (insn->off) { case 0: @@ -2559,6 +2568,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) return true; } +bool bpf_jit_supports_percpu_insn(void) +{ + return true; +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { From 75fe4c0b3e181f5e3b990128013ac192fdfd4012 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 2 May 2024 15:18:54 +0000 Subject: [PATCH 092/119] bpf, arm64: inline bpf_get_smp_processor_id() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline calls to bpf_get_smp_processor_id() helper in the JIT by emitting a read from struct thread_info. The SP_EL0 system register holds the pointer to the task_struct and thread_info is the first member of this struct. We can read the cpu number from the thread_info. Here is how the ARM64 JITed assembly changes after this commit: ARM64 JIT =========== BEFORE AFTER -------- ------- int cpu = bpf_get_smp_processor_id(); int cpu = bpf_get_smp_processor_id(); mov x10, #0xfffffffffffff4d0 mrs x10, sp_el0 movk x10, #0x802b, lsl #16 ldr w7, [x10, #24] movk x10, #0x8000, lsl #32 blr x10 add x7, x0, #0x0 Performance improvement using benchmark[1] ./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc +---------------+-------------------+-------------------+--------------+ | Name | Before | After | % change | |---------------+-------------------+-------------------+--------------| | glob-arr-inc | 23.380 ± 1.675M/s | 25.893 ± 0.026M/s | + 10.74% | | arr-inc | 23.928 ± 0.034M/s | 25.213 ± 0.063M/s | + 5.37% | | hash-inc | 12.352 ± 0.005M/s | 12.609 ± 0.013M/s | + 2.08% | +---------------+-------------------+-------------------+--------------+ [1] https://github.com/anakryiko/linux/commit/8dec900975ef Signed-off-by: Puranjay Mohan Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240502151854.9810-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/insn.h | 1 + arch/arm64/net/bpf_jit.h | 2 ++ arch/arm64/net/bpf_jit_comp.c | 25 +++++++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 8de0e39b29f3..8c0a36f72d6f 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -138,6 +138,7 @@ enum aarch64_insn_special_register { enum aarch64_insn_system_register { AARCH64_INSN_SYSREG_TPIDR_EL1 = 0x4684, AARCH64_INSN_SYSREG_TPIDR_EL2 = 0x6682, + AARCH64_INSN_SYSREG_SP_EL0 = 0x4208, }; enum aarch64_insn_variant { diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index b627ef7188c7..b22ab2f97a30 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -302,5 +302,7 @@ aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL1) #define A64_MRS_TPIDR_EL2(Rt) \ aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2) +#define A64_MRS_SP_EL0(Rt) \ + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0) #endif /* _BPF_JIT_H */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 4e7954e9829d..47151414a450 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1228,6 +1228,21 @@ emit_cond_jmp: const u8 r0 = bpf2a64[BPF_REG_0]; bool func_addr_fixed; u64 func_addr; + u32 cpu_offset; + + /* Implement helper call to bpf_get_smp_processor_id() inline */ + if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) { + cpu_offset = offsetof(struct thread_info, cpu); + + emit(A64_MRS_SP_EL0(tmp), ctx); + if (is_lsi_offset(cpu_offset, 2)) { + emit(A64_LDR32I(r0, tmp, cpu_offset), ctx); + } else { + emit_a64_mov_i(1, tmp2, cpu_offset, ctx); + emit(A64_LDR32(r0, tmp, tmp2), ctx); + } + break; + } ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &func_addr, &func_addr_fixed); @@ -2573,6 +2588,16 @@ bool bpf_jit_supports_percpu_insn(void) return true; } +bool bpf_jit_inlines_helper_call(s32 imm) +{ + switch (imm) { + case BPF_FUNC_get_smp_processor_id: + return true; + default: + return false; + } +} + void bpf_jit_free(struct bpf_prog *prog) { if (prog->jited) { From 68378982f0b21de02ac3c6a11e2420badefcb4bc Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 7 May 2024 02:02:49 +0200 Subject: [PATCH 093/119] s390/bpf: Emit a barrier for BPF_FETCH instructions BPF_ATOMIC_OP() macro documentation states that "BPF_ADD | BPF_FETCH" should be the same as atomic_fetch_add(), which is currently not the case on s390x: the serialization instruction "bcr 14,0" is missing. This applies to "and", "or" and "xor" variants too. s390x is allowed to reorder stores with subsequent fetches from different addresses, so code relying on BPF_FETCH acting as a barrier, for example: stw [%r0], 1 afadd [%r1], %r2 ldxw %r3, [%r4] may be broken. Fix it by emitting "bcr 14,0". Note that a separate serialization instruction is not needed for BPF_XCHG and BPF_CMPXCHG, because COMPARE AND SWAP performs serialization itself. Fixes: ba3b86b9cef0 ("s390/bpf: Implement new atomic ops") Reported-by: Puranjay Mohan Closes: https://lore.kernel.org/bpf/mb61p34qvq3wf.fsf@kernel.org/ Signed-off-by: Ilya Leoshkevich Reviewed-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240507000557.12048-1-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- arch/s390/net/bpf_jit_comp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index fa2f824e3b06..4be8f5cadd02 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1427,8 +1427,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, is32 ? (op32) : (op64), \ (insn->imm & BPF_FETCH) ? src_reg : REG_W0, \ src_reg, dst_reg, off); \ - if (is32 && (insn->imm & BPF_FETCH)) \ - EMIT_ZERO(src_reg); \ + if (insn->imm & BPF_FETCH) { \ + /* bcr 14,0 - see atomic_fetch_{add,and,or,xor}() */ \ + _EMIT2(0x07e0); \ + if (is32) \ + EMIT_ZERO(src_reg); \ + } \ } while (0) case BPF_ADD: case BPF_ADD | BPF_FETCH: From 80c5a07ae673a740ef7ef0fe1ab588075a25ce8d Mon Sep 17 00:00:00 2001 From: Xiao Wang Date: Tue, 7 May 2024 19:16:18 +0800 Subject: [PATCH 094/119] riscv, bpf: Fix typo in comment We can use either "instruction" or "insn" in the comment. Signed-off-by: Xiao Wang Reviewed-by: Pu Lehui Link: https://lore.kernel.org/r/20240507111618.437121-1-xiao.w.wang@intel.com Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 5fc374ed98ea..fdbf88ca8b70 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -608,7 +608,7 @@ static inline u32 rv_nop(void) return rv_i_insn(0, 0, 0, 0, 0x13); } -/* RVC instrutions. */ +/* RVC instructions. */ static inline u16 rvc_addi4spn(u8 rd, u32 imm10) { @@ -737,7 +737,7 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2) return rv_css_insn(0x6, imm, rs2, 0x2); } -/* RVZBB instrutions. */ +/* RVZBB instructions. */ static inline u32 rvzbb_sextb(u8 rd, u8 rs1) { return rv_i_insn(0x604, rs1, 1, rd, 0x13); From 20a759df3bba35bf5c3ddec0c02ad69b603b584c Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Sun, 5 May 2024 20:16:33 +0000 Subject: [PATCH 095/119] riscv, bpf: make some atomic operations fully ordered The BPF atomic operations with the BPF_FETCH modifier along with BPF_XCHG and BPF_CMPXCHG are fully ordered but the RISC-V JIT implements all atomic operations except BPF_CMPXCHG with relaxed ordering. Section 8.1 of the "The RISC-V Instruction Set Manual Volume I: Unprivileged ISA" [1], titled, "Specifying Ordering of Atomic Instructions" says: | To provide more efficient support for release consistency [5], each | atomic instruction has two bits, aq and rl, used to specify additional | memory ordering constraints as viewed by other RISC-V harts. and | If only the aq bit is set, the atomic memory operation is treated as | an acquire access. | If only the rl bit is set, the atomic memory operation is treated as a | release access. | | If both the aq and rl bits are set, the atomic memory operation is | sequentially consistent. Fix this by setting both aq and rl bits as 1 for operations with BPF_FETCH and BPF_XCHG. [1] https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf Fixes: dd642ccb45ec ("riscv, bpf: Implement more atomic operations for RV64") Signed-off-by: Puranjay Mohan Reviewed-by: Pu Lehui Link: https://lore.kernel.org/r/20240505201633.123115-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit_comp64.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index a46ec7fb4489..e51b291b5baa 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -504,33 +504,33 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, break; /* src_reg = atomic_fetch_(dst_reg + off16, src_reg) */ case BPF_ADD | BPF_FETCH: - emit(is64 ? rv_amoadd_d(rs, rs, rd, 0, 0) : - rv_amoadd_w(rs, rs, rd, 0, 0), ctx); + emit(is64 ? rv_amoadd_d(rs, rs, rd, 1, 1) : + rv_amoadd_w(rs, rs, rd, 1, 1), ctx); if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_AND | BPF_FETCH: - emit(is64 ? rv_amoand_d(rs, rs, rd, 0, 0) : - rv_amoand_w(rs, rs, rd, 0, 0), ctx); + emit(is64 ? rv_amoand_d(rs, rs, rd, 1, 1) : + rv_amoand_w(rs, rs, rd, 1, 1), ctx); if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_OR | BPF_FETCH: - emit(is64 ? rv_amoor_d(rs, rs, rd, 0, 0) : - rv_amoor_w(rs, rs, rd, 0, 0), ctx); + emit(is64 ? rv_amoor_d(rs, rs, rd, 1, 1) : + rv_amoor_w(rs, rs, rd, 1, 1), ctx); if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_XOR | BPF_FETCH: - emit(is64 ? rv_amoxor_d(rs, rs, rd, 0, 0) : - rv_amoxor_w(rs, rs, rd, 0, 0), ctx); + emit(is64 ? rv_amoxor_d(rs, rs, rd, 1, 1) : + rv_amoxor_w(rs, rs, rd, 1, 1), ctx); if (!is64) emit_zextw(rs, rs, ctx); break; /* src_reg = atomic_xchg(dst_reg + off16, src_reg); */ case BPF_XCHG: - emit(is64 ? rv_amoswap_d(rs, rs, rd, 0, 0) : - rv_amoswap_w(rs, rs, rd, 0, 0), ctx); + emit(is64 ? rv_amoswap_d(rs, rs, rd, 1, 1) : + rv_amoswap_w(rs, rs, rd, 1, 1), ctx); if (!is64) emit_zextw(rs, rs, ctx); break; From 73964e9085bbea517a675d5d8ceeb1e609a34748 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:18 -0500 Subject: [PATCH 096/119] selftests/bpf: Migrate recvmsg* return code tests to verifier_sock_addr.c This set of tests check that the BPF verifier rejects programs with invalid return codes (recvmsg4 and recvmsg6 hooks can only return 1). This patch replaces the tests in test_sock_addr.c with verifier_sock_addr.c, a new verifier prog_tests for sockaddr hooks, in a step towards fully retiring test_sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-2-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../selftests/bpf/progs/verifier_sock_addr.c | 37 ++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 70 ------------------- 3 files changed, 39 insertions(+), 70 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/verifier_sock_addr.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index c4f9f306646e..c60db8beeb73 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -66,6 +66,7 @@ #include "verifier_sdiv.skel.h" #include "verifier_search_pruning.skel.h" #include "verifier_sock.skel.h" +#include "verifier_sock_addr.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" @@ -181,6 +182,7 @@ void test_verifier_scalar_ids(void) { RUN(verifier_scalar_ids); } void test_verifier_sdiv(void) { RUN(verifier_sdiv); } void test_verifier_search_pruning(void) { RUN(verifier_search_pruning); } void test_verifier_sock(void) { RUN(verifier_sock); } +void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } diff --git a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c new file mode 100644 index 000000000000..5081fa723d3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include +#include +#include +#include "bpf_misc.h" + +SEC("cgroup/recvmsg4") +__success +int recvmsg4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/recvmsg6") +__success +int recvmsg6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index aa2198a0f24d..40e33167bec2 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -94,8 +94,6 @@ static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg_allow_prog_load(const struct sock_addr_test *test); static int sendmsg_deny_prog_load(const struct sock_addr_test *test); -static int recvmsg_allow_prog_load(const struct sock_addr_test *test); -static int recvmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); @@ -373,64 +371,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SYSCALL_EPERM, }, - - /* recvmsg */ - { - "recvmsg4: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg4: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP4_RECVMSG, - BPF_CGROUP_UDP4_RECVMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, - { - "recvmsg6: return code ok", - recvmsg_allow_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_OKAY, - }, - { - "recvmsg6: return code !ok", - recvmsg_deny_prog_load, - BPF_CGROUP_UDP6_RECVMSG, - BPF_CGROUP_UDP6_RECVMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -527,16 +467,6 @@ static int sendmsg_deny_prog_load(const struct sock_addr_test *test) return xmsg_ret_only_prog_load(test, /*rc*/ 0); } -static int recvmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - -static int recvmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) { struct sockaddr_in dst4_rw_addr; From 86b65c6db0190fb6c119e83da4de0eccf74fb1ff Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:19 -0500 Subject: [PATCH 097/119] selftests/bpf: Use program name for skel load/destroy functions In preparation to migrate tests from bpf/test_sock_addr.c to sock_addr.c, update BPF_SKEL_FUNCS so that it generates functions based on prog_name instead of skel_name. This allows us to differentiate between programs in the same skeleton. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-3-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 96 ++++++++++--------- 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 9c709c33f889..039c3e38e1bc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -357,7 +357,7 @@ struct sock_addr_test { }; #define BPF_SKEL_FUNCS(skel_name, prog_name) \ -static void *skel_name##_load(int cgroup_fd) \ +static void *prog_name##_load(int cgroup_fd) \ { \ struct skel_name *skel; \ skel = skel_name##__open_and_load(); \ @@ -372,7 +372,7 @@ cleanup: \ skel_name##__destroy(skel); \ return NULL; \ } \ -static void skel_name##_destroy(void *skel) \ +static void prog_name##_destroy(void *skel) \ { \ skel_name##__destroy(skel); \ } @@ -396,8 +396,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind4: bind (stream)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &user_ops, AF_INET, SOCK_STREAM, @@ -405,12 +405,13 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, }, { SOCK_ADDR_TEST_BIND, "bind4: bind (dgram)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &user_ops, AF_INET, SOCK_DGRAM, @@ -418,12 +419,13 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &user_ops, AF_INET6, SOCK_STREAM, @@ -431,12 +433,13 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (dgram)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &user_ops, AF_INET6, SOCK_DGRAM, @@ -444,14 +447,15 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, }, /* bind - kernel calls */ { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (stream)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -463,8 +467,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (dgram)", - bind4_prog_load, - bind4_prog_destroy, + bind_v4_prog_load, + bind_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -476,8 +480,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (stream)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -489,8 +493,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (dgram)", - bind6_prog_load, - bind6_prog_destroy, + bind_v6_prog_load, + bind_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -504,8 +508,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: connect (stream)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &user_ops, AF_INET, SOCK_STREAM, @@ -518,8 +522,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: connect (dgram)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &user_ops, AF_INET, SOCK_DGRAM, @@ -532,8 +536,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &user_ops, AF_INET6, SOCK_STREAM, @@ -546,8 +550,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: connect (dgram)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &user_ops, AF_INET6, SOCK_DGRAM, @@ -576,8 +580,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (stream)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -590,8 +594,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (dgram)", - connect4_prog_load, - connect4_prog_destroy, + connect_v4_prog_load, + connect_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -604,8 +608,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (stream)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -618,8 +622,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (dgram)", - connect6_prog_load, - connect6_prog_destroy, + connect_v6_prog_load, + connect_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -648,8 +652,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg4: sendmsg (dgram)", - sendmsg4_prog_load, - sendmsg4_prog_destroy, + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, &user_ops, AF_INET, SOCK_DGRAM, @@ -662,8 +666,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", - sendmsg6_prog_load, - sendmsg6_prog_destroy, + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, &user_ops, AF_INET6, SOCK_DGRAM, @@ -692,8 +696,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg4: sock_sendmsg (dgram)", - sendmsg4_prog_load, - sendmsg4_prog_destroy, + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -706,8 +710,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sock_sendmsg (dgram)", - sendmsg6_prog_load, - sendmsg6_prog_destroy, + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -736,8 +740,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg4: kernel_sendmsg (dgram)", - sendmsg4_prog_load, - sendmsg4_prog_destroy, + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, &kern_ops_kernel_sendmsg, AF_INET, SOCK_DGRAM, @@ -750,8 +754,8 @@ static struct sock_addr_test tests[] = { { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: kernel_sendmsg (dgram)", - sendmsg6_prog_load, - sendmsg6_prog_destroy, + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, &kern_ops_kernel_sendmsg, AF_INET6, SOCK_DGRAM, From 5eff48f33fb733de9b88a5381e0428f3e873c670 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:20 -0500 Subject: [PATCH 098/119] selftests/bpf: Handle LOAD_REJECT test cases In preparation to move test cases from bpf/test_sock_addr.c that expect LOAD_REJECT, this patch adds expected_attach_type and extends load_fn to accept an expected attach type and a flag indicating whether or not rejection is expected. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-4-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 103 +++++++++++++++++- 1 file changed, 98 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 039c3e38e1bc..3033641fd756 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -52,7 +52,9 @@ enum sock_addr_test_type { SOCK_ADDR_TEST_GETPEERNAME, }; -typedef void *(*load_fn)(int cgroup_fd); +typedef void *(*load_fn)(int cgroup_fd, + enum bpf_attach_type attach_type, + bool expect_reject); typedef void (*destroy_fn)(void *skel); static int cmp_addr(const struct sockaddr_storage *addr1, socklen_t addr1_len, @@ -343,6 +345,7 @@ struct sock_addr_test { /* BPF prog properties */ load_fn loadfn; destroy_fn destroyfn; + enum bpf_attach_type attach_type; /* Socket operations */ struct sock_ops *ops; /* Socket properties */ @@ -354,15 +357,34 @@ struct sock_addr_test { const char *expected_addr; unsigned short expected_port; const char *expected_src_addr; + /* Expected test result */ + enum { + LOAD_REJECT, + ATTACH_REJECT, + SYSCALL_EPERM, + SYSCALL_ENOTSUPP, + SUCCESS, + } expected_result; }; #define BPF_SKEL_FUNCS(skel_name, prog_name) \ -static void *prog_name##_load(int cgroup_fd) \ +static void *prog_name##_load(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ { \ - struct skel_name *skel; \ - skel = skel_name##__open_and_load(); \ + struct skel_name *skel = skel_name##__open(); \ if (!ASSERT_OK_PTR(skel, "skel_open")) \ goto cleanup; \ + if (!ASSERT_OK(bpf_program__set_expected_attach_type(skel->progs.prog_name, \ + attach_type), \ + "set_expected_attach_type")) \ + goto cleanup; \ + if (skel_name##__load(skel)) { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ skel->links.prog_name = bpf_program__attach_cgroup( \ skel->progs.prog_name, cgroup_fd); \ if (!ASSERT_OK_PTR(skel->links.prog_name, "prog_attach")) \ @@ -398,6 +420,7 @@ static struct sock_addr_test tests[] = { "bind4: bind (stream)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &user_ops, AF_INET, SOCK_STREAM, @@ -406,12 +429,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind4: bind (dgram)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &user_ops, AF_INET, SOCK_DGRAM, @@ -420,12 +445,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &user_ops, AF_INET6, SOCK_STREAM, @@ -434,12 +461,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: bind (dgram)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &user_ops, AF_INET6, SOCK_DGRAM, @@ -448,6 +477,7 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, NULL, + SUCCESS, }, /* bind - kernel calls */ @@ -456,6 +486,7 @@ static struct sock_addr_test tests[] = { "bind4: kernel_bind (stream)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -463,12 +494,15 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (dgram)", bind_v4_prog_load, bind_v4_prog_destroy, + BPF_CGROUP_INET4_BIND, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -476,12 +510,15 @@ static struct sock_addr_test tests[] = { SERV4_PORT, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (stream)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -489,12 +526,15 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, + SUCCESS, }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (dgram)", bind_v6_prog_load, bind_v6_prog_destroy, + BPF_CGROUP_INET6_BIND, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -502,6 +542,8 @@ static struct sock_addr_test tests[] = { SERV6_PORT, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + NULL, + SUCCESS, }, /* connect - system calls */ @@ -510,6 +552,7 @@ static struct sock_addr_test tests[] = { "connect4: connect (stream)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &user_ops, AF_INET, SOCK_STREAM, @@ -518,12 +561,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect4: connect (dgram)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &user_ops, AF_INET, SOCK_DGRAM, @@ -532,12 +577,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &user_ops, AF_INET6, SOCK_STREAM, @@ -546,12 +593,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (dgram)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &user_ops, AF_INET6, SOCK_DGRAM, @@ -560,12 +609,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: connect (stream)", connect_unix_prog_load, connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, &user_ops, AF_UNIX, SOCK_STREAM, @@ -574,6 +625,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* connect - kernel calls */ @@ -582,6 +634,7 @@ static struct sock_addr_test tests[] = { "connect4: kernel_connect (stream)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &kern_ops_sock_sendmsg, AF_INET, SOCK_STREAM, @@ -590,12 +643,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (dgram)", connect_v4_prog_load, connect_v4_prog_destroy, + BPF_CGROUP_INET4_CONNECT, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -604,12 +659,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (stream)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &kern_ops_sock_sendmsg, AF_INET6, SOCK_STREAM, @@ -618,12 +675,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (dgram)", connect_v6_prog_load, connect_v6_prog_destroy, + BPF_CGROUP_INET6_CONNECT, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -632,12 +691,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: kernel_connect (dgram)", connect_unix_prog_load, connect_unix_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, &kern_ops_sock_sendmsg, AF_UNIX, SOCK_STREAM, @@ -646,6 +707,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* sendmsg - system calls */ @@ -654,6 +716,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: sendmsg (dgram)", sendmsg_v4_prog_load, sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, &user_ops, AF_INET, SOCK_DGRAM, @@ -662,12 +725,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", sendmsg_v6_prog_load, sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, &user_ops, AF_INET6, SOCK_DGRAM, @@ -676,12 +741,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, &user_ops, AF_UNIX, SOCK_DGRAM, @@ -690,6 +757,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* sendmsg - kernel calls (sock_sendmsg) */ @@ -698,6 +766,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: sock_sendmsg (dgram)", sendmsg_v4_prog_load, sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, &kern_ops_sock_sendmsg, AF_INET, SOCK_DGRAM, @@ -706,12 +775,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sock_sendmsg (dgram)", sendmsg_v6_prog_load, sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, &kern_ops_sock_sendmsg, AF_INET6, SOCK_DGRAM, @@ -720,12 +791,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, &kern_ops_sock_sendmsg, AF_UNIX, SOCK_DGRAM, @@ -734,6 +807,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* sendmsg - kernel calls (kernel_sendmsg) */ @@ -742,6 +816,7 @@ static struct sock_addr_test tests[] = { "sendmsg4: kernel_sendmsg (dgram)", sendmsg_v4_prog_load, sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, &kern_ops_kernel_sendmsg, AF_INET, SOCK_DGRAM, @@ -750,12 +825,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SRC4_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: kernel_sendmsg (dgram)", sendmsg_v6_prog_load, sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, &kern_ops_kernel_sendmsg, AF_INET6, SOCK_DGRAM, @@ -764,12 +841,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SRC6_REWRITE_IP, + SUCCESS, }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", sendmsg_unix_prog_load, sendmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, &kern_ops_kernel_sendmsg, AF_UNIX, SOCK_DGRAM, @@ -778,6 +857,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* recvmsg - system calls */ @@ -786,6 +866,7 @@ static struct sock_addr_test tests[] = { "recvmsg4: recvfrom (dgram)", recvmsg4_prog_load, recvmsg4_prog_destroy, + BPF_CGROUP_UDP4_RECVMSG, &user_ops, AF_INET, SOCK_DGRAM, @@ -794,12 +875,14 @@ static struct sock_addr_test tests[] = { SERV4_REWRITE_IP, SERV4_REWRITE_PORT, SERV4_IP, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg6: recvfrom (dgram)", recvmsg6_prog_load, recvmsg6_prog_destroy, + BPF_CGROUP_UDP6_RECVMSG, &user_ops, AF_INET6, SOCK_DGRAM, @@ -808,12 +891,14 @@ static struct sock_addr_test tests[] = { SERV6_REWRITE_IP, SERV6_REWRITE_PORT, SERV6_IP, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg_unix: recvfrom (dgram)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, &user_ops, AF_UNIX, SOCK_DGRAM, @@ -822,12 +907,14 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg_unix: recvfrom (stream)", recvmsg_unix_prog_load, recvmsg_unix_prog_destroy, + BPF_CGROUP_UNIX_RECVMSG, &user_ops, AF_UNIX, SOCK_STREAM, @@ -836,6 +923,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, SERVUN_ADDRESS, + SUCCESS, }, /* getsockname - system calls */ @@ -844,6 +932,7 @@ static struct sock_addr_test tests[] = { "getsockname_unix", getsockname_unix_prog_load, getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, &user_ops, AF_UNIX, SOCK_STREAM, @@ -852,6 +941,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, /* getpeername - system calls */ @@ -860,6 +950,7 @@ static struct sock_addr_test tests[] = { "getpeername_unix", getpeername_unix_prog_load, getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, &user_ops, AF_UNIX, SOCK_STREAM, @@ -868,6 +959,7 @@ static struct sock_addr_test tests[] = { SERVUN_REWRITE_ADDRESS, 0, NULL, + SUCCESS, }, }; @@ -1249,7 +1341,8 @@ void test_sock_addr(void) if (!test__start_subtest(test->name)) continue; - skel = test->loadfn(cgroup_fd); + skel = test->loadfn(cgroup_fd, test->attach_type, + test->expected_result == LOAD_REJECT); if (!skel) continue; From 5a047b2226c0511d4528d1467dc90f08fffafc38 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:21 -0500 Subject: [PATCH 099/119] selftests/bpf: Handle ATTACH_REJECT test cases In preparation to move test cases from bpf/test_sock_addr.c that expect ATTACH_REJECT, this patch adds BPF_SKEL_FUNCS_RAW to generate load and destroy functions that use bpf_prog_attach() to control the attach_type. The normal load functions use bpf_program__attach_cgroup which does not have the same degree of control over the attach type, as bpf_program_attach_fd() calls bpf_link_create() with the attach type extracted from prog using bpf_program__expected_attach_type(). It is currently not possible to modify the attach type before bpf_program__attach_cgroup() is called, since bpf_program__set_expected_attach_type() has no effect after the program is loaded. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-5-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 3033641fd756..53440458f365 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -367,6 +367,38 @@ struct sock_addr_test { } expected_result; }; +#define BPF_SKEL_FUNCS_RAW(skel_name, prog_name) \ +static void *prog_name##_load_raw(int cgroup_fd, \ + enum bpf_attach_type attach_type, \ + bool expect_reject) \ +{ \ + struct skel_name *skel = skel_name##__open(); \ + int prog_fd = -1; \ + if (!ASSERT_OK_PTR(skel, "skel_open")) \ + goto cleanup; \ + if (!ASSERT_OK(skel_name##__load(skel), "load")) \ + goto cleanup; \ + prog_fd = bpf_program__fd(skel->progs.prog_name); \ + if (!ASSERT_GT(prog_fd, 0, "prog_fd")) \ + goto cleanup; \ + if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, \ + BPF_F_ALLOW_OVERRIDE), "bpf_prog_attach") { \ + ASSERT_TRUE(expect_reject, "unexpected rejection"); \ + goto cleanup; \ + } \ + if (!ASSERT_FALSE(expect_reject, "expected rejection")) \ + goto cleanup; \ +cleanup: \ + if (prog_fd > 0) \ + bpf_prog_detach(cgroup_fd, attach_type); \ + skel_name##__destroy(skel); \ + return NULL; \ +} \ +static void prog_name##_destroy_raw(void *progfd) \ +{ \ + /* No-op. *_load_raw does all cleanup. */ \ +} \ + #define BPF_SKEL_FUNCS(skel_name, prog_name) \ static void *prog_name##_load(int cgroup_fd, \ enum bpf_attach_type attach_type, \ @@ -1342,7 +1374,8 @@ void test_sock_addr(void) continue; skel = test->loadfn(cgroup_fd, test->attach_type, - test->expected_result == LOAD_REJECT); + test->expected_result == LOAD_REJECT || + test->expected_result == ATTACH_REJECT); if (!skel) continue; From a2618c0d854235deaac2325cf8200a55274afa2b Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:22 -0500 Subject: [PATCH 100/119] selftests/bpf: Handle SYSCALL_EPERM and SYSCALL_ENOTSUPP test cases In preparation to move test cases from bpf/test_sock_addr.c that expect system calls to return ENOTSUPP or EPERM, this patch propagates errno from relevant system calls up to test_sock_addr() where the result can be checked. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-6-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 78 ++++++++++++++----- 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 53440458f365..626be900a8fd 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -19,6 +19,10 @@ #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" +#ifndef ENOTSUPP +# define ENOTSUPP 524 +#endif + #define TEST_NS "sock_addr" #define TEST_IF_PREFIX "test_sock_addr" #define TEST_IPV4 "127.0.0.4" @@ -43,6 +47,8 @@ #define SERVUN_REWRITE_ADDRESS "bpf_cgroup_unix_test_rewrite" #define SRCUN_ADDRESS "bpf_cgroup_unix_test_src" +#define save_errno_do(op) ({ int __save = errno; op; errno = __save; }) + enum sock_addr_test_type { SOCK_ADDR_TEST_BIND, SOCK_ADDR_TEST_CONNECT, @@ -98,6 +104,7 @@ static int run_bpf_prog(const char *prog_name, void *ctx, int ctx_size) goto err; err = topts.retval; + errno = -topts.retval; goto out; err: err = -1; @@ -221,8 +228,7 @@ int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, sockle "kernel_init_sock")) goto err; - if (!ASSERT_OK(kernel_connect((struct sockaddr *)addr, addrlen), - "kernel_connect")) + if (kernel_connect((struct sockaddr *)addr, addrlen) < 0) goto err; /* Test code expects a "file descriptor" on success. */ @@ -230,7 +236,7 @@ int kernel_connect_to_addr(int type, const struct sockaddr_storage *addr, sockle goto out; err: err = -1; - ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); out: return err; } @@ -248,8 +254,7 @@ int kernel_start_server(int family, int type, const char *addr_str, __u16 port, if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) goto err; - if (!ASSERT_OK(kernel_bind(0, (struct sockaddr *)&addr, addrlen), - "kernel_bind")) + if (kernel_bind(0, (struct sockaddr *)&addr, addrlen) < 0) goto err; if (type == SOCK_STREAM) { @@ -262,7 +267,7 @@ int kernel_start_server(int family, int type, const char *addr_str, __u16 port, goto out; err: err = -1; - ASSERT_OK(kernel_close_sock(0), "kernel_close_sock"); + save_errno_do(ASSERT_OK(kernel_close_sock(0), "kernel_close_sock")); out: return err; } @@ -1066,7 +1071,7 @@ static void unload_sock_addr_kern(void) sock_addr_kern__destroy(skel); } -static void test_bind(struct sock_addr_test *test) +static int test_bind(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); @@ -1075,8 +1080,10 @@ static void test_bind(struct sock_addr_test *test) serv = test->ops->start_server(test->socket_family, test->socket_type, test->requested_addr, test->requested_port, 0); - if (!ASSERT_GE(serv, 0, "start_server")) - goto cleanup; + if (serv < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, @@ -1095,13 +1102,17 @@ static void test_bind(struct sock_addr_test *test) goto cleanup; cleanup: + err = 0; +err: if (client != -1) close(client); if (serv != -1) test->ops->close(serv); + + return err; } -static void test_connect(struct sock_addr_test *test) +static int test_connect(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr, expected_src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -1121,8 +1132,10 @@ static void test_connect(struct sock_addr_test *test) client = test->ops->connect_to_addr(test->socket_type, &addr, addr_len, NULL); - if (!ASSERT_GE(client, 0, "connect_to_addr")) - goto cleanup; + if (client < 0) { + err = errno; + goto err; + } err = make_sockaddr(test->socket_family, test->expected_addr, test->expected_port, &expected_addr, &expected_addr_len); @@ -1149,13 +1162,17 @@ static void test_connect(struct sock_addr_test *test) goto cleanup; } cleanup: + err = 0; +err: if (client != -1) test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_xmsg(struct sock_addr_test *test) +static int test_xmsg(struct sock_addr_test *test) { struct sockaddr_storage addr, src_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -1196,6 +1213,11 @@ static void test_xmsg(struct sock_addr_test *test) if (test->socket_type == SOCK_DGRAM) { err = test->ops->sendmsg(client, (struct sockaddr *)&addr, addr_len, &data, sizeof(data)); + if (err < 0) { + err = errno; + goto err; + } + if (!ASSERT_EQ(err, sizeof(data), "sendmsg")) goto cleanup; } else { @@ -1245,13 +1267,17 @@ static void test_xmsg(struct sock_addr_test *test) } cleanup: + err = 0; +err: if (client != -1) test->ops->close(client); if (serv != -1) close(serv); + + return err; } -static void test_getsockname(struct sock_addr_test *test) +static int test_getsockname(struct sock_addr_test *test) { struct sockaddr_storage expected_addr; socklen_t expected_addr_len = sizeof(struct sockaddr_storage); @@ -1275,9 +1301,11 @@ static void test_getsockname(struct sock_addr_test *test) cleanup: if (serv != -1) test->ops->close(serv); + + return 0; } -static void test_getpeername(struct sock_addr_test *test) +static int test_getpeername(struct sock_addr_test *test) { struct sockaddr_storage addr, expected_addr; socklen_t addr_len = sizeof(struct sockaddr_storage), @@ -1314,6 +1342,8 @@ cleanup: test->ops->close(client); if (serv != -1) close(serv); + + return 0; } static int setup_test_env(struct nstoken **tok) @@ -1369,6 +1399,7 @@ void test_sock_addr(void) for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) { struct sock_addr_test *test = &tests[i]; + int err; if (!test__start_subtest(test->name)) continue; @@ -1385,26 +1416,33 @@ void test_sock_addr(void) * the future. */ case SOCK_ADDR_TEST_BIND: - test_bind(test); + err = test_bind(test); break; case SOCK_ADDR_TEST_CONNECT: - test_connect(test); + err = test_connect(test); break; case SOCK_ADDR_TEST_SENDMSG: case SOCK_ADDR_TEST_RECVMSG: - test_xmsg(test); + err = test_xmsg(test); break; case SOCK_ADDR_TEST_GETSOCKNAME: - test_getsockname(test); + err = test_getsockname(test); break; case SOCK_ADDR_TEST_GETPEERNAME: - test_getpeername(test); + err = test_getpeername(test); break; default: ASSERT_TRUE(false, "Unknown sock addr test type"); break; } + if (test->expected_result == SYSCALL_EPERM) + ASSERT_EQ(err, EPERM, "socket operation returns EPERM"); + else if (test->expected_result == SYSCALL_ENOTSUPP) + ASSERT_EQ(err, ENOTSUPP, "socket operation returns ENOTSUPP"); + else if (test->expected_result == SUCCESS) + ASSERT_OK(err, "socket operation succeeds"); + test->destroyfn(skel); } From d1b24fcf1c16290ce8cac467be2f7d6773de9da4 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:23 -0500 Subject: [PATCH 101/119] selftests/bpf: Migrate WILDCARD_IP test Move wildcard IP sendmsg test case out of bpf/test_sock_addr.c into prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-7-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 50 +++++++++++++++++++ .../selftests/bpf/progs/sendmsg6_prog.c | 6 +++ tools/testing/selftests/bpf/test_sock_addr.c | 20 -------- 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 626be900a8fd..37e9ef5a5ae1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -40,6 +40,7 @@ #define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" #define SRC6_IP "::1" #define SRC6_REWRITE_IP TEST_IPV6 +#define WILDCARD6_IP "::" #define SERV6_PORT 6060 #define SERV6_REWRITE_PORT 6666 @@ -443,6 +444,7 @@ BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); @@ -780,6 +782,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", @@ -830,6 +848,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", @@ -880,6 +914,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram)", + sendmsg_v6_preserve_dst_prog_load, + sendmsg_v6_preserve_dst_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + WILDCARD6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_PORT, + SRC6_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index bf9b46b806f6..03956a654ce5 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -59,4 +59,10 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) +{ + return 1; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 40e33167bec2..ab8ef02c9c55 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -92,7 +92,6 @@ static int bind4_prog_load(const struct sock_addr_test *test); static int bind6_prog_load(const struct sock_addr_test *test); static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); -static int sendmsg_allow_prog_load(const struct sock_addr_test *test); static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); @@ -343,20 +342,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: preserve dst IP = [::] (BSD'ism)", - sendmsg_allow_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - WILDCARD6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_PORT, - SRC6_IP, - SUCCESS, - }, { "sendmsg6: deny call", sendmsg_deny_prog_load, @@ -457,11 +442,6 @@ static int xmsg_ret_only_prog_load(const struct sock_addr_test *test, return load_insns(test, insns, ARRAY_SIZE(insns)); } -static int sendmsg_allow_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 1); -} - static int sendmsg_deny_prog_load(const struct sock_addr_test *test) { return xmsg_ret_only_prog_load(test, /*rc*/ 0); From f46a10483b27cc5a62b45e7e727445de6430e785 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:24 -0500 Subject: [PATCH 102/119] selftests/bpf: Migrate sendmsg deny test cases This set of tests checks that sendmsg calls are rejected (return -EPERM) when the sendmsg* hook returns 0. Replace those in bpf/test_sock_addr.c with corresponding tests in prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-8-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 98 +++++++++++++++++++ .../selftests/bpf/progs/sendmsg4_prog.c | 6 ++ .../selftests/bpf/progs/sendmsg6_prog.c | 6 ++ tools/testing/selftests/bpf/test_sock_addr.c | 45 --------- 4 files changed, 110 insertions(+), 45 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 37e9ef5a5ae1..634f7a31b35d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -443,7 +443,9 @@ BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); @@ -766,6 +768,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", @@ -798,6 +816,22 @@ static struct sock_addr_test tests[] = { SRC6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", @@ -832,6 +866,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: sock_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sock_sendmsg (dgram)", @@ -864,6 +914,22 @@ static struct sock_addr_test tests[] = { SRC6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sock_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", @@ -898,6 +964,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: kernel_sendmsg deny (dgram)", + sendmsg_v4_deny_prog_load, + sendmsg_v4_deny_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: kernel_sendmsg (dgram)", @@ -930,6 +1012,22 @@ static struct sock_addr_test tests[] = { SRC6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: kernel_sendmsg deny (dgram)", + sendmsg_v6_deny_prog_load, + sendmsg_v6_deny_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sock_sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c index 351e79aef2fa..edc159598a0e 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c @@ -49,4 +49,10 @@ int sendmsg_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg4") +int sendmsg_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index 03956a654ce5..0c1825cb994d 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -65,4 +65,10 @@ int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index ab8ef02c9c55..91d88358090e 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -92,7 +92,6 @@ static int bind4_prog_load(const struct sock_addr_test *test); static int bind6_prog_load(const struct sock_addr_test *test); static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); -static int sendmsg_deny_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); @@ -258,20 +257,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg4: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SYSCALL_EPERM, - }, { "sendmsg6: load prog with wrong expected attach type", sendmsg6_rw_asm_prog_load, @@ -342,20 +327,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: deny call", - sendmsg_deny_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_EPERM, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -431,22 +402,6 @@ static int connect6_prog_load(const struct sock_addr_test *test) return load_path(test, CONNECT6_PROG_PATH); } -static int xmsg_ret_only_prog_load(const struct sock_addr_test *test, - int32_t rc) -{ - struct bpf_insn insns[] = { - /* return rc */ - BPF_MOV64_IMM(BPF_REG_0, rc), - BPF_EXIT_INSN(), - }; - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg_deny_prog_load(const struct sock_addr_test *test) -{ - return xmsg_ret_only_prog_load(test, /*rc*/ 0); -} - static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) { struct sockaddr_in dst4_rw_addr; From 54462e8452f139e313e315959e005408cd31a4e6 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:25 -0500 Subject: [PATCH 103/119] selftests/bpf: Migrate sendmsg6 v4 mapped address tests Migrate test case from bpf/test_sock_addr.c ensuring that sendmsg returns -ENOTSUPP when sending to an IPv4-mapped IPv6 address to prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-9-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 17 +++++++++++++ .../selftests/bpf/progs/sendmsg6_prog.c | 25 +++++++++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 20 --------------- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 634f7a31b35d..f096203171b1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -447,6 +447,7 @@ BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); @@ -832,6 +833,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg IPv4-mapped IPv6 (dgram)", + sendmsg_v6_v4mapped_prog_load, + sendmsg_v6_v4mapped_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_ENOTSUPP, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index 0c1825cb994d..7611d9e17dd1 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -20,6 +20,11 @@ #define DST_REWRITE_IP6_2 0 #define DST_REWRITE_IP6_3 1 +#define DST_REWRITE_IP6_V4_MAPPED_0 0 +#define DST_REWRITE_IP6_V4_MAPPED_1 0 +#define DST_REWRITE_IP6_V4_MAPPED_2 0x0000FFFF +#define DST_REWRITE_IP6_V4_MAPPED_3 0xc0a80004 // 192.168.0.4 + #define DST_REWRITE_PORT6 6666 SEC("cgroup/sendmsg6") @@ -59,6 +64,26 @@ int sendmsg_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_v4mapped_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_0); + ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_1); + ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_2); + ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_V4_MAPPED_3); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + SEC("cgroup/sendmsg6") int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) { diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 91d88358090e..4ead113753f8 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -94,7 +94,6 @@ static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { @@ -299,20 +298,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: IPv4-mapped IPv6", - sendmsg6_rw_v4mapped_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SYSCALL_ENOTSUPP, - }, { "sendmsg6: set dst IP = [::] (BSD'ism)", sendmsg6_rw_wildcard_prog_load, @@ -512,11 +497,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); } -static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP); -} - static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) { return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); From 8eaf8056a44b28a7b198aa699e35854bbec2c452 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:26 -0500 Subject: [PATCH 104/119] selftests/bpf: Migrate wildcard destination rewrite test Migrate test case from bpf/test_sock_addr.c ensuring that sendmsg respects when sendmsg6 hooks rewrite the destination IP with the IPv6 wildcard IP, [::]. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-10-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 17 ++++++++++++++++ .../selftests/bpf/progs/sendmsg6_prog.c | 20 +++++++++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 20 ------------------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index f096203171b1..e3c450d11b9e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -448,6 +448,7 @@ BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); +BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); @@ -849,6 +850,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SYSCALL_ENOTSUPP, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram)", + sendmsg_v6_wildcard_prog_load, + sendmsg_v6_wildcard_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SUCCESS, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c index 7611d9e17dd1..36a7f960799f 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c @@ -84,6 +84,26 @@ int sendmsg_v6_v4mapped_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg6") +int sendmsg_v6_wildcard_prog(struct bpf_sock_addr *ctx) +{ + /* Rewrite source. */ + ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0); + ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1); + ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2); + ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(0); + ctx->user_ip6[1] = bpf_htonl(0); + ctx->user_ip6[2] = bpf_htonl(0); + ctx->user_ip6[3] = bpf_htonl(0); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + return 1; +} + SEC("cgroup/sendmsg6") int sendmsg_v6_preserve_dst_prog(struct bpf_sock_addr *ctx) { diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 4ead113753f8..85fb2a793be5 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -94,7 +94,6 @@ static int connect4_prog_load(const struct sock_addr_test *test); static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { /* bind */ @@ -298,20 +297,6 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: set dst IP = [::] (BSD'ism)", - sendmsg6_rw_wildcard_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, }; static int load_insns(const struct sock_addr_test *test, @@ -497,11 +482,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); } -static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP); -} - static int cmp_addr(const struct sockaddr_storage *addr1, const struct sockaddr_storage *addr2, int cmp_port) { From b0f3af0bffefc54650d9fb10810fc2f974365dfd Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:27 -0500 Subject: [PATCH 105/119] selftests/bpf: Migrate expected_attach_type tests Migrates tests from progs/test_sock_addr.c ensuring that programs fail to load when the expected attach type does not match. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-11-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 96 +++++++++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 84 ---------------- 2 files changed, 96 insertions(+), 84 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index e3c450d11b9e..8c7c56f99754 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -490,6 +490,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: load prog with wrong expected attach type", + bind_v4_prog_load, + bind_v4_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", @@ -522,6 +538,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: load prog with wrong expected attach type", + bind_v6_prog_load, + bind_v6_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, /* bind - kernel calls */ { @@ -622,6 +654,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: load prog with wrong expected attach type", + connect_v4_prog_load, + connect_v4_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", @@ -654,6 +702,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: load prog with wrong expected attach type", + connect_v6_prog_load, + connect_v6_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: connect (stream)", @@ -786,6 +850,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: load prog with wrong expected attach type", + sendmsg_v4_prog_load, + sendmsg_v4_prog_destroy, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", @@ -866,6 +946,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: load prog with wrong expected attach type", + sendmsg_v6_prog_load, + sendmsg_v6_prog_destroy, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + LOAD_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 85fb2a793be5..4ecbc72477f1 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -97,20 +97,6 @@ static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { /* bind */ - { - "bind4: load prog with wrong expected attach type", - bind4_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "bind4: attach prog with wrong attach type", bind4_prog_load, @@ -125,20 +111,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "bind6: load prog with wrong expected attach type", - bind6_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "bind6: attach prog with wrong attach type", bind6_prog_load, @@ -155,20 +127,6 @@ static struct sock_addr_test tests[] = { }, /* connect */ - { - "connect4: load prog with wrong expected attach type", - connect4_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "connect4: attach prog with wrong attach type", connect4_prog_load, @@ -183,20 +141,6 @@ static struct sock_addr_test tests[] = { NULL, ATTACH_REJECT, }, - { - "connect6: load prog with wrong expected attach type", - connect6_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET6, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "connect6: attach prog with wrong attach type", connect6_prog_load, @@ -213,20 +157,6 @@ static struct sock_addr_test tests[] = { }, /* sendmsg */ - { - "sendmsg4: load prog with wrong expected attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "sendmsg4: attach prog with wrong attach type", sendmsg4_rw_asm_prog_load, @@ -255,20 +185,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: load prog with wrong expected attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - LOAD_REJECT, - }, { "sendmsg6: attach prog with wrong attach type", sendmsg6_rw_asm_prog_load, From cded71f595c0c4396acc9657911c5aa2a289a8dc Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:28 -0500 Subject: [PATCH 106/119] selftests/bpf: Migrate ATTACH_REJECT test cases Migrate test case from bpf/test_sock_addr.c ensuring that program attachment fails when using an inappropriate attach type. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-12-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 102 ++++++++++++ tools/testing/selftests/bpf/test_sock_addr.c | 146 ------------------ 2 files changed, 102 insertions(+), 146 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 8c7c56f99754..ebd5e58e38c5 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -438,13 +438,19 @@ static void prog_name##_destroy(void *skel) \ } BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS_RAW(bind4_prog, bind_v4_prog); BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS_RAW(bind6_prog, bind_v6_prog); BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS_RAW(connect4_prog, connect_v4_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); +BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_prog); +BPF_SKEL_FUNCS_RAW(sendmsg6_prog, sendmsg_v6_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_deny_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); @@ -506,6 +512,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: attach prog with wrong attach type", + bind_v4_prog_load_raw, + bind_v4_prog_destroy_raw, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_BIND, "bind6: bind (stream)", @@ -554,6 +576,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: attach prog with wrong attach type", + bind_v6_prog_load_raw, + bind_v6_prog_destroy_raw, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, /* bind - kernel calls */ { @@ -670,6 +708,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: attach prog with wrong attach type", + connect_v4_prog_load_raw, + connect_v4_prog_destroy_raw, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (stream)", @@ -718,6 +772,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: attach prog with wrong attach type", + connect_v6_prog_load_raw, + connect_v6_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: connect (stream)", @@ -866,6 +936,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg4: attach prog with wrong attach type", + sendmsg_v4_prog_load_raw, + sendmsg_v4_prog_destroy_raw, + BPF_CGROUP_UDP6_SENDMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg6: sendmsg (dgram)", @@ -962,6 +1048,22 @@ static struct sock_addr_test tests[] = { NULL, LOAD_REJECT, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg6: attach prog with wrong attach type", + sendmsg_v6_prog_load_raw, + sendmsg_v6_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + NULL, + 0, + NULL, + 0, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_SENDMSG, "sendmsg_unix: sendmsg (dgram)", diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 4ecbc72477f1..311eda4f4864 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -88,89 +88,11 @@ struct sock_addr_test { } expected_result; }; -static int bind4_prog_load(const struct sock_addr_test *test); -static int bind6_prog_load(const struct sock_addr_test *test); -static int connect4_prog_load(const struct sock_addr_test *test); -static int connect6_prog_load(const struct sock_addr_test *test); static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); static struct sock_addr_test tests[] = { - /* bind */ - { - "bind4: attach prog with wrong attach type", - bind4_prog_load, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "bind6: attach prog with wrong attach type", - bind6_prog_load, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_BIND, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - - /* connect */ - { - "connect4: attach prog with wrong attach type", - connect4_prog_load, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - { - "connect6: attach prog with wrong attach type", - connect6_prog_load, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_CONNECT, - AF_INET, - SOCK_STREAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, - /* sendmsg */ - { - "sendmsg4: attach prog with wrong attach type", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, { "sendmsg4: rewrite IP & port (asm)", sendmsg4_rw_asm_prog_load, @@ -185,20 +107,6 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, - { - "sendmsg6: attach prog with wrong attach type", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET6, - SOCK_DGRAM, - NULL, - 0, - NULL, - 0, - NULL, - ATTACH_REJECT, - }, { "sendmsg6: rewrite IP & port (asm)", sendmsg6_rw_asm_prog_load, @@ -234,60 +142,6 @@ static int load_insns(const struct sock_addr_test *test, return ret; } -static int load_path(const struct sock_addr_test *test, const char *path) -{ - struct bpf_object *obj; - struct bpf_program *prog; - int err; - - obj = bpf_object__open_file(path, NULL); - err = libbpf_get_error(obj); - if (err) { - log_err(">>> Opening BPF object (%s) error.\n", path); - return -1; - } - - prog = bpf_object__next_program(obj, NULL); - if (!prog) - goto err_out; - - bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR); - bpf_program__set_expected_attach_type(prog, test->expected_attach_type); - bpf_program__set_flags(prog, testing_prog_flags()); - - err = bpf_object__load(obj); - if (err) { - if (test->expected_result != LOAD_REJECT) - log_err(">>> Loading program (%s) error.\n", path); - goto err_out; - } - - return bpf_program__fd(prog); -err_out: - bpf_object__close(obj); - return -1; -} - -static int bind4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND4_PROG_PATH); -} - -static int bind6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, BIND6_PROG_PATH); -} - -static int connect4_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT4_PROG_PATH); -} - -static int connect6_prog_load(const struct sock_addr_test *test) -{ - return load_path(test, CONNECT6_PROG_PATH); -} - static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) { struct sockaddr_in dst4_rw_addr; From 9c3f17862faef89696d26655a6d10f90137df42e Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:29 -0500 Subject: [PATCH 107/119] selftests/bpf: Remove redundant sendmsg test cases Remove these test cases completely, as the same behavior is already covered by other sendmsg* test cases in prog_tests/sock_addr.c. This just rewrites the destination address similar to sendmsg_v4_prog and sendmsg_v6_prog. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-13-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_sock_addr.c | 161 ------------------- 1 file changed, 161 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 311eda4f4864..a2b587273331 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -88,170 +88,9 @@ struct sock_addr_test { } expected_result; }; -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test); -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test); - static struct sock_addr_test tests[] = { - /* sendmsg */ - { - "sendmsg4: rewrite IP & port (asm)", - sendmsg4_rw_asm_prog_load, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP4_SENDMSG, - AF_INET, - SOCK_DGRAM, - SERV4_IP, - SERV4_PORT, - SERV4_REWRITE_IP, - SERV4_REWRITE_PORT, - SRC4_REWRITE_IP, - SUCCESS, - }, - { - "sendmsg6: rewrite IP & port (asm)", - sendmsg6_rw_asm_prog_load, - BPF_CGROUP_UDP6_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - AF_INET6, - SOCK_DGRAM, - SERV6_IP, - SERV6_PORT, - SERV6_REWRITE_IP, - SERV6_REWRITE_PORT, - SRC6_REWRITE_IP, - SUCCESS, - }, }; -static int load_insns(const struct sock_addr_test *test, - const struct bpf_insn *insns, size_t insns_cnt) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts); - int ret; - - opts.expected_attach_type = test->expected_attach_type; - opts.log_buf = bpf_log_buf; - opts.log_size = BPF_LOG_BUF_SIZE; - - ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, NULL, "GPL", insns, insns_cnt, &opts); - if (ret < 0 && test->expected_result != LOAD_REJECT) { - log_err(">>> Loading program error.\n" - ">>> Verifier output:\n%s\n-------\n", bpf_log_buf); - } - - return ret; -} - -static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test) -{ - struct sockaddr_in dst4_rw_addr; - struct in_addr src4_rw_ip; - - if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) { - log_err("Invalid IPv4: %s", SRC4_REWRITE_IP); - return -1; - } - - if (make_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, - (struct sockaddr_storage *)&dst4_rw_addr, - NULL) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET && */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8), - - /* sk.type == SOCK_DGRAM) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, type)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6), - - /* msg_src_ip4 = src4_rw_ip */ - BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, msg_src_ip4)), - - /* user_ip4 = dst4_rw_addr.sin_addr */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_ip4)), - - /* user_port = dst4_rw_addr.sin_port */ - BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test, - const char *rw_dst_ip) -{ - struct sockaddr_in6 dst6_rw_addr; - struct in6_addr src6_rw_ip; - - if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) { - log_err("Invalid IPv6: %s", SRC6_REWRITE_IP); - return -1; - } - - if (make_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT, - (struct sockaddr_storage *)&dst6_rw_addr, - NULL) == -1) - return -1; - - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), - - /* if (sk.family == AF_INET6) { */ - BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, - offsetof(struct bpf_sock_addr, family)), - BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), - -#define STORE_IPV6_WORD_N(DST, SRC, N) \ - BPF_MOV32_IMM(BPF_REG_7, SRC[N]), \ - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ - offsetof(struct bpf_sock_addr, DST[N])) - -#define STORE_IPV6(DST, SRC) \ - STORE_IPV6_WORD_N(DST, SRC, 0), \ - STORE_IPV6_WORD_N(DST, SRC, 1), \ - STORE_IPV6_WORD_N(DST, SRC, 2), \ - STORE_IPV6_WORD_N(DST, SRC, 3) - - STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32), - STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32), - - /* user_port = dst6_rw_addr.sin6_port */ - BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port), - BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, - offsetof(struct bpf_sock_addr, user_port)), - - /* } */ - - /* return 1 */ - BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_EXIT_INSN(), - }; - - return load_insns(test, insns, ARRAY_SIZE(insns)); -} - -static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test) -{ - return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP); -} - static int cmp_addr(const struct sockaddr_storage *addr1, const struct sockaddr_storage *addr2, int cmp_port) { From 61ecfdfce2647281e7d14119bfa529922ce2d8b2 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:30 -0500 Subject: [PATCH 108/119] selftests/bpf: Retire test_sock_addr.(c|sh) Fully remove test_sock_addr.c and test_sock_addr.sh, as test coverage has been fully moved to prog_tests/sock_addr.c. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-14-jrife@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/test_sock_addr.c | 574 ------------------ tools/testing/selftests/bpf/test_sock_addr.sh | 58 -- 4 files changed, 1 insertion(+), 636 deletions(-) delete mode 100644 tools/testing/selftests/bpf/test_sock_addr.c delete mode 100755 tools/testing/selftests/bpf/test_sock_addr.sh diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index f1aebabfb017..5025401323af 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -17,7 +17,6 @@ test_dev_cgroup test_verifier_log feature test_sock -test_sock_addr urandom_read test_sockmap test_lirc_mode2_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 135023a357b3..ed381b0197fe 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -115,7 +115,6 @@ TEST_PROGS := test_kmod.sh \ test_xdp_redirect_multi.sh \ test_xdp_meta.sh \ test_xdp_veth.sh \ - test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ @@ -140,7 +139,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ +TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \ xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \ @@ -296,7 +295,6 @@ NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(NETWORK_HELPERS) $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c deleted file mode 100644 index a2b587273331..000000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ /dev/null @@ -1,574 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2018 Facebook - -#define _GNU_SOURCE - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include "cgroup_helpers.h" -#include "network_helpers.h" -#include "testing_helpers.h" -#include "bpf_util.h" - -#ifndef ENOTSUPP -# define ENOTSUPP 524 -#endif - -#define CG_PATH "/foo" -#define CONNECT4_PROG_PATH "./connect4_prog.bpf.o" -#define CONNECT6_PROG_PATH "./connect6_prog.bpf.o" -#define SENDMSG4_PROG_PATH "./sendmsg4_prog.bpf.o" -#define SENDMSG6_PROG_PATH "./sendmsg6_prog.bpf.o" -#define RECVMSG4_PROG_PATH "./recvmsg4_prog.bpf.o" -#define RECVMSG6_PROG_PATH "./recvmsg6_prog.bpf.o" -#define BIND4_PROG_PATH "./bind4_prog.bpf.o" -#define BIND6_PROG_PATH "./bind6_prog.bpf.o" - -#define SERV4_IP "192.168.1.254" -#define SERV4_REWRITE_IP "127.0.0.1" -#define SRC4_IP "172.16.0.1" -#define SRC4_REWRITE_IP "127.0.0.4" -#define SERV4_PORT 4040 -#define SERV4_REWRITE_PORT 4444 - -#define SERV6_IP "face:b00c:1234:5678::abcd" -#define SERV6_REWRITE_IP "::1" -#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4" -#define SRC6_IP "::1" -#define SRC6_REWRITE_IP "::6" -#define WILDCARD6_IP "::" -#define SERV6_PORT 6060 -#define SERV6_REWRITE_PORT 6666 - -#define INET_NTOP_BUF 40 - -struct sock_addr_test; - -typedef int (*load_fn)(const struct sock_addr_test *test); -typedef int (*info_fn)(int, struct sockaddr *, socklen_t *); - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -struct sock_addr_test { - const char *descr; - /* BPF prog properties */ - load_fn loadfn; - enum bpf_attach_type expected_attach_type; - enum bpf_attach_type attach_type; - /* Socket properties */ - int domain; - int type; - /* IP:port pairs for BPF prog to override */ - const char *requested_ip; - unsigned short requested_port; - const char *expected_ip; - unsigned short expected_port; - const char *expected_src_ip; - /* Expected test result */ - enum { - LOAD_REJECT, - ATTACH_REJECT, - ATTACH_OKAY, - SYSCALL_EPERM, - SYSCALL_ENOTSUPP, - SUCCESS, - } expected_result; -}; - -static struct sock_addr_test tests[] = { -}; - -static int cmp_addr(const struct sockaddr_storage *addr1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - const struct sockaddr_in *four1, *four2; - const struct sockaddr_in6 *six1, *six2; - - if (addr1->ss_family != addr2->ss_family) - return -1; - - if (addr1->ss_family == AF_INET) { - four1 = (const struct sockaddr_in *)addr1; - four2 = (const struct sockaddr_in *)addr2; - return !((four1->sin_port == four2->sin_port || !cmp_port) && - four1->sin_addr.s_addr == four2->sin_addr.s_addr); - } else if (addr1->ss_family == AF_INET6) { - six1 = (const struct sockaddr_in6 *)addr1; - six2 = (const struct sockaddr_in6 *)addr2; - return !((six1->sin6_port == six2->sin6_port || !cmp_port) && - !memcmp(&six1->sin6_addr, &six2->sin6_addr, - sizeof(struct in6_addr))); - } - - return -1; -} - -static int cmp_sock_addr(info_fn fn, int sock1, - const struct sockaddr_storage *addr2, int cmp_port) -{ - struct sockaddr_storage addr1; - socklen_t len1 = sizeof(addr1); - - memset(&addr1, 0, len1); - if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0) - return -1; - - return cmp_addr(&addr1, addr2, cmp_port); -} - -static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0); -} - -static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1); -} - -static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2) -{ - return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1); -} - -int init_pktinfo(int domain, struct cmsghdr *cmsg) -{ - struct in6_pktinfo *pktinfo6; - struct in_pktinfo *pktinfo4; - - if (domain == AF_INET) { - cmsg->cmsg_level = SOL_IP; - cmsg->cmsg_type = IP_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); - pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo4, 0, sizeof(struct in_pktinfo)); - if (inet_pton(domain, SRC4_IP, - (void *)&pktinfo4->ipi_spec_dst) != 1) - return -1; - } else if (domain == AF_INET6) { - cmsg->cmsg_level = SOL_IPV6; - cmsg->cmsg_type = IPV6_PKTINFO; - cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); - pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg); - memset(pktinfo6, 0, sizeof(struct in6_pktinfo)); - if (inet_pton(domain, SRC6_IP, - (void *)&pktinfo6->ipi6_addr) != 1) - return -1; - } else { - return -1; - } - - return 0; -} - -static int sendmsg_to_server(int type, const struct sockaddr_storage *addr, - socklen_t addr_len, int set_cmsg, int flags, - int *syscall_err) -{ - union { - char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))]; - struct cmsghdr align; - } control6; - union { - char buf[CMSG_SPACE(sizeof(struct in_pktinfo))]; - struct cmsghdr align; - } control4; - struct msghdr hdr; - struct iovec iov; - char data = 'a'; - int domain; - int fd = -1; - - domain = addr->ss_family; - - if (domain != AF_INET && domain != AF_INET6) { - log_err("Unsupported address family"); - goto err; - } - - fd = socket(domain, type, 0); - if (fd == -1) { - log_err("Failed to create client socket"); - goto err; - } - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = (void *)addr; - hdr.msg_namelen = addr_len; - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - if (set_cmsg) { - if (domain == AF_INET) { - hdr.msg_control = &control4; - hdr.msg_controllen = sizeof(control4.buf); - } else if (domain == AF_INET6) { - hdr.msg_control = &control6; - hdr.msg_controllen = sizeof(control6.buf); - } - if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) { - log_err("Fail to init pktinfo"); - goto err; - } - } - - if (sendmsg(fd, &hdr, flags) != sizeof(data)) { - log_err("Fail to send message to server"); - *syscall_err = errno; - goto err; - } - - goto out; -err: - close(fd); - fd = -1; -out: - return fd; -} - -static int fastconnect_to_server(const struct sockaddr_storage *addr, - socklen_t addr_len) -{ - int sendmsg_err; - - return sendmsg_to_server(SOCK_STREAM, addr, addr_len, /*set_cmsg*/0, - MSG_FASTOPEN, &sendmsg_err); -} - -static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr) -{ - struct timeval tv; - struct msghdr hdr; - struct iovec iov; - char data[64]; - fd_set rfds; - - FD_ZERO(&rfds); - FD_SET(sockfd, &rfds); - - tv.tv_sec = 2; - tv.tv_usec = 0; - - if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 || - !FD_ISSET(sockfd, &rfds)) - return -1; - - memset(&iov, 0, sizeof(iov)); - iov.iov_base = data; - iov.iov_len = sizeof(data); - - memset(&hdr, 0, sizeof(hdr)); - hdr.msg_name = src_addr; - hdr.msg_namelen = sizeof(struct sockaddr_storage); - hdr.msg_iov = &iov; - hdr.msg_iovlen = 1; - - return recvmsg(sockfd, &hdr, 0); -} - -static int init_addrs(const struct sock_addr_test *test, - struct sockaddr_storage *requested_addr, - struct sockaddr_storage *expected_addr, - struct sockaddr_storage *expected_src_addr) -{ - if (make_sockaddr(test->domain, test->expected_ip, test->expected_port, - expected_addr, NULL) == -1) - goto err; - - if (make_sockaddr(test->domain, test->requested_ip, test->requested_port, - requested_addr, NULL) == -1) - goto err; - - if (test->expected_src_ip && - make_sockaddr(test->domain, test->expected_src_ip, 0, - expected_src_addr, NULL) == -1) - goto err; - - return 0; -err: - return -1; -} - -static int run_bind_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, NULL)) - goto err; - - servfd = start_server_addr(test->type, &requested_addr, addr_len, NULL); - if (servfd == -1) - goto err; - - if (cmp_local_addr(servfd, &expected_addr)) - goto err; - - /* Try to connect to server just in case */ - clientfd = connect_to_addr(test->type, &expected_addr, addr_len, NULL); - if (clientfd == -1) - goto err; - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_connect_test_case(const struct sock_addr_test *test) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_src_addr; - struct sockaddr_storage requested_addr; - struct sockaddr_storage expected_addr; - int clientfd = -1; - int servfd = -1; - int err = 0; - - if (init_addrs(test, &requested_addr, &expected_addr, - &expected_src_addr)) - goto err; - - /* Prepare server to connect to */ - servfd = start_server_addr(test->type, &expected_addr, addr_len, NULL); - if (servfd == -1) - goto err; - - clientfd = connect_to_addr(test->type, &requested_addr, addr_len, NULL); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - - if (test->type == SOCK_STREAM) { - /* Test TCP Fast Open scenario */ - clientfd = fastconnect_to_server(&requested_addr, addr_len); - if (clientfd == -1) - goto err; - - /* Make sure src and dst addrs were overridden properly */ - if (cmp_peer_addr(clientfd, &expected_addr)) - goto err; - - if (cmp_local_ip(clientfd, &expected_src_addr)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg) -{ - socklen_t addr_len = sizeof(struct sockaddr_storage); - struct sockaddr_storage expected_addr; - struct sockaddr_storage server_addr; - struct sockaddr_storage sendmsg_addr; - struct sockaddr_storage recvmsg_addr; - int clientfd = -1; - int servfd = -1; - int set_cmsg; - int err = 0; - - if (test->type != SOCK_DGRAM) - goto err; - - if (init_addrs(test, &sendmsg_addr, &server_addr, &expected_addr)) - goto err; - - /* Prepare server to sendmsg to */ - servfd = start_server_addr(test->type, &server_addr, addr_len, NULL); - if (servfd == -1) - goto err; - - for (set_cmsg = 0; set_cmsg <= max_cmsg; ++set_cmsg) { - if (clientfd >= 0) - close(clientfd); - - clientfd = sendmsg_to_server(test->type, &sendmsg_addr, - addr_len, set_cmsg, /*flags*/0, - &err); - if (err) - goto out; - else if (clientfd == -1) - goto err; - - /* Try to receive message on server instead of using - * getpeername(2) on client socket, to check that client's - * destination address was rewritten properly, since - * getpeername(2) doesn't work with unconnected datagram - * sockets. - * - * Get source address from recvmsg(2) as well to make sure - * source was rewritten properly: getsockname(2) can't be used - * since socket is unconnected and source defined for one - * specific packet may differ from the one used by default and - * returned by getsockname(2). - */ - if (recvmsg_from_client(servfd, &recvmsg_addr) == -1) - goto err; - - if (cmp_addr(&recvmsg_addr, &expected_addr, /*cmp_port*/0)) - goto err; - } - - goto out; -err: - err = -1; -out: - close(clientfd); - close(servfd); - return err; -} - -static int run_test_case(int cgfd, const struct sock_addr_test *test) -{ - int progfd = -1; - int err = 0; - - printf("Test case: %s .. ", test->descr); - - progfd = test->loadfn(test); - if (test->expected_result == LOAD_REJECT && progfd < 0) - goto out; - else if (test->expected_result == LOAD_REJECT || progfd < 0) - goto err; - - err = bpf_prog_attach(progfd, cgfd, test->attach_type, - BPF_F_ALLOW_OVERRIDE); - if (test->expected_result == ATTACH_REJECT && err) { - err = 0; /* error was expected, reset it */ - goto out; - } else if (test->expected_result == ATTACH_REJECT || err) { - goto err; - } else if (test->expected_result == ATTACH_OKAY) { - err = 0; - goto out; - } - - switch (test->attach_type) { - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - err = run_bind_test_case(test); - break; - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - err = run_connect_test_case(test); - break; - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - err = run_xmsg_test_case(test, 1); - break; - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - err = run_xmsg_test_case(test, 0); - break; - default: - goto err; - } - - if (test->expected_result == SYSCALL_EPERM && err == EPERM) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) { - err = 0; /* error was expected, reset it */ - goto out; - } - - if (err || test->expected_result != SUCCESS) - goto err; - - goto out; -err: - err = -1; -out: - /* Detaching w/o checking return code: best effort attempt. */ - if (progfd != -1) - bpf_prog_detach(cgfd, test->attach_type); - close(progfd); - printf("[%s]\n", err ? "FAIL" : "PASS"); - return err; -} - -static int run_tests(int cgfd) -{ - int passes = 0; - int fails = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(tests); ++i) { - if (run_test_case(cgfd, &tests[i])) - ++fails; - else - ++passes; - } - printf("Summary: %d PASSED, %d FAILED\n", passes, fails); - return fails ? -1 : 0; -} - -int main(int argc, char **argv) -{ - int cgfd = -1; - int err = 0; - - if (argc < 2) { - fprintf(stderr, - "%s has to be run via %s.sh. Skip direct run.\n", - argv[0], argv[0]); - exit(err); - } - - cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - if (run_tests(cgfd)) - goto err; - - goto out; -err: - err = -1; -out: - close(cgfd); - cleanup_cgroup_environment(); - return err; -} diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh deleted file mode 100755 index 3b9fdb8094aa..000000000000 --- a/tools/testing/selftests/bpf/test_sock_addr.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -set -eu - -ping_once() -{ - type ping${1} >/dev/null 2>&1 && PING="ping${1}" || PING="ping -${1}" - $PING -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1 -} - -wait_for_ip() -{ - local _i - echo -n "Wait for testing IPv4/IPv6 to become available " - for _i in $(seq ${MAX_PING_TRIES}); do - echo -n "." - if ping_once 4 ${TEST_IPv4} && ping_once 6 ${TEST_IPv6}; then - echo " OK" - return - fi - done - echo 1>&2 "ERROR: Timeout waiting for test IP to become available." - exit 1 -} - -setup() -{ - # Create testing interfaces not to interfere with current environment. - ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} - ip link set ${TEST_IF} up - ip link set ${TEST_IF_PEER} up - - ip -4 addr add ${TEST_IPv4} dev ${TEST_IF} - ip -6 addr add ${TEST_IPv6} dev ${TEST_IF} - wait_for_ip -} - -cleanup() -{ - ip link del ${TEST_IF} 2>/dev/null || : - ip link del ${TEST_IF_PEER} 2>/dev/null || : -} - -main() -{ - trap cleanup EXIT 2 3 6 15 - setup - ./test_sock_addr setup_done -} - -BASENAME=$(basename $0 .sh) -TEST_IF="${BASENAME}1" -TEST_IF_PEER="${BASENAME}2" -TEST_IPv4="127.0.0.4/8" -TEST_IPv6="::6/128" -MAX_PING_TRIES=5 - -main From 1e0a8367c89f82816735973d0e65a3c8e1b43179 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:31 -0500 Subject: [PATCH 109/119] selftests/bpf: Expand sockaddr program return value tests This patch expands verifier coverage for program return values to cover bind, connect, sendmsg, getsockname, and getpeername hooks. It also rounds out the recvmsg coverage by adding test cases for recvmsg_unix hooks. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-15-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_sock_addr.c | 294 ++++++++++++++++++ 1 file changed, 294 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c index 5081fa723d3a..9c31448a0f52 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock_addr.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock_addr.c @@ -34,4 +34,298 @@ int recvmsg6_bad_return_code(struct bpf_sock_addr *ctx) return 0; } +SEC("cgroup/recvmsg_unix") +__success +int recvmsg_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/recvmsg_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int recvmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg4") +__success +int sendmsg4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg6") +__success +int sendmsg6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/sendmsg_unix") +__success +int sendmsg_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/sendmsg_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int sendmsg_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/getpeername4") +__success +int getpeername4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername6") +__success +int getpeername6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getpeername_unix") +__success +int getpeername_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getpeername_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getpeername_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname4") +__success +int getsockname4_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname4") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname6") +__success +int getsockname6_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname6") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/getsockname_unix") +__success +int getsockname_unix_good_return_code(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/getsockname_unix") +__failure __msg("At program exit the register R0 has smin=0 smax=0 should have been in [1, 1]") +int getsockname_unix_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind4") +__success +int bind4_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind4") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_2(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/bind6") +__success +int bind6_good_return_code_3(struct bpf_sock_addr *ctx) +{ + return 3; +} + +SEC("cgroup/bind6") +__failure __msg("At program exit the register R0 has smin=4 smax=4 should have been in [0, 3]") +int bind6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 4; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect4") +__success +int connect4_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect4") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect4_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect6") +__success +int connect6_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect6") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect6_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_0(struct bpf_sock_addr *ctx) +{ + return 0; +} + +SEC("cgroup/connect_unix") +__success +int connect_unix_good_return_code_1(struct bpf_sock_addr *ctx) +{ + return 1; +} + +SEC("cgroup/connect_unix") +__failure __msg("At program exit the register R0 has smin=2 smax=2 should have been in [0, 1]") +int connect_unix_bad_return_code(struct bpf_sock_addr *ctx) +{ + return 2; +} + char _license[] SEC("license") = "GPL"; From dfb7539b47b501ccc0d23bae718500ada2157aee Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:32 -0500 Subject: [PATCH 110/119] sefltests/bpf: Expand sockaddr hook deny tests This patch expands test coverage for EPERM tests to include connect and bind calls and rounds out the coverage for sendmsg by adding tests for sendmsg_unix. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-16-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 342 ++++++++++++++++++ .../testing/selftests/bpf/progs/bind4_prog.c | 6 + .../testing/selftests/bpf/progs/bind6_prog.c | 6 + .../selftests/bpf/progs/connect4_prog.c | 6 + .../selftests/bpf/progs/connect6_prog.c | 6 + .../selftests/bpf/progs/connect_unix_prog.c | 6 + .../selftests/bpf/progs/sendmsg_unix_prog.c | 6 + 7 files changed, 378 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index ebd5e58e38c5..0477b4080b2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -439,13 +439,18 @@ static void prog_name##_destroy(void *skel) \ BPF_SKEL_FUNCS(bind4_prog, bind_v4_prog); BPF_SKEL_FUNCS_RAW(bind4_prog, bind_v4_prog); +BPF_SKEL_FUNCS(bind4_prog, bind_v4_deny_prog); BPF_SKEL_FUNCS(bind6_prog, bind_v6_prog); BPF_SKEL_FUNCS_RAW(bind6_prog, bind_v6_prog); +BPF_SKEL_FUNCS(bind6_prog, bind_v6_deny_prog); BPF_SKEL_FUNCS(connect4_prog, connect_v4_prog); BPF_SKEL_FUNCS_RAW(connect4_prog, connect_v4_prog); +BPF_SKEL_FUNCS(connect4_prog, connect_v4_deny_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); +BPF_SKEL_FUNCS(connect6_prog, connect_v6_deny_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_deny_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_deny_prog); @@ -456,6 +461,7 @@ BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_deny_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); @@ -480,6 +486,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind4: bind (dgram)", @@ -496,6 +518,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind4: load prog with wrong expected attach type", @@ -544,6 +582,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: bind (dgram)", @@ -560,6 +614,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: load prog with wrong expected attach type", @@ -610,6 +680,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (stream)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind4: kernel_bind (dgram)", @@ -626,6 +712,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind4: kernel_bind deny (dgram)", + bind_v4_deny_prog_load, + bind_v4_deny_prog_destroy, + BPF_CGROUP_INET4_BIND, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (stream)", @@ -642,6 +744,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (stream)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_BIND, "bind6: kernel_bind (dgram)", @@ -658,6 +776,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_BIND, + "bind6: kernel_bind deny (dgram)", + bind_v6_deny_prog_load, + bind_v6_deny_prog_destroy, + BPF_CGROUP_INET6_BIND, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + NULL, + SYSCALL_EPERM, + }, /* connect - system calls */ { @@ -676,6 +810,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect4: connect (dgram)", @@ -692,6 +842,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect4: load prog with wrong expected attach type", @@ -740,6 +906,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: connect (dgram)", @@ -756,6 +938,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: load prog with wrong expected attach type", @@ -804,6 +1002,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: connect deny (stream)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* connect - kernel calls */ { @@ -822,6 +1036,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (stream)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect4: kernel_connect (dgram)", @@ -838,6 +1068,22 @@ static struct sock_addr_test tests[] = { SRC4_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect4: kernel_connect deny (dgram)", + connect_v4_deny_prog_load, + connect_v4_deny_prog_destroy, + BPF_CGROUP_INET4_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_IP, + SERV4_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SRC4_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (stream)", @@ -854,6 +1100,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (stream)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect6: kernel_connect (dgram)", @@ -870,6 +1132,22 @@ static struct sock_addr_test tests[] = { SRC6_REWRITE_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect6: kernel_connect deny (dgram)", + connect_v6_deny_prog_load, + connect_v6_deny_prog_destroy, + BPF_CGROUP_INET6_CONNECT, + &kern_ops_sock_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_IP, + SERV6_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SRC6_REWRITE_IP, + SYSCALL_EPERM, + }, { SOCK_ADDR_TEST_CONNECT, "connect_unix: kernel_connect (dgram)", @@ -886,6 +1164,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: kernel_connect deny (dgram)", + connect_unix_deny_prog_load, + connect_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_CONNECT, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* sendmsg - system calls */ { @@ -1080,6 +1374,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* sendmsg - kernel calls (sock_sendmsg) */ { @@ -1178,6 +1488,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: sock_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_sock_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* sendmsg - kernel calls (kernel_sendmsg) */ { @@ -1276,6 +1602,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: kernel_sendmsg deny (dgram)", + sendmsg_unix_deny_prog_load, + sendmsg_unix_deny_prog_destroy, + BPF_CGROUP_UNIX_SENDMSG, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SYSCALL_EPERM, + }, /* recvmsg - system calls */ { diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 66005c1a5b36..b7ddf8ec4ee8 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -158,4 +158,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind4") +int bind_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 9c86c712348c..501c3fc11d35 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -175,4 +175,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/bind6") +int bind_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index bec529da7c9d..9e9ebf27b878 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -199,4 +199,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) return do_bind(ctx) ? 1 : 0; } +SEC("cgroup/connect4") +int connect_v4_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c index 40266d2c737c..e98573b00ddb 100644 --- a/tools/testing/selftests/bpf/progs/connect6_prog.c +++ b/tools/testing/selftests/bpf/progs/connect6_prog.c @@ -90,4 +90,10 @@ int connect_v6_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect6") +int connect_v6_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect_unix_prog.c b/tools/testing/selftests/bpf/progs/connect_unix_prog.c index 2ef0e0c46d17..ba60adadb335 100644 --- a/tools/testing/selftests/bpf/progs/connect_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/connect_unix_prog.c @@ -36,4 +36,10 @@ int connect_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/connect_unix") +int connect_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c index d8869b03dda9..332d0eb1116f 100644 --- a/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c +++ b/tools/testing/selftests/bpf/progs/sendmsg_unix_prog.c @@ -36,4 +36,10 @@ int sendmsg_unix_prog(struct bpf_sock_addr *ctx) return 1; } +SEC("cgroup/sendmsg_unix") +int sendmsg_unix_deny_prog(struct bpf_sock_addr *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; From bc467e953e4fbafd94d04c355f875bf1adf438e2 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:33 -0500 Subject: [PATCH 111/119] selftests/bpf: Expand getsockname and getpeername tests This expands coverage for getsockname and getpeername hooks to include getsockname4, getsockname6, getpeername4, and getpeername6. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-17-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 304 +++++++++++++++++- .../selftests/bpf/progs/getpeername4_prog.c | 24 ++ .../selftests/bpf/progs/getpeername6_prog.c | 31 ++ .../selftests/bpf/progs/getsockname4_prog.c | 24 ++ .../selftests/bpf/progs/getsockname6_prog.c | 31 ++ 5 files changed, 412 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/getpeername4_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getpeername6_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getsockname4_prog.c create mode 100644 tools/testing/selftests/bpf/progs/getsockname6_prog.c diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index 0477b4080b2e..a0a40bdcfe45 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -15,7 +15,11 @@ #include "recvmsg6_prog.skel.h" #include "sendmsg_unix_prog.skel.h" #include "recvmsg_unix_prog.skel.h" +#include "getsockname4_prog.skel.h" +#include "getsockname6_prog.skel.h" #include "getsockname_unix_prog.skel.h" +#include "getpeername4_prog.skel.h" +#include "getpeername6_prog.skel.h" #include "getpeername_unix_prog.skel.h" #include "network_helpers.h" @@ -466,7 +470,11 @@ BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS(getsockname6_prog, getsockname_v6_prog); BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS(getpeername6_prog, getpeername_v6_prog); static struct sock_addr_test tests[] = { /* bind - system calls */ @@ -1688,7 +1696,71 @@ static struct sock_addr_test tests[] = { /* getsockname - system calls */ { SOCK_ADDR_TEST_GETSOCKNAME, - "getsockname_unix", + "getsockname4: getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: getsockname", getsockname_unix_prog_load, getsockname_unix_prog_destroy, BPF_CGROUP_UNIX_GETSOCKNAME, @@ -1703,10 +1775,156 @@ static struct sock_addr_test tests[] = { SUCCESS, }, + /* getsockname - kernel calls */ + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (stream)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: kernel_getsockname (dgram)", + getsockname_v4_prog_load, + getsockname_v4_prog_destroy, + BPF_CGROUP_INET4_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (stream)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: kernel_getsockname (dgram)", + getsockname_v6_prog_load, + getsockname_v6_prog_destroy, + BPF_CGROUP_INET6_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: kernel_getsockname", + getsockname_unix_prog_load, + getsockname_unix_prog_destroy, + BPF_CGROUP_UNIX_GETSOCKNAME, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, + /* getpeername - system calls */ { SOCK_ADDR_TEST_GETPEERNAME, - "getpeername_unix", + "getpeername4: getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: getpeername", getpeername_unix_prog_load, getpeername_unix_prog_destroy, BPF_CGROUP_UNIX_GETPEERNAME, @@ -1720,6 +1938,88 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + + /* getpeername - kernel calls */ + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (stream)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_STREAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: kernel_getpeername (dgram)", + getpeername_v4_prog_load, + getpeername_v4_prog_destroy, + BPF_CGROUP_INET4_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: kernel_getpeername (stream)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_STREAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: kernel_getpeername (dgram)", + getpeername_v6_prog_load, + getpeername_v6_prog_destroy, + BPF_CGROUP_INET6_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + SUCCESS, + }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: kernel_getpeername", + getpeername_unix_prog_load, + getpeername_unix_prog_destroy, + BPF_CGROUP_UNIX_GETPEERNAME, + &kern_ops_kernel_sendmsg, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + SUCCESS, + }, }; typedef int (*info_fn)(int, struct sockaddr *, socklen_t *); diff --git a/tools/testing/selftests/bpf/progs/getpeername4_prog.c b/tools/testing/selftests/bpf/progs/getpeername4_prog.c new file mode 100644 index 000000000000..4c97208cd25d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getpeername4") +int getpeername_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getpeername6_prog.c b/tools/testing/selftests/bpf/progs/getpeername6_prog.c new file mode 100644 index 000000000000..070e4d7f636c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getpeername6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getpeername6") +int getpeername_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname4_prog.c b/tools/testing/selftests/bpf/progs/getsockname4_prog.c new file mode 100644 index 000000000000..e298487c6347 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname4_prog.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP4 0xc0a801fe // 192.168.1.254 +#define REWRITE_ADDRESS_PORT4 4040 + +SEC("cgroup/getsockname4") +int getsockname_v4_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip4 = bpf_htonl(REWRITE_ADDRESS_IP4); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT4); + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/getsockname6_prog.c b/tools/testing/selftests/bpf/progs/getsockname6_prog.c new file mode 100644 index 000000000000..811d10cd5525 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/getsockname6_prog.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Google LLC */ + +#include "vmlinux.h" + +#include +#include +#include +#include +#include "bpf_kfuncs.h" + +#define REWRITE_ADDRESS_IP6_0 0xfaceb00c +#define REWRITE_ADDRESS_IP6_1 0x12345678 +#define REWRITE_ADDRESS_IP6_2 0x00000000 +#define REWRITE_ADDRESS_IP6_3 0x0000abcd + +#define REWRITE_ADDRESS_PORT6 6060 + +SEC("cgroup/getsockname6") +int getsockname_v6_prog(struct bpf_sock_addr *ctx) +{ + ctx->user_ip6[0] = bpf_htonl(REWRITE_ADDRESS_IP6_0); + ctx->user_ip6[1] = bpf_htonl(REWRITE_ADDRESS_IP6_1); + ctx->user_ip6[2] = bpf_htonl(REWRITE_ADDRESS_IP6_2); + ctx->user_ip6[3] = bpf_htonl(REWRITE_ADDRESS_IP6_3); + ctx->user_port = bpf_htons(REWRITE_ADDRESS_PORT6); + + return 1; +} + +char _license[] SEC("license") = "GPL"; From a3d3eb957ddc733d04c0da67024b1c30d8826cc2 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 10 May 2024 14:02:34 -0500 Subject: [PATCH 112/119] selftests/bpf: Expand ATTACH_REJECT tests This expands coverage for ATTACH_REJECT tests to include connect_unix, sendmsg_unix, recvmsg*, getsockname*, and getpeername*. Signed-off-by: Jordan Rife Link: https://lore.kernel.org/r/20240510190246.3247730-18-jrife@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sock_addr.c | 187 ++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_addr.c b/tools/testing/selftests/bpf/prog_tests/sock_addr.c index a0a40bdcfe45..b880c564a204 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_addr.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_addr.c @@ -454,6 +454,7 @@ BPF_SKEL_FUNCS(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS_RAW(connect6_prog, connect_v6_prog); BPF_SKEL_FUNCS(connect6_prog, connect_v6_deny_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_prog); +BPF_SKEL_FUNCS_RAW(connect_unix_prog, connect_unix_prog); BPF_SKEL_FUNCS(connect_unix_prog, connect_unix_deny_prog); BPF_SKEL_FUNCS(sendmsg4_prog, sendmsg_v4_prog); BPF_SKEL_FUNCS_RAW(sendmsg4_prog, sendmsg_v4_prog); @@ -465,16 +466,26 @@ BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_preserve_dst_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_v4mapped_prog); BPF_SKEL_FUNCS(sendmsg6_prog, sendmsg_v6_wildcard_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(sendmsg_unix_prog, sendmsg_unix_prog); BPF_SKEL_FUNCS(sendmsg_unix_prog, sendmsg_unix_deny_prog); BPF_SKEL_FUNCS(recvmsg4_prog, recvmsg4_prog); +BPF_SKEL_FUNCS_RAW(recvmsg4_prog, recvmsg4_prog); BPF_SKEL_FUNCS(recvmsg6_prog, recvmsg6_prog); +BPF_SKEL_FUNCS_RAW(recvmsg6_prog, recvmsg6_prog); BPF_SKEL_FUNCS(recvmsg_unix_prog, recvmsg_unix_prog); +BPF_SKEL_FUNCS_RAW(recvmsg_unix_prog, recvmsg_unix_prog); BPF_SKEL_FUNCS(getsockname_unix_prog, getsockname_unix_prog); +BPF_SKEL_FUNCS_RAW(getsockname_unix_prog, getsockname_unix_prog); BPF_SKEL_FUNCS(getsockname4_prog, getsockname_v4_prog); +BPF_SKEL_FUNCS_RAW(getsockname4_prog, getsockname_v4_prog); BPF_SKEL_FUNCS(getsockname6_prog, getsockname_v6_prog); +BPF_SKEL_FUNCS_RAW(getsockname6_prog, getsockname_v6_prog); BPF_SKEL_FUNCS(getpeername_unix_prog, getpeername_unix_prog); +BPF_SKEL_FUNCS_RAW(getpeername_unix_prog, getpeername_unix_prog); BPF_SKEL_FUNCS(getpeername4_prog, getpeername_v4_prog); +BPF_SKEL_FUNCS_RAW(getpeername4_prog, getpeername_v4_prog); BPF_SKEL_FUNCS(getpeername6_prog, getpeername_v6_prog); +BPF_SKEL_FUNCS_RAW(getpeername6_prog, getpeername_v6_prog); static struct sock_addr_test tests[] = { /* bind - system calls */ @@ -1026,6 +1037,22 @@ static struct sock_addr_test tests[] = { NULL, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_CONNECT, + "connect_unix: attach prog with wrong attach type", + connect_unix_prog_load_raw, + connect_unix_prog_destroy_raw, + BPF_CGROUP_INET4_CONNECT, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* connect - kernel calls */ { @@ -1398,6 +1425,22 @@ static struct sock_addr_test tests[] = { NULL, SYSCALL_EPERM, }, + { + SOCK_ADDR_TEST_SENDMSG, + "sendmsg_unix: attach prog with wrong attach type", + sendmsg_unix_prog_load_raw, + sendmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_SENDMSG, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* sendmsg - kernel calls (sock_sendmsg) */ { @@ -1644,6 +1687,22 @@ static struct sock_addr_test tests[] = { SERV4_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg4: attach prog with wrong attach type", + recvmsg4_prog_load_raw, + recvmsg4_prog_destroy_raw, + BPF_CGROUP_UDP6_RECVMSG, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg6: recvfrom (dgram)", @@ -1660,6 +1719,22 @@ static struct sock_addr_test tests[] = { SERV6_IP, SUCCESS, }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg6: attach prog with wrong attach type", + recvmsg6_prog_load_raw, + recvmsg6_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_RECVMSG, "recvmsg_unix: recvfrom (dgram)", @@ -1692,6 +1767,22 @@ static struct sock_addr_test tests[] = { SERVUN_ADDRESS, SUCCESS, }, + { + SOCK_ADDR_TEST_RECVMSG, + "recvmsg_unix: attach prog with wrong attach type", + recvmsg_unix_prog_load_raw, + recvmsg_unix_prog_destroy_raw, + BPF_CGROUP_UDP4_RECVMSG, + &user_ops, + AF_INET6, + SOCK_STREAM, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + SERVUN_ADDRESS, + ATTACH_REJECT, + }, /* getsockname - system calls */ { @@ -1726,6 +1817,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname4: attach prog with wrong attach type", + getsockname_v4_prog_load_raw, + getsockname_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_INET, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETSOCKNAME, "getsockname6: getsockname (stream)", @@ -1758,6 +1865,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname6: attach prog with wrong attach type", + getsockname_v6_prog_load_raw, + getsockname_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETSOCKNAME, "getsockname_unix: getsockname", @@ -1774,6 +1897,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETSOCKNAME, + "getsockname_unix: attach prog with wrong attach type", + getsockname_unix_prog_load_raw, + getsockname_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* getsockname - kernel calls */ { @@ -1890,6 +2029,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername4: attach prog with wrong attach type", + getpeername_v4_prog_load_raw, + getpeername_v4_prog_destroy_raw, + BPF_CGROUP_INET6_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_DGRAM, + SERV4_REWRITE_IP, + SERV4_REWRITE_PORT, + SERV4_IP, + SERV4_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETPEERNAME, "getpeername6: getpeername (stream)", @@ -1922,6 +2077,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername6: attach prog with wrong attach type", + getpeername_v6_prog_load_raw, + getpeername_v6_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_INET6, + SOCK_DGRAM, + SERV6_REWRITE_IP, + SERV6_REWRITE_PORT, + SERV6_IP, + SERV6_PORT, + NULL, + ATTACH_REJECT, + }, { SOCK_ADDR_TEST_GETPEERNAME, "getpeername_unix: getpeername", @@ -1938,6 +2109,22 @@ static struct sock_addr_test tests[] = { NULL, SUCCESS, }, + { + SOCK_ADDR_TEST_GETPEERNAME, + "getpeername_unix: attach prog with wrong attach type", + getpeername_unix_prog_load_raw, + getpeername_unix_prog_destroy_raw, + BPF_CGROUP_INET4_GETSOCKNAME, + &user_ops, + AF_UNIX, + SOCK_STREAM, + SERVUN_ADDRESS, + 0, + SERVUN_REWRITE_ADDRESS, + 0, + NULL, + ATTACH_REJECT, + }, /* getpeername - kernel calls */ { From bbe91a9f6889934e661fa924144c7023f0a1c4cf Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Wed, 8 May 2024 10:41:23 +0000 Subject: [PATCH 113/119] tools: remove redundant ethtool.h from tooling infra Remove the redundant ethtool.h header file from tools/include/uapi/linux. The file is unnecessary as the system uses the kernel's include/uapi/linux/ethtool.h directly. Signed-off-by: Tushar Vyavahare Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240508104123.434769-1-tushar.vyavahare@intel.com Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/ethtool.h | 2271 ---------------------------- 1 file changed, 2271 deletions(-) delete mode 100644 tools/include/uapi/linux/ethtool.h diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h deleted file mode 100644 index 11fc18988bc2..000000000000 --- a/tools/include/uapi/linux/ethtool.h +++ /dev/null @@ -1,2271 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * ethtool.h: Defines for Linux ethtool. - * - * Copyright (C) 1998 David S. Miller (davem@redhat.com) - * Copyright 2001 Jeff Garzik - * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) - * Portions Copyright 2002 Intel (eli.kupermann@intel.com, - * christopher.leech@intel.com, - * scott.feldman@intel.com) - * Portions Copyright (C) Sun Microsystems 2008 - */ - -#ifndef _UAPI_LINUX_ETHTOOL_H -#define _UAPI_LINUX_ETHTOOL_H - -#include -#include -#include - -#ifndef __KERNEL__ -#include /* for INT_MAX */ -#endif - -/* All structures exposed to userland should be defined such that they - * have the same layout for 32-bit and 64-bit userland. - */ - -/* Note on reserved space. - * Reserved fields must not be accessed directly by user space because - * they may be replaced by a different field in the future. They must - * be initialized to zero before making the request, e.g. via memset - * of the entire structure or implicitly by not being set in a structure - * initializer. - */ - -/** - * struct ethtool_cmd - DEPRECATED, link control and status - * This structure is DEPRECATED, please use struct ethtool_link_settings. - * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET - * @supported: Bitmask of %SUPPORTED_* flags for the link modes, - * physical connectors and other link features for which the - * interface supports autonegotiation or auto-detection. - * Read-only. - * @advertising: Bitmask of %ADVERTISED_* flags for the link modes, - * physical connectors and other link features that are - * advertised through autonegotiation or enabled for - * auto-detection. - * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN - * @duplex: Duplex mode; one of %DUPLEX_* - * @port: Physical connector type; one of %PORT_* - * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not - * applicable. For clause 45 PHYs this is the PRTAD. - * @transceiver: Historically used to distinguish different possible - * PHY types, but not in a consistent way. Deprecated. - * @autoneg: Enable/disable autonegotiation and auto-detection; - * either %AUTONEG_DISABLE or %AUTONEG_ENABLE - * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO - * protocols supported by the interface; 0 if unknown. - * Read-only. - * @maxtxpkt: Historically used to report TX IRQ coalescing; now - * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. - * @maxrxpkt: Historically used to report RX IRQ coalescing; now - * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. - * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN - * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of - * %ETH_TP_MDI_*. If the status is unknown or not applicable, the - * value will be %ETH_TP_MDI_INVALID. Read-only. - * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of - * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads - * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. - * When written successfully, the link should be renegotiated if - * necessary. - * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes - * and other link features that the link partner advertised - * through autonegotiation; 0 if unknown or not applicable. - * Read-only. - * @reserved: Reserved for future use; see the note on reserved space. - * - * The link speed in Mbps is split between @speed and @speed_hi. Use - * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to - * access it. - * - * If autonegotiation is disabled, the speed and @duplex represent the - * fixed link mode and are writable if the driver supports multiple - * link modes. If it is enabled then they are read-only; if the link - * is up they represent the negotiated link mode; if the link is down, - * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and - * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. - * - * Some hardware interfaces may have multiple PHYs and/or physical - * connectors fitted or do not allow the driver to detect which are - * fitted. For these interfaces @port and/or @phy_address may be - * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. - * Otherwise, attempts to write different values may be ignored or - * rejected. - * - * Users should assume that all fields not marked read-only are - * writable and subject to validation by the driver. They should use - * %ETHTOOL_GSET to get the current values before making specific - * changes and then applying them with %ETHTOOL_SSET. - * - * Deprecated fields should be ignored by both users and drivers. - */ -struct ethtool_cmd { - __u32 cmd; - __u32 supported; - __u32 advertising; - __u16 speed; - __u8 duplex; - __u8 port; - __u8 phy_address; - __u8 transceiver; - __u8 autoneg; - __u8 mdio_support; - __u32 maxtxpkt; - __u32 maxrxpkt; - __u16 speed_hi; - __u8 eth_tp_mdix; - __u8 eth_tp_mdix_ctrl; - __u32 lp_advertising; - __u32 reserved[2]; -}; - -static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep, - __u32 speed) -{ - ep->speed = (__u16)(speed & 0xFFFF); - ep->speed_hi = (__u16)(speed >> 16); -} - -static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep) -{ - return (ep->speed_hi << 16) | ep->speed; -} - -/* Device supports clause 22 register access to PHY or peripherals - * using the interface defined in . This should not be - * set if there are known to be no such peripherals present or if - * the driver only emulates clause 22 registers for compatibility. - */ -#define ETH_MDIO_SUPPORTS_C22 1 - -/* Device supports clause 45 register access to PHY or peripherals - * using the interface defined in and . - * This should not be set if there are known to be no such peripherals - * present. - */ -#define ETH_MDIO_SUPPORTS_C45 2 - -#define ETHTOOL_FWVERS_LEN 32 -#define ETHTOOL_BUSINFO_LEN 32 -#define ETHTOOL_EROMVERS_LEN 32 - -/** - * struct ethtool_drvinfo - general driver and device information - * @cmd: Command number = %ETHTOOL_GDRVINFO - * @driver: Driver short name. This should normally match the name - * in its bus driver structure (e.g. pci_driver::name). Must - * not be an empty string. - * @version: Driver version string; may be an empty string - * @fw_version: Firmware version string; driver defined; may be an - * empty string - * @erom_version: Expansion ROM version string; driver defined; may be - * an empty string - * @bus_info: Device bus address. This should match the dev_name() - * string for the underlying bus device, if there is one. May be - * an empty string. - * @reserved2: Reserved for future use; see the note on reserved space. - * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and - * %ETHTOOL_SPFLAGS commands; also the number of strings in the - * %ETH_SS_PRIV_FLAGS set - * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS - * command; also the number of strings in the %ETH_SS_STATS set - * @testinfo_len: Number of results returned by the %ETHTOOL_TEST - * command; also the number of strings in the %ETH_SS_TEST set - * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM - * and %ETHTOOL_SEEPROM commands, in bytes - * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS - * command, in bytes - * - * Users can use the %ETHTOOL_GSSET_INFO command to get the number of - * strings in any string set (from Linux 2.6.34). - */ -struct ethtool_drvinfo { - __u32 cmd; - char driver[32]; - char version[32]; - char fw_version[ETHTOOL_FWVERS_LEN]; - char bus_info[ETHTOOL_BUSINFO_LEN]; - char erom_version[ETHTOOL_EROMVERS_LEN]; - char reserved2[12]; - __u32 n_priv_flags; - __u32 n_stats; - __u32 testinfo_len; - __u32 eedump_len; - __u32 regdump_len; -}; - -#define SOPASS_MAX 6 - -/** - * struct ethtool_wolinfo - Wake-On-Lan configuration - * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL - * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes. - * Read-only. - * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. - * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE - * is set in @wolopts. - */ -struct ethtool_wolinfo { - __u32 cmd; - __u32 supported; - __u32 wolopts; - __u8 sopass[SOPASS_MAX]; -}; - -/* for passing single values */ -struct ethtool_value { - __u32 cmd; - __u32 data; -}; - -#define PFC_STORM_PREVENTION_AUTO 0xffff -#define PFC_STORM_PREVENTION_DISABLE 0 - -enum tunable_id { - ETHTOOL_ID_UNSPEC, - ETHTOOL_RX_COPYBREAK, - ETHTOOL_TX_COPYBREAK, - ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ - ETHTOOL_TX_COPYBREAK_BUF_SIZE, - /* - * Add your fresh new tunable attribute above and remember to update - * tunable_strings[] in net/ethtool/common.c - */ - __ETHTOOL_TUNABLE_COUNT, -}; - -enum tunable_type_id { - ETHTOOL_TUNABLE_UNSPEC, - ETHTOOL_TUNABLE_U8, - ETHTOOL_TUNABLE_U16, - ETHTOOL_TUNABLE_U32, - ETHTOOL_TUNABLE_U64, - ETHTOOL_TUNABLE_STRING, - ETHTOOL_TUNABLE_S8, - ETHTOOL_TUNABLE_S16, - ETHTOOL_TUNABLE_S32, - ETHTOOL_TUNABLE_S64, -}; - -struct ethtool_tunable { - __u32 cmd; - __u32 id; - __u32 type_id; - __u32 len; - void *data[]; -}; - -#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff -#define DOWNSHIFT_DEV_DISABLE 0 - -/* Time in msecs after which link is reported as down - * 0 = lowest time supported by the PHY - * 0xff = off, link down detection according to standard - */ -#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 -#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff - -/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where - * the PHY's RX & TX blocks are put into a low-power mode when there is no - * link detected (typically cable is un-plugged). For RX, only a minimal - * link-detection is available, and for TX the PHY wakes up to send link pulses - * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode. - * - * Some PHYs may support configuration of the wake-up interval for TX pulses, - * and some PHYs may support only disabling TX pulses entirely. For the latter - * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be - * configured from userspace (should the user want it). - * - * The interval units for TX wake-up are in milliseconds, since this should - * cover a reasonable range of intervals: - * - from 1 millisecond, which does not sound like much of a power-saver - * - to ~65 seconds which is quite a lot to wait for a link to come up when - * plugging a cable - */ -#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS 0xffff -#define ETHTOOL_PHY_EDPD_NO_TX 0xfffe -#define ETHTOOL_PHY_EDPD_DISABLE 0 - -enum phy_tunable_id { - ETHTOOL_PHY_ID_UNSPEC, - ETHTOOL_PHY_DOWNSHIFT, - ETHTOOL_PHY_FAST_LINK_DOWN, - ETHTOOL_PHY_EDPD, - /* - * Add your fresh new phy tunable attribute above and remember to update - * phy_tunable_strings[] in net/ethtool/common.c - */ - __ETHTOOL_PHY_TUNABLE_COUNT, -}; - -/** - * struct ethtool_regs - hardware register dump - * @cmd: Command number = %ETHTOOL_GREGS - * @version: Dump format version. This is driver-specific and may - * distinguish different chips/revisions. Drivers must use new - * version numbers whenever the dump format changes in an - * incompatible way. - * @len: On entry, the real length of @data. On return, the number of - * bytes used. - * @data: Buffer for the register dump - * - * Users should use %ETHTOOL_GDRVINFO to find the maximum length of - * a register dump for the interface. They must allocate the buffer - * immediately following this structure. - */ -struct ethtool_regs { - __u32 cmd; - __u32 version; - __u32 len; - __u8 data[]; -}; - -/** - * struct ethtool_eeprom - EEPROM dump - * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or - * %ETHTOOL_SEEPROM - * @magic: A 'magic cookie' value to guard against accidental changes. - * The value passed in to %ETHTOOL_SEEPROM must match the value - * returned by %ETHTOOL_GEEPROM for the same device. This is - * unused when @cmd is %ETHTOOL_GMODULEEEPROM. - * @offset: Offset within the EEPROM to begin reading/writing, in bytes - * @len: On entry, number of bytes to read/write. On successful - * return, number of bytes actually read/written. In case of - * error, this may indicate at what point the error occurred. - * @data: Buffer to read/write from - * - * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find - * the length of an on-board or module EEPROM, respectively. They - * must allocate the buffer immediately following this structure. - */ -struct ethtool_eeprom { - __u32 cmd; - __u32 magic; - __u32 offset; - __u32 len; - __u8 data[]; -}; - -/** - * struct ethtool_eee - Energy Efficient Ethernet information - * @cmd: ETHTOOL_{G,S}EEE - * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations - * for which there is EEE support. - * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations - * advertised as eee capable. - * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex - * combinations advertised by the link partner as eee capable. - * @eee_active: Result of the eee auto negotiation. - * @eee_enabled: EEE configured mode (enabled/disabled). - * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given - * that eee was negotiated. - * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting - * its tx lpi (after reaching 'idle' state). Effective only when eee - * was negotiated and tx_lpi_enabled was set. - * @reserved: Reserved for future use; see the note on reserved space. - */ -struct ethtool_eee { - __u32 cmd; - __u32 supported; - __u32 advertised; - __u32 lp_advertised; - __u32 eee_active; - __u32 eee_enabled; - __u32 tx_lpi_enabled; - __u32 tx_lpi_timer; - __u32 reserved[2]; -}; - -/** - * struct ethtool_modinfo - plugin module eeprom information - * @cmd: %ETHTOOL_GMODULEINFO - * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx - * @eeprom_len: Length of the eeprom - * @reserved: Reserved for future use; see the note on reserved space. - * - * This structure is used to return the information to - * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM. - * The type code indicates the eeprom data format - */ -struct ethtool_modinfo { - __u32 cmd; - __u32 type; - __u32 eeprom_len; - __u32 reserved[8]; -}; - -/** - * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates - * @cmd: ETHTOOL_{G,S}COALESCE - * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after - * a packet arrives. - * @rx_max_coalesced_frames: Maximum number of packets to receive - * before an RX interrupt. - * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that - * this value applies while an IRQ is being serviced by the host. - * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames, - * except that this value applies while an IRQ is being serviced - * by the host. - * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after - * a packet is sent. - * @tx_max_coalesced_frames: Maximum number of packets to be sent - * before a TX interrupt. - * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that - * this value applies while an IRQ is being serviced by the host. - * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames, - * except that this value applies while an IRQ is being serviced - * by the host. - * @stats_block_coalesce_usecs: How many usecs to delay in-memory - * statistics block updates. Some drivers do not have an - * in-memory statistic block, and in such cases this value is - * ignored. This value must not be zero. - * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing. - * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing. - * @pkt_rate_low: Threshold for low packet rate (packets per second). - * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after - * a packet arrives, when the packet rate is below @pkt_rate_low. - * @rx_max_coalesced_frames_low: Maximum number of packets to be received - * before an RX interrupt, when the packet rate is below @pkt_rate_low. - * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after - * a packet is sent, when the packet rate is below @pkt_rate_low. - * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before - * a TX interrupt, when the packet rate is below @pkt_rate_low. - * @pkt_rate_high: Threshold for high packet rate (packets per second). - * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after - * a packet arrives, when the packet rate is above @pkt_rate_high. - * @rx_max_coalesced_frames_high: Maximum number of packets to be received - * before an RX interrupt, when the packet rate is above @pkt_rate_high. - * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after - * a packet is sent, when the packet rate is above @pkt_rate_high. - * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before - * a TX interrupt, when the packet rate is above @pkt_rate_high. - * @rate_sample_interval: How often to do adaptive coalescing packet rate - * sampling, measured in seconds. Must not be zero. - * - * Each pair of (usecs, max_frames) fields specifies that interrupts - * should be coalesced until - * (usecs > 0 && time_since_first_completion >= usecs) || - * (max_frames > 0 && completed_frames >= max_frames) - * - * It is illegal to set both usecs and max_frames to zero as this - * would cause interrupts to never be generated. To disable - * coalescing, set usecs = 0 and max_frames = 1. - * - * Some implementations ignore the value of max_frames and use the - * condition time_since_first_completion >= usecs - * - * This is deprecated. Drivers for hardware that does not support - * counting completions should validate that max_frames == !rx_usecs. - * - * Adaptive RX/TX coalescing is an algorithm implemented by some - * drivers to improve latency under low packet rates and improve - * throughput under high packet rates. Some drivers only implement - * one of RX or TX adaptive coalescing. Anything not implemented by - * the driver causes these values to be silently ignored. - * - * When the packet rate is below @pkt_rate_high but above - * @pkt_rate_low (both measured in packets per second) the - * normal {rx,tx}_* coalescing parameters are used. - */ -struct ethtool_coalesce { - __u32 cmd; - __u32 rx_coalesce_usecs; - __u32 rx_max_coalesced_frames; - __u32 rx_coalesce_usecs_irq; - __u32 rx_max_coalesced_frames_irq; - __u32 tx_coalesce_usecs; - __u32 tx_max_coalesced_frames; - __u32 tx_coalesce_usecs_irq; - __u32 tx_max_coalesced_frames_irq; - __u32 stats_block_coalesce_usecs; - __u32 use_adaptive_rx_coalesce; - __u32 use_adaptive_tx_coalesce; - __u32 pkt_rate_low; - __u32 rx_coalesce_usecs_low; - __u32 rx_max_coalesced_frames_low; - __u32 tx_coalesce_usecs_low; - __u32 tx_max_coalesced_frames_low; - __u32 pkt_rate_high; - __u32 rx_coalesce_usecs_high; - __u32 rx_max_coalesced_frames_high; - __u32 tx_coalesce_usecs_high; - __u32 tx_max_coalesced_frames_high; - __u32 rate_sample_interval; -}; - -/** - * struct ethtool_ringparam - RX/TX ring parameters - * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM - * @rx_max_pending: Maximum supported number of pending entries per - * RX ring. Read-only. - * @rx_mini_max_pending: Maximum supported number of pending entries - * per RX mini ring. Read-only. - * @rx_jumbo_max_pending: Maximum supported number of pending entries - * per RX jumbo ring. Read-only. - * @tx_max_pending: Maximum supported number of pending entries per - * TX ring. Read-only. - * @rx_pending: Current maximum number of pending entries per RX ring - * @rx_mini_pending: Current maximum number of pending entries per RX - * mini ring - * @rx_jumbo_pending: Current maximum number of pending entries per RX - * jumbo ring - * @tx_pending: Current maximum supported number of pending entries - * per TX ring - * - * If the interface does not have separate RX mini and/or jumbo rings, - * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0. - * - * There may also be driver-dependent minimum values for the number - * of entries per ring. - */ -struct ethtool_ringparam { - __u32 cmd; - __u32 rx_max_pending; - __u32 rx_mini_max_pending; - __u32 rx_jumbo_max_pending; - __u32 tx_max_pending; - __u32 rx_pending; - __u32 rx_mini_pending; - __u32 rx_jumbo_pending; - __u32 tx_pending; -}; - -/** - * struct ethtool_channels - configuring number of network channel - * @cmd: ETHTOOL_{G,S}CHANNELS - * @max_rx: Read only. Maximum number of receive channel the driver support. - * @max_tx: Read only. Maximum number of transmit channel the driver support. - * @max_other: Read only. Maximum number of other channel the driver support. - * @max_combined: Read only. Maximum number of combined channel the driver - * support. Set of queues RX, TX or other. - * @rx_count: Valid values are in the range 1 to the max_rx. - * @tx_count: Valid values are in the range 1 to the max_tx. - * @other_count: Valid values are in the range 1 to the max_other. - * @combined_count: Valid values are in the range 1 to the max_combined. - * - * This can be used to configure RX, TX and other channels. - */ - -struct ethtool_channels { - __u32 cmd; - __u32 max_rx; - __u32 max_tx; - __u32 max_other; - __u32 max_combined; - __u32 rx_count; - __u32 tx_count; - __u32 other_count; - __u32 combined_count; -}; - -/** - * struct ethtool_pauseparam - Ethernet pause (flow control) parameters - * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM - * @autoneg: Flag to enable autonegotiation of pause frame use - * @rx_pause: Flag to enable reception of pause frames - * @tx_pause: Flag to enable transmission of pause frames - * - * Drivers should reject a non-zero setting of @autoneg when - * autoneogotiation is disabled (or not supported) for the link. - * - * If the link is autonegotiated, drivers should use - * mii_advertise_flowctrl() or similar code to set the advertised - * pause frame capabilities based on the @rx_pause and @tx_pause flags, - * even if @autoneg is zero. They should also allow the advertised - * pause frame capabilities to be controlled directly through the - * advertising field of &struct ethtool_cmd. - * - * If @autoneg is non-zero, the MAC is configured to send and/or - * receive pause frames according to the result of autonegotiation. - * Otherwise, it is configured directly based on the @rx_pause and - * @tx_pause flags. - */ -struct ethtool_pauseparam { - __u32 cmd; - __u32 autoneg; - __u32 rx_pause; - __u32 tx_pause; -}; - -/* Link extended state */ -enum ethtool_link_ext_state { - ETHTOOL_LINK_EXT_STATE_AUTONEG, - ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, - ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, - ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, - ETHTOOL_LINK_EXT_STATE_NO_CABLE, - ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, - ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, - ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, - ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, - ETHTOOL_LINK_EXT_STATE_OVERHEAT, - ETHTOOL_LINK_EXT_STATE_MODULE, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ -enum ethtool_link_ext_substate_autoneg { - ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, - ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, - ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, - ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, - ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, - ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. - */ -enum ethtool_link_ext_substate_link_training { - ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, - ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, - ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, - ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. - */ -enum ethtool_link_ext_substate_link_logical_mismatch { - ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, - ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. - */ -enum ethtool_link_ext_substate_bad_signal_integrity { - ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, - ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, - ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, - ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ -enum ethtool_link_ext_substate_cable_issue { - ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, - ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, -}; - -/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ -enum ethtool_link_ext_substate_module { - ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, -}; - -#define ETH_GSTRING_LEN 32 - -/** - * enum ethtool_stringset - string set ID - * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST - * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS - * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with - * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS - * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; - * now deprecated - * @ETH_SS_FEATURES: Device feature names - * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names - * @ETH_SS_TUNABLES: tunable names - * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS - * @ETH_SS_PHY_TUNABLES: PHY tunable names - * @ETH_SS_LINK_MODES: link mode names - * @ETH_SS_MSG_CLASSES: debug message class names - * @ETH_SS_WOL_MODES: wake-on-lan modes - * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags - * @ETH_SS_TS_TX_TYPES: timestamping Tx types - * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters - * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types - * @ETH_SS_STATS_STD: standardized stats - * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics - * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics - * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics - * @ETH_SS_STATS_RMON: names of RMON statistics - * - * @ETH_SS_COUNT: number of defined string sets - */ -enum ethtool_stringset { - ETH_SS_TEST = 0, - ETH_SS_STATS, - ETH_SS_PRIV_FLAGS, - ETH_SS_NTUPLE_FILTERS, - ETH_SS_FEATURES, - ETH_SS_RSS_HASH_FUNCS, - ETH_SS_TUNABLES, - ETH_SS_PHY_STATS, - ETH_SS_PHY_TUNABLES, - ETH_SS_LINK_MODES, - ETH_SS_MSG_CLASSES, - ETH_SS_WOL_MODES, - ETH_SS_SOF_TIMESTAMPING, - ETH_SS_TS_TX_TYPES, - ETH_SS_TS_RX_FILTERS, - ETH_SS_UDP_TUNNEL_TYPES, - ETH_SS_STATS_STD, - ETH_SS_STATS_ETH_PHY, - ETH_SS_STATS_ETH_MAC, - ETH_SS_STATS_ETH_CTRL, - ETH_SS_STATS_RMON, - - /* add new constants above here */ - ETH_SS_COUNT -}; - -/** - * enum ethtool_mac_stats_src - source of ethtool MAC statistics - * @ETHTOOL_MAC_STATS_SRC_AGGREGATE: - * if device supports a MAC merge layer, this retrieves the aggregate - * statistics of the eMAC and pMAC. Otherwise, it retrieves just the - * statistics of the single (express) MAC. - * @ETHTOOL_MAC_STATS_SRC_EMAC: - * if device supports a MM layer, this retrieves the eMAC statistics. - * Otherwise, it retrieves the statistics of the single (express) MAC. - * @ETHTOOL_MAC_STATS_SRC_PMAC: - * if device supports a MM layer, this retrieves the pMAC statistics. - */ -enum ethtool_mac_stats_src { - ETHTOOL_MAC_STATS_SRC_AGGREGATE, - ETHTOOL_MAC_STATS_SRC_EMAC, - ETHTOOL_MAC_STATS_SRC_PMAC, -}; - -/** - * enum ethtool_module_power_mode_policy - plug-in module power mode policy - * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. - * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host - * to high power mode when the first port using it is put administratively - * up and to low power mode when the last port using it is put - * administratively down. - */ -enum ethtool_module_power_mode_policy { - ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, - ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, -}; - -/** - * enum ethtool_module_power_mode - plug-in module power mode - * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. - * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. - */ -enum ethtool_module_power_mode { - ETHTOOL_MODULE_POWER_MODE_LOW = 1, - ETHTOOL_MODULE_POWER_MODE_HIGH, -}; - -/** - * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE - * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState - * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are - * unknown - * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled - * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled - */ -enum ethtool_podl_pse_admin_state { - ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1, - ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, - ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED, -}; - -/** - * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE. - * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: - * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE - * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is - * asserted true when the PoDL PSE state diagram variable mr_pse_enable is - * false" - * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is - * asserted true when either of the PSE state diagram variables - * pi_detecting or pi_classifying is true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower” - * is asserted true when the PoDL PSE state diagram variable pi_powered is - * true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted - * true when the PoDL PSE state diagram variable pi_sleeping is true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true - * when the logical combination of the PoDL PSE state diagram variables - * pi_prebiased*!pi_sleeping is true." - * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted - * true when the PoDL PSE state diagram variable overload_held is true." - */ -enum ethtool_podl_pse_pw_d_status { - ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1, - ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED, - ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING, - ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING, - ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP, - ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE, - ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR, -}; - -/** - * enum ethtool_mm_verify_status - status of MAC Merge Verify function - * @ETHTOOL_MM_VERIFY_STATUS_UNKNOWN: - * verification status is unknown - * @ETHTOOL_MM_VERIFY_STATUS_INITIAL: - * the 802.3 Verify State diagram is in the state INIT_VERIFICATION - * @ETHTOOL_MM_VERIFY_STATUS_VERIFYING: - * the Verify State diagram is in the state VERIFICATION_IDLE, - * SEND_VERIFY or WAIT_FOR_RESPONSE - * @ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: - * indicates that the Verify State diagram is in the state VERIFIED - * @ETHTOOL_MM_VERIFY_STATUS_FAILED: - * the Verify State diagram is in the state VERIFY_FAIL - * @ETHTOOL_MM_VERIFY_STATUS_DISABLED: - * verification of preemption operation is disabled - */ -enum ethtool_mm_verify_status { - ETHTOOL_MM_VERIFY_STATUS_UNKNOWN, - ETHTOOL_MM_VERIFY_STATUS_INITIAL, - ETHTOOL_MM_VERIFY_STATUS_VERIFYING, - ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED, - ETHTOOL_MM_VERIFY_STATUS_FAILED, - ETHTOOL_MM_VERIFY_STATUS_DISABLED, -}; - -/** - * struct ethtool_gstrings - string set for data tagging - * @cmd: Command number = %ETHTOOL_GSTRINGS - * @string_set: String set ID; one of &enum ethtool_stringset - * @len: On return, the number of strings in the string set - * @data: Buffer for strings. Each string is null-padded to a size of - * %ETH_GSTRING_LEN. - * - * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in - * the string set. They must allocate a buffer of the appropriate - * size immediately following this structure. - */ -struct ethtool_gstrings { - __u32 cmd; - __u32 string_set; - __u32 len; - __u8 data[]; -}; - -/** - * struct ethtool_sset_info - string set information - * @cmd: Command number = %ETHTOOL_GSSET_INFO - * @reserved: Reserved for future use; see the note on reserved space. - * @sset_mask: On entry, a bitmask of string sets to query, with bits - * numbered according to &enum ethtool_stringset. On return, a - * bitmask of those string sets queried that are supported. - * @data: Buffer for string set sizes. On return, this contains the - * size of each string set that was queried and supported, in - * order of ID. - * - * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on - * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the - * size of set 1 and @data[1] contains the size of set 2. - * - * Users must allocate a buffer of the appropriate size (4 * number of - * sets queried) immediately following this structure. - */ -struct ethtool_sset_info { - __u32 cmd; - __u32 reserved; - __u64 sset_mask; - __u32 data[]; -}; - -/** - * enum ethtool_test_flags - flags definition of ethtool_test - * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise - * only online tests. - * @ETH_TEST_FL_FAILED: Driver set this flag if test fails. - * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback - * test. - * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test - */ - -enum ethtool_test_flags { - ETH_TEST_FL_OFFLINE = (1 << 0), - ETH_TEST_FL_FAILED = (1 << 1), - ETH_TEST_FL_EXTERNAL_LB = (1 << 2), - ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3), -}; - -/** - * struct ethtool_test - device self-test invocation - * @cmd: Command number = %ETHTOOL_TEST - * @flags: A bitmask of flags from &enum ethtool_test_flags. Some - * flags may be set by the user on entry; others may be set by - * the driver on return. - * @reserved: Reserved for future use; see the note on reserved space. - * @len: On return, the number of test results - * @data: Array of test results - * - * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the - * number of test results that will be returned. They must allocate a - * buffer of the appropriate size (8 * number of results) immediately - * following this structure. - */ -struct ethtool_test { - __u32 cmd; - __u32 flags; - __u32 reserved; - __u32 len; - __u64 data[]; -}; - -/** - * struct ethtool_stats - device-specific statistics - * @cmd: Command number = %ETHTOOL_GSTATS - * @n_stats: On return, the number of statistics - * @data: Array of statistics - * - * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the - * number of statistics that will be returned. They must allocate a - * buffer of the appropriate size (8 * number of statistics) - * immediately following this structure. - */ -struct ethtool_stats { - __u32 cmd; - __u32 n_stats; - __u64 data[]; -}; - -/** - * struct ethtool_perm_addr - permanent hardware address - * @cmd: Command number = %ETHTOOL_GPERMADDR - * @size: On entry, the size of the buffer. On return, the size of the - * address. The command fails if the buffer is too small. - * @data: Buffer for the address - * - * Users must allocate the buffer immediately following this structure. - * A buffer size of %MAX_ADDR_LEN should be sufficient for any address - * type. - */ -struct ethtool_perm_addr { - __u32 cmd; - __u32 size; - __u8 data[]; -}; - -/* boolean flags controlling per-interface behavior characteristics. - * When reading, the flag indicates whether or not a certain behavior - * is enabled/present. When writing, the flag indicates whether - * or not the driver should turn on (set) or off (clear) a behavior. - * - * Some behaviors may read-only (unconditionally absent or present). - * If such is the case, return EINVAL in the set-flags operation if the - * flag differs from the read-only value. - */ -enum ethtool_flags { - ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */ - ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */ - ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */ - ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */ - ETH_FLAG_RXHASH = (1 << 28), -}; - -/* The following structures are for supporting RX network flow - * classification and RX n-tuple configuration. Note, all multibyte - * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to - * be in network byte order. - */ - -/** - * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc. - * @ip4src: Source host - * @ip4dst: Destination host - * @psrc: Source port - * @pdst: Destination port - * @tos: Type-of-service - * - * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow. - */ -struct ethtool_tcpip4_spec { - __be32 ip4src; - __be32 ip4dst; - __be16 psrc; - __be16 pdst; - __u8 tos; -}; - -/** - * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4 - * @ip4src: Source host - * @ip4dst: Destination host - * @spi: Security parameters index - * @tos: Type-of-service - * - * This can be used to specify an IPsec transport or tunnel over IPv4. - */ -struct ethtool_ah_espip4_spec { - __be32 ip4src; - __be32 ip4dst; - __be32 spi; - __u8 tos; -}; - -#define ETH_RX_NFC_IP4 1 - -/** - * struct ethtool_usrip4_spec - general flow specification for IPv4 - * @ip4src: Source host - * @ip4dst: Destination host - * @l4_4_bytes: First 4 bytes of transport (layer 4) header - * @tos: Type-of-service - * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0 - * @proto: Transport protocol number; mask must be 0 - */ -struct ethtool_usrip4_spec { - __be32 ip4src; - __be32 ip4dst; - __be32 l4_4_bytes; - __u8 tos; - __u8 ip_ver; - __u8 proto; -}; - -/** - * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. - * @ip6src: Source host - * @ip6dst: Destination host - * @psrc: Source port - * @pdst: Destination port - * @tclass: Traffic Class - * - * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. - */ -struct ethtool_tcpip6_spec { - __be32 ip6src[4]; - __be32 ip6dst[4]; - __be16 psrc; - __be16 pdst; - __u8 tclass; -}; - -/** - * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 - * @ip6src: Source host - * @ip6dst: Destination host - * @spi: Security parameters index - * @tclass: Traffic Class - * - * This can be used to specify an IPsec transport or tunnel over IPv6. - */ -struct ethtool_ah_espip6_spec { - __be32 ip6src[4]; - __be32 ip6dst[4]; - __be32 spi; - __u8 tclass; -}; - -/** - * struct ethtool_usrip6_spec - general flow specification for IPv6 - * @ip6src: Source host - * @ip6dst: Destination host - * @l4_4_bytes: First 4 bytes of transport (layer 4) header - * @tclass: Traffic Class - * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) - */ -struct ethtool_usrip6_spec { - __be32 ip6src[4]; - __be32 ip6dst[4]; - __be32 l4_4_bytes; - __u8 tclass; - __u8 l4_proto; -}; - -union ethtool_flow_union { - struct ethtool_tcpip4_spec tcp_ip4_spec; - struct ethtool_tcpip4_spec udp_ip4_spec; - struct ethtool_tcpip4_spec sctp_ip4_spec; - struct ethtool_ah_espip4_spec ah_ip4_spec; - struct ethtool_ah_espip4_spec esp_ip4_spec; - struct ethtool_usrip4_spec usr_ip4_spec; - struct ethtool_tcpip6_spec tcp_ip6_spec; - struct ethtool_tcpip6_spec udp_ip6_spec; - struct ethtool_tcpip6_spec sctp_ip6_spec; - struct ethtool_ah_espip6_spec ah_ip6_spec; - struct ethtool_ah_espip6_spec esp_ip6_spec; - struct ethtool_usrip6_spec usr_ip6_spec; - struct ethhdr ether_spec; - __u8 hdata[52]; -}; - -/** - * struct ethtool_flow_ext - additional RX flow fields - * @h_dest: destination MAC address - * @vlan_etype: VLAN EtherType - * @vlan_tci: VLAN tag control information - * @data: user defined data - * @padding: Reserved for future use; see the note on reserved space. - * - * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT - * is set in &struct ethtool_rx_flow_spec @flow_type. - * @h_dest is valid if %FLOW_MAC_EXT is set. - */ -struct ethtool_flow_ext { - __u8 padding[2]; - unsigned char h_dest[ETH_ALEN]; - __be16 vlan_etype; - __be16 vlan_tci; - __be32 data[2]; -}; - -/** - * struct ethtool_rx_flow_spec - classification rule for RX flows - * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW - * @h_u: Flow fields to match (dependent on @flow_type) - * @h_ext: Additional fields to match - * @m_u: Masks for flow field bits to be matched - * @m_ext: Masks for additional field bits to be matched - * Note, all additional fields must be ignored unless @flow_type - * includes the %FLOW_EXT or %FLOW_MAC_EXT flag - * (see &struct ethtool_flow_ext description). - * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC - * if packets should be discarded, or %RX_CLS_FLOW_WAKE if the - * packets should be used for Wake-on-LAN with %WAKE_FILTER - * @location: Location of rule in the table. Locations must be - * numbered such that a flow matching multiple rules will be - * classified according to the first (lowest numbered) rule. - */ -struct ethtool_rx_flow_spec { - __u32 flow_type; - union ethtool_flow_union h_u; - struct ethtool_flow_ext h_ext; - union ethtool_flow_union m_u; - struct ethtool_flow_ext m_ext; - __u64 ring_cookie; - __u32 location; -}; - -/* How rings are laid out when accessing virtual functions or - * offloaded queues is device specific. To allow users to do flow - * steering and specify these queues the ring cookie is partitioned - * into a 32bit queue index with an 8 bit virtual function id. - * This also leaves the 3bytes for further specifiers. It is possible - * future devices may support more than 256 virtual functions if - * devices start supporting PCIe w/ARI. However at the moment I - * do not know of any devices that support this so I do not reserve - * space for this at this time. If a future patch consumes the next - * byte it should be aware of this possibility. - */ -#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL -#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL -#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 -static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) -{ - return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; -} - -static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) -{ - return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> - ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; -} - -/** - * struct ethtool_rxnfc - command to get or set RX flow classification rules - * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, - * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE, - * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS - * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW - * @data: Command-dependent value - * @fs: Flow classification rule - * @rss_context: RSS context to be affected - * @rule_cnt: Number of rules to be affected - * @rule_locs: Array of used rule locations - * - * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating - * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following - * structure fields must not be used, except that if @flow_type includes - * the %FLOW_RSS flag, then @rss_context determines which RSS context to - * act on. - * - * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues - * on return. - * - * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined - * rules on return. If @data is non-zero on return then it is the - * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the - * driver supports any special location values. If that flag is not - * set in @data then special location values should not be used. - * - * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an - * existing rule on entry and @fs contains the rule on return; if - * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is - * filled with the RSS context ID associated with the rule. - * - * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the - * user buffer for @rule_locs on entry. On return, @data is the size - * of the rule table, @rule_cnt is the number of defined rules, and - * @rule_locs contains the locations of the defined rules. Drivers - * must use the second parameter to get_rxnfc() instead of @rule_locs. - * - * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. - * @fs.@location either specifies the location to use or is a special - * location value with %RX_CLS_LOC_SPECIAL flag set. On return, - * @fs.@location is the actual rule location. If @fs.@flow_type - * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to - * use for flow spreading traffic which matches this rule. The value - * from the rxfh indirection table will be added to @fs.@ring_cookie - * to choose which ring to deliver to. - * - * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an - * existing rule on entry. - * - * A driver supporting the special location values for - * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused - * location, and may remove a rule at a later location (lower - * priority) that matches exactly the same set of flows. The special - * values are %RX_CLS_LOC_ANY, selecting any location; - * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum - * priority); and %RX_CLS_LOC_LAST, selecting the last suitable - * location (minimum priority). Additional special values may be - * defined in future and drivers must return -%EINVAL for any - * unrecognised value. - */ -struct ethtool_rxnfc { - __u32 cmd; - __u32 flow_type; - __u64 data; - struct ethtool_rx_flow_spec fs; - union { - __u32 rule_cnt; - __u32 rss_context; - }; - __u32 rule_locs[]; -}; - - -/** - * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection - * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR - * @size: On entry, the array size of the user buffer, which may be zero. - * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware - * indirection table. - * @ring_index: RX ring/queue index for each hash value - * - * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size - * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means - * the table should be reset to default values. This last feature - * is not supported by the original implementations. - */ -struct ethtool_rxfh_indir { - __u32 cmd; - __u32 size; - __u32 ring_index[]; -}; - -/** - * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. - * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH - * @rss_context: RSS context identifier. Context 0 is the default for normal - * traffic; other contexts can be referenced as the destination for RX flow - * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command - * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will - * contain the ID of the newly allocated context. - * @indir_size: On entry, the array size of the user buffer for the - * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), - * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, - * the array size of the hardware indirection table. - * @key_size: On entry, the array size of the user buffer for the hash key, - * which may be zero. On return from %ETHTOOL_GRSSH, the size of the - * hardware hash key. - * @hfunc: Defines the current RSS hash function used by HW (or to be set to). - * Valid values are one of the %ETH_RSS_HASH_*. - * @input_xfrm: Defines how the input data is transformed. Valid values are one - * of %RXH_XFRM_*. - * @rsvd8: Reserved for future use; see the note on reserved space. - * @rsvd32: Reserved for future use; see the note on reserved space. - * @rss_config: RX ring/queue index for each hash value i.e., indirection table - * of @indir_size __u32 elements, followed by hash key of @key_size - * bytes. - * - * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the - * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of - * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested - * and a @indir_size of zero means the indir table should be reset to default - * values (if @rss_context == 0) or that the RSS context should be deleted. - * An hfunc of zero means that hash function setting is not requested. - */ -struct ethtool_rxfh { - __u32 cmd; - __u32 rss_context; - __u32 indir_size; - __u32 key_size; - __u8 hfunc; - __u8 input_xfrm; - __u8 rsvd8[2]; - __u32 rsvd32; - __u32 rss_config[]; -}; -#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff -#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff - -/** - * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter - * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW - * @h_u: Flow field values to match (dependent on @flow_type) - * @m_u: Masks for flow field value bits to be ignored - * @vlan_tag: VLAN tag to match - * @vlan_tag_mask: Mask for VLAN tag bits to be ignored - * @data: Driver-dependent data to match - * @data_mask: Mask for driver-dependent data bits to be ignored - * @action: RX ring/queue index to deliver to (non-negative) or other action - * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP) - * - * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where - * a field value and mask are both zero this is treated as if all mask - * bits are set i.e. the field is ignored. - */ -struct ethtool_rx_ntuple_flow_spec { - __u32 flow_type; - union { - struct ethtool_tcpip4_spec tcp_ip4_spec; - struct ethtool_tcpip4_spec udp_ip4_spec; - struct ethtool_tcpip4_spec sctp_ip4_spec; - struct ethtool_ah_espip4_spec ah_ip4_spec; - struct ethtool_ah_espip4_spec esp_ip4_spec; - struct ethtool_usrip4_spec usr_ip4_spec; - struct ethhdr ether_spec; - __u8 hdata[72]; - } h_u, m_u; - - __u16 vlan_tag; - __u16 vlan_tag_mask; - __u64 data; - __u64 data_mask; - - __s32 action; -#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */ -#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */ -}; - -/** - * struct ethtool_rx_ntuple - command to set or clear RX flow filter - * @cmd: Command number - %ETHTOOL_SRXNTUPLE - * @fs: Flow filter specification - */ -struct ethtool_rx_ntuple { - __u32 cmd; - struct ethtool_rx_ntuple_flow_spec fs; -}; - -#define ETHTOOL_FLASH_MAX_FILENAME 128 -enum ethtool_flash_op_type { - ETHTOOL_FLASH_ALL_REGIONS = 0, -}; - -/* for passing firmware flashing related parameters */ -struct ethtool_flash { - __u32 cmd; - __u32 region; - char data[ETHTOOL_FLASH_MAX_FILENAME]; -}; - -/** - * struct ethtool_dump - used for retrieving, setting device dump - * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or - * %ETHTOOL_SET_DUMP - * @version: FW version of the dump, filled in by driver - * @flag: driver dependent flag for dump setting, filled in by driver during - * get and filled in by ethtool for set operation. - * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when - * firmware dump is disabled. - * @len: length of dump data, used as the length of the user buffer on entry to - * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver - * for %ETHTOOL_GET_DUMP_FLAG command - * @data: data collected for get dump data operation - */ -struct ethtool_dump { - __u32 cmd; - __u32 version; - __u32 flag; - __u32 len; - __u8 data[]; -}; - -#define ETH_FW_DUMP_DISABLE 0 - -/* for returning and changing feature sets */ - -/** - * struct ethtool_get_features_block - block with state of 32 features - * @available: mask of changeable features - * @requested: mask of features requested to be enabled if possible - * @active: mask of currently enabled features - * @never_changed: mask of features not changeable for any device - */ -struct ethtool_get_features_block { - __u32 available; - __u32 requested; - __u32 active; - __u32 never_changed; -}; - -/** - * struct ethtool_gfeatures - command to get state of device's features - * @cmd: command number = %ETHTOOL_GFEATURES - * @size: On entry, the number of elements in the features[] array; - * on return, the number of elements in features[] needed to hold - * all features - * @features: state of features - */ -struct ethtool_gfeatures { - __u32 cmd; - __u32 size; - struct ethtool_get_features_block features[]; -}; - -/** - * struct ethtool_set_features_block - block with request for 32 features - * @valid: mask of features to be changed - * @requested: values of features to be changed - */ -struct ethtool_set_features_block { - __u32 valid; - __u32 requested; -}; - -/** - * struct ethtool_sfeatures - command to request change in device's features - * @cmd: command number = %ETHTOOL_SFEATURES - * @size: array size of the features[] array - * @features: feature change masks - */ -struct ethtool_sfeatures { - __u32 cmd; - __u32 size; - struct ethtool_set_features_block features[]; -}; - -/** - * struct ethtool_ts_info - holds a device's timestamping and PHC association - * @cmd: command number = %ETHTOOL_GET_TS_INFO - * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags - * @phc_index: device index of the associated PHC, or -1 if there is none - * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values - * @tx_reserved: Reserved for future use; see the note on reserved space. - * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values - * @rx_reserved: Reserved for future use; see the note on reserved space. - * - * The bits in the 'tx_types' and 'rx_filters' fields correspond to - * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values, - * respectively. For example, if the device supports HWTSTAMP_TX_ON, - * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set. - * - * Drivers should only report the filters they actually support without - * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for - * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the - * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op. - */ -struct ethtool_ts_info { - __u32 cmd; - __u32 so_timestamping; - __s32 phc_index; - __u32 tx_types; - __u32 tx_reserved[3]; - __u32 rx_filters; - __u32 rx_reserved[3]; -}; - -/* - * %ETHTOOL_SFEATURES changes features present in features[].valid to the - * values of corresponding bits in features[].requested. Bits in .requested - * not set in .valid or not changeable are ignored. - * - * Returns %EINVAL when .valid contains undefined or never-changeable bits - * or size is not equal to required number of features words (32-bit blocks). - * Returns >= 0 if request was completed; bits set in the value mean: - * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not - * changeable (not present in %ETHTOOL_GFEATURES' features[].available) - * those bits were ignored. - * %ETHTOOL_F_WISH - some or all changes requested were recorded but the - * resulting state of bits masked by .valid is not equal to .requested. - * Probably there are other device-specific constraints on some features - * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered - * here as though ignored bits were cleared. - * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling - * compatibility functions. Requested offload state cannot be properly - * managed by kernel. - * - * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of - * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands - * for ETH_SS_FEATURES string set. First entry in the table corresponds to least - * significant bit in features[0] fields. Empty strings mark undefined features. - */ -enum ethtool_sfeatures_retval_bits { - ETHTOOL_F_UNSUPPORTED__BIT, - ETHTOOL_F_WISH__BIT, - ETHTOOL_F_COMPAT__BIT, -}; - -#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) -#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) -#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) - -#define MAX_NUM_QUEUE 4096 - -/** - * struct ethtool_per_queue_op - apply sub command to the queues in mask. - * @cmd: ETHTOOL_PERQUEUE - * @sub_command: the sub command which apply to each queues - * @queue_mask: Bitmap of the queues which sub command apply to - * @data: A complete command structure following for each of the queues addressed - */ -struct ethtool_per_queue_op { - __u32 cmd; - __u32 sub_command; - __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; - char data[]; -}; - -/** - * struct ethtool_fecparam - Ethernet Forward Error Correction parameters - * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on the port, single bit set, GET only. - * @fec: Bitmask of configured FEC modes. - * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. - * - * Note that @reserved was never validated on input and ethtool user space - * left it uninitialized when calling SET. Hence going forward it can only be - * used to return a value to userspace with GET. - * - * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. - * FEC settings are configured by link autonegotiation whenever it's enabled. - * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. - * - * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. - * It is recommended that drivers only accept a single bit set in @fec. - * When multiple bits are set in @fec drivers may pick mode in an implementation - * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other - * FEC modes, because it's unclear whether in this case other modes constrain - * AUTO or are independent choices. - * Drivers must reject SET requests if they support none of the requested modes. - * - * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead - * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. - * - * See enum ethtool_fec_config_bits for definition of valid bits for both - * @fec and @active_fec. - */ -struct ethtool_fecparam { - __u32 cmd; - /* bitmask of FEC modes */ - __u32 active_fec; - __u32 fec; - __u32 reserved; -}; - -/** - * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration - * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not - * be used together with other bits. GET only. - * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually - * based link mode and SFP parameters read from module's - * EEPROM. This bit does _not_ mean autonegotiation. - * @ETHTOOL_FEC_OFF_BIT: No FEC Mode - * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode - * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode - * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet - * Consortium) - */ -enum ethtool_fec_config_bits { - ETHTOOL_FEC_NONE_BIT, - ETHTOOL_FEC_AUTO_BIT, - ETHTOOL_FEC_OFF_BIT, - ETHTOOL_FEC_RS_BIT, - ETHTOOL_FEC_BASER_BIT, - ETHTOOL_FEC_LLRS_BIT, -}; - -#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) -#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) -#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) -#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) -#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) -#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT) - -/* CMDs currently supported */ -#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. - * Please use ETHTOOL_GLINKSETTINGS - */ -#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. - * Please use ETHTOOL_SLINKSETTINGS - */ -#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ -#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ -#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ -#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ -#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ -#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ -#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ -/* Get link status for host, i.e. whether the interface *and* the - * physical port (if there is one) are up (ethtool_value). */ -#define ETHTOOL_GLINK 0x0000000a -#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ -#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ -#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ -#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ -#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ -#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ -#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ -#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ -#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ -#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ -#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ -#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ -#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable - * (ethtool_value) */ -#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable - * (ethtool_value). */ -#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ -#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ -#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ -#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ -#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ -#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ -#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ -#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ -#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ -#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ -#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ -#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ -#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ -#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ -#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ - -#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ -#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ -#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ -#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ -#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */ -#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */ -#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */ -#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */ -#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ -#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ -#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ -#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ -#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */ -#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */ -#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */ -#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ -#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ - -#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ -#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ -#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ -#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */ -#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */ -#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */ -#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */ -#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ -#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ -#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ -#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ -#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ - -#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ -#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ -#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ -#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ -#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ - -#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ - -#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ -#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ -#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ -#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ -#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ -#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ - -/* compatibility with older code */ -#define SPARC_ETH_GSET ETHTOOL_GSET -#define SPARC_ETH_SSET ETHTOOL_SSET - -/* Link mode bit indices */ -enum ethtool_link_mode_bit_indices { - ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, - ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, - ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, - ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, - ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, - ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, - ETHTOOL_LINK_MODE_Autoneg_BIT = 6, - ETHTOOL_LINK_MODE_TP_BIT = 7, - ETHTOOL_LINK_MODE_AUI_BIT = 8, - ETHTOOL_LINK_MODE_MII_BIT = 9, - ETHTOOL_LINK_MODE_FIBRE_BIT = 10, - ETHTOOL_LINK_MODE_BNC_BIT = 11, - ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, - ETHTOOL_LINK_MODE_Pause_BIT = 13, - ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, - ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, - ETHTOOL_LINK_MODE_Backplane_BIT = 16, - ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, - ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, - ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, - ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, - ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, - ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, - ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, - ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, - ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, - ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, - ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, - ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, - ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, - ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, - ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, - - /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit - * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* - * macro for bits > 31. The only way to use indices > 31 is to - * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. - */ - - ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, - ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, - ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, - ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35, - ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36, - ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, - ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, - ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, - ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, - ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, - ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, - ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, - ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, - ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, - ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, - ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, - ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, - - ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, - ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, - ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, - ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52, - ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53, - ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54, - ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55, - ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56, - ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57, - ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58, - ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59, - ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60, - ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61, - ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62, - ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63, - ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, - ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, - ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, - ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, - ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, - ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, - ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, - ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, - ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, - ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, - ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, - ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, - ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, - ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, - ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, - ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, - ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, - ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, - ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, - ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, - ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, - ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, - ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, - ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, - ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, - ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, - ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, - ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, - ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, - ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, - ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, - ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, - ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, - ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, - ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, - ETHTOOL_LINK_MODE_10baseT1S_Full_BIT = 99, - ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, - ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, - - /* must be last entry */ - __ETHTOOL_LINK_MODE_MASK_NBITS -}; - -#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ - (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) - -/* DEPRECATED macros. Please migrate to - * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT - * define any new SUPPORTED_* macro for bits > 31. - */ -#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) -#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) -#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) -#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) -#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) -#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) -#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) -#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) -#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) -#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) -#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) -#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) -#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) -#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) -#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) -#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) -#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) -#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) -#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) -#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) -#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) -#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) -#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) -#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) -#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) -#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) -#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) -#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) -#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) -#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) -#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) -/* Please do not define any new SUPPORTED_* macro for bits > 31, see - * notice above. - */ - -/* - * DEPRECATED macros. Please migrate to - * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT - * define any new ADERTISE_* macro for bits > 31. - */ -#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) -#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) -#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) -#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) -#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) -#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) -#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) -#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) -#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) -#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) -#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) -#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) -#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) -#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) -#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) -#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) -#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) -#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) -#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) -#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) -#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) -#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) -#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) -#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) -#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) -#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) -#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) -#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) -#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) -#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) -#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) -/* Please do not define any new ADVERTISED_* macro for bits > 31, see - * notice above. - */ - -/* The following are all involved in forcing a particular link - * mode for the device for setting things. When getting the - * devices settings, these indicate the current mode and whether - * it was forced up into this mode or autonegotiated. - */ - -/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. - * Update drivers/net/phy/phy.c:phy_speed_to_str() and - * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values. - */ -#define SPEED_10 10 -#define SPEED_100 100 -#define SPEED_1000 1000 -#define SPEED_2500 2500 -#define SPEED_5000 5000 -#define SPEED_10000 10000 -#define SPEED_14000 14000 -#define SPEED_20000 20000 -#define SPEED_25000 25000 -#define SPEED_40000 40000 -#define SPEED_50000 50000 -#define SPEED_56000 56000 -#define SPEED_100000 100000 -#define SPEED_200000 200000 -#define SPEED_400000 400000 -#define SPEED_800000 800000 - -#define SPEED_UNKNOWN -1 - -static inline int ethtool_validate_speed(__u32 speed) -{ - return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN; -} - -/* Duplex, half or full. */ -#define DUPLEX_HALF 0x00 -#define DUPLEX_FULL 0x01 -#define DUPLEX_UNKNOWN 0xff - -static inline int ethtool_validate_duplex(__u8 duplex) -{ - switch (duplex) { - case DUPLEX_HALF: - case DUPLEX_FULL: - case DUPLEX_UNKNOWN: - return 1; - } - - return 0; -} - -#define MASTER_SLAVE_CFG_UNSUPPORTED 0 -#define MASTER_SLAVE_CFG_UNKNOWN 1 -#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2 -#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3 -#define MASTER_SLAVE_CFG_MASTER_FORCE 4 -#define MASTER_SLAVE_CFG_SLAVE_FORCE 5 -#define MASTER_SLAVE_STATE_UNSUPPORTED 0 -#define MASTER_SLAVE_STATE_UNKNOWN 1 -#define MASTER_SLAVE_STATE_MASTER 2 -#define MASTER_SLAVE_STATE_SLAVE 3 -#define MASTER_SLAVE_STATE_ERR 4 - -/* These are used to throttle the rate of data on the phy interface when the - * native speed of the interface is higher than the link speed. These should - * not be used for phy interfaces which natively support multiple speeds (e.g. - * MII or SGMII). - */ -/* No rate matching performed. */ -#define RATE_MATCH_NONE 0 -/* The phy sends pause frames to throttle the MAC. */ -#define RATE_MATCH_PAUSE 1 -/* The phy asserts CRS to prevent the MAC from transmitting. */ -#define RATE_MATCH_CRS 2 -/* The MAC is programmed with a sufficiently-large IPG. */ -#define RATE_MATCH_OPEN_LOOP 3 - -/* Which connector port. */ -#define PORT_TP 0x00 -#define PORT_AUI 0x01 -#define PORT_MII 0x02 -#define PORT_FIBRE 0x03 -#define PORT_BNC 0x04 -#define PORT_DA 0x05 -#define PORT_NONE 0xef -#define PORT_OTHER 0xff - -/* Which transceiver to use. */ -#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ -#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ -#define XCVR_DUMMY1 0x02 -#define XCVR_DUMMY2 0x03 -#define XCVR_DUMMY3 0x04 - -/* Enable or disable autonegotiation. */ -#define AUTONEG_DISABLE 0x00 -#define AUTONEG_ENABLE 0x01 - -/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then - * the driver is required to renegotiate link - */ -#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */ -#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */ -#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */ -#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */ - -/* Wake-On-Lan options. */ -#define WAKE_PHY (1 << 0) -#define WAKE_UCAST (1 << 1) -#define WAKE_MCAST (1 << 2) -#define WAKE_BCAST (1 << 3) -#define WAKE_ARP (1 << 4) -#define WAKE_MAGIC (1 << 5) -#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ -#define WAKE_FILTER (1 << 7) - -#define WOL_MODE_COUNT 8 - -/* RSS hash function data - * XOR the corresponding source and destination fields of each specified - * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH - * calculation. Note that this XORing reduces the input set entropy and could - * be exploited to reduce the RSS queue spread. - */ -#define RXH_XFRM_SYM_XOR (1 << 0) -#define RXH_XFRM_NO_CHANGE 0xff - -/* L2-L4 network traffic flow types */ -#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ -#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ -#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ -#define AH_ESP_V4_FLOW 0x04 /* hash only */ -#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ -#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ -#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ -#define AH_ESP_V6_FLOW 0x08 /* hash only */ -#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ -#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ -#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ -#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ -#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ -#define IP_USER_FLOW IPV4_USER_FLOW -#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ -#define IPV4_FLOW 0x10 /* hash only */ -#define IPV6_FLOW 0x11 /* hash only */ -#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ - -/* Used for GTP-U IPv4 and IPv6. - * The format of GTP packets only includes - * elements such as TEID and GTP version. - * It is primarily intended for data communication of the UE. - */ -#define GTPU_V4_FLOW 0x13 /* hash only */ -#define GTPU_V6_FLOW 0x14 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * The format of these GTP packets does not include TEID. - * Primarily expected to be used for communication - * to create sessions for UE data communication, - * commonly referred to as CSR (Create Session Request). - */ -#define GTPC_V4_FLOW 0x15 /* hash only */ -#define GTPC_V6_FLOW 0x16 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. - * After session creation, it becomes this packet. - * This is mainly used for requests to realize UE handover. - */ -#define GTPC_TEID_V4_FLOW 0x17 /* hash only */ -#define GTPC_TEID_V6_FLOW 0x18 /* hash only */ - -/* Use for GTP-U and extended headers for the PSC (PDU Session Container). - * The format of these GTP packets includes TEID and QFI. - * In 5G communication using UPF (User Plane Function), - * data communication with this extended header is performed. - */ -#define GTPU_EH_V4_FLOW 0x19 /* hash only */ -#define GTPU_EH_V6_FLOW 0x1a /* hash only */ - -/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. - * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by - * UL/DL included in the PSC. - * There are differences in the data included based on Downlink/Uplink, - * and can be used to distinguish packets. - * The functions described so far are useful when you want to - * handle communication from the mobile network in UPF, PGW, etc. - */ -#define GTPU_UL_V4_FLOW 0x1b /* hash only */ -#define GTPU_UL_V6_FLOW 0x1c /* hash only */ -#define GTPU_DL_V4_FLOW 0x1d /* hash only */ -#define GTPU_DL_V6_FLOW 0x1e /* hash only */ - -/* Flag to enable additional fields in struct ethtool_rx_flow_spec */ -#define FLOW_EXT 0x80000000 -#define FLOW_MAC_EXT 0x40000000 -/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ -#define FLOW_RSS 0x20000000 - -/* L3-L4 network traffic flow hash options */ -#define RXH_L2DA (1 << 1) -#define RXH_VLAN (1 << 2) -#define RXH_L3_PROTO (1 << 3) -#define RXH_IP_SRC (1 << 4) -#define RXH_IP_DST (1 << 5) -#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ -#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ -#define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ -#define RXH_DISCARD (1 << 31) - -#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL -#define RX_CLS_FLOW_WAKE 0xfffffffffffffffeULL - -/* Special RX classification rule insert location values */ -#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */ -#define RX_CLS_LOC_ANY 0xffffffff -#define RX_CLS_LOC_FIRST 0xfffffffe -#define RX_CLS_LOC_LAST 0xfffffffd - -/* EEPROM Standards for plug in modules */ -#define ETH_MODULE_SFF_8079 0x1 -#define ETH_MODULE_SFF_8079_LEN 256 -#define ETH_MODULE_SFF_8472 0x2 -#define ETH_MODULE_SFF_8472_LEN 512 -#define ETH_MODULE_SFF_8636 0x3 -#define ETH_MODULE_SFF_8636_LEN 256 -#define ETH_MODULE_SFF_8436 0x4 -#define ETH_MODULE_SFF_8436_LEN 256 - -#define ETH_MODULE_SFF_8636_MAX_LEN 640 -#define ETH_MODULE_SFF_8436_MAX_LEN 640 - -/* Reset flags */ -/* The reset() operation must clear the flags for the components which - * were actually reset. On successful return, the flags indicate the - * components which were not reset, either because they do not exist - * in the hardware or because they cannot be reset independently. The - * driver must never reset any components that were not requested. - */ -enum ethtool_reset_flags { - /* These flags represent components dedicated to the interface - * the command is addressed to. Shift any flag left by - * ETH_RESET_SHARED_SHIFT to reset a shared component of the - * same type. - */ - ETH_RESET_MGMT = 1 << 0, /* Management processor */ - ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ - ETH_RESET_DMA = 1 << 2, /* DMA engine */ - ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ - ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ - ETH_RESET_MAC = 1 << 5, /* Media access controller */ - ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ - ETH_RESET_RAM = 1 << 7, /* RAM shared between - * multiple components */ - ETH_RESET_AP = 1 << 8, /* Application processor */ - - ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to - * this interface */ - ETH_RESET_ALL = 0xffffffff, /* All components used by this - * interface, even if shared */ -}; -#define ETH_RESET_SHARED_SHIFT 16 - - -/** - * struct ethtool_link_settings - link control and status - * - * IMPORTANT, Backward compatibility notice: When implementing new - * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and - * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link - * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS - * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in - * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use - * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link - * settings; do not use %ETHTOOL_SLINKSETTINGS if - * %ETHTOOL_GLINKSETTINGS failed: stick to - * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. - * - * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS - * @speed: Link speed (Mbps) - * @duplex: Duplex mode; one of %DUPLEX_* - * @port: Physical connector type; one of %PORT_* - * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not - * applicable. For clause 45 PHYs this is the PRTAD. - * @autoneg: Enable/disable autonegotiation and auto-detection; - * either %AUTONEG_DISABLE or %AUTONEG_ENABLE - * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO - * protocols supported by the interface; 0 if unknown. - * Read-only. - * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of - * %ETH_TP_MDI_*. If the status is unknown or not applicable, the - * value will be %ETH_TP_MDI_INVALID. Read-only. - * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of - * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads - * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. - * When written successfully, the link should be renegotiated if - * necessary. - * @link_mode_masks_nwords: Number of 32-bit words for each of the - * supported, advertising, lp_advertising link mode bitmaps. For - * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user - * (>= 0); on return, if handshake in progress, negative if - * request size unsupported by kernel: absolute value indicates - * kernel expected size and all the other fields but cmd - * are 0; otherwise (handshake completed), strictly positive - * to indicate size used by kernel and cmd field stays - * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For - * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive - * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise - * refused. For drivers: ignore this field (use kernel's - * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will - * be overwritten by kernel. - * @transceiver: Used to distinguish different possible PHY types, - * reported consistently by PHYLIB. Read-only. - * @master_slave_cfg: Master/slave port mode. - * @master_slave_state: Master/slave port state. - * @rate_matching: Rate adaptation performed by the PHY - * @reserved: Reserved for future use; see the note on reserved space. - * @link_mode_masks: Variable length bitmaps. - * - * If autonegotiation is disabled, the speed and @duplex represent the - * fixed link mode and are writable if the driver supports multiple - * link modes. If it is enabled then they are read-only; if the link - * is up they represent the negotiated link mode; if the link is down, - * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and - * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. - * - * Some hardware interfaces may have multiple PHYs and/or physical - * connectors fitted or do not allow the driver to detect which are - * fitted. For these interfaces @port and/or @phy_address may be - * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. - * Otherwise, attempts to write different values may be ignored or - * rejected. - * - * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt - * are not available in %ethtool_link_settings. These fields will be - * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will - * fail if any of them is set to non-zero value. - * - * Users should assume that all fields not marked read-only are - * writable and subject to validation by the driver. They should use - * %ETHTOOL_GLINKSETTINGS to get the current values before making specific - * changes and then applying them with %ETHTOOL_SLINKSETTINGS. - * - * Drivers that implement %get_link_ksettings and/or - * %set_link_ksettings should ignore the @cmd - * and @link_mode_masks_nwords fields (any change to them overwritten - * by kernel), and rely only on kernel's internal - * %__ETHTOOL_LINK_MODE_MASK_NBITS and - * %ethtool_link_mode_mask_t. Drivers that implement - * %set_link_ksettings() should validate all fields other than @cmd - * and @link_mode_masks_nwords that are not described as read-only or - * deprecated, and must ignore all fields described as read-only. - * - * @link_mode_masks is divided into three bitfields, each of length - * @link_mode_masks_nwords: - * - supported: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, physical - * connectors and other link features for which the interface - * supports autonegotiation or auto-detection. Read-only. - * - advertising: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, physical - * connectors and other link features that are advertised through - * autonegotiation or enabled for auto-detection. - * - lp_advertising: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, and other - * link features that the link partner advertised through - * autonegotiation; 0 if unknown or not applicable. Read-only. - */ -struct ethtool_link_settings { - __u32 cmd; - __u32 speed; - __u8 duplex; - __u8 port; - __u8 phy_address; - __u8 autoneg; - __u8 mdio_support; - __u8 eth_tp_mdix; - __u8 eth_tp_mdix_ctrl; - __s8 link_mode_masks_nwords; - __u8 transceiver; - __u8 master_slave_cfg; - __u8 master_slave_state; - __u8 rate_matching; - __u32 reserved[7]; - __u32 link_mode_masks[]; - /* layout of link_mode_masks fields: - * __u32 map_supported[link_mode_masks_nwords]; - * __u32 map_advertising[link_mode_masks_nwords]; - * __u32 map_lp_advertising[link_mode_masks_nwords]; - */ -}; -#endif /* _UAPI_LINUX_ETHTOOL_H */ From 792a04bed41caec79c787d105b0d442351b3bcc8 Mon Sep 17 00:00:00 2001 From: David Faust Date: Wed, 8 May 2024 12:35:12 -0700 Subject: [PATCH 114/119] bpf: avoid gcc overflow warning in test_xdp_vlan.c This patch fixes an integer overflow warning raised by GCC in xdp_prognum1 of progs/test_xdp_vlan.c: GCC-BPF [test_maps] test_xdp_vlan.bpf.o progs/test_xdp_vlan.c: In function 'xdp_prognum1': progs/test_xdp_vlan.c:163:25: error: integer overflow in expression '(short int)(((__builtin_constant_p((int)vlan_hdr->h_vlan_TCI)) != 0 ? (int)(short unsigned int)((short int)((int)vlan_hdr->h_vlan_TCI << 8 >> 8) << 8 | (short int)((int)vlan_hdr->h_vlan_TCI << 0 >> 8 << 0)) & 61440 : (int)__builtin_bswap16(vlan_hdr->h_vlan_TCI) & 61440) << 8 >> 8) << 8' of type 'short int' results in '0' [-Werror=overflow] 163 | bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000) | ^~~~~~~~~ The problem lies with the expansion of the bpf_htons macro and the expression passed into it. The bpf_htons macro (and similarly the bpf_ntohs macro) expand to a ternary operation using either __builtin_bswap16 or ___bpf_swab16 to swap the bytes, depending on whether the expression is constant. For an expression, with 'value' as a u16, like: bpf_htons (value & 0xf000) The entire (value & 0xf000) is 'x' in the expansion of ___bpf_swab16 and we get as one part of the expanded swab16: ((__u16)(value & 0xf000) << 8 >> 8 << 8 This will always evaluate to 0, which is intentional since this subexpression deals with the byte guaranteed to be 0 by the mask. However, GCC warns because the precise reason this always evaluates to 0 is an overflow. Specifically, the plain 0xf000 in the expression is a signed 32-bit integer, which causes 'value' to also be promoted to a signed 32-bit integer, and the combination of the 8-bit left shift and down-cast back to __u16 results in a signed overflow (really a 'warning: overflow in conversion from int to __u16' which is propegated up through the rest of the expression leading to the ultimate overflow warning above), which is a valid warning despite being the intended result of this code. Clang does not warn on this case, likely because it performs constant folding later in the compilation process relative to GCC. It seems that by the time clang does constant folding for this expression, the side of the ternary with this overflow has already been discarded. Fortunately, this warning is easily silenced by simply making the 0xf000 mask explicitly unsigned. This has no impact on the result. Signed-off-by: David Faust Cc: jose.marchesi@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20240508193512.152759-1-david.faust@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_xdp_vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c index f3ec8086482d..a7588302268d 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c @@ -160,7 +160,7 @@ int xdp_prognum1(struct xdp_md *ctx) /* Modifying VLAN, preserve top 4 bits */ vlan_hdr->h_vlan_TCI = - bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000) + bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000U) | TO_VLAN); } From 5ddafcc377f98778acc08f660dee6400aece6a62 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Fri, 10 May 2024 19:38:50 +0100 Subject: [PATCH 115/119] selftests/bpf: Fix a few tests for GCC related warnings. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch corrects a few warnings to allow selftests to compile for GCC. -- progs/cpumask_failure.c -- progs/bpf_misc.h:136:22: error: ‘cpumask’ is used uninitialized [-Werror=uninitialized] 136 | #define __sink(expr) asm volatile("" : "+g"(expr)) | ^~~ progs/cpumask_failure.c:68:9: note: in expansion of macro ‘__sink’ 68 | __sink(cpumask); The macro __sink(cpumask) with the '+' contraint modifier forces the the compiler to expect a read and write from cpumask. GCC detects that cpumask is never initialized and reports an error. This patch removes the spurious non required definitions of cpumask. -- progs/dynptr_fail.c -- progs/dynptr_fail.c:1444:9: error: ‘ptr1’ may be used uninitialized [-Werror=maybe-uninitialized] 1444 | bpf_dynptr_clone(&ptr1, &ptr2); Many of the tests in the file are related to the detection of uninitialized pointers by the verifier. GCC is able to detect possible uninitialized values, and reports this as an error. The patch initializes all of the previous uninitialized structs. -- progs/test_tunnel_kern.c -- progs/test_tunnel_kern.c:590:9: error: array subscript 1 is outside array bounds of ‘struct geneve_opt[1]’ [-Werror=array-bounds=] 590 | *(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef); | ^~~~~~~~~~~~~~~~~~~~~~~ progs/test_tunnel_kern.c:575:27: note: at offset 4 into object ‘gopt’ of size 4 575 | struct geneve_opt gopt; This tests accesses beyond the defined data for the struct geneve_opt which contains as last field "u8 opt_data[0]" which clearly does not get reserved space (in stack) in the function header. This pattern is repeated in ip6geneve_set_tunnel and geneve_set_tunnel functions. GCC is able to see this and emits a warning. The patch introduces a local struct that allocates enough space to safely allow the write to opt_data field. -- progs/jeq_infer_not_null_fail.c -- progs/jeq_infer_not_null_fail.c:21:40: error: array subscript ‘struct bpf_map[0]’ is partly outside array bounds of ‘struct [1]’ [-Werror=array-bounds=] 21 | struct bpf_map *inner_map = map->inner_map_meta; | ^~ progs/jeq_infer_not_null_fail.c:14:3: note: object ‘m_hash’ of size 32 14 | } m_hash SEC(".maps"); This example defines m_hash in the context of the compilation unit and casts it to struct bpf_map which is much smaller than the size of struct bpf_map. It errors out in GCC when it attempts to access an element that would be defined in struct bpf_map outsize of the defined limits for m_hash. This patch disables the warning through a GCC pragma. This changes were tested in bpf-next master selftests without any regressions. Signed-off-by: Cupertino Miranda Cc: jose.marchesi@oracle.com Cc: david.faust@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Cc: Andrii Nakryiko Link: https://lore.kernel.org/r/20240510183850.286661-2-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/cpumask_failure.c | 3 -- .../testing/selftests/bpf/progs/dynptr_fail.c | 12 ++--- .../bpf/progs/jeq_infer_not_null_fail.c | 4 ++ .../selftests/bpf/progs/test_tunnel_kern.c | 47 +++++++++++-------- 4 files changed, 37 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index a9bf6ea336cf..a988d2823b52 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -61,11 +61,8 @@ SEC("tp_btf/task_newtask") __failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask") int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags) { - struct bpf_cpumask *cpumask; - /* Can't set the CPU of a non-struct bpf_cpumask. */ bpf_cpumask_set_cpu(0, (struct bpf_cpumask *)task->cpus_ptr); - __sink(cpumask); return 0; } diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index 7ce7e827d5f0..66a60bfb5867 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -80,7 +80,7 @@ SEC("?raw_tp") __failure __msg("Unreleased reference id=2") int ringbuf_missing_release1(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; bpf_ringbuf_reserve_dynptr(&ringbuf, val, 0, &ptr); @@ -1385,7 +1385,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_adjust_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_adjust(&ptr, 1, 2); @@ -1398,7 +1398,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_null_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_null(&ptr); @@ -1411,7 +1411,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_is_rdonly_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_is_rdonly(&ptr); @@ -1424,7 +1424,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int dynptr_size_invalid(void *ctx) { - struct bpf_dynptr ptr; + struct bpf_dynptr ptr = {}; /* this should fail */ bpf_dynptr_size(&ptr); @@ -1437,7 +1437,7 @@ SEC("?raw_tp") __failure __msg("Expected an initialized dynptr as arg #1") int clone_invalid1(void *ctx) { - struct bpf_dynptr ptr1; + struct bpf_dynptr ptr1 = {}; struct bpf_dynptr ptr2; /* this should fail */ diff --git a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c index f46965053acb..4d619bea9c75 100644 --- a/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c +++ b/tools/testing/selftests/bpf/progs/jeq_infer_not_null_fail.c @@ -4,6 +4,10 @@ #include #include "bpf_misc.h" +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + char _license[] SEC("license") = "GPL"; struct { diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 3e436e6f7312..3f5abcf3ff13 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -567,12 +567,18 @@ int ip6vxlan_get_tunnel_src(struct __sk_buff *skb) return TC_ACT_OK; } +struct local_geneve_opt { + struct geneve_opt gopt; + int data; +}; + SEC("tc") int geneve_set_tunnel(struct __sk_buff *skb) { int ret; struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; __builtin_memset(&key, 0x0, sizeof(key)); key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ @@ -580,14 +586,14 @@ int geneve_set_tunnel(struct __sk_buff *skb) key.tunnel_tos = 0; key.tunnel_ttl = 64; - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xdeadbeef); ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); @@ -596,7 +602,7 @@ int geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(local_gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; @@ -631,7 +637,8 @@ SEC("tc") int ip6geneve_set_tunnel(struct __sk_buff *skb) { struct bpf_tunnel_key key; - struct geneve_opt gopt; + struct local_geneve_opt local_gopt; + struct geneve_opt *gopt = (struct geneve_opt *) &local_gopt; int ret; __builtin_memset(&key, 0x0, sizeof(key)); @@ -647,16 +654,16 @@ int ip6geneve_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - __builtin_memset(&gopt, 0x0, sizeof(gopt)); - gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ - gopt.type = 0x08; - gopt.r1 = 0; - gopt.r2 = 0; - gopt.r3 = 0; - gopt.length = 2; /* 4-byte multiple */ - *(int *) &gopt.opt_data = bpf_htonl(0xfeedbeef); + __builtin_memset(gopt, 0x0, sizeof(local_gopt)); + gopt->opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */ + gopt->type = 0x08; + gopt->r1 = 0; + gopt->r2 = 0; + gopt->r3 = 0; + gopt->length = 2; /* 4-byte multiple */ + *(int *) &gopt->opt_data = bpf_htonl(0xfeedbeef); - ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt)); + ret = bpf_skb_set_tunnel_opt(skb, gopt, sizeof(gopt)); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; From a3c1c95538e22283ef6fa529e3ffa0e6d47ee190 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 11 May 2024 16:50:24 +0800 Subject: [PATCH 116/119] selftests/bpf: Free strdup memory in xdp_hw_metadata The strdup() function returns a pointer to a new string which is a duplicate of the string "ifname". Memory for the new string is obtained with malloc(), and need to be freed with free(). This patch adds this missing "free(saved_hwtstamp_ifname)" in cleanup() to avoid a potential memory leak in xdp_hw_metadata.c. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/af9bcccb96655e82de5ce2b4510b88c9c8ed5ed0.1715417367.git.tanggeliang@kylinos.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xdp_hw_metadata.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 0859fe727da7..6f9956eed797 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -581,6 +581,8 @@ static void cleanup(void) if (bpf_obj) xdp_hw_metadata__destroy(bpf_obj); + + free((void *)saved_hwtstamp_ifname); } static void handle_signal(int sig) From 73868988c90d2701587ab2a48b5858ab935afb17 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 11 May 2024 23:22:13 +0200 Subject: [PATCH 117/119] bpf: disable strict aliasing in test_global_func9.c The BPF selftest test_global_func9.c performs type punning and breaks srict-aliasing rules. In particular, given: int global_func9(struct __sk_buff *skb) { int result = 0; [...] { const struct C c = {.x = skb->len, .y = skb->family }; result |= foo((const struct S *)&c); } } When building with strict-aliasing enabled (the default) the initialization of `c' gets optimized away in its entirely: [... no initialization of `c' ...] r1 = r10 r1 += -40 call foo w0 |= w6 Since GCC knows that `foo' accesses s->x, we get a "maybe uninitialized" warning. On the other hand, when strict-aliasing is disabled GCC only optimizes away the store to `.y': r1 = *(u32 *) (r6+0) *(u32 *) (r10+-40) = r1 ; This is .x = skb->len in `c' r1 = r10 r1 += -40 call foo w0 |= w6 In this case the warning is not emitted, because s-> is initialized. This patch disables strict aliasing in this test when building with GCC. clang seems to not optimize this particular code even when strict aliasing is enabled. Tested in bpf-next master. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Link: https://lore.kernel.org/r/20240511212213.23418-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ed381b0197fe..e0b3887b3d2d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -53,6 +53,7 @@ progs/syscall.c-CFLAGS := -fno-strict-aliasing progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing ifneq ($(LLVM),) # Silence some warnings when compiled with clang From 6a2f786e6905007e82bac212296deca29815916d Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 11 May 2024 23:23:49 +0200 Subject: [PATCH 118/119] bpf: ignore expected GCC warning in test_global_func10.c The BPF selftest global_func10 in progs/test_global_func10.c contains: struct Small { long x; }; struct Big { long x; long y; }; [...] __noinline int foo(const struct Big *big) { if (!big) return 0; return bpf_get_prandom_u32() < big->y; } [...] SEC("cgroup_skb/ingress") __failure __msg("invalid indirect access to stack") int global_func10(struct __sk_buff *skb) { const struct Small small = {.x = skb->len }; return foo((struct Big *)&small) ? 1 : 0; } GCC emits a "maybe uninitialized" warning for the code above, because it knows `foo' accesses `big->y'. Since the purpose of this selftest is to check that the verifier will fail on this sort of invalid memory access, this patch just silences the compiler warning. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Yonghong Song Cc: Eduard Zingerman Link: https://lore.kernel.org/r/20240511212349.23549-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_global_func10.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c index 8fba3f3649e2..5da001ca57a5 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func10.c +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -4,6 +4,10 @@ #include #include "bpf_misc.h" +#if !defined(__clang__) +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + struct Small { long x; }; From ba39486d2c43ba7c103c438540aa56c8bde3b6c7 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Sat, 11 May 2024 23:22:43 +0200 Subject: [PATCH 119/119] bpf: make list_for_each_entry portable [Changes from V1: - The __compat_break has been abandoned in favor of a more readable can_loop macro that can be used anywhere, including loop conditions.] The macro list_for_each_entry is defined in bpf_arena_list.h as follows: #define list_for_each_entry(pos, head, member) \ for (void * ___tmp = (pos = list_entry_safe((head)->first, \ typeof(*(pos)), member), \ (void *)0); \ pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ cond_break, \ pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) The macro cond_break, in turn, expands to a statement expression that contains a `break' statement. Compound statement expressions, and the subsequent ability of placing statements in the header of a `for' loop, are GNU extensions. Unfortunately, clang implements this GNU extension differently than GCC: - In GCC the `break' statement is bound to the containing "breakable" context in which the defining `for' appears. If there is no such context, GCC emits a warning: break statement without enclosing `for' o `switch' statement. - In clang the `break' statement is bound to the defining `for'. If the defining `for' is itself inside some breakable construct, then clang emits a -Wgcc-compat warning. This patch adds a new macro can_loop to bpf_experimental, that implements the same logic than cond_break but evaluates to a boolean expression. The patch also changes all the current instances of usage of cond_break withing the header of loop accordingly. Tested in bpf-next master. No regressions. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Alexei Starovoitov Link: https://lore.kernel.org/r/20240511212243.23477-1-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_arena_list.h | 4 +-- .../testing/selftests/bpf/bpf_experimental.h | 33 +++++++++++++++++-- .../testing/selftests/bpf/progs/arena_list.c | 2 +- .../bpf/progs/verifier_iterating_callbacks.c | 9 +++-- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index b99b9f408eff..85dbc3ea4da5 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -29,6 +29,7 @@ static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { re static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {} static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } #define cond_break ({}) +#define can_loop true #endif /* Safely walk link list elements. Deletion of elements is allowed. */ @@ -36,8 +37,7 @@ static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } for (void * ___tmp = (pos = list_entry_safe((head)->first, \ typeof(*(pos)), member), \ (void *)0); \ - pos && ({ ___tmp = (void *)pos->member.next; 1; }); \ - cond_break, \ + pos && ({ ___tmp = (void *)pos->member.next; 1; }) && can_loop; \ pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 8b9cc87be4c4..3d9e4b8c6b81 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -326,19 +326,48 @@ l_true: \ }) #endif +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ #ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + #define cond_break \ ({ __label__ l_break, l_continue; \ - asm volatile goto("may_goto %l[l_break]" \ + asm volatile goto("may_goto %l[l_break]" \ :::: l_break); \ goto l_continue; \ l_break: break; \ l_continue:; \ }) #else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + #define cond_break \ ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ + asm volatile goto("1:.byte 0xe5; \ .byte 0; \ .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ .short 0" \ diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index c0422c58cee2..93bd0600eba0 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -49,7 +49,7 @@ int arena_list_add(void *ctx) list_head = &global_head; - for (i = zero; i < cnt; cond_break, i++) { + for (i = zero; i < cnt && can_loop; i++) { struct elem __arena *n = bpf_alloc(sizeof(*n)); test_val++; diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index 99e561f18f9b..bd676d7e615f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -318,7 +318,7 @@ int cond_break1(const void *ctx) unsigned long i; unsigned int sum = 0; - for (i = zero; i < ARR_SZ; cond_break, i++) + for (i = zero; i < ARR_SZ && can_loop; i++) sum += i; for (i = zero; i < ARR_SZ; i++) { barrier_var(i); @@ -336,12 +336,11 @@ int cond_break2(const void *ctx) int i, j; int sum = 0; - for (i = zero; i < 1000; cond_break, i++) + for (i = zero; i < 1000 && can_loop; i++) for (j = zero; j < 1000; j++) { sum += i + j; cond_break; - } - + } return sum; } @@ -349,7 +348,7 @@ static __noinline int loop(void) { int i, sum = 0; - for (i = zero; i <= 1000000; i++, cond_break) + for (i = zero; i <= 1000000 && can_loop; i++) sum += i; return sum;