Merge branch 'bpf: Add socket destroy capability'

Aditi Ghag says:

====================

This patch set adds the capability to destroy sockets in BPF. We plan to
use the capability in Cilium to force client sockets to reconnect when
their remote load-balancing backends are deleted. The other use case is
on-the-fly policy enforcement where existing socket connections
prevented by policies need to be terminated.

The use cases, and more details around
the selected approach were presented at LPC 2022 -
https://lpc.events/event/16/contributions/1358/.
RFC discussion -
https://lore.kernel.org/netdev/CABG=zsBEh-P4NXk23eBJw7eajB5YJeRS7oPXnTAzs=yob4EMoQ@mail.gmail.com/T/#u.
v8 patch series -
https://lore.kernel.org/bpf/20230517175359.527917-1-aditi.ghag@isovalent.com/

v9 highlights:
Address review comments:
Martin:
- Rearranged the kfunc filter patch, and added the missing break
  statement.
- Squashed the extended selftest/bpf patch.
Yonghong:
- Revised commit message for patch 1.

(Below notes are same as v8 patch series that are still relevant. Refer to
earlier patch series versions for other notes.)
- I hit a snag while writing the kfunc where verifier complained about the
  `sock_common` type passed from TCP iterator. With kfuncs, there don't
  seem to be any options available to pass BTF type hints to the verifier
  (equivalent of `ARG_PTR_TO_BTF_ID_SOCK_COMMON`, as was the case with the
  helper).  As a result, I changed the argument type of the sock_destory
  kfunc to `sock_common`.
====================

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
This commit is contained in:
Martin KaFai Lau 2023-05-19 17:45:47 -07:00
commit 18f558876f
13 changed files with 794 additions and 79 deletions

View file

@ -98,10 +98,14 @@ struct btf_type;
union bpf_attr;
struct btf_show;
struct btf_id_set;
struct bpf_prog;
typedef int (*btf_kfunc_filter_t)(const struct bpf_prog *prog, u32 kfunc_id);
struct btf_kfunc_id_set {
struct module *owner;
struct btf_id_set8 *set;
btf_kfunc_filter_t filter;
};
struct btf_id_dtor_kfunc {
@ -479,7 +483,6 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
}
struct bpf_prog;
struct bpf_verifier_log;
#ifdef CONFIG_BPF_SYSCALL
@ -487,10 +490,10 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
const char *btf_name_by_offset(const struct btf *btf, u32 offset);
struct btf *btf_parse_vmlinux(void);
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
u32 *btf_kfunc_id_set_contains(const struct btf *btf,
enum bpf_prog_type prog_type,
u32 kfunc_btf_id);
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id);
u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id,
const struct bpf_prog *prog);
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
const struct bpf_prog *prog);
int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
const struct btf_kfunc_id_set *s);
int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset);
@ -517,8 +520,9 @@ static inline const char *btf_name_by_offset(const struct btf *btf,
return NULL;
}
static inline u32 *btf_kfunc_id_set_contains(const struct btf *btf,
enum bpf_prog_type prog_type,
u32 kfunc_btf_id)
u32 kfunc_btf_id,
struct bpf_prog *prog)
{
return NULL;
}

View file

@ -437,7 +437,6 @@ struct udp_seq_afinfo {
struct udp_iter_state {
struct seq_net_private p;
int bucket;
struct udp_seq_afinfo *bpf_seq_afinfo;
};
void *udp_seq_start(struct seq_file *seq, loff_t *pos);

View file

@ -222,10 +222,17 @@ enum btf_kfunc_hook {
enum {
BTF_KFUNC_SET_MAX_CNT = 256,
BTF_DTOR_KFUNC_MAX_CNT = 256,
BTF_KFUNC_FILTER_MAX_CNT = 16,
};
struct btf_kfunc_hook_filter {
btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT];
u32 nr_filters;
};
struct btf_kfunc_set_tab {
struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX];
};
struct btf_id_dtor_kfunc_tab {
@ -7669,9 +7676,12 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
/* Kernel Function (kfunc) BTF ID set registration API */
static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
struct btf_id_set8 *add_set)
const struct btf_kfunc_id_set *kset)
{
struct btf_kfunc_hook_filter *hook_filter;
struct btf_id_set8 *add_set = kset->set;
bool vmlinux_set = !btf_is_module(btf);
bool add_filter = !!kset->filter;
struct btf_kfunc_set_tab *tab;
struct btf_id_set8 *set;
u32 set_cnt;
@ -7686,6 +7696,24 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
return 0;
tab = btf->kfunc_set_tab;
if (tab && add_filter) {
u32 i;
hook_filter = &tab->hook_filters[hook];
for (i = 0; i < hook_filter->nr_filters; i++) {
if (hook_filter->filters[i] == kset->filter) {
add_filter = false;
break;
}
}
if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) {
ret = -E2BIG;
goto end;
}
}
if (!tab) {
tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
if (!tab)
@ -7708,7 +7736,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
*/
if (!vmlinux_set) {
tab->sets[hook] = add_set;
return 0;
goto do_add_filter;
}
/* In case of vmlinux sets, there may be more than one set being
@ -7750,6 +7778,11 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
do_add_filter:
if (add_filter) {
hook_filter = &tab->hook_filters[hook];
hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
}
return 0;
end:
btf_free_kfunc_set_tab(btf);
@ -7758,15 +7791,22 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
enum btf_kfunc_hook hook,
u32 kfunc_btf_id)
u32 kfunc_btf_id,
const struct bpf_prog *prog)
{
struct btf_kfunc_hook_filter *hook_filter;
struct btf_id_set8 *set;
u32 *id;
u32 *id, i;
if (hook >= BTF_KFUNC_HOOK_MAX)
return NULL;
if (!btf->kfunc_set_tab)
return NULL;
hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
for (i = 0; i < hook_filter->nr_filters; i++) {
if (hook_filter->filters[i](prog, kfunc_btf_id))
return NULL;
}
set = btf->kfunc_set_tab->sets[hook];
if (!set)
return NULL;
@ -7821,23 +7861,25 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
* protection for looking up a well-formed btf->kfunc_set_tab.
*/
u32 *btf_kfunc_id_set_contains(const struct btf *btf,
enum bpf_prog_type prog_type,
u32 kfunc_btf_id)
u32 kfunc_btf_id,
const struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
enum btf_kfunc_hook hook;
u32 *kfunc_flags;
kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
if (kfunc_flags)
return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
}
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
const struct bpf_prog *prog)
{
return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
}
static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
@ -7868,7 +7910,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
goto err_out;
}
ret = btf_populate_kfunc_set(btf, hook, kset->set);
ret = btf_populate_kfunc_set(btf, hook, kset);
err_out:
btf_put(btf);
return ret;

View file

@ -10939,7 +10939,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
*kfunc_name = func_name;
func_proto = btf_type_by_id(desc_btf, func->type);
kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
if (!kfunc_flags) {
return -EACCES;
}
@ -19010,7 +19010,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
* in the fmodret id set with the KF_SLEEPABLE flag.
*/
else {
u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
prog);
if (flags && (*flags & KF_SLEEPABLE))
ret = 0;
@ -19038,7 +19039,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = -EINVAL;
if (btf_kfunc_is_modify_return(btf, btf_id) ||
if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
!check_attach_modify_return(addr, tname))
ret = 0;
if (ret) {

View file

@ -11723,3 +11723,66 @@ static int __init bpf_kfunc_init(void)
return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
}
late_initcall(bpf_kfunc_init);
/* Disables missing prototype warnings */
__diag_push();
__diag_ignore_all("-Wmissing-prototypes",
"Global functions as their definitions will be in vmlinux BTF");
/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
*
* The function expects a non-NULL pointer to a socket, and invokes the
* protocol specific socket destroy handlers.
*
* The helper can only be called from BPF contexts that have acquired the socket
* locks.
*
* Parameters:
* @sock: Pointer to socket to be destroyed
*
* Return:
* On error, may return EPROTONOSUPPORT, EINVAL.
* EPROTONOSUPPORT if protocol specific destroy handler is not supported.
* 0 otherwise
*/
__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
{
struct sock *sk = (struct sock *)sock;
/* The locking semantics that allow for synchronous execution of the
* destroy handlers are only supported for TCP and UDP.
* Supporting protocols will need to acquire sock lock in the BPF context
* prior to invoking this kfunc.
*/
if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
sk->sk_protocol != IPPROTO_UDP))
return -EOPNOTSUPP;
return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
}
__diag_pop()
BTF_SET8_START(bpf_sk_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
BTF_SET8_END(bpf_sk_iter_kfunc_ids)
static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
prog->expected_attach_type != BPF_TRACE_ITER)
return -EACCES;
return 0;
}
static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
.owner = THIS_MODULE,
.set = &bpf_sk_iter_kfunc_ids,
.filter = tracing_iter_filter,
};
static int init_subsystem(void)
{
return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
}
late_initcall(init_subsystem);

View file

@ -4682,8 +4682,10 @@ int tcp_abort(struct sock *sk, int err)
return 0;
}
/* Don't race with userspace socket closes such as tcp_close. */
lock_sock(sk);
/* BPF context ensures sock locking. */
if (!has_current_bpf_ctx())
/* Don't race with userspace socket closes such as tcp_close. */
lock_sock(sk);
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@ -4707,7 +4709,8 @@ int tcp_abort(struct sock *sk, int err)
bh_unlock_sock(sk);
local_bh_enable();
tcp_write_queue_purge(sk);
release_sock(sk);
if (!has_current_bpf_ctx())
release_sock(sk);
return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);

View file

@ -2962,7 +2962,6 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_meta meta;
struct bpf_prog *prog;
struct sock *sk = v;
bool slow;
uid_t uid;
int ret;
@ -2970,7 +2969,7 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
return 0;
if (sk_fullsock(sk))
slow = lock_sock_fast(sk);
lock_sock(sk);
if (unlikely(sk_unhashed(sk))) {
ret = SEQ_SKIP;
@ -2994,7 +2993,7 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
unlock:
if (sk_fullsock(sk))
unlock_sock_fast(sk, slow);
release_sock(sk);
return ret;
}
@ -3356,7 +3355,7 @@ static struct bpf_iter_reg tcp_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__tcp, sk_common),
PTR_TO_BTF_ID_OR_NULL },
PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.get_func_proto = bpf_iter_tcp_get_func_proto,
.seq_info = &tcp_seq_info,

View file

@ -2930,7 +2930,8 @@ EXPORT_SYMBOL(udp_poll);
int udp_abort(struct sock *sk, int err)
{
lock_sock(sk);
if (!has_current_bpf_ctx())
lock_sock(sk);
/* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
* with close()
@ -2943,7 +2944,8 @@ int udp_abort(struct sock *sk, int err)
__udp_disconnect(sk, 0);
out:
release_sock(sk);
if (!has_current_bpf_ctx())
release_sock(sk);
return 0;
}
@ -2988,9 +2990,30 @@ EXPORT_SYMBOL(udp_prot);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo,
struct net *net)
static unsigned short seq_file_family(const struct seq_file *seq);
static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
{
unsigned short family = seq_file_family(seq);
/* AF_UNSPEC is used as a match all */
return ((family == AF_UNSPEC || family == sk->sk_family) &&
net_eq(sock_net(sk), seq_file_net(seq)));
}
#ifdef CONFIG_BPF_SYSCALL
static const struct seq_operations bpf_iter_udp_seq_ops;
#endif
static struct udp_table *udp_get_table_seq(struct seq_file *seq,
struct net *net)
{
const struct udp_seq_afinfo *afinfo;
#ifdef CONFIG_BPF_SYSCALL
if (seq->op == &bpf_iter_udp_seq_ops)
return net->ipv4.udp_table;
#endif
afinfo = pde_data(file_inode(seq->file));
return afinfo->udp_table ? : net->ipv4.udp_table;
}
@ -2998,16 +3021,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
{
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
struct sock *sk;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
afinfo = pde_data(file_inode(seq->file));
udptable = udp_get_table_afinfo(afinfo, net);
udptable = udp_get_table_seq(seq, net);
for (state->bucket = start; state->bucket <= udptable->mask;
++state->bucket) {
@ -3018,10 +3035,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
spin_lock_bh(&hslot->lock);
sk_for_each(sk, &hslot->head) {
if (!net_eq(sock_net(sk), net))
continue;
if (afinfo->family == AF_UNSPEC ||
sk->sk_family == afinfo->family)
if (seq_sk_match(seq, sk))
goto found;
}
spin_unlock_bh(&hslot->lock);
@ -3035,22 +3049,14 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
afinfo = pde_data(file_inode(seq->file));
do {
sk = sk_next(sk);
} while (sk && (!net_eq(sock_net(sk), net) ||
(afinfo->family != AF_UNSPEC &&
sk->sk_family != afinfo->family)));
} while (sk && !seq_sk_match(seq, sk));
if (!sk) {
udptable = udp_get_table_afinfo(afinfo, net);
udptable = udp_get_table_seq(seq, net);
if (state->bucket <= udptable->mask)
spin_unlock_bh(&udptable->hash[state->bucket].lock);
@ -3096,15 +3102,9 @@ EXPORT_SYMBOL(udp_seq_next);
void udp_seq_stop(struct seq_file *seq, void *v)
{
struct udp_iter_state *state = seq->private;
struct udp_seq_afinfo *afinfo;
struct udp_table *udptable;
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
afinfo = pde_data(file_inode(seq->file));
udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq));
udptable = udp_get_table_seq(seq, seq_file_net(seq));
if (state->bucket <= udptable->mask)
spin_unlock_bh(&udptable->hash[state->bucket].lock);
@ -3157,6 +3157,143 @@ struct bpf_iter__udp {
int bucket __aligned(8);
};
struct bpf_udp_iter_state {
struct udp_iter_state state;
unsigned int cur_sk;
unsigned int end_sk;
unsigned int max_sk;
int offset;
struct sock **batch;
bool st_bucket_done;
};
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
unsigned int new_batch_sz);
static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
{
struct bpf_udp_iter_state *iter = seq->private;
struct udp_iter_state *state = &iter->state;
struct net *net = seq_file_net(seq);
struct udp_table *udptable;
unsigned int batch_sks = 0;
bool resized = false;
struct sock *sk;
/* The current batch is done, so advance the bucket. */
if (iter->st_bucket_done) {
state->bucket++;
iter->offset = 0;
}
udptable = udp_get_table_seq(seq, net);
again:
/* New batch for the next bucket.
* Iterate over the hash table to find a bucket with sockets matching
* the iterator attributes, and return the first matching socket from
* the bucket. The remaining matched sockets from the bucket are batched
* before releasing the bucket lock. This allows BPF programs that are
* called in seq_show to acquire the bucket lock if needed.
*/
iter->cur_sk = 0;
iter->end_sk = 0;
iter->st_bucket_done = false;
batch_sks = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
if (hlist_empty(&hslot2->head)) {
iter->offset = 0;
continue;
}
spin_lock_bh(&hslot2->lock);
udp_portaddr_for_each_entry(sk, &hslot2->head) {
if (seq_sk_match(seq, sk)) {
/* Resume from the last iterated socket at the
* offset in the bucket before iterator was stopped.
*/
if (iter->offset) {
--iter->offset;
continue;
}
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
iter->batch[iter->end_sk++] = sk;
}
batch_sks++;
}
}
spin_unlock_bh(&hslot2->lock);
if (iter->end_sk)
break;
/* Reset the current bucket's offset before moving to the next bucket. */
iter->offset = 0;
}
/* All done: no batch made. */
if (!iter->end_sk)
return NULL;
if (iter->end_sk == batch_sks) {
/* Batching is done for the current bucket; return the first
* socket to be iterated from the batch.
*/
iter->st_bucket_done = true;
goto done;
}
if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
resized = true;
/* After allocating a larger batch, retry one more time to grab
* the whole bucket.
*/
state->bucket--;
goto again;
}
done:
return iter->batch[0];
}
static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_udp_iter_state *iter = seq->private;
struct sock *sk;
/* Whenever seq_next() is called, the iter->cur_sk is
* done with seq_show(), so unref the iter->cur_sk.
*/
if (iter->cur_sk < iter->end_sk) {
sock_put(iter->batch[iter->cur_sk++]);
++iter->offset;
}
/* After updating iter->cur_sk, check if there are more sockets
* available in the current bucket batch.
*/
if (iter->cur_sk < iter->end_sk)
sk = iter->batch[iter->cur_sk];
else
/* Prepare a new batch. */
sk = bpf_iter_udp_batch(seq);
++*pos;
return sk;
}
static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
{
/* bpf iter does not support lseek, so it always
* continue from where it was stop()-ped.
*/
if (*pos)
return bpf_iter_udp_batch(seq);
return SEQ_START_TOKEN;
}
static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
struct udp_sock *udp_sk, uid_t uid, int bucket)
{
@ -3177,18 +3314,37 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
struct bpf_prog *prog;
struct sock *sk = v;
uid_t uid;
int ret;
if (v == SEQ_START_TOKEN)
return 0;
lock_sock(sk);
if (unlikely(sk_unhashed(sk))) {
ret = SEQ_SKIP;
goto unlock;
}
uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
unlock:
release_sock(sk);
return ret;
}
static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
{
while (iter->cur_sk < iter->end_sk)
sock_put(iter->batch[iter->cur_sk++]);
}
static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_udp_iter_state *iter = seq->private;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
@ -3199,17 +3355,35 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
}
udp_seq_stop(seq, v);
if (iter->cur_sk < iter->end_sk) {
bpf_iter_udp_put_batch(iter);
iter->st_bucket_done = false;
}
}
static const struct seq_operations bpf_iter_udp_seq_ops = {
.start = udp_seq_start,
.next = udp_seq_next,
.start = bpf_iter_udp_seq_start,
.next = bpf_iter_udp_seq_next,
.stop = bpf_iter_udp_seq_stop,
.show = bpf_iter_udp_seq_show,
};
#endif
static unsigned short seq_file_family(const struct seq_file *seq)
{
const struct udp_seq_afinfo *afinfo;
#ifdef CONFIG_BPF_SYSCALL
/* BPF iterator: bpf programs to filter sockets. */
if (seq->op == &bpf_iter_udp_seq_ops)
return AF_UNSPEC;
#endif
/* Proc fs iterator */
afinfo = pde_data(file_inode(seq->file));
return afinfo->family;
}
const struct seq_operations udp_seq_ops = {
.start = udp_seq_start,
.next = udp_seq_next,
@ -3418,38 +3592,55 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
struct udp_sock *udp_sk, uid_t uid, int bucket)
static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
unsigned int new_batch_sz)
{
struct udp_iter_state *st = priv_data;
struct udp_seq_afinfo *afinfo;
int ret;
struct sock **new_batch;
afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
if (!afinfo)
new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
GFP_USER | __GFP_NOWARN);
if (!new_batch)
return -ENOMEM;
afinfo->family = AF_UNSPEC;
afinfo->udp_table = NULL;
st->bpf_seq_afinfo = afinfo;
bpf_iter_udp_put_batch(iter);
kvfree(iter->batch);
iter->batch = new_batch;
iter->max_sk = new_batch_sz;
return 0;
}
#define INIT_BATCH_SZ 16
static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
{
struct bpf_udp_iter_state *iter = priv_data;
int ret;
ret = bpf_iter_init_seq_net(priv_data, aux);
if (ret)
kfree(afinfo);
return ret;
ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
if (ret)
bpf_iter_fini_seq_net(priv_data);
return ret;
}
static void bpf_iter_fini_udp(void *priv_data)
{
struct udp_iter_state *st = priv_data;
struct bpf_udp_iter_state *iter = priv_data;
kfree(st->bpf_seq_afinfo);
bpf_iter_fini_seq_net(priv_data);
kvfree(iter->batch);
}
static const struct bpf_iter_seq_info udp_seq_info = {
.seq_ops = &bpf_iter_udp_seq_ops,
.init_seq_private = bpf_iter_init_udp,
.fini_seq_private = bpf_iter_fini_udp,
.seq_priv_size = sizeof(struct udp_iter_state),
.seq_priv_size = sizeof(struct bpf_udp_iter_state),
};
static struct bpf_iter_reg udp_reg_info = {
@ -3457,7 +3648,7 @@ static struct bpf_iter_reg udp_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__udp, udp_sk),
PTR_TO_BTF_ID_OR_NULL },
PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &udp_seq_info,
};

View file

@ -427,3 +427,26 @@ void close_netns(struct nstoken *token)
close(token->orig_netns_fd);
free(token);
}
int get_socket_local_port(int sock_fd)
{
struct sockaddr_storage addr;
socklen_t addrlen = sizeof(addr);
int err;
err = getsockname(sock_fd, (struct sockaddr *)&addr, &addrlen);
if (err < 0)
return err;
if (addr.ss_family == AF_INET) {
struct sockaddr_in *sin = (struct sockaddr_in *)&addr;
return sin->sin_port;
} else if (addr.ss_family == AF_INET6) {
struct sockaddr_in6 *sin = (struct sockaddr_in6 *)&addr;
return sin->sin6_port;
}
return -1;
}

View file

@ -56,6 +56,7 @@ int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
int make_sockaddr(int family, const char *addr_str, __u16 port,
struct sockaddr_storage *addr, socklen_t *len);
char *ping_command(int family);
int get_socket_local_port(int sock_fd);
struct nstoken;
/**

View file

@ -0,0 +1,221 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include <bpf/bpf_endian.h>
#include "sock_destroy_prog.skel.h"
#include "sock_destroy_prog_fail.skel.h"
#include "network_helpers.h"
#define TEST_NS "sock_destroy_netns"
static void start_iter_sockets(struct bpf_program *prog)
{
struct bpf_link *link;
char buf[50] = {};
int iter_fd, len;
link = bpf_program__attach_iter(prog, NULL);
if (!ASSERT_OK_PTR(link, "attach_iter"))
return;
iter_fd = bpf_iter_create(bpf_link__fd(link));
if (!ASSERT_GE(iter_fd, 0, "create_iter"))
goto free_link;
while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
;
ASSERT_GE(len, 0, "read");
close(iter_fd);
free_link:
bpf_link__destroy(link);
}
static void test_tcp_client(struct sock_destroy_prog *skel)
{
int serv = -1, clien = -1, accept_serv = -1, n;
serv = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0);
if (!ASSERT_GE(serv, 0, "start_server"))
goto cleanup;
clien = connect_to_fd(serv, 0);
if (!ASSERT_GE(clien, 0, "connect_to_fd"))
goto cleanup;
accept_serv = accept(serv, NULL, NULL);
if (!ASSERT_GE(accept_serv, 0, "serv accept"))
goto cleanup;
n = send(clien, "t", 1, 0);
if (!ASSERT_EQ(n, 1, "client send"))
goto cleanup;
/* Run iterator program that destroys connected client sockets. */
start_iter_sockets(skel->progs.iter_tcp6_client);
n = send(clien, "t", 1, 0);
if (!ASSERT_LT(n, 0, "client_send on destroyed socket"))
goto cleanup;
ASSERT_EQ(errno, ECONNABORTED, "error code on destroyed socket");
cleanup:
if (clien != -1)
close(clien);
if (accept_serv != -1)
close(accept_serv);
if (serv != -1)
close(serv);
}
static void test_tcp_server(struct sock_destroy_prog *skel)
{
int serv = -1, clien = -1, accept_serv = -1, n, serv_port;
serv = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0);
if (!ASSERT_GE(serv, 0, "start_server"))
goto cleanup;
serv_port = get_socket_local_port(serv);
if (!ASSERT_GE(serv_port, 0, "get_sock_local_port"))
goto cleanup;
skel->bss->serv_port = (__be16) serv_port;
clien = connect_to_fd(serv, 0);
if (!ASSERT_GE(clien, 0, "connect_to_fd"))
goto cleanup;
accept_serv = accept(serv, NULL, NULL);
if (!ASSERT_GE(accept_serv, 0, "serv accept"))
goto cleanup;
n = send(clien, "t", 1, 0);
if (!ASSERT_EQ(n, 1, "client send"))
goto cleanup;
/* Run iterator program that destroys server sockets. */
start_iter_sockets(skel->progs.iter_tcp6_server);
n = send(clien, "t", 1, 0);
if (!ASSERT_LT(n, 0, "client_send on destroyed socket"))
goto cleanup;
ASSERT_EQ(errno, ECONNRESET, "error code on destroyed socket");
cleanup:
if (clien != -1)
close(clien);
if (accept_serv != -1)
close(accept_serv);
if (serv != -1)
close(serv);
}
static void test_udp_client(struct sock_destroy_prog *skel)
{
int serv = -1, clien = -1, n = 0;
serv = start_server(AF_INET6, SOCK_DGRAM, NULL, 0, 0);
if (!ASSERT_GE(serv, 0, "start_server"))
goto cleanup;
clien = connect_to_fd(serv, 0);
if (!ASSERT_GE(clien, 0, "connect_to_fd"))
goto cleanup;
n = send(clien, "t", 1, 0);
if (!ASSERT_EQ(n, 1, "client send"))
goto cleanup;
/* Run iterator program that destroys sockets. */
start_iter_sockets(skel->progs.iter_udp6_client);
n = send(clien, "t", 1, 0);
if (!ASSERT_LT(n, 0, "client_send on destroyed socket"))
goto cleanup;
/* UDP sockets have an overriding error code after they are disconnected,
* so we don't check for ECONNABORTED error code.
*/
cleanup:
if (clien != -1)
close(clien);
if (serv != -1)
close(serv);
}
static void test_udp_server(struct sock_destroy_prog *skel)
{
int *listen_fds = NULL, n, i, serv_port;
unsigned int num_listens = 5;
char buf[1];
/* Start reuseport servers. */
listen_fds = start_reuseport_server(AF_INET6, SOCK_DGRAM,
"::1", 0, 0, num_listens);
if (!ASSERT_OK_PTR(listen_fds, "start_reuseport_server"))
goto cleanup;
serv_port = get_socket_local_port(listen_fds[0]);
if (!ASSERT_GE(serv_port, 0, "get_sock_local_port"))
goto cleanup;
skel->bss->serv_port = (__be16) serv_port;
/* Run iterator program that destroys server sockets. */
start_iter_sockets(skel->progs.iter_udp6_server);
for (i = 0; i < num_listens; ++i) {
n = read(listen_fds[i], buf, sizeof(buf));
if (!ASSERT_EQ(n, -1, "read") ||
!ASSERT_EQ(errno, ECONNABORTED, "error code on destroyed socket"))
break;
}
ASSERT_EQ(i, num_listens, "server socket");
cleanup:
free_fds(listen_fds, num_listens);
}
void test_sock_destroy(void)
{
struct sock_destroy_prog *skel;
struct nstoken *nstoken = NULL;
int cgroup_fd;
skel = sock_destroy_prog__open_and_load();
if (!ASSERT_OK_PTR(skel, "skel_open"))
return;
cgroup_fd = test__join_cgroup("/sock_destroy");
if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
goto cleanup;
skel->links.sock_connect = bpf_program__attach_cgroup(
skel->progs.sock_connect, cgroup_fd);
if (!ASSERT_OK_PTR(skel->links.sock_connect, "prog_attach"))
goto cleanup;
SYS(cleanup, "ip netns add %s", TEST_NS);
SYS(cleanup, "ip -net %s link set dev lo up", TEST_NS);
nstoken = open_netns(TEST_NS);
if (!ASSERT_OK_PTR(nstoken, "open_netns"))
goto cleanup;
if (test__start_subtest("tcp_client"))
test_tcp_client(skel);
if (test__start_subtest("tcp_server"))
test_tcp_server(skel);
if (test__start_subtest("udp_client"))
test_udp_client(skel);
if (test__start_subtest("udp_server"))
test_udp_server(skel);
RUN_TESTS(sock_destroy_prog_fail);
cleanup:
if (nstoken)
close_netns(nstoken);
SYS_NOFAIL("ip netns del " TEST_NS " &> /dev/null");
if (cgroup_fd >= 0)
close(cgroup_fd);
sock_destroy_prog__destroy(skel);
}

View file

@ -0,0 +1,145 @@
// SPDX-License-Identifier: GPL-2.0
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#include "bpf_tracing_net.h"
__be16 serv_port = 0;
int bpf_sock_destroy(struct sock_common *sk) __ksym;
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, __u64);
} tcp_conn_sockets SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, __u64);
} udp_conn_sockets SEC(".maps");
SEC("cgroup/connect6")
int sock_connect(struct bpf_sock_addr *ctx)
{
__u64 sock_cookie = 0;
int key = 0;
__u32 keyc = 0;
if (ctx->family != AF_INET6 || ctx->user_family != AF_INET6)
return 1;
sock_cookie = bpf_get_socket_cookie(ctx);
if (ctx->protocol == IPPROTO_TCP)
bpf_map_update_elem(&tcp_conn_sockets, &key, &sock_cookie, 0);
else if (ctx->protocol == IPPROTO_UDP)
bpf_map_update_elem(&udp_conn_sockets, &keyc, &sock_cookie, 0);
else
return 1;
return 1;
}
SEC("iter/tcp")
int iter_tcp6_client(struct bpf_iter__tcp *ctx)
{
struct sock_common *sk_common = ctx->sk_common;
__u64 sock_cookie = 0;
__u64 *val;
int key = 0;
if (!sk_common)
return 0;
if (sk_common->skc_family != AF_INET6)
return 0;
sock_cookie = bpf_get_socket_cookie(sk_common);
val = bpf_map_lookup_elem(&tcp_conn_sockets, &key);
if (!val)
return 0;
/* Destroy connected client sockets. */
if (sock_cookie == *val)
bpf_sock_destroy(sk_common);
return 0;
}
SEC("iter/tcp")
int iter_tcp6_server(struct bpf_iter__tcp *ctx)
{
struct sock_common *sk_common = ctx->sk_common;
const struct inet_connection_sock *icsk;
const struct inet_sock *inet;
struct tcp6_sock *tcp_sk;
__be16 srcp;
if (!sk_common)
return 0;
if (sk_common->skc_family != AF_INET6)
return 0;
tcp_sk = bpf_skc_to_tcp6_sock(sk_common);
if (!tcp_sk)
return 0;
icsk = &tcp_sk->tcp.inet_conn;
inet = &icsk->icsk_inet;
srcp = inet->inet_sport;
/* Destroy server sockets. */
if (srcp == serv_port)
bpf_sock_destroy(sk_common);
return 0;
}
SEC("iter/udp")
int iter_udp6_client(struct bpf_iter__udp *ctx)
{
struct udp_sock *udp_sk = ctx->udp_sk;
struct sock *sk = (struct sock *) udp_sk;
__u64 sock_cookie = 0, *val;
int key = 0;
if (!sk)
return 0;
sock_cookie = bpf_get_socket_cookie(sk);
val = bpf_map_lookup_elem(&udp_conn_sockets, &key);
if (!val)
return 0;
/* Destroy connected client sockets. */
if (sock_cookie == *val)
bpf_sock_destroy((struct sock_common *)sk);
return 0;
}
SEC("iter/udp")
int iter_udp6_server(struct bpf_iter__udp *ctx)
{
struct udp_sock *udp_sk = ctx->udp_sk;
struct sock *sk = (struct sock *) udp_sk;
struct inet_sock *inet;
__be16 srcp;
if (!sk)
return 0;
inet = &udp_sk->inet;
srcp = inet->inet_sport;
if (srcp == serv_port)
bpf_sock_destroy((struct sock_common *)sk);
return 0;
}
char _license[] SEC("license") = "GPL";

View file

@ -0,0 +1,22 @@
// SPDX-License-Identifier: GPL-2.0
#include "vmlinux.h"
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
char _license[] SEC("license") = "GPL";
int bpf_sock_destroy(struct sock_common *sk) __ksym;
SEC("tp_btf/tcp_destroy_sock")
__failure __msg("calling kernel function bpf_sock_destroy is not allowed")
int BPF_PROG(trace_tcp_destroy_sock, struct sock *sk)
{
/* should not load */
bpf_sock_destroy((struct sock_common *)sk);
return 0;
}