Daniel Borkmann says:

====================
pull-request: bpf 2022-10-03

We've added 10 non-merge commits during the last 23 day(s) which contain
a total of 14 files changed, 130 insertions(+), 69 deletions(-).

The main changes are:

1) Fix dynptr helper API to gate behind CAP_BPF given it was not intended
   for unprivileged BPF programs, from Kumar Kartikeya Dwivedi.

2) Fix need_wakeup flag inheritance from umem buffer pool for shared xsk
   sockets, from Jalal Mostafa.

3) Fix truncated last_member_type_id in btf_struct_resolve() which had a
   wrong storage type, from Lorenz Bauer.

4) Fix xsk back-pressure mechanism on tx when amount of produced
   descriptors to CQ is lower than what was grabbed from xsk tx ring,
   from Maciej Fijalkowski.

5) Fix wrong cgroup attach flags being displayed to effective progs,
   from Pu Lehui.

* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  xsk: Inherit need_wakeup flag for shared sockets
  bpf: Gate dynptr API behind CAP_BPF
  selftests/bpf: Adapt cgroup effective query uapi change
  bpftool: Fix wrong cgroup attach flags being assigned to effective progs
  bpf, cgroup: Reject prog_attach_flags array when effective query
  bpf: Ensure correct locking around vulnerable function find_vpid()
  bpf: btf: fix truncated last_member_type_id in btf_struct_resolve
  selftests/xsk: Add missing close() on netns fd
  xsk: Fix backpressure mechanism on Tx
  MAINTAINERS: Add include/linux/tnum.h to BPF CORE
====================

Link: https://lore.kernel.org/r/20221003201957.13149-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2022-10-03 16:17:44 -07:00
commit ad061cf422
14 changed files with 130 additions and 69 deletions

View file

@ -3825,6 +3825,7 @@ F: kernel/bpf/dispatcher.c
F: kernel/bpf/trampoline.c
F: include/linux/bpf*
F: include/linux/filter.h
F: include/linux/tnum.h
BPF [BTF]
M: Martin KaFai Lau <martin.lau@linux.dev>

View file

@ -95,7 +95,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
struct xdp_umem *umem);
int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
u16 queue_id, u16 flags);
int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
struct net_device *dev, u16 queue_id);
int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs);
void xp_destroy(struct xsk_buff_pool *pool);

View file

@ -1233,7 +1233,7 @@ enum {
/* Query effective (directly attached + inherited from ancestor cgroups)
* programs that will be executed for events within a cgroup.
* attach_flags with this flag are returned only for directly attached programs.
* attach_flags with this flag are always returned 0.
*/
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
@ -1432,7 +1432,10 @@ union bpf_attr {
__u32 attach_flags;
__aligned_u64 prog_ids;
__u32 prog_cnt;
__aligned_u64 prog_attach_flags; /* output: per-program attach_flags */
/* output: per-program attach_flags.
* not allowed to be set during effective query.
*/
__aligned_u64 prog_attach_flags;
} query;
struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */

View file

@ -3128,7 +3128,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
u16 last_member_type_id;
u32 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;

View file

@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
enum cgroup_bpf_attach_type from_atype, to_atype;
@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
int total_cnt = 0;
u32 flags;
if (effective_query && prog_attach_flags)
return -EINVAL;
if (type == BPF_LSM_CGROUP) {
if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
if (!effective_query && attr->query.prog_cnt &&
prog_ids && !prog_attach_flags)
return -EINVAL;
from_atype = CGROUP_LSM_START;
@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
for (atype = from_atype; atype <= to_atype; atype++) {
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
total_cnt += bpf_prog_array_length(effective);
@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
}
/* always output uattr->query.attach_flags as 0 during effective query */
flags = effective_query ? 0 : flags;
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
if (++i == cnt)
break;
}
}
if (prog_attach_flags) {
flags = cgrp->bpf.flags[atype];
if (prog_attach_flags) {
flags = cgrp->bpf.flags[atype];
for (i = 0; i < cnt; i++)
if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
return -EFAULT;
prog_attach_flags += cnt;
for (i = 0; i < cnt; i++)
if (copy_to_user(prog_attach_flags + i,
&flags, sizeof(flags)))
return -EFAULT;
prog_attach_flags += cnt;
}
}
prog_ids += cnt;

View file

@ -1627,26 +1627,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
case BPF_FUNC_ringbuf_reserve_dynptr:
return &bpf_ringbuf_reserve_dynptr_proto;
case BPF_FUNC_ringbuf_submit_dynptr:
return &bpf_ringbuf_submit_dynptr_proto;
case BPF_FUNC_ringbuf_discard_dynptr:
return &bpf_ringbuf_discard_dynptr_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
case BPF_FUNC_loop:
return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
case BPF_FUNC_dynptr_from_mem:
return &bpf_dynptr_from_mem_proto;
case BPF_FUNC_dynptr_read:
return &bpf_dynptr_read_proto;
case BPF_FUNC_dynptr_write:
return &bpf_dynptr_write_proto;
case BPF_FUNC_dynptr_data:
return &bpf_dynptr_data_proto;
default:
break;
}
@ -1675,6 +1661,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_timer_cancel_proto;
case BPF_FUNC_kptr_xchg:
return &bpf_kptr_xchg_proto;
case BPF_FUNC_ringbuf_reserve_dynptr:
return &bpf_ringbuf_reserve_dynptr_proto;
case BPF_FUNC_ringbuf_submit_dynptr:
return &bpf_ringbuf_submit_dynptr_proto;
case BPF_FUNC_ringbuf_discard_dynptr:
return &bpf_ringbuf_discard_dynptr_proto;
case BPF_FUNC_dynptr_from_mem:
return &bpf_dynptr_from_mem_proto;
case BPF_FUNC_dynptr_read:
return &bpf_dynptr_read_proto;
case BPF_FUNC_dynptr_write:
return &bpf_dynptr_write_proto;
case BPF_FUNC_dynptr_data:
return &bpf_dynptr_data_proto;
default:
break;
}

View file

@ -4395,7 +4395,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (attr->task_fd_query.flags != 0)
return -EINVAL;
rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
rcu_read_unlock();
if (!task)
return -ENOENT;

View file

@ -355,16 +355,15 @@ static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entr
return nb_pkts;
}
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
{
struct xdp_sock *xs;
u32 nb_pkts;
rcu_read_lock();
if (!list_is_singular(&pool->xsk_tx_list)) {
/* Fallback to the non-batched version */
rcu_read_unlock();
return xsk_tx_peek_release_fallback(pool, max_entries);
return xsk_tx_peek_release_fallback(pool, nb_pkts);
}
xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
@ -373,12 +372,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
goto out;
}
max_entries = xskq_cons_nb_entries(xs->tx, max_entries);
nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, max_entries);
if (!nb_pkts) {
xs->tx->queue_empty_descs++;
goto out;
}
nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
/* This is the backpressure mechanism for the Tx path. Try to
* reserve space in the completion queue for all packets, but
@ -386,12 +380,18 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
* packets. This avoids having to implement any buffering in
* the Tx path.
*/
nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
if (!nb_pkts)
goto out;
xskq_cons_release_n(xs->tx, max_entries);
nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
if (!nb_pkts) {
xs->tx->queue_empty_descs++;
goto out;
}
__xskq_cons_release(xs->tx);
xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
xs->sk.sk_write_space(&xs->sk);
out:
@ -954,8 +954,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
goto out_unlock;
}
err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
dev, qid);
err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
qid);
if (err) {
xp_destroy(xs->pool);
xs->pool = NULL;

View file

@ -212,17 +212,18 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
return err;
}
int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
struct net_device *dev, u16 queue_id)
{
u16 flags;
struct xdp_umem *umem = umem_xs->umem;
/* One fill and completion ring required for each queue id. */
if (!pool->fq || !pool->cq)
return -EINVAL;
flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
if (pool->uses_need_wakeup)
if (umem_xs->pool->uses_need_wakeup)
flags |= XDP_USE_NEED_WAKEUP;
return xp_assign_dev(pool, dev, queue_id, flags);

View file

@ -205,6 +205,11 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
return false;
}
static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
{
q->cached_cons += cnt;
}
static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
u32 max)
{
@ -226,6 +231,8 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff
cached_cons++;
}
/* Release valid plus any invalid entries */
xskq_cons_release_n(q, cached_cons - q->cached_cons);
return nb_entries;
}
@ -291,11 +298,6 @@ static inline void xskq_cons_release(struct xsk_queue *q)
q->cached_cons++;
}
static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
{
q->cached_cons += cnt;
}
static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
{
/* No barriers needed since data is not accessed */
@ -350,21 +352,17 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 max)
static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 nb_entries)
{
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
u32 nb_entries, i, cached_prod;
nb_entries = xskq_prod_nb_free(q, max);
u32 i, cached_prod;
/* A, matches D */
cached_prod = q->cached_prod;
for (i = 0; i < nb_entries; i++)
ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr;
q->cached_prod = cached_prod;
return nb_entries;
}
static inline int xskq_prod_reserve_desc(struct xsk_queue *q,

View file

@ -136,8 +136,8 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
jsonw_string_field(json_wtr, "attach_type", attach_type_str);
else
jsonw_uint_field(json_wtr, "attach_type", attach_type);
jsonw_string_field(json_wtr, "attach_flags",
attach_flags_str);
if (!(query_flags & BPF_F_QUERY_EFFECTIVE))
jsonw_string_field(json_wtr, "attach_flags", attach_flags_str);
jsonw_string_field(json_wtr, "name", prog_name);
if (attach_btf_name)
jsonw_string_field(json_wtr, "attach_btf_name", attach_btf_name);
@ -150,7 +150,10 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
printf("%-15s", attach_type_str);
else
printf("type %-10u", attach_type);
printf(" %-15s %-15s", attach_flags_str, prog_name);
if (query_flags & BPF_F_QUERY_EFFECTIVE)
printf(" %-15s", prog_name);
else
printf(" %-15s %-15s", attach_flags_str, prog_name);
if (attach_btf_name)
printf(" %-15s", attach_btf_name);
else if (info.attach_btf_id)
@ -195,6 +198,32 @@ static int cgroup_has_attached_progs(int cgroup_fd)
return no_prog ? 0 : 1;
}
static int show_effective_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
int level)
{
LIBBPF_OPTS(bpf_prog_query_opts, p);
__u32 prog_ids[1024] = {0};
__u32 iter;
int ret;
p.query_flags = query_flags;
p.prog_cnt = ARRAY_SIZE(prog_ids);
p.prog_ids = prog_ids;
ret = bpf_prog_query_opts(cgroup_fd, type, &p);
if (ret)
return ret;
if (p.prog_cnt == 0)
return 0;
for (iter = 0; iter < p.prog_cnt; iter++)
show_bpf_prog(prog_ids[iter], type, NULL, level);
return 0;
}
static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
int level)
{
@ -245,6 +274,14 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
return 0;
}
static int show_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
int level)
{
return query_flags & BPF_F_QUERY_EFFECTIVE ?
show_effective_bpf_progs(cgroup_fd, type, level) :
show_attached_bpf_progs(cgroup_fd, type, level);
}
static int do_show(int argc, char **argv)
{
enum bpf_attach_type type;
@ -292,6 +329,8 @@ static int do_show(int argc, char **argv)
if (json_output)
jsonw_start_array(json_wtr);
else if (query_flags & BPF_F_QUERY_EFFECTIVE)
printf("%-8s %-15s %-15s\n", "ID", "AttachType", "Name");
else
printf("%-8s %-15s %-15s %-15s\n", "ID", "AttachType",
"AttachFlags", "Name");
@ -304,7 +343,7 @@ static int do_show(int argc, char **argv)
* If we were able to get the show for at least one
* attach type, let's return 0.
*/
if (show_attached_bpf_progs(cgroup_fd, type, 0) == 0)
if (show_bpf_progs(cgroup_fd, type, 0) == 0)
ret = 0;
}
@ -362,7 +401,7 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb,
btf_vmlinux = libbpf_find_kernel_btf();
for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++)
show_attached_bpf_progs(cgroup_fd, type, ftw->level);
show_bpf_progs(cgroup_fd, type, ftw->level);
if (errno == EINVAL)
/* Last attach type does not support query.
@ -436,6 +475,11 @@ static int do_show_tree(int argc, char **argv)
if (json_output)
jsonw_start_array(json_wtr);
else if (query_flags & BPF_F_QUERY_EFFECTIVE)
printf("%s\n"
"%-8s %-15s %-15s\n",
"CgroupPath",
"ID", "AttachType", "Name");
else
printf("%s\n"
"%-8s %-15s %-15s %-15s\n",

View file

@ -1233,7 +1233,7 @@ enum {
/* Query effective (directly attached + inherited from ancestor cgroups)
* programs that will be executed for events within a cgroup.
* attach_flags with this flag are returned only for directly attached programs.
* attach_flags with this flag are always returned 0.
*/
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
@ -1432,7 +1432,10 @@ union bpf_attr {
__u32 attach_flags;
__aligned_u64 prog_ids;
__u32 prog_cnt;
__aligned_u64 prog_attach_flags; /* output: per-program attach_flags */
/* output: per-program attach_flags.
* not allowed to be set during effective query.
*/
__aligned_u64 prog_attach_flags;
} query;
struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */

View file

@ -71,10 +71,9 @@ void serial_test_cgroup_link(void)
ping_and_check(cg_nr, 0);
/* query the number of effective progs and attach flags in root cg */
/* query the number of attached progs and attach flags in root cg */
err = bpf_prog_query(cgs[0].fd, BPF_CGROUP_INET_EGRESS,
BPF_F_QUERY_EFFECTIVE, &attach_flags, NULL,
&prog_cnt);
0, &attach_flags, NULL, &prog_cnt);
CHECK_FAIL(err);
CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
if (CHECK(prog_cnt != 1, "effect_cnt", "exp %d, got %d\n", 1, prog_cnt))
@ -85,17 +84,15 @@ void serial_test_cgroup_link(void)
BPF_F_QUERY_EFFECTIVE, NULL, NULL,
&prog_cnt);
CHECK_FAIL(err);
CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n",
cg_nr, prog_cnt))
goto cleanup;
/* query the effective prog IDs in last cg */
err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS,
BPF_F_QUERY_EFFECTIVE, &attach_flags,
prog_ids, &prog_cnt);
BPF_F_QUERY_EFFECTIVE, NULL, prog_ids,
&prog_cnt);
CHECK_FAIL(err);
CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n",
cg_nr, prog_cnt))
goto cleanup;

View file

@ -1606,6 +1606,8 @@ static struct ifobject *ifobject_create(void)
if (!ifobj->umem)
goto out_umem;
ifobj->ns_fd = -1;
return ifobj;
out_umem:
@ -1617,6 +1619,8 @@ static struct ifobject *ifobject_create(void)
static void ifobject_delete(struct ifobject *ifobj)
{
if (ifobj->ns_fd != -1)
close(ifobj->ns_fd);
free(ifobj->umem);
free(ifobj->xsk_arr);
free(ifobj);