Merge branch 'allow-bpf_refcount_acquire-of-mapval-obtained-via-direct-ld'

Dave Marchevsky says:

====================
Allow bpf_refcount_acquire of mapval obtained via direct LD

Consider this BPF program:

  struct cgv_node {
    int d;
    struct bpf_refcount r;
    struct bpf_rb_node rb;
  };

  struct val_stash {
    struct cgv_node __kptr *v;
  };

  struct {
    __uint(type, BPF_MAP_TYPE_ARRAY);
    __type(key, int);
    __type(value, struct val_stash);
    __uint(max_entries, 10);
  } array_map SEC(".maps");

  long bpf_program(void *ctx)
  {
    struct val_stash *mapval;
    struct cgv_node *p;
    int idx = 0;

    mapval = bpf_map_lookup_elem(&array_map, &idx);
    if (!mapval || !mapval->v) { /* omitted */ }

    p = bpf_refcount_acquire(mapval->v); /* Verification FAILs here */

    /* Add p to some tree */
    return 0;
  }

Verification fails on the refcount_acquire:

  160: (79) r1 = *(u64 *)(r8 +8)        ; R1_w=untrusted_ptr_or_null_cgv_node(id=11,off=0,imm=0) R8_w=map_value(id=10,off=0,ks=8,vs=16,imm=0) refs=6
  161: (b7) r2 = 0                      ; R2_w=0 refs=6
  162: (85) call bpf_refcount_acquire_impl#117824
  arg#0 is neither owning or non-owning ref

The above verifier dump is actually from sched_ext's scx_flatcg [0],
which is the motivating usecase for this series' changes. Specifically,
scx_flatcg stashes a rb_node type w/ cgroup-specific info (struct
cgv_node) in a map when the cgroup is created, then later puts that
cgroup's node in a rbtree in .enqueue . Making struct cgv_node
refcounted would simplify the code a bit by virtue of allowing us to
remove the kptr_xchg's, but "later puts that cgroups node in a rbtree"
is not possible without a refcount_acquire, which suffers from the above
verification failure.

If we get rid of PTR_UNTRUSTED flag, and add MEM_ALLOC | NON_OWN_REF,
mapval->v would be a non-owning ref and verification would succeed. Due
to the most recent set of refcount changes [1], which modified
bpf_obj_drop behavior to not reuse refcounted graph node's underlying
memory until after RCU grace period, this is safe to do. Once mapval->v
has the correct flags it _is_ a non-owning reference and verification of
the motivating example will succeed.

  [0]: 52911e1040/tools/sched_ext/scx_flatcg.bpf.c (L275)
  [1]: https://lore.kernel.org/bpf/20230821193311.3290257-1-davemarchevsky@fb.com/

Summary of patches:
  * Patch 1 fixes an issue with bpf_refcount_acquire verification
    letting MAYBE_NULL ptrs through
    * Patch 2 tests Patch 1's fix
  * Patch 3 broadens the use of "free only after RCU GP" to all
    user-allocated types
  * Patch 4 is a small nonfunctional refactoring
  * Patch 5 changes verifier to mark direct LD of stashed graph node
    kptr as non-owning ref
    * Patch 6 tests Patch 5's verifier changes

Changelog:

v1 -> v2: https://lore.kernel.org/bpf/20231025214007.2920506-1-davemarchevsky@fb.com/

Series title changed to "Allow bpf_refcount_acquire of mapval obtained via
direct LD". V1's title was mistakenly truncated.

  * Patch 5 ("bpf: Mark direct ld of stashed bpf_{rb,list}_node as non-owning ref")
    * Direct LD of percpu kptr should not have MEM_ALLOC flag (Yonghong)
  * Patch 6 ("selftests/bpf: Test bpf_refcount_acquire of node obtained via direct ld")
    * Test read from stashed value as well
====================

Link: https://lore.kernel.org/r/20231107085639.3016113-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2023-11-09 10:45:45 -08:00
commit 3f6d04d742
7 changed files with 162 additions and 19 deletions

View File

@ -186,8 +186,8 @@ enum btf_field_type {
BPF_LIST_NODE = (1 << 6),
BPF_RB_ROOT = (1 << 7),
BPF_RB_NODE = (1 << 8),
BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
BPF_RB_NODE | BPF_RB_ROOT,
BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
BPF_REFCOUNT = (1 << 9),
};

View File

@ -3840,9 +3840,6 @@ end:
return ERR_PTR(ret);
}
#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
{
int i;
@ -3855,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* Hence we only need to ensure that bpf_{list_head,rb_root} ownership
* does not form cycles.
*/
if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
return 0;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *meta;
u32 btf_id;
if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
continue;
btf_id = rec->fields[i].graph_root.value_btf_id;
meta = btf_find_struct_meta(btf, btf_id);
@ -3873,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* to check ownership cycle for a type unless it's also a
* node type.
*/
if (!(rec->field_mask & GRAPH_NODE_MASK))
if (!(rec->field_mask & BPF_GRAPH_NODE))
continue;
/* We need to ensure ownership acyclicity among all types. The
@ -3909,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* - A is both an root and node.
* - B is only an node.
*/
if (meta->record->field_mask & GRAPH_ROOT_MASK)
if (meta->record->field_mask & BPF_GRAPH_ROOT)
return -ELOOP;
}
return 0;

View File

@ -1937,10 +1937,7 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
ma = &bpf_global_percpu_ma;
else
ma = &bpf_global_ma;
if (rec && rec->refcount_off >= 0)
bpf_mem_free_rcu(ma, p);
else
bpf_mem_free(ma, p);
bpf_mem_free_rcu(ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@ -2520,7 +2517,7 @@ BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)

View File

@ -5557,10 +5557,23 @@ BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
{
if (!btf_is_kernel(btf))
return false;
return true;
return btf_id_set_contains(&rcu_protected_types, btf_id);
}
static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
{
struct btf_struct_meta *meta;
if (btf_is_kernel(kptr_field->kptr.btf))
return NULL;
meta = btf_find_struct_meta(kptr_field->kptr.btf,
kptr_field->kptr.btf_id);
return meta ? meta->record : NULL;
}
static bool rcu_safe_kptr(const struct btf_field *field)
{
const struct btf_field_kptr *kptr = &field->kptr;
@ -5571,12 +5584,25 @@ static bool rcu_safe_kptr(const struct btf_field *field)
static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
{
struct btf_record *rec;
u32 ret;
ret = PTR_MAYBE_NULL;
if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
if (kptr_field->type != BPF_KPTR_PERCPU)
return PTR_MAYBE_NULL | MEM_RCU;
return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;
ret |= MEM_RCU;
if (kptr_field->type == BPF_KPTR_PERCPU)
ret |= MEM_PERCPU;
else if (!btf_is_kernel(kptr_field->kptr.btf))
ret |= MEM_ALLOC;
rec = kptr_pointee_btf_record(kptr_field);
if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
ret |= NON_OWN_REF;
} else {
ret |= PTR_UNTRUSTED;
}
return PTR_MAYBE_NULL | PTR_UNTRUSTED;
return ret;
}
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,

View File

@ -73,6 +73,37 @@ static void test_local_kptr_stash_unstash(void)
local_kptr_stash__destroy(skel);
}
static void test_refcount_acquire_without_unstash(void)
{
LIBBPF_OPTS(bpf_test_run_opts, opts,
.data_in = &pkt_v4,
.data_size_in = sizeof(pkt_v4),
.repeat = 1,
);
struct local_kptr_stash *skel;
int ret;
skel = local_kptr_stash__open_and_load();
if (!ASSERT_OK_PTR(skel, "local_kptr_stash__open_and_load"))
return;
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.refcount_acquire_without_unstash),
&opts);
ASSERT_OK(ret, "refcount_acquire_without_unstash run");
ASSERT_EQ(opts.retval, 2, "refcount_acquire_without_unstash retval");
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.stash_refcounted_node), &opts);
ASSERT_OK(ret, "stash_refcounted_node run");
ASSERT_OK(opts.retval, "stash_refcounted_node retval");
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.refcount_acquire_without_unstash),
&opts);
ASSERT_OK(ret, "refcount_acquire_without_unstash (2) run");
ASSERT_EQ(opts.retval, 42, "refcount_acquire_without_unstash (2) retval");
local_kptr_stash__destroy(skel);
}
static void test_local_kptr_stash_fail(void)
{
RUN_TESTS(local_kptr_stash_fail);
@ -86,6 +117,8 @@ void test_local_kptr_stash(void)
test_local_kptr_stash_plain();
if (test__start_subtest("local_kptr_stash_unstash"))
test_local_kptr_stash_unstash();
if (test__start_subtest("refcount_acquire_without_unstash"))
test_refcount_acquire_without_unstash();
if (test__start_subtest("local_kptr_stash_fail"))
test_local_kptr_stash_fail();
}

View File

@ -14,6 +14,24 @@ struct node_data {
struct bpf_rb_node node;
};
struct refcounted_node {
long data;
struct bpf_rb_node rb_node;
struct bpf_refcount refcount;
};
struct stash {
struct bpf_spin_lock l;
struct refcounted_node __kptr *stashed;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, int);
__type(value, struct stash);
__uint(max_entries, 10);
} refcounted_node_stash SEC(".maps");
struct plain_local {
long key;
long data;
@ -38,6 +56,7 @@ struct map_value {
* Had to do the same w/ bpf_kfunc_call_test_release below
*/
struct node_data *just_here_because_btf_bug;
struct refcounted_node *just_here_because_btf_bug2;
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
@ -132,4 +151,56 @@ long stash_test_ref_kfunc(void *ctx)
return 0;
}
SEC("tc")
long refcount_acquire_without_unstash(void *ctx)
{
struct refcounted_node *p;
struct stash *s;
int ret = 0;
s = bpf_map_lookup_elem(&refcounted_node_stash, &ret);
if (!s)
return 1;
if (!s->stashed)
/* refcount_acquire failure is expected when no refcounted_node
* has been stashed before this program executes
*/
return 2;
p = bpf_refcount_acquire(s->stashed);
if (!p)
return 3;
ret = s->stashed ? s->stashed->data : -1;
bpf_obj_drop(p);
return ret;
}
/* Helper for refcount_acquire_without_unstash test */
SEC("tc")
long stash_refcounted_node(void *ctx)
{
struct refcounted_node *p;
struct stash *s;
int key = 0;
s = bpf_map_lookup_elem(&refcounted_node_stash, &key);
if (!s)
return 1;
p = bpf_obj_new(typeof(*p));
if (!p)
return 2;
p->data = 42;
p = bpf_kptr_xchg(&s->stashed, p);
if (p) {
bpf_obj_drop(p);
return 3;
}
return 0;
}
char _license[] SEC("license") = "GPL";

View File

@ -53,6 +53,25 @@ long rbtree_refcounted_node_ref_escapes(void *ctx)
return 0;
}
SEC("?tc")
__failure __msg("Possibly NULL pointer passed to trusted arg0")
long refcount_acquire_maybe_null(void *ctx)
{
struct node_acquire *n, *m;
n = bpf_obj_new(typeof(*n));
/* Intentionally not testing !n
* it's MAYBE_NULL for refcount_acquire
*/
m = bpf_refcount_acquire(n);
if (m)
bpf_obj_drop(m);
if (n)
bpf_obj_drop(n);
return 0;
}
SEC("?tc")
__failure __msg("Unreleased reference id=3 alloc_insn=9")
long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx)