Merge branch 'bpf-fixes-for-per-cpu-kptr'

Hou Tao says:

====================
bpf: Fixes for per-cpu kptr

From: Hou Tao <houtao1@huawei.com>

Hi,

The patchset aims to fix the problems found in the review of per-cpu
kptr patch-set [0]. Patch #1 moves pcpu_lock after the invocation of
pcpu_chunk_addr_search() and it is a micro-optimization for
free_percpu(). The reason includes it in the patch is that the same
logic is used in newly-added API pcpu_alloc_size(). Patch #2 introduces
pcpu_alloc_size() for dynamic per-cpu area. Patch #2 and #3 use
pcpu_alloc_size() to check whether or not unit_size matches with the
size of underlying per-cpu area and to select a matching bpf_mem_cache.
Patch #4 fixes the freeing of per-cpu kptr when these kptrs are freed by
map destruction. The last patch adds test cases for these problems.

Please see individual patches for details. And comments are always
welcome.

Change Log:
v3:
 * rebased on bpf-next
 * patch 2: update API document to note that pcpu_alloc_size() doesn't
            support statically allocated per-cpu area. (Dennis)
 * patch 1 & 2: add Acked-by from Dennis

v2: https://lore.kernel.org/bpf/20231018113343.2446300-1-houtao@huaweicloud.com/
  * add a new patch "don't acquire pcpu_lock for pcpu_chunk_addr_search()"
  * patch 2: change type of bit_off and end to unsigned long (Andrew)
  * patch 2: rename the new API as pcpu_alloc_size and follow 80-column convention (Dennis)
  * patch 5: move the common declaration into bpf.h (Stanislav, Alxei)

v1: https://lore.kernel.org/bpf/20231007135106.3031284-1-houtao@huaweicloud.com/

[0]: https://lore.kernel.org/bpf/20230827152729.1995219-1-yonghong.song@linux.dev
====================

Link: https://lore.kernel.org/r/20231020133202.4043247-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2023-10-20 10:12:55 -07:00
commit cf559a416f
9 changed files with 271 additions and 37 deletions

View File

@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
struct bpf_map *bpf_map_get(u32 ufd);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);

View File

@ -11,6 +11,7 @@ struct bpf_mem_caches;
struct bpf_mem_alloc {
struct bpf_mem_caches __percpu *caches;
struct bpf_mem_cache __percpu *cache;
bool percpu;
struct work_struct work;
};

View File

@ -132,6 +132,7 @@ extern void __init setup_per_cpu_areas(void);
extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
extern void free_percpu(void __percpu *__pdata);
extern size_t pcpu_alloc_size(void __percpu *__pdata);
DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))

View File

@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
@ -1844,7 +1842,7 @@ unlock:
* bpf_list_head which needs to be freed.
*/
migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@ -1883,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
migrate_enable();
}
}
@ -1915,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
}
/* Must be called under migrate_disable(), as required by bpf_mem_free */
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
{
struct bpf_mem_alloc *ma;
if (rec && rec->refcount_off >= 0 &&
!refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
/* Object is refcounted and refcount_dec didn't result in 0
@ -1928,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
if (rec)
bpf_obj_free_fields(rec, p);
if (rec && rec->refcount_off >= 0)
bpf_mem_free_rcu(&bpf_global_ma, p);
if (percpu)
ma = &bpf_global_percpu_ma;
else
bpf_mem_free(&bpf_global_ma, p);
ma = &bpf_global_ma;
if (rec && rec->refcount_off >= 0)
bpf_mem_free_rcu(ma, p);
else
bpf_mem_free(ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@ -1939,7 +1943,7 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
}
__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
@ -1983,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
__bpf_obj_drop_impl((void *)n - off, rec);
__bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
@ -2082,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
*/
if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
__bpf_obj_drop_impl((void *)n - off, rec);
__bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}

View File

@ -491,21 +491,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
struct llist_node *first;
unsigned int obj_size;
/* For per-cpu allocator, the size of free objects in free list doesn't
* match with unit_size and now there is no way to get the size of
* per-cpu pointer saved in free object, so just skip the checking.
*/
if (c->percpu_size)
return 0;
first = c->free_llist.first;
if (!first)
return 0;
obj_size = ksize(first);
if (c->percpu_size)
obj_size = pcpu_alloc_size(((void **)first)[1]);
else
obj_size = ksize(first);
if (obj_size != c->unit_size) {
WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
idx, obj_size, c->unit_size);
WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n",
idx, c->percpu_size, obj_size, c->unit_size);
return -EINVAL;
}
return 0;
@ -529,6 +525,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
/* room for llist_node and per-cpu pointer */
if (percpu)
percpu_size = LLIST_NODE_SZ + sizeof(void *);
ma->percpu = percpu;
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@ -878,6 +875,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
{
size_t size;
if (percpu)
size = pcpu_alloc_size(*((void **)ptr));
else
size = ksize(ptr - LLIST_NODE_SZ);
return bpf_mem_cache_idx(size);
}
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
{
int idx;
@ -885,7 +893,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@ -899,7 +907,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
if (!ptr)
return;
idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
idx = bpf_mem_free_idx(ptr, ma->percpu);
if (idx < 0)
return;
@ -973,6 +981,12 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the
* actual size of dynamic per-cpu area will always be matched and there is
* no need to adjust size_index for per-cpu allocation. However for the
* simplicity of the implementation, use an unified size_index for both
* kmalloc and per-cpu allocation.
*/
static __init int bpf_mem_cache_adjust_size(void)
{
unsigned int size;

View File

@ -626,8 +626,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@ -662,8 +660,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
field->kptr.btf_id);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record :
NULL);
pointee_struct_meta->record : NULL,
fields[i].type == BPF_KPTR_PERCPU);
migrate_enable();
} else {
field->kptr.dtor(xchgd_field);

View File

@ -2244,6 +2244,37 @@ static void pcpu_balance_workfn(struct work_struct *work)
mutex_unlock(&pcpu_alloc_mutex);
}
/**
* pcpu_alloc_size - the size of the dynamic percpu area
* @ptr: pointer to the dynamic percpu area
*
* Returns the size of the @ptr allocation. This is undefined for statically
* defined percpu variables as there is no corresponding chunk->bound_map.
*
* RETURNS:
* The size of the dynamic percpu area.
*
* CONTEXT:
* Can be called from atomic context.
*/
size_t pcpu_alloc_size(void __percpu *ptr)
{
struct pcpu_chunk *chunk;
unsigned long bit_off, end;
void *addr;
if (!ptr)
return 0;
addr = __pcpu_ptr_to_addr(ptr);
/* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
chunk = pcpu_chunk_addr_search(addr);
bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
bit_off + 1);
return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
@ -2267,12 +2298,10 @@ void free_percpu(void __percpu *ptr)
kmemleak_free_percpu(ptr);
addr = __pcpu_ptr_to_addr(ptr);
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
spin_lock_irqsave(&pcpu_lock, flags);
size = pcpu_free_area(chunk, off);
pcpu_memcg_free_hook(chunk, off, size);

View File

@ -9,9 +9,10 @@
#include "test_bpf_ma.skel.h"
void test_test_bpf_ma(void)
static void do_bpf_ma_test(const char *name)
{
struct test_bpf_ma *skel;
struct bpf_program *prog;
struct btf *btf;
int i, err;
@ -34,6 +35,11 @@ void test_test_bpf_ma(void)
skel->rodata->data_btf_ids[i] = id;
}
prog = bpf_object__find_program_by_name(skel->obj, name);
if (!ASSERT_OK_PTR(prog, "invalid prog name"))
goto out;
bpf_program__set_autoload(prog, true);
err = test_bpf_ma__load(skel);
if (!ASSERT_OK(err, "load"))
goto out;
@ -48,3 +54,15 @@ void test_test_bpf_ma(void)
out:
test_bpf_ma__destroy(skel);
}
void test_test_bpf_ma(void)
{
if (test__start_subtest("batch_alloc_free"))
do_bpf_ma_test("test_batch_alloc_free");
if (test__start_subtest("free_through_map_free"))
do_bpf_ma_test("test_free_through_map_free");
if (test__start_subtest("batch_percpu_alloc_free"))
do_bpf_ma_test("test_batch_percpu_alloc_free");
if (test__start_subtest("percpu_free_through_map_free"))
do_bpf_ma_test("test_percpu_free_through_map_free");
}

View File

@ -37,10 +37,20 @@ int pid = 0;
__type(key, int); \
__type(value, struct map_value_##_size); \
__uint(max_entries, 128); \
} array_##_size SEC(".maps");
} array_##_size SEC(".maps")
static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int batch,
unsigned int idx)
#define DEFINE_ARRAY_WITH_PERCPU_KPTR(_size) \
struct map_value_percpu_##_size { \
struct bin_data_##_size __percpu_kptr * data; \
}; \
struct { \
__uint(type, BPF_MAP_TYPE_ARRAY); \
__type(key, int); \
__type(value, struct map_value_percpu_##_size); \
__uint(max_entries, 128); \
} array_percpu_##_size SEC(".maps")
static __always_inline void batch_alloc(struct bpf_map *map, unsigned int batch, unsigned int idx)
{
struct generic_map_value *value;
unsigned int i, key;
@ -65,6 +75,14 @@ static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int b
return;
}
}
}
static __always_inline void batch_free(struct bpf_map *map, unsigned int batch, unsigned int idx)
{
struct generic_map_value *value;
unsigned int i, key;
void *old;
for (i = 0; i < batch; i++) {
key = i;
value = bpf_map_lookup_elem(map, &key);
@ -81,8 +99,72 @@ static __always_inline void batch_alloc_free(struct bpf_map *map, unsigned int b
}
}
static __always_inline void batch_percpu_alloc(struct bpf_map *map, unsigned int batch,
unsigned int idx)
{
struct generic_map_value *value;
unsigned int i, key;
void *old, *new;
for (i = 0; i < batch; i++) {
key = i;
value = bpf_map_lookup_elem(map, &key);
if (!value) {
err = 1;
return;
}
/* per-cpu allocator may not be able to refill in time */
new = bpf_percpu_obj_new_impl(data_btf_ids[idx], NULL);
if (!new)
continue;
old = bpf_kptr_xchg(&value->data, new);
if (old) {
bpf_percpu_obj_drop(old);
err = 2;
return;
}
}
}
static __always_inline void batch_percpu_free(struct bpf_map *map, unsigned int batch,
unsigned int idx)
{
struct generic_map_value *value;
unsigned int i, key;
void *old;
for (i = 0; i < batch; i++) {
key = i;
value = bpf_map_lookup_elem(map, &key);
if (!value) {
err = 3;
return;
}
old = bpf_kptr_xchg(&value->data, NULL);
if (!old)
continue;
bpf_percpu_obj_drop(old);
}
}
#define CALL_BATCH_ALLOC(size, batch, idx) \
batch_alloc((struct bpf_map *)(&array_##size), batch, idx)
#define CALL_BATCH_ALLOC_FREE(size, batch, idx) \
batch_alloc_free((struct bpf_map *)(&array_##size), batch, idx)
do { \
batch_alloc((struct bpf_map *)(&array_##size), batch, idx); \
batch_free((struct bpf_map *)(&array_##size), batch, idx); \
} while (0)
#define CALL_BATCH_PERCPU_ALLOC(size, batch, idx) \
batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx)
#define CALL_BATCH_PERCPU_ALLOC_FREE(size, batch, idx) \
do { \
batch_percpu_alloc((struct bpf_map *)(&array_percpu_##size), batch, idx); \
batch_percpu_free((struct bpf_map *)(&array_percpu_##size), batch, idx); \
} while (0)
DEFINE_ARRAY_WITH_KPTR(8);
DEFINE_ARRAY_WITH_KPTR(16);
@ -97,8 +179,21 @@ DEFINE_ARRAY_WITH_KPTR(1024);
DEFINE_ARRAY_WITH_KPTR(2048);
DEFINE_ARRAY_WITH_KPTR(4096);
SEC("fentry/" SYS_PREFIX "sys_nanosleep")
int test_bpf_mem_alloc_free(void *ctx)
/* per-cpu kptr doesn't support bin_data_8 which is a zero-sized array */
DEFINE_ARRAY_WITH_PERCPU_KPTR(16);
DEFINE_ARRAY_WITH_PERCPU_KPTR(32);
DEFINE_ARRAY_WITH_PERCPU_KPTR(64);
DEFINE_ARRAY_WITH_PERCPU_KPTR(96);
DEFINE_ARRAY_WITH_PERCPU_KPTR(128);
DEFINE_ARRAY_WITH_PERCPU_KPTR(192);
DEFINE_ARRAY_WITH_PERCPU_KPTR(256);
DEFINE_ARRAY_WITH_PERCPU_KPTR(512);
DEFINE_ARRAY_WITH_PERCPU_KPTR(1024);
DEFINE_ARRAY_WITH_PERCPU_KPTR(2048);
DEFINE_ARRAY_WITH_PERCPU_KPTR(4096);
SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
int test_batch_alloc_free(void *ctx)
{
if ((u32)bpf_get_current_pid_tgid() != pid)
return 0;
@ -121,3 +216,76 @@ int test_bpf_mem_alloc_free(void *ctx)
return 0;
}
SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
int test_free_through_map_free(void *ctx)
{
if ((u32)bpf_get_current_pid_tgid() != pid)
return 0;
/* Alloc 128 8-bytes objects in batch to trigger refilling,
* then free these objects through map free.
*/
CALL_BATCH_ALLOC(8, 128, 0);
CALL_BATCH_ALLOC(16, 128, 1);
CALL_BATCH_ALLOC(32, 128, 2);
CALL_BATCH_ALLOC(64, 128, 3);
CALL_BATCH_ALLOC(96, 128, 4);
CALL_BATCH_ALLOC(128, 128, 5);
CALL_BATCH_ALLOC(192, 128, 6);
CALL_BATCH_ALLOC(256, 128, 7);
CALL_BATCH_ALLOC(512, 64, 8);
CALL_BATCH_ALLOC(1024, 32, 9);
CALL_BATCH_ALLOC(2048, 16, 10);
CALL_BATCH_ALLOC(4096, 8, 11);
return 0;
}
SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
int test_batch_percpu_alloc_free(void *ctx)
{
if ((u32)bpf_get_current_pid_tgid() != pid)
return 0;
/* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling,
* then free 128 16-bytes per-cpu objects in batch to trigger freeing.
*/
CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 1);
CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 2);
CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 3);
CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 4);
CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 5);
CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 6);
CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 7);
CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 8);
CALL_BATCH_PERCPU_ALLOC_FREE(1024, 32, 9);
CALL_BATCH_PERCPU_ALLOC_FREE(2048, 16, 10);
CALL_BATCH_PERCPU_ALLOC_FREE(4096, 8, 11);
return 0;
}
SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
int test_percpu_free_through_map_free(void *ctx)
{
if ((u32)bpf_get_current_pid_tgid() != pid)
return 0;
/* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling,
* then free these object through map free.
*/
CALL_BATCH_PERCPU_ALLOC(16, 128, 1);
CALL_BATCH_PERCPU_ALLOC(32, 128, 2);
CALL_BATCH_PERCPU_ALLOC(64, 128, 3);
CALL_BATCH_PERCPU_ALLOC(96, 128, 4);
CALL_BATCH_PERCPU_ALLOC(128, 128, 5);
CALL_BATCH_PERCPU_ALLOC(192, 128, 6);
CALL_BATCH_PERCPU_ALLOC(256, 128, 7);
CALL_BATCH_PERCPU_ALLOC(512, 64, 8);
CALL_BATCH_PERCPU_ALLOC(1024, 32, 9);
CALL_BATCH_PERCPU_ALLOC(2048, 16, 10);
CALL_BATCH_PERCPU_ALLOC(4096, 8, 11);
return 0;
}