bpf: struct_ops supports more than one page for trampolines.

The BPF struct_ops previously only allowed one page of trampolines.
Each function pointer of a struct_ops is implemented by a struct_ops
bpf program. Each struct_ops bpf program requires a trampoline.
The following selftest patch shows each page can hold a little more
than 20 trampolines.

While one page is more than enough for the tcp-cc usecase,
the sched_ext use case shows that one page is not always enough and hits
the one page limit. This patch overcomes the one page limit by allocating
another page when needed and it is limited to a total of
MAX_IMAGE_PAGES (8) pages which is more than enough for
reasonable usages.

The variable st_map->image has been changed to st_map->image_pages, and
its type has been changed to an array of pointers to pages.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240224223418.526631-3-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
This commit is contained in:
Kui-Feng Lee 2024-02-24 14:34:17 -08:00 committed by Martin KaFai Lau
parent 73e4f9e615
commit 187e2af05a
3 changed files with 96 additions and 50 deletions

View File

@ -1763,7 +1763,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
void *stub_func,
void *image, void *image_end);
void **image, u32 *image_off,
bool allow_alloc);
void bpf_struct_ops_image_free(void *image);
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
if (owner == BPF_MODULE_OWNER)

View File

@ -18,6 +18,8 @@ struct bpf_struct_ops_value {
char data[] ____cacheline_aligned_in_smp;
};
#define MAX_TRAMP_IMAGE_PAGES 8
struct bpf_struct_ops_map {
struct bpf_map map;
struct rcu_head rcu;
@ -30,12 +32,11 @@ struct bpf_struct_ops_map {
*/
struct bpf_link **links;
u32 links_cnt;
/* image is a page that has all the trampolines
u32 image_pages_cnt;
/* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
* A PAGE_SIZE "image" is enough to store all trampoline for
* "links[]".
*/
void *image;
void *image_pages[MAX_TRAMP_IMAGE_PAGES];
/* The owner moduler's btf. */
struct btf *btf;
/* uvalue->data stores the kernel struct
@ -116,6 +117,31 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
return true;
}
static void *bpf_struct_ops_image_alloc(void)
{
void *image;
int err;
err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
return ERR_PTR(err);
image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!image) {
bpf_jit_uncharge_modmem(PAGE_SIZE);
return ERR_PTR(-ENOMEM);
}
return image;
}
void bpf_struct_ops_image_free(void *image)
{
if (image) {
arch_free_bpf_trampoline(image, PAGE_SIZE);
bpf_jit_uncharge_modmem(PAGE_SIZE);
}
}
#define MAYBE_NULL_SUFFIX "__nullable"
#define MAX_STUB_NAME 128
@ -461,6 +487,15 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
}
}
static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
{
int i;
for (i = 0; i < st_map->image_pages_cnt; i++)
bpf_struct_ops_image_free(st_map->image_pages[i]);
st_map->image_pages_cnt = 0;
}
static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
{
const struct btf_member *member;
@ -506,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
void *stub_func, void *image, void *image_end)
void *stub_func,
void **_image, u32 *_image_off,
bool allow_alloc)
{
u32 flags = BPF_TRAMP_F_INDIRECT;
u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
void *image = *_image;
int size;
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
@ -518,12 +556,32 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
if (size < 0)
return size;
if (size > (unsigned long)image_end - (unsigned long)image)
return -E2BIG;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
if (size <= 0)
return size ? : -EFAULT;
/* Allocate image buffer if necessary */
if (!image || size > PAGE_SIZE - image_off) {
if (!allow_alloc)
return -E2BIG;
image = bpf_struct_ops_image_alloc();
if (IS_ERR(image))
return PTR_ERR(image);
image_off = 0;
}
size = arch_prepare_bpf_trampoline(NULL, image + image_off,
image + PAGE_SIZE,
model, flags, tlinks, stub_func);
if (size <= 0) {
if (image != *_image)
bpf_struct_ops_image_free(image);
return size ? : -EFAULT;
}
*_image = image;
*_image_off = image_off + size;
return 0;
}
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@ -539,8 +597,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_tramp_links *tlinks;
void *udata, *kdata;
int prog_fd, err;
void *image, *image_end;
u32 i;
u32 i, trampoline_start, image_off = 0;
void *cur_image = NULL, *image = NULL;
if (flags)
return -EINVAL;
@ -578,8 +636,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
image_end = st_map->image + PAGE_SIZE;
module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
@ -658,15 +714,24 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
&bpf_struct_ops_link_lops, prog);
st_map->links[i] = &link->link;
trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[i],
*(void **)(st_ops->cfi_stubs + moff),
image, image_end);
&st_ops->func_models[i],
*(void **)(st_ops->cfi_stubs + moff),
&image, &image_off,
st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
if (err)
goto reset_unlock;
if (cur_image != image) {
st_map->image_pages[st_map->image_pages_cnt++] = image;
cur_image = image;
trampoline_start = 0;
}
if (err < 0)
goto reset_unlock;
*(void **)(kdata + moff) = image + cfi_get_offset();
image += err;
*(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
@ -677,10 +742,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
goto reset_unlock;
}
for (i = 0; i < st_map->image_pages_cnt; i++)
arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0;
arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
@ -689,7 +755,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* This refcnt increment on the map here after
@ -712,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock:
bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
@ -781,10 +846,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
if (st_map->image) {
arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
bpf_jit_uncharge_modmem(PAGE_SIZE);
}
bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
@ -894,20 +956,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->st_ops_desc = st_ops_desc;
map = &st_map->map;
ret = bpf_jit_charge_modmem(PAGE_SIZE);
if (ret)
goto errout_free;
st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!st_map->image) {
/* __bpf_struct_ops_map_free() uses st_map->image as flag
* for "charged or not". In this case, we need to unchange
* here.
*/
bpf_jit_uncharge_modmem(PAGE_SIZE);
ret = -ENOMEM;
goto errout_free;
}
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->links_cnt = btf_type_vlen(t);
st_map->links =

View File

@ -91,6 +91,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
struct bpf_tramp_link *link = NULL;
void *image = NULL;
unsigned int op_idx;
u32 image_off = 0;
int prog_ret;
s32 type_id;
int err;
@ -114,12 +115,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
goto out;
}
image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!image) {
err = -ENOMEM;
goto out;
}
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
@ -133,7 +128,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[op_idx],
&dummy_ops_test_ret_function,
image, image + PAGE_SIZE);
&image, &image_off,
true);
if (err < 0)
goto out;
@ -147,7 +143,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
err = -EFAULT;
out:
kfree(args);
arch_free_bpf_trampoline(image, PAGE_SIZE);
bpf_struct_ops_image_free(image);
if (link)
bpf_link_put(&link->link);
kfree(tlinks);