bpf: Introduce sleepable BPF programs

Introduce sleepable BPF programs that can request such property for themselves
via BPF_F_SLEEPABLE flag at program load time. In such case they will be able
to use helpers like bpf_copy_from_user() that might sleep. At present only
fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only
when they are attached to kernel functions that are known to allow sleeping.

The non-sleepable programs are relying on implicit rcu_read_lock() and
migrate_disable() to protect life time of programs, maps that they use and
per-cpu kernel structures used to pass info between bpf programs and the
kernel. The sleepable programs cannot be enclosed into rcu_read_lock().
migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs
should not be enclosed in migrate_disable() as well. Therefore
rcu_read_lock_trace is used to protect the life time of sleepable progs.

There are many networking and tracing program types. In many cases the
'struct bpf_prog *' pointer itself is rcu protected within some other kernel
data structure and the kernel code is using rcu_dereference() to load that
program pointer and call BPF_PROG_RUN() on it. All these cases are not touched.
Instead sleepable bpf programs are allowed with bpf trampoline only. The
program pointers are hard-coded into generated assembly of bpf trampoline and
synchronize_rcu_tasks_trace() is used to protect the life time of the program.
The same trampoline can hold both sleepable and non-sleepable progs.

When rcu_read_lock_trace is held it means that some sleepable bpf program is
running from bpf trampoline. Those programs can use bpf arrays and preallocated
hash/lru maps. These map types are waiting on programs to complete via
synchronize_rcu_tasks_trace();

Updates to trampoline now has to do synchronize_rcu_tasks_trace() and
synchronize_rcu_tasks() to wait for sleepable progs to finish and for
trampoline assembly to finish.

This is the first step of introducing sleepable progs. Eventually dynamically
allocated hash maps can be allowed and networking program types can become
sleepable too.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: KP Singh <kpsingh@google.com>
Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com
This commit is contained in:
Alexei Starovoitov 2020-08-27 15:01:11 -07:00 committed by Daniel Borkmann
parent 76cd61739f
commit 1e6c62a882
10 changed files with 162 additions and 25 deletions

View File

@ -1379,10 +1379,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
u8 *prog = *pprog; u8 *prog = *pprog;
int cnt = 0; int cnt = 0;
if (emit_call(&prog, __bpf_prog_enter, prog)) if (p->aux->sleepable) {
return -EINVAL; if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
/* remember prog start time returned by __bpf_prog_enter */ return -EINVAL;
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); } else {
if (emit_call(&prog, __bpf_prog_enter, prog))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
}
/* arg1: lea rdi, [rbp - stack_size] */ /* arg1: lea rdi, [rbp - stack_size] */
EMIT4(0x48, 0x8D, 0x7D, -stack_size); EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@ -1402,13 +1407,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (mod_ret) if (mod_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
/* arg1: mov rdi, progs[i] */ if (p->aux->sleepable) {
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
(u32) (long) p); return -EINVAL;
/* arg2: mov rsi, rbx <- start time in nsec */ } else {
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg1: mov rdi, progs[i] */
if (emit_call(&prog, __bpf_prog_exit, prog)) emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
return -EINVAL; (u32) (long) p);
/* arg2: mov rsi, rbx <- start time in nsec */
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
if (emit_call(&prog, __bpf_prog_exit, prog))
return -EINVAL;
}
*pprog = prog; *pprog = prog;
return 0; return 0;

View File

@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
/* these two functions are called from generated trampoline */ /* these two functions are called from generated trampoline */
u64 notrace __bpf_prog_enter(void); u64 notrace __bpf_prog_enter(void);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
void notrace __bpf_prog_enter_sleepable(void);
void notrace __bpf_prog_exit_sleepable(void);
struct bpf_ksym { struct bpf_ksym {
unsigned long start; unsigned long start;
@ -734,6 +736,7 @@ struct bpf_prog_aux {
bool offload_requested; bool offload_requested;
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
bool func_proto_unreliable; bool func_proto_unreliable;
bool sleepable;
enum bpf_tramp_prog_type trampoline_prog_type; enum bpf_tramp_prog_type trampoline_prog_type;
struct bpf_trampoline *trampoline; struct bpf_trampoline *trampoline;
struct hlist_node tramp_hlist; struct hlist_node tramp_hlist;

View File

@ -346,6 +346,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */ /* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3) #define BPF_F_TEST_STATE_FREQ (1U << 3)
/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
* restrict map and helper usage for such programs. Sleepable BPF programs can
* only be attached to hooks where kernel execution context allows sleeping.
* Such programs are allowed to use helpers that may sleep like
* bpf_copy_from_user().
*/
#define BPF_F_SLEEPABLE (1U << 4)
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions: * two extensions:
* *

View File

@ -1691,6 +1691,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call" bool "Enable bpf() system call"
select BPF select BPF
select IRQ_WORK select IRQ_WORK
select TASKS_TRACE_RCU
default n default n
help help
Enable the bpf() system call that allows to manipulate eBPF Enable the bpf() system call that allows to manipulate eBPF

View File

@ -10,6 +10,7 @@
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <uapi/linux/btf.h> #include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include "map_in_map.h" #include "map_in_map.h"

View File

@ -9,6 +9,7 @@
#include <linux/rculist_nulls.h> #include <linux/rculist_nulls.h>
#include <linux/random.h> #include <linux/random.h>
#include <uapi/linux/btf.h> #include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include "percpu_freelist.h" #include "percpu_freelist.h"
#include "bpf_lru_list.h" #include "bpf_lru_list.h"
#include "map_in_map.h" #include "map_in_map.h"
@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l; struct htab_elem *l;
u32 hash, key_size; u32 hash, key_size;
/* Must be called with rcu_read_lock. */ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size; key_size = map->key_size;
@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */ /* unknown flags */
return -EINVAL; return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held()); WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size; key_size = map->key_size;
@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */ /* unknown flags */
return -EINVAL; return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held()); WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size; key_size = map->key_size;
@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size; u32 hash, key_size;
int ret = -ENOENT; int ret = -ENOENT;
WARN_ON_ONCE(!rcu_read_lock_held()); WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size; key_size = map->key_size;
@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size; u32 hash, key_size;
int ret = -ENOENT; int ret = -ENOENT;
WARN_ON_ONCE(!rcu_read_lock_held()); WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size; key_size = map->key_size;

View File

@ -29,6 +29,7 @@
#include <linux/bpf_lsm.h> #include <linux/bpf_lsm.h>
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/bpf-netns.h> #include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
btf_put(prog->aux->btf); btf_put(prog->aux->btf);
bpf_prog_free_linfo(prog); bpf_prog_free_linfo(prog);
if (deferred) if (deferred) {
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); if (prog->aux->sleepable)
else call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
else
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
} else {
__bpf_prog_put_rcu(&prog->aux->rcu); __bpf_prog_put_rcu(&prog->aux->rcu);
}
} }
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT | BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ | BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32)) BPF_F_TEST_RND_HI32))
return -EINVAL; return -EINVAL;
@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
} }
prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
err = security_bpf_prog_alloc(prog->aux); err = security_bpf_prog_alloc(prog->aux);
if (err) if (err)

View File

@ -7,6 +7,8 @@
#include <linux/rbtree_latch.h> #include <linux/rbtree_latch.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <linux/btf.h> #include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
/* dummy _ops. The verifier will operate on target program's ops. */ /* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = { const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
* updates to trampoline would change the code from underneath the * updates to trampoline would change the code from underneath the
* preempted task. Hence wait for tasks to voluntarily schedule or go * preempted task. Hence wait for tasks to voluntarily schedule or go
* to userspace. * to userspace.
* The same trampoline can hold both sleepable and non-sleepable progs.
* synchronize_rcu_tasks_trace() is needed to make sure all sleepable
* programs finish executing.
* Wait for these two grace periods together.
*/ */
synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
synchronize_rcu_tasks();
err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
&tr->func.model, flags, tprogs, &tr->func.model, flags, tprogs,
@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out; goto out;
bpf_image_ksym_del(&tr->ksym); bpf_image_ksym_del(&tr->ksym);
/* wait for tasks to get out of trampoline before freeing it */ /* This code will be executed when all bpf progs (both sleepable and
* non-sleepable) went through
* bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
* Hence no need for another synchronize_rcu_tasks_trace() here,
* but synchronize_rcu_tasks() is still needed, since trampoline
* may not have had any sleepable programs and we need to wait
* for tasks to get out of trampoline code before freeing it.
*/
synchronize_rcu_tasks(); synchronize_rcu_tasks();
bpf_jit_free_exec(tr->image); bpf_jit_free_exec(tr->image);
hlist_del(&tr->hlist); hlist_del(&tr->hlist);
@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
rcu_read_unlock(); rcu_read_unlock();
} }
void notrace __bpf_prog_enter_sleepable(void)
{
rcu_read_lock_trace();
}
void notrace __bpf_prog_exit_sleepable(void)
{
rcu_read_unlock_trace();
}
int __weak int __weak
arch_prepare_bpf_trampoline(void *image, void *image_end, arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags, const struct btf_func_model *m, u32 flags,

View File

@ -21,6 +21,7 @@
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/error-injection.h> #include <linux/error-injection.h>
#include <linux/bpf_lsm.h> #include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include "disasm.h" #include "disasm.h"
@ -9367,6 +9368,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL; return -EINVAL;
} }
if (prog->aux->sleepable)
switch (map->map_type) {
case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_LRU_HASH:
case BPF_MAP_TYPE_ARRAY:
if (!is_preallocated_map(map)) {
verbose(env,
"Sleepable programs can only use preallocated hash maps\n");
return -EINVAL;
}
break;
default:
verbose(env,
"Sleepable programs can only use array and hash maps\n");
return -EINVAL;
}
return 0; return 0;
} }
@ -10985,6 +11003,36 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr)
return -EINVAL; return -EINVAL;
} }
/* non exhaustive list of sleepable bpf_lsm_*() functions */
BTF_SET_START(btf_sleepable_lsm_hooks)
#ifdef CONFIG_BPF_LSM
BTF_ID(func, bpf_lsm_file_mprotect)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
#endif
BTF_SET_END(btf_sleepable_lsm_hooks)
static int check_sleepable_lsm_hook(u32 btf_id)
{
return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id);
}
/* list of non-sleepable functions that are otherwise on
* ALLOW_ERROR_INJECTION list
*/
BTF_SET_START(btf_non_sleepable_error_inject)
/* Three functions below can be called from sleepable and non-sleepable context.
* Assume non-sleepable from bpf safety point of view.
*/
BTF_ID(func, __add_to_page_cache_locked)
BTF_ID(func, should_fail_alloc_page)
BTF_ID(func, should_failslab)
BTF_SET_END(btf_non_sleepable_error_inject)
static int check_non_sleepable_error_inject(u32 btf_id)
{
return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
}
static int check_attach_btf_id(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env)
{ {
struct bpf_prog *prog = env->prog; struct bpf_prog *prog = env->prog;
@ -11002,6 +11050,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
long addr; long addr;
u64 key; u64 key;
if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
prog->type != BPF_PROG_TYPE_LSM) {
verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
return -EINVAL;
}
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
return check_struct_ops_btf_id(env); return check_struct_ops_btf_id(env);
@ -11210,13 +11264,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
} }
} }
if (prog->expected_attach_type == BPF_MODIFY_RETURN) { if (prog->aux->sleepable) {
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
/* fentry/fexit/fmod_ret progs can be sleepable only if they are
* attached to ALLOW_ERROR_INJECTION and are not in denylist.
*/
if (!check_non_sleepable_error_inject(btf_id) &&
within_error_injection_list(addr))
ret = 0;
break;
case BPF_PROG_TYPE_LSM:
/* LSM progs check that they are attached to bpf_lsm_*() funcs.
* Only some of them are sleepable.
*/
if (check_sleepable_lsm_hook(btf_id))
ret = 0;
break;
default:
break;
}
if (ret)
verbose(env, "%s is not sleepable\n",
prog->aux->attach_func_name);
} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
ret = check_attach_modify_return(prog, addr); ret = check_attach_modify_return(prog, addr);
if (ret) if (ret)
verbose(env, "%s() is not modifiable\n", verbose(env, "%s() is not modifiable\n",
prog->aux->attach_func_name); prog->aux->attach_func_name);
} }
if (ret) if (ret)
goto out; goto out;
tr->func.addr = (void *)addr; tr->func.addr = (void *)addr;

View File

@ -346,6 +346,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */ /* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3) #define BPF_F_TEST_STATE_FREQ (1U << 3)
/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
* restrict map and helper usage for such programs. Sleepable BPF programs can
* only be attached to hooks where kernel execution context allows sleeping.
* Such programs are allowed to use helpers that may sleep like
* bpf_copy_from_user().
*/
#define BPF_F_SLEEPABLE (1U << 4)
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions: * two extensions:
* *