diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da8c64ca8dc9..2ae39a3e9ead 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -50,6 +50,7 @@ struct bpf_map { const struct bpf_map_ops *ops; struct work_struct work; atomic_t usercnt; + struct bpf_map *inner_map_meta; }; struct bpf_map_type_list { @@ -276,6 +277,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); void bpf_fd_array_map_clear(struct bpf_map *map); +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0539a0ceef38..ce6f029ac368 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -96,6 +96,8 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, }; enum bpf_prog_type { @@ -152,6 +154,7 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* prealloc or not */ + __u32 inner_map_fd; /* fd pointing to the inner map */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1ce4f4fd7fd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bcf9955fac95..bc9da93db403 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,6 +17,8 @@ #include #include +#include "map_in_map.h" + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -117,20 +119,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; - u32 elem_size = array->elem_size; + u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, array->map.max_entries, - elem_size == 1 ? 2 : 3); - if (elem_size == 1) { - /* nop */ - } else if (is_power_of_2(elem_size)) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + + if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); } else { *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); @@ -605,3 +604,64 @@ static int __init register_cgroup_array_map(void) } late_initcall(register_cgroup_array_map); #endif + +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_array_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void array_of_map_free(struct bpf_map *map) +{ + /* map->inner_map_meta is only accessed by syscall which + * is protected by fdget/fdput. + */ + bpf_map_meta_free(map->inner_map_meta); + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = array_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static const struct bpf_map_ops array_of_map_ops = { + .map_alloc = array_of_map_alloc, + .map_free = array_of_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_of_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list array_of_map_type __ro_after_init = { + .ops = &array_of_map_ops, + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, +}; + +static int __init register_array_of_map(void) +{ + bpf_register_map_type(&array_of_map_type); + return 0; +} +late_initcall(register_array_of_map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 000153acb6d5..343fb5394c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include #include "percpu_freelist.h" #include "bpf_lru_list.h" +#include "map_in_map.h" struct bucket { struct hlist_nulls_head head; @@ -88,6 +89,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size return *(void __percpu **)(l->key + key_size); } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ + return *(void **)(l->key + roundup(map->key_size, 8)); +} + static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -603,6 +609,14 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + struct bpf_map *map = &htab->map; + + if (map->ops->map_fd_put_ptr) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + if (l->state == HTAB_EXTRA_ELEM_USED) { l->state = HTAB_EXTRA_ELEM_FREE; return; @@ -1057,6 +1071,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } } + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1213,12 +1228,118 @@ static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, }; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* pointer is stored internally */ + attr->value_size = sizeof(void *); + map = htab_map_alloc(attr); + attr->value_size = sizeof(u32); + + return map; +} + +static void fd_htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_node *n; + struct hlist_nulls_head *head; + struct htab_elem *l; + int i; + + for (i = 0; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + } + + htab_map_free(map); +} + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) +{ + void *ptr; + int ret; + u32 ufd = *(u32 *)value; + + ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + ret = htab_map_update_elem(map, key, &ptr, map_flags); + if (ret) + map->ops->map_fd_put_ptr(ptr); + + return ret; +} + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_htab_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = htab_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ + bpf_map_meta_free(map->inner_map_meta); + fd_htab_map_free(map); +} + +static const struct bpf_map_ops htab_of_map_ops = { + .map_alloc = htab_of_map_alloc, + .map_free = htab_of_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_of_map_lookup_elem, + .map_delete_elem = htab_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list htab_of_map_type __ro_after_init = { + .ops = &htab_of_map_ops, + .type = BPF_MAP_TYPE_HASH_OF_MAPS, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); bpf_register_map_type(&htab_lru_type); bpf_register_map_type(&htab_lru_percpu_type); + bpf_register_map_type(&htab_of_map_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..59bcdf821ae4 --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ + struct bpf_map *inner_map, *inner_map_meta; + struct fd f; + + f = fdget(inner_map_ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + /* prog_array->owner_prog_type and owner_jited + * is a runtime binding. Doing static check alone + * in the verifier is not enough. + */ + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + /* Does not support >1 level map-in-map */ + if (inner_map->inner_map_meta) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (!inner_map_meta) { + fdput(f); + return ERR_PTR(-ENOMEM); + } + + inner_map_meta->map_type = inner_map->map_type; + inner_map_meta->key_size = inner_map->key_size; + inner_map_meta->value_size = inner_map->value_size; + inner_map_meta->map_flags = inner_map->map_flags; + inner_map_meta->ops = inner_map->ops; + inner_map_meta->max_entries = inner_map->max_entries; + + fdput(f); + return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ + kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + /* No need to compare ops because it is covered by map_type */ + return meta0->map_type == meta1->map_type && + meta0->key_size == meta1->key_size && + meta0->value_size == meta1->value_size && + meta0->map_flags == meta1->map_flags && + meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int ufd) +{ + struct bpf_map *inner_map; + struct fd f; + + f = fdget(ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map = bpf_map_inc(inner_map, false); + else + inner_map = ERR_PTR(-EINVAL); + + fdput(f); + return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ + /* ptr->ops->map_free() has to go through one + * rcu grace period by itself. + */ + bpf_map_put(ptr); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..177fadb689dc --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd); +void bpf_map_fd_put_ptr(void *ptr); + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 48c914b983bd..c35ebfe6d84d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -215,7 +215,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -352,6 +352,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + err = -ENOTSUPP; } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -438,11 +441,17 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90bf46787603..09923cc5c7c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 65536 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1197,6 +1199,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + if (func_id != BPF_FUNC_map_lookup_elem) + goto error; default: break; } @@ -1357,6 +1363,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + struct bpf_insn_aux_data *insn_aux; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() @@ -1369,7 +1377,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + insn_aux = &env->insn_aux_data[insn_idx]; + if (!insn_aux->map_ptr) + insn_aux->map_ptr = meta.map_ptr; + else if (insn_aux->map_ptr != meta.map_ptr) + insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose("unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -2093,14 +2105,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - reg->type = type; + if (type == UNKNOWN_VALUE) { + __mark_reg_unknown_value(regs, regno); + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = type; + } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances * to take effect. */ reg->id = 0; - if (type == UNKNOWN_VALUE) - __mark_reg_unknown_value(regs, regno); } } @@ -3025,16 +3042,33 @@ static int do_check(struct bpf_verifier_env *env) return 0; } +static int check_map_prealloc(struct bpf_map *map) +{ + return (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || + !(map->map_flags & BPF_F_NO_PREALLOC); +} + static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT && - (map->map_type == BPF_MAP_TYPE_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && - (map->map_flags & BPF_F_NO_PREALLOC)) { - verbose("perf_event programs can only use preallocated hash map\n"); - return -EINVAL; + /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use + * preallocated hash maps, since doing memory allocation + * in overflow_handler can crash depending on where nmi got + * triggered. + */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (!check_map_prealloc(map)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + if (map->inner_map_meta && + !check_map_prealloc(map->inner_map_meta)) { + verbose("perf_event programs can only use preallocated inner hash map\n"); + return -EINVAL; + } } return 0; } @@ -3307,7 +3341,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (!map_ptr->ops->map_gen_lookup) + if (map_ptr == BPF_MAP_PTR_POISON || + !map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 09e9d535bd74..91c1d616d975 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -34,6 +34,7 @@ hostprogs-y += sampleip hostprogs-y += tc_l2_redirect hostprogs-y += lwt_len_hist hostprogs-y += xdp_tx_iptunnel +hostprogs-y += test_map_in_map # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -72,6 +73,7 @@ sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o +test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -105,6 +107,7 @@ always += trace_event_kern.o always += sampleip_kern.o always += lwt_len_hist_kern.o always += xdp_tx_iptunnel_kern.o +always += test_map_in_map_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -139,6 +142,7 @@ HOSTLOADLIBES_sampleip += -lelf HOSTLOADLIBES_tc_l2_redirect += -l elf HOSTLOADLIBES_lwt_len_hist += -l elf HOSTLOADLIBES_xdp_tx_iptunnel += -lelf +HOSTLOADLIBES_test_map_in_map += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index faaffe2e139a..52de9d88c021 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -80,6 +80,7 @@ struct bpf_map_def { unsigned int value_size; unsigned int max_entries; unsigned int map_flags; + unsigned int inner_map_idx; }; static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index b86ee54da2d1..dcdce1270d38 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -43,6 +43,7 @@ struct bpf_map_def { unsigned int value_size; unsigned int max_entries; unsigned int map_flags; + unsigned int inner_map_idx; }; static int populate_prog_array(const char *event, int prog_fd) @@ -198,11 +199,22 @@ static int load_maps(struct bpf_map_def *maps, int len) for (i = 0; i < len / sizeof(struct bpf_map_def); i++) { - map_fd[i] = bpf_create_map(maps[i].type, - maps[i].key_size, - maps[i].value_size, - maps[i].max_entries, - maps[i].map_flags); + if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) { + int inner_map_fd = map_fd[maps[i].inner_map_idx]; + + map_fd[i] = bpf_create_map_in_map(maps[i].type, + maps[i].key_size, + inner_map_fd, + maps[i].max_entries, + maps[i].map_flags); + } else { + map_fd[i] = bpf_create_map(maps[i].type, + maps[i].key_size, + maps[i].value_size, + maps[i].max_entries, + maps[i].map_flags); + } if (map_fd[i] < 0) { printf("failed to create a map: %d %s\n", errno, strerror(errno)); diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c new file mode 100644 index 000000000000..42c44d091dd1 --- /dev/null +++ b/samples/bpf/test_map_in_map_kern.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define MAX_NR_PORTS 65536 + +/* map #0 */ +struct bpf_map_def SEC("maps") port_a = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = MAX_NR_PORTS, +}; + +/* map #1 */ +struct bpf_map_def SEC("maps") port_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #2 */ +struct bpf_map_def SEC("maps") reg_result_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #3 */ +struct bpf_map_def SEC("maps") inline_result_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #4 */ /* Test case #0 */ +struct bpf_map_def SEC("maps") a_of_port_a = { + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 0, /* map_fd[0] is port_a */ + .max_entries = MAX_NR_PORTS, +}; + +/* map #5 */ /* Test case #1 */ +struct bpf_map_def SEC("maps") h_of_port_a = { + .type = BPF_MAP_TYPE_HASH_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 0, /* map_fd[0] is port_a */ + .max_entries = 1, +}; + +/* map #6 */ /* Test case #2 */ +struct bpf_map_def SEC("maps") h_of_port_h = { + .type = BPF_MAP_TYPE_HASH_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 1, /* map_fd[1] is port_h */ + .max_entries = 1, +}; + +static __always_inline int do_reg_lookup(void *inner_map, u32 port) +{ + int *result; + + result = bpf_map_lookup_elem(inner_map, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_array_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_a) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_a, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_h) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_h, &port); + return result ? *result : -ENOENT; +} + +SEC("kprobe/sys_connect") +int trace_sys_connect(struct pt_regs *ctx) +{ + struct sockaddr_in6 *in6; + u16 test_case, port, dst6[8]; + int addrlen, ret, inline_ret, ret_key = 0; + u32 port_key; + void *outer_map, *inner_map; + bool inline_hash = false; + + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); + addrlen = (int)PT_REGS_PARM3(ctx); + + if (addrlen != sizeof(*in6)) + return 0; + + ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + if (ret) { + inline_ret = ret; + goto done; + } + + if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + return 0; + + test_case = dst6[7]; + + ret = bpf_probe_read(&port, sizeof(port), &in6->sin6_port); + if (ret) { + inline_ret = ret; + goto done; + } + + port_key = port; + + ret = -ENOENT; + if (test_case == 0) { + outer_map = &a_of_port_a; + } else if (test_case == 1) { + outer_map = &h_of_port_a; + } else if (test_case == 2) { + outer_map = &h_of_port_h; + } else { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + inner_map = bpf_map_lookup_elem(outer_map, &port_key); + if (!inner_map) { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + ret = do_reg_lookup(inner_map, port_key); + + if (test_case == 0 || test_case == 1) + inline_ret = do_inline_array_lookup(inner_map, port_key); + else + inline_ret = do_inline_hash_lookup(inner_map, port_key); + +done: + bpf_map_update_elem(®_result_h, &ret_key, &ret, BPF_ANY); + bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c new file mode 100644 index 000000000000..f62fdc2bd428 --- /dev/null +++ b/samples/bpf/test_map_in_map_user.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define PORT_A (map_fd[0]) +#define PORT_H (map_fd[1]) +#define REG_RESULT_H (map_fd[2]) +#define INLINE_RESULT_H (map_fd[3]) +#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */ +#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */ +#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */ + +static const char * const test_names[] = { + "Array of Array", + "Hash of Array", + "Hash of Hash", +}; + +#define NR_TESTS (sizeof(test_names) / sizeof(*test_names)) + +static void populate_map(uint32_t port_key, int magic_result) +{ + int ret; + + ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY); + assert(!ret); + + ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result, + BPF_NOEXIST); + assert(!ret); + + ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY); + assert(!ret); + + ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST); + assert(!ret); + + ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST); + assert(!ret); +} + +static void test_map_in_map(void) +{ + struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; + uint32_t result_key = 0, port_key; + int result, inline_result; + int magic_result = 0xfaceb00c; + int ret; + int i; + + port_key = rand() & 0x00FF; + populate_map(port_key, magic_result); + + in6.sin6_addr.s6_addr16[0] = 0xdead; + in6.sin6_addr.s6_addr16[1] = 0xbeef; + in6.sin6_port = port_key; + + for (i = 0; i < NR_TESTS; i++) { + printf("%s: ", test_names[i]); + + in6.sin6_addr.s6_addr16[7] = i; + ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6)); + assert(ret == -1 && errno == EBADF); + + ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result); + assert(!ret); + + ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key, + &inline_result); + assert(!ret); + + if (result != magic_result || inline_result != magic_result) { + printf("Error. result:%d inline_result:%d\n", + result, inline_result); + exit(1); + } + + bpf_map_delete_elem(REG_RESULT_H, &result_key); + bpf_map_delete_elem(INLINE_RESULT_H, &result_key); + + printf("Pass\n"); + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + + assert(!setrlimit(RLIMIT_MEMLOCK, &r)); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + test_map_in_map(); + + return 0; +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0539a0ceef38..ce6f029ac368 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -96,6 +96,8 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, }; enum bpf_prog_type { @@ -152,6 +154,7 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* prealloc or not */ + __u32 inner_map_fd; /* fd pointing to the inner map */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 207c2eeddab0..9b58d20e8c93 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -69,6 +69,23 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } +int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, + int inner_map_fd, int max_entries, __u32 map_flags) +{ + union bpf_attr attr; + + memset(&attr, '\0', sizeof(attr)); + + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = 4; + attr.inner_map_fd = inner_map_fd; + attr.max_entries = max_entries; + attr.map_flags = map_flags; + + return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); +} + int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, const char *license, __u32 kern_version, char *log_buf, size_t log_buf_sz) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 09c3dcac0496..93f021932623 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -26,6 +26,8 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags); +int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, + int inner_map_fd, int max_entries, __u32 map_flags); /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE 65536 diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index d1555e4240c0..f4f43c98cf7f 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -38,6 +38,7 @@ #define MAX_INSNS 512 #define MAX_FIXUPS 8 +#define MAX_NR_MAPS 4 struct bpf_test { const char *descr; @@ -45,6 +46,7 @@ struct bpf_test { int fixup_map1[MAX_FIXUPS]; int fixup_map2[MAX_FIXUPS]; int fixup_prog[MAX_FIXUPS]; + int fixup_map_in_map[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; enum { @@ -4452,7 +4454,76 @@ static struct bpf_test tests[] = { .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", .result = REJECT, .result_unpriv = REJECT, - } + }, + { + "map in map access", + .insns = { + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_in_map = { 3 }, + .result = ACCEPT, + }, + { + "invalid inner map pointer", + .insns = { + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_in_map = { 3 }, + .errstr = "R1 type=inv expected=map_ptr", + .errstr_unpriv = "R1 pointer arithmetic prohibited", + .result = REJECT, + }, + { + "forgot null checking on the inner map pointer", + .insns = { + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_in_map = { 3 }, + .errstr = "R1 type=map_value_or_null expected=map_ptr", + .result = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) @@ -4489,42 +4560,73 @@ static int create_prog_array(void) return fd; } +static int create_map_in_map(void) +{ + int inner_map_fd, outer_map_fd; + + inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), + sizeof(int), 1, 0); + if (inner_map_fd < 0) { + printf("Failed to create array '%s'!\n", strerror(errno)); + return inner_map_fd; + } + + outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, + sizeof(int), inner_map_fd, 1, 0); + if (outer_map_fd < 0) + printf("Failed to create array of maps '%s'!\n", + strerror(errno)); + + close(inner_map_fd); + + return outer_map_fd; +} + static char bpf_vlog[32768]; static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, - int *fd_f1, int *fd_f2, int *fd_f3) + int *map_fds) { int *fixup_map1 = test->fixup_map1; int *fixup_map2 = test->fixup_map2; int *fixup_prog = test->fixup_prog; + int *fixup_map_in_map = test->fixup_map_in_map; /* Allocating HTs with 1 elem is fine here, since we only test * for verifier and not do a runtime lookup, so the only thing * that really matters is value size in this case. */ if (*fixup_map1) { - *fd_f1 = create_map(sizeof(long long), 1); + map_fds[0] = create_map(sizeof(long long), 1); do { - prog[*fixup_map1].imm = *fd_f1; + prog[*fixup_map1].imm = map_fds[0]; fixup_map1++; } while (*fixup_map1); } if (*fixup_map2) { - *fd_f2 = create_map(sizeof(struct test_val), 1); + map_fds[1] = create_map(sizeof(struct test_val), 1); do { - prog[*fixup_map2].imm = *fd_f2; + prog[*fixup_map2].imm = map_fds[1]; fixup_map2++; } while (*fixup_map2); } if (*fixup_prog) { - *fd_f3 = create_prog_array(); + map_fds[2] = create_prog_array(); do { - prog[*fixup_prog].imm = *fd_f3; + prog[*fixup_prog].imm = map_fds[2]; fixup_prog++; } while (*fixup_prog); } + + if (*fixup_map_in_map) { + map_fds[3] = create_map_in_map(); + do { + prog[*fixup_map_in_map].imm = map_fds[3]; + fixup_map_in_map++; + } while (*fixup_map_in_map); + } } static void do_test_single(struct bpf_test *test, bool unpriv, @@ -4533,11 +4635,15 @@ static void do_test_single(struct bpf_test *test, bool unpriv, struct bpf_insn *prog = test->insns; int prog_len = probe_filter_length(prog); int prog_type = test->prog_type; - int fd_f1 = -1, fd_f2 = -1, fd_f3 = -1; + int map_fds[MAX_NR_MAPS]; int fd_prog, expected_ret; const char *expected_err; + int i; - do_test_fixup(test, prog, &fd_f1, &fd_f2, &fd_f3); + for (i = 0; i < MAX_NR_MAPS; i++) + map_fds[i] = -1; + + do_test_fixup(test, prog, map_fds); fd_prog = bpf_load_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, prog, prog_len, "GPL", 0, bpf_vlog, @@ -4568,9 +4674,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, printf("OK\n"); close_fds: close(fd_prog); - close(fd_f1); - close(fd_f2); - close(fd_f3); + for (i = 0; i < MAX_NR_MAPS; i++) + close(map_fds[i]); sched_yield(); return; fail_log: