From fad73a1a35ea61f13607a391aca669caad8c04ca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2017 10:00:32 -0700 Subject: [PATCH 1/4] bpf: Fix and simplifications on inline map lookup Fix in verifier: For the same bpf_map_lookup_elem() instruction (i.e. "call 1"), a broken case is "a different type of map could be used for the same lookup instruction". For example, an array in one case and a hashmap in another. We have to resort to the old dynamic call behavior in this case. The fix is to check for collision on insn_aux->map_ptr. If there is collision, don't inline the map lookup. Please see the "do_reg_lookup()" in test_map_in_map_kern.c in the later patch for how-to trigger the above case. Simplifications on array_map_gen_lookup(): 1. Calculate elem_size from map->value_size. It removes the need for 'struct bpf_array' which makes the later map-in-map implementation easier. 2. Remove the 'elem_size == 1' test Fixes: 81ed18ab3098 ("bpf: add helper inlining infra and optimize map_array lookup") Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 11 ++++------- kernel/bpf/verifier.c | 13 +++++++++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bcf9955fac95..4d7d5d0ed76a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -117,20 +117,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; - u32 elem_size = array->elem_size; + u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, array->map.max_entries, - elem_size == 1 ? 2 : 3); - if (elem_size == 1) { - /* nop */ - } else if (is_power_of_2(elem_size)) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + + if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); } else { *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90bf46787603..9bf82267f2f9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 65536 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1357,6 +1359,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + struct bpf_insn_aux_data *insn_aux; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() @@ -1369,7 +1373,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + insn_aux = &env->insn_aux_data[insn_idx]; + if (!insn_aux->map_ptr) + insn_aux->map_ptr = meta.map_ptr; + else if (insn_aux->map_ptr != meta.map_ptr) + insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose("unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3307,7 +3315,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (!map_ptr->ops->map_gen_lookup) + if (map_ptr == BPF_MAP_PTR_POISON || + !map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); From 56f668dfe00dcf086734f1c42ea999398fad6572 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2017 10:00:33 -0700 Subject: [PATCH 2/4] bpf: Add array of maps support This patch adds a few helper funcs to enable map-in-map support (i.e. outer_map->inner_map). The first outer_map type BPF_MAP_TYPE_ARRAY_OF_MAPS is also added in this patch. The next patch will introduce a hash of maps type. Any bpf map type can be acted as an inner_map. The exception is BPF_MAP_TYPE_PROG_ARRAY because the extra level of indirection makes it harder to verify the owner_prog_type and owner_jited. Multi-level map-in-map is not supported (i.e. map->map is ok but not map->map->map). When adding an inner_map to an outer_map, it currently checks the map_type, key_size, value_size, map_flags, max_entries and ops. The verifier also uses those map's properties to do static analysis. map_flags is needed because we need to ensure BPF_PROG_TYPE_PERF_EVENT is using a preallocated hashtab for the inner_hash also. ops and max_entries are needed to generate inlined map-lookup instructions. For simplicity reason, a simple '==' test is used for both map_flags and max_entries. The equality of ops is implied by the equality of map_type. During outer_map creation time, an inner_map_fd is needed to create an outer_map. However, the inner_map_fd's life time does not depend on the outer_map. The inner_map_fd is merely used to initialize the inner_map_meta of the outer_map. Also, for the outer_map: * It allows element update and delete from syscall * It allows element lookup from bpf_prog The above is similar to the current fd_array pattern. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 2 + kernel/bpf/Makefile | 2 +- kernel/bpf/arraymap.c | 63 ++++++++++++++++++++++++++ kernel/bpf/map_in_map.c | 97 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/map_in_map.h | 23 ++++++++++ kernel/bpf/syscall.c | 7 ++- kernel/bpf/verifier.c | 42 +++++++++++++---- 8 files changed, 225 insertions(+), 12 deletions(-) create mode 100644 kernel/bpf/map_in_map.c create mode 100644 kernel/bpf/map_in_map.h diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da8c64ca8dc9..3f3cdf9b15e8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -50,6 +50,7 @@ struct bpf_map { const struct bpf_map_ops *ops; struct work_struct work; atomic_t usercnt; + struct bpf_map *inner_map_meta; }; struct bpf_map_type_list { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0539a0ceef38..1701ec1e7de3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, }; enum bpf_prog_type { @@ -152,6 +153,7 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* prealloc or not */ + __u32 inner_map_fd; /* fd pointing to the inner map */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1ce4f4fd7fd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4d7d5d0ed76a..bc9da93db403 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,6 +17,8 @@ #include #include +#include "map_in_map.h" + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -602,3 +604,64 @@ static int __init register_cgroup_array_map(void) } late_initcall(register_cgroup_array_map); #endif + +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_array_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void array_of_map_free(struct bpf_map *map) +{ + /* map->inner_map_meta is only accessed by syscall which + * is protected by fdget/fdput. + */ + bpf_map_meta_free(map->inner_map_meta); + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = array_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static const struct bpf_map_ops array_of_map_ops = { + .map_alloc = array_of_map_alloc, + .map_free = array_of_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_of_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list array_of_map_type __ro_after_init = { + .ops = &array_of_map_ops, + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, +}; + +static int __init register_array_of_map(void) +{ + bpf_register_map_type(&array_of_map_type); + return 0; +} +late_initcall(register_array_of_map); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..59bcdf821ae4 --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ + struct bpf_map *inner_map, *inner_map_meta; + struct fd f; + + f = fdget(inner_map_ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + /* prog_array->owner_prog_type and owner_jited + * is a runtime binding. Doing static check alone + * in the verifier is not enough. + */ + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + /* Does not support >1 level map-in-map */ + if (inner_map->inner_map_meta) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (!inner_map_meta) { + fdput(f); + return ERR_PTR(-ENOMEM); + } + + inner_map_meta->map_type = inner_map->map_type; + inner_map_meta->key_size = inner_map->key_size; + inner_map_meta->value_size = inner_map->value_size; + inner_map_meta->map_flags = inner_map->map_flags; + inner_map_meta->ops = inner_map->ops; + inner_map_meta->max_entries = inner_map->max_entries; + + fdput(f); + return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ + kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + /* No need to compare ops because it is covered by map_type */ + return meta0->map_type == meta1->map_type && + meta0->key_size == meta1->key_size && + meta0->value_size == meta1->value_size && + meta0->map_flags == meta1->map_flags && + meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int ufd) +{ + struct bpf_map *inner_map; + struct fd f; + + f = fdget(ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map = bpf_map_inc(inner_map, false); + else + inner_map = ERR_PTR(-EINVAL); + + fdput(f); + return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ + /* ptr->ops->map_free() has to go through one + * rcu grace period by itself. + */ + bpf_map_put(ptr); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..177fadb689dc --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd); +void bpf_map_fd_put_ptr(void *ptr); + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 48c914b983bd..6e24fdf1f373 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -215,7 +215,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -352,6 +352,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + err = -ENOTSUPP; } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -438,7 +440,8 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9bf82267f2f9..3b8f528c5473 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1199,6 +1199,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + if (func_id != BPF_FUNC_map_lookup_elem) + goto error; default: break; } @@ -2101,14 +2104,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - reg->type = type; + if (type == UNKNOWN_VALUE) { + __mark_reg_unknown_value(regs, regno); + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = type; + } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances * to take effect. */ reg->id = 0; - if (type == UNKNOWN_VALUE) - __mark_reg_unknown_value(regs, regno); } } @@ -3033,16 +3041,32 @@ process_bpf_exit: return 0; } +static int check_map_prealloc(struct bpf_map *map) +{ + return (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH) || + !(map->map_flags & BPF_F_NO_PREALLOC); +} + static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT && - (map->map_type == BPF_MAP_TYPE_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && - (map->map_flags & BPF_F_NO_PREALLOC)) { - verbose("perf_event programs can only use preallocated hash map\n"); - return -EINVAL; + /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use + * preallocated hash maps, since doing memory allocation + * in overflow_handler can crash depending on where nmi got + * triggered. + */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (!check_map_prealloc(map)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + if (map->inner_map_meta && + !check_map_prealloc(map->inner_map_meta)) { + verbose("perf_event programs can only use preallocated inner hash map\n"); + return -EINVAL; + } } return 0; } From bcc6b1b7ebf857a9fe56202e2be3361131588c15 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2017 10:00:34 -0700 Subject: [PATCH 3/4] bpf: Add hash of maps support This patch adds hash of maps support (hashmap->bpf_map). BPF_MAP_TYPE_HASH_OF_MAPS is added. A map-in-map contains a pointer to another map and lets call this pointer 'inner_map_ptr'. Notes on deleting inner_map_ptr from a hash map: 1. For BPF_F_NO_PREALLOC map-in-map, when deleting an inner_map_ptr, the htab_elem itself will go through a rcu grace period and the inner_map_ptr resides in the htab_elem. 2. For pre-allocated htab_elem (!BPF_F_NO_PREALLOC), when deleting an inner_map_ptr, the htab_elem may get reused immediately. This situation is similar to the existing prealloc-ated use cases. However, the bpf_map_fd_put_ptr() calls bpf_map_put() which calls inner_map->ops->map_free(inner_map) which will go through a rcu grace period (i.e. all bpf_map's map_free currently goes through a rcu grace period). Hence, the inner_map_ptr is still safe for the rcu reader side. This patch also includes BPF_MAP_TYPE_HASH_OF_MAPS to the check_map_prealloc() in the verifier. preallocation is a must for BPF_PROG_TYPE_PERF_EVENT. Hence, even we don't expect heavy updates to map-in-map, enforcing BPF_F_NO_PREALLOC for map-in-map is impossible without disallowing BPF_PROG_TYPE_PERF_EVENT from using map-in-map first. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 1 + kernel/bpf/hashtab.c | 121 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 8 ++- kernel/bpf/verifier.c | 4 +- 5 files changed, 134 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3f3cdf9b15e8..2ae39a3e9ead 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -277,6 +277,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); void bpf_fd_array_map_clear(struct bpf_map *map); +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1701ec1e7de3..ce6f029ac368 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, }; enum bpf_prog_type { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 000153acb6d5..343fb5394c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include #include "percpu_freelist.h" #include "bpf_lru_list.h" +#include "map_in_map.h" struct bucket { struct hlist_nulls_head head; @@ -88,6 +89,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size return *(void __percpu **)(l->key + key_size); } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ + return *(void **)(l->key + roundup(map->key_size, 8)); +} + static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -603,6 +609,14 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + struct bpf_map *map = &htab->map; + + if (map->ops->map_fd_put_ptr) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + if (l->state == HTAB_EXTRA_ELEM_USED) { l->state = HTAB_EXTRA_ELEM_FREE; return; @@ -1057,6 +1071,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } } + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1213,12 +1228,118 @@ static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, }; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* pointer is stored internally */ + attr->value_size = sizeof(void *); + map = htab_map_alloc(attr); + attr->value_size = sizeof(u32); + + return map; +} + +static void fd_htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_node *n; + struct hlist_nulls_head *head; + struct htab_elem *l; + int i; + + for (i = 0; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + } + + htab_map_free(map); +} + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) +{ + void *ptr; + int ret; + u32 ufd = *(u32 *)value; + + ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + ret = htab_map_update_elem(map, key, &ptr, map_flags); + if (ret) + map->ops->map_fd_put_ptr(ptr); + + return ret; +} + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_htab_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = htab_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ + bpf_map_meta_free(map->inner_map_meta); + fd_htab_map_free(map); +} + +static const struct bpf_map_ops htab_of_map_ops = { + .map_alloc = htab_of_map_alloc, + .map_free = htab_of_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_of_map_lookup_elem, + .map_delete_elem = htab_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list htab_of_map_type __ro_after_init = { + .ops = &htab_of_map_ops, + .type = BPF_MAP_TYPE_HASH_OF_MAPS, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); bpf_register_map_type(&htab_lru_type); bpf_register_map_type(&htab_lru_percpu_type); + bpf_register_map_type(&htab_of_map_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6e24fdf1f373..c35ebfe6d84d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -352,7 +352,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { err = -ENOTSUPP; } else { rcu_read_lock(); @@ -446,6 +447,11 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3b8f528c5473..09923cc5c7c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1200,6 +1200,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) goto error; default: @@ -3044,7 +3045,8 @@ process_bpf_exit: static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_PERCPU_HASH) || + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || !(map->map_flags & BPF_F_NO_PREALLOC); } From fb30d4b71214aa1811e997f8f753b14b46d5b912 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2017 10:00:35 -0700 Subject: [PATCH 4/4] bpf: Add tests for map-in-map Test cases for array of maps and hash of maps. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/Makefile | 4 + samples/bpf/bpf_helpers.h | 1 + samples/bpf/bpf_load.c | 22 ++- samples/bpf/test_map_in_map_kern.c | 173 ++++++++++++++++++++ samples/bpf/test_map_in_map_user.c | 116 +++++++++++++ tools/include/uapi/linux/bpf.h | 3 + tools/lib/bpf/bpf.c | 17 ++ tools/lib/bpf/bpf.h | 2 + tools/testing/selftests/bpf/test_verifier.c | 131 +++++++++++++-- 9 files changed, 451 insertions(+), 18 deletions(-) create mode 100644 samples/bpf/test_map_in_map_kern.c create mode 100644 samples/bpf/test_map_in_map_user.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 09e9d535bd74..91c1d616d975 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -34,6 +34,7 @@ hostprogs-y += sampleip hostprogs-y += tc_l2_redirect hostprogs-y += lwt_len_hist hostprogs-y += xdp_tx_iptunnel +hostprogs-y += test_map_in_map # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -72,6 +73,7 @@ sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o +test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -105,6 +107,7 @@ always += trace_event_kern.o always += sampleip_kern.o always += lwt_len_hist_kern.o always += xdp_tx_iptunnel_kern.o +always += test_map_in_map_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -139,6 +142,7 @@ HOSTLOADLIBES_sampleip += -lelf HOSTLOADLIBES_tc_l2_redirect += -l elf HOSTLOADLIBES_lwt_len_hist += -l elf HOSTLOADLIBES_xdp_tx_iptunnel += -lelf +HOSTLOADLIBES_test_map_in_map += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index faaffe2e139a..52de9d88c021 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -80,6 +80,7 @@ struct bpf_map_def { unsigned int value_size; unsigned int max_entries; unsigned int map_flags; + unsigned int inner_map_idx; }; static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index b86ee54da2d1..dcdce1270d38 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -43,6 +43,7 @@ struct bpf_map_def { unsigned int value_size; unsigned int max_entries; unsigned int map_flags; + unsigned int inner_map_idx; }; static int populate_prog_array(const char *event, int prog_fd) @@ -198,11 +199,22 @@ static int load_maps(struct bpf_map_def *maps, int len) for (i = 0; i < len / sizeof(struct bpf_map_def); i++) { - map_fd[i] = bpf_create_map(maps[i].type, - maps[i].key_size, - maps[i].value_size, - maps[i].max_entries, - maps[i].map_flags); + if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) { + int inner_map_fd = map_fd[maps[i].inner_map_idx]; + + map_fd[i] = bpf_create_map_in_map(maps[i].type, + maps[i].key_size, + inner_map_fd, + maps[i].max_entries, + maps[i].map_flags); + } else { + map_fd[i] = bpf_create_map(maps[i].type, + maps[i].key_size, + maps[i].value_size, + maps[i].max_entries, + maps[i].map_flags); + } if (map_fd[i] < 0) { printf("failed to create a map: %d %s\n", errno, strerror(errno)); diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c new file mode 100644 index 000000000000..42c44d091dd1 --- /dev/null +++ b/samples/bpf/test_map_in_map_kern.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define MAX_NR_PORTS 65536 + +/* map #0 */ +struct bpf_map_def SEC("maps") port_a = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = MAX_NR_PORTS, +}; + +/* map #1 */ +struct bpf_map_def SEC("maps") port_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #2 */ +struct bpf_map_def SEC("maps") reg_result_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #3 */ +struct bpf_map_def SEC("maps") inline_result_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #4 */ /* Test case #0 */ +struct bpf_map_def SEC("maps") a_of_port_a = { + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 0, /* map_fd[0] is port_a */ + .max_entries = MAX_NR_PORTS, +}; + +/* map #5 */ /* Test case #1 */ +struct bpf_map_def SEC("maps") h_of_port_a = { + .type = BPF_MAP_TYPE_HASH_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 0, /* map_fd[0] is port_a */ + .max_entries = 1, +}; + +/* map #6 */ /* Test case #2 */ +struct bpf_map_def SEC("maps") h_of_port_h = { + .type = BPF_MAP_TYPE_HASH_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 1, /* map_fd[1] is port_h */ + .max_entries = 1, +}; + +static __always_inline int do_reg_lookup(void *inner_map, u32 port) +{ + int *result; + + result = bpf_map_lookup_elem(inner_map, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_array_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_a) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_a, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_h) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_h, &port); + return result ? *result : -ENOENT; +} + +SEC("kprobe/sys_connect") +int trace_sys_connect(struct pt_regs *ctx) +{ + struct sockaddr_in6 *in6; + u16 test_case, port, dst6[8]; + int addrlen, ret, inline_ret, ret_key = 0; + u32 port_key; + void *outer_map, *inner_map; + bool inline_hash = false; + + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); + addrlen = (int)PT_REGS_PARM3(ctx); + + if (addrlen != sizeof(*in6)) + return 0; + + ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + if (ret) { + inline_ret = ret; + goto done; + } + + if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + return 0; + + test_case = dst6[7]; + + ret = bpf_probe_read(&port, sizeof(port), &in6->sin6_port); + if (ret) { + inline_ret = ret; + goto done; + } + + port_key = port; + + ret = -ENOENT; + if (test_case == 0) { + outer_map = &a_of_port_a; + } else if (test_case == 1) { + outer_map = &h_of_port_a; + } else if (test_case == 2) { + outer_map = &h_of_port_h; + } else { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + inner_map = bpf_map_lookup_elem(outer_map, &port_key); + if (!inner_map) { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + ret = do_reg_lookup(inner_map, port_key); + + if (test_case == 0 || test_case == 1) + inline_ret = do_inline_array_lookup(inner_map, port_key); + else + inline_ret = do_inline_hash_lookup(inner_map, port_key); + +done: + bpf_map_update_elem(®_result_h, &ret_key, &ret, BPF_ANY); + bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c new file mode 100644 index 000000000000..f62fdc2bd428 --- /dev/null +++ b/samples/bpf/test_map_in_map_user.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define PORT_A (map_fd[0]) +#define PORT_H (map_fd[1]) +#define REG_RESULT_H (map_fd[2]) +#define INLINE_RESULT_H (map_fd[3]) +#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */ +#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */ +#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */ + +static const char * const test_names[] = { + "Array of Array", + "Hash of Array", + "Hash of Hash", +}; + +#define NR_TESTS (sizeof(test_names) / sizeof(*test_names)) + +static void populate_map(uint32_t port_key, int magic_result) +{ + int ret; + + ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY); + assert(!ret); + + ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result, + BPF_NOEXIST); + assert(!ret); + + ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY); + assert(!ret); + + ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST); + assert(!ret); + + ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST); + assert(!ret); +} + +static void test_map_in_map(void) +{ + struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; + uint32_t result_key = 0, port_key; + int result, inline_result; + int magic_result = 0xfaceb00c; + int ret; + int i; + + port_key = rand() & 0x00FF; + populate_map(port_key, magic_result); + + in6.sin6_addr.s6_addr16[0] = 0xdead; + in6.sin6_addr.s6_addr16[1] = 0xbeef; + in6.sin6_port = port_key; + + for (i = 0; i < NR_TESTS; i++) { + printf("%s: ", test_names[i]); + + in6.sin6_addr.s6_addr16[7] = i; + ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6)); + assert(ret == -1 && errno == EBADF); + + ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result); + assert(!ret); + + ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key, + &inline_result); + assert(!ret); + + if (result != magic_result || inline_result != magic_result) { + printf("Error. result:%d inline_result:%d\n", + result, inline_result); + exit(1); + } + + bpf_map_delete_elem(REG_RESULT_H, &result_key); + bpf_map_delete_elem(INLINE_RESULT_H, &result_key); + + printf("Pass\n"); + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + + assert(!setrlimit(RLIMIT_MEMLOCK, &r)); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + test_map_in_map(); + + return 0; +} diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0539a0ceef38..ce6f029ac368 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -96,6 +96,8 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, }; enum bpf_prog_type { @@ -152,6 +154,7 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* prealloc or not */ + __u32 inner_map_fd; /* fd pointing to the inner map */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 207c2eeddab0..9b58d20e8c93 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -69,6 +69,23 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } +int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, + int inner_map_fd, int max_entries, __u32 map_flags) +{ + union bpf_attr attr; + + memset(&attr, '\0', sizeof(attr)); + + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = 4; + attr.inner_map_fd = inner_map_fd; + attr.max_entries = max_entries; + attr.map_flags = map_flags; + + return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); +} + int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, const char *license, __u32 kern_version, char *log_buf, size_t log_buf_sz) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 09c3dcac0496..93f021932623 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -26,6 +26,8 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags); +int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, + int inner_map_fd, int max_entries, __u32 map_flags); /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE 65536 diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index d1555e4240c0..f4f43c98cf7f 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -38,6 +38,7 @@ #define MAX_INSNS 512 #define MAX_FIXUPS 8 +#define MAX_NR_MAPS 4 struct bpf_test { const char *descr; @@ -45,6 +46,7 @@ struct bpf_test { int fixup_map1[MAX_FIXUPS]; int fixup_map2[MAX_FIXUPS]; int fixup_prog[MAX_FIXUPS]; + int fixup_map_in_map[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; enum { @@ -4452,7 +4454,76 @@ static struct bpf_test tests[] = { .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", .result = REJECT, .result_unpriv = REJECT, - } + }, + { + "map in map access", + .insns = { + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_in_map = { 3 }, + .result = ACCEPT, + }, + { + "invalid inner map pointer", + .insns = { + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_in_map = { 3 }, + .errstr = "R1 type=inv expected=map_ptr", + .errstr_unpriv = "R1 pointer arithmetic prohibited", + .result = REJECT, + }, + { + "forgot null checking on the inner map pointer", + .insns = { + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_in_map = { 3 }, + .errstr = "R1 type=map_value_or_null expected=map_ptr", + .result = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) @@ -4489,42 +4560,73 @@ static int create_prog_array(void) return fd; } +static int create_map_in_map(void) +{ + int inner_map_fd, outer_map_fd; + + inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), + sizeof(int), 1, 0); + if (inner_map_fd < 0) { + printf("Failed to create array '%s'!\n", strerror(errno)); + return inner_map_fd; + } + + outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, + sizeof(int), inner_map_fd, 1, 0); + if (outer_map_fd < 0) + printf("Failed to create array of maps '%s'!\n", + strerror(errno)); + + close(inner_map_fd); + + return outer_map_fd; +} + static char bpf_vlog[32768]; static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, - int *fd_f1, int *fd_f2, int *fd_f3) + int *map_fds) { int *fixup_map1 = test->fixup_map1; int *fixup_map2 = test->fixup_map2; int *fixup_prog = test->fixup_prog; + int *fixup_map_in_map = test->fixup_map_in_map; /* Allocating HTs with 1 elem is fine here, since we only test * for verifier and not do a runtime lookup, so the only thing * that really matters is value size in this case. */ if (*fixup_map1) { - *fd_f1 = create_map(sizeof(long long), 1); + map_fds[0] = create_map(sizeof(long long), 1); do { - prog[*fixup_map1].imm = *fd_f1; + prog[*fixup_map1].imm = map_fds[0]; fixup_map1++; } while (*fixup_map1); } if (*fixup_map2) { - *fd_f2 = create_map(sizeof(struct test_val), 1); + map_fds[1] = create_map(sizeof(struct test_val), 1); do { - prog[*fixup_map2].imm = *fd_f2; + prog[*fixup_map2].imm = map_fds[1]; fixup_map2++; } while (*fixup_map2); } if (*fixup_prog) { - *fd_f3 = create_prog_array(); + map_fds[2] = create_prog_array(); do { - prog[*fixup_prog].imm = *fd_f3; + prog[*fixup_prog].imm = map_fds[2]; fixup_prog++; } while (*fixup_prog); } + + if (*fixup_map_in_map) { + map_fds[3] = create_map_in_map(); + do { + prog[*fixup_map_in_map].imm = map_fds[3]; + fixup_map_in_map++; + } while (*fixup_map_in_map); + } } static void do_test_single(struct bpf_test *test, bool unpriv, @@ -4533,11 +4635,15 @@ static void do_test_single(struct bpf_test *test, bool unpriv, struct bpf_insn *prog = test->insns; int prog_len = probe_filter_length(prog); int prog_type = test->prog_type; - int fd_f1 = -1, fd_f2 = -1, fd_f3 = -1; + int map_fds[MAX_NR_MAPS]; int fd_prog, expected_ret; const char *expected_err; + int i; - do_test_fixup(test, prog, &fd_f1, &fd_f2, &fd_f3); + for (i = 0; i < MAX_NR_MAPS; i++) + map_fds[i] = -1; + + do_test_fixup(test, prog, map_fds); fd_prog = bpf_load_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, prog, prog_len, "GPL", 0, bpf_vlog, @@ -4568,9 +4674,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, printf("OK\n"); close_fds: close(fd_prog); - close(fd_f1); - close(fd_f2); - close(fd_f3); + for (i = 0; i < MAX_NR_MAPS; i++) + close(map_fds[i]); sched_yield(); return; fail_log: