Merge branch 'bpf-map-in-map'

Martin KaFai Lau says:

====================
bpf: Add map-in-map support

This patchset adds map-in-map support (map->map).
One use case is the (vips -> webservers) in the L4 load balancer so
that different vips can be backed by different set of webservers.

Please refer to the individual commit log for details.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-03-22 15:45:46 -07:00
commit 9bdf64d511
18 changed files with 823 additions and 39 deletions

View File

@ -50,6 +50,7 @@ struct bpf_map {
const struct bpf_map_ops *ops;
struct work_struct work;
atomic_t usercnt;
struct bpf_map *inner_map_meta;
};
struct bpf_map_type_list {
@ -276,6 +277,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags);
void bpf_fd_array_map_clear(struct bpf_map *map);
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags);
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
* forced to use 'long' read/writes to try to atomically copy long counters.

View File

@ -96,6 +96,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
BPF_MAP_TYPE_LPM_TRIE,
BPF_MAP_TYPE_ARRAY_OF_MAPS,
BPF_MAP_TYPE_HASH_OF_MAPS,
};
enum bpf_prog_type {
@ -152,6 +154,7 @@ union bpf_attr {
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* prealloc or not */
__u32 inner_map_fd; /* fd pointing to the inner map */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */

View File

@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif

View File

@ -17,6 +17,8 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
#include "map_in_map.h"
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
@ -117,20 +119,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = array->elem_size;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, array->map.max_entries,
elem_size == 1 ? 2 : 3);
if (elem_size == 1) {
/* nop */
} else if (is_power_of_2(elem_size)) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
} else {
*insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
@ -605,3 +604,64 @@ static int __init register_cgroup_array_map(void)
}
late_initcall(register_cgroup_array_map);
#endif
static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
{
struct bpf_map *map, *inner_map_meta;
inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
if (IS_ERR(inner_map_meta))
return inner_map_meta;
map = fd_array_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
}
map->inner_map_meta = inner_map_meta;
return map;
}
static void array_of_map_free(struct bpf_map *map)
{
/* map->inner_map_meta is only accessed by syscall which
* is protected by fdget/fdput.
*/
bpf_map_meta_free(map->inner_map_meta);
bpf_fd_array_map_clear(map);
fd_array_map_free(map);
}
static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_map **inner_map = array_map_lookup_elem(map, key);
if (!inner_map)
return NULL;
return READ_ONCE(*inner_map);
}
static const struct bpf_map_ops array_of_map_ops = {
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_of_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
};
static struct bpf_map_type_list array_of_map_type __ro_after_init = {
.ops = &array_of_map_ops,
.type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
};
static int __init register_array_of_map(void)
{
bpf_register_map_type(&array_of_map_type);
return 0;
}
late_initcall(register_array_of_map);

View File

@ -16,6 +16,7 @@
#include <linux/rculist_nulls.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
struct bucket {
struct hlist_nulls_head head;
@ -88,6 +89,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size
return *(void __percpu **)(l->key + key_size);
}
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
{
return *(void **)(l->key + roundup(map->key_size, 8));
}
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
{
return (struct htab_elem *) (htab->elems + i * htab->elem_size);
@ -603,6 +609,14 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
struct bpf_map *map = &htab->map;
if (map->ops->map_fd_put_ptr) {
void *ptr = fd_htab_map_get_ptr(map, l);
map->ops->map_fd_put_ptr(ptr);
}
if (l->state == HTAB_EXTRA_ELEM_USED) {
l->state = HTAB_EXTRA_ELEM_FREE;
return;
@ -1057,6 +1071,7 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@ -1213,12 +1228,118 @@ static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
};
static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
{
struct bpf_map *map;
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
/* pointer is stored internally */
attr->value_size = sizeof(void *);
map = htab_map_alloc(attr);
attr->value_size = sizeof(u32);
return map;
}
static void fd_htab_map_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_node *n;
struct hlist_nulls_head *head;
struct htab_elem *l;
int i;
for (i = 0; i < htab->n_buckets; i++) {
head = select_bucket(htab, i);
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
void *ptr = fd_htab_map_get_ptr(map, l);
map->ops->map_fd_put_ptr(ptr);
}
}
htab_map_free(map);
}
/* only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
void *ptr;
int ret;
u32 ufd = *(u32 *)value;
ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
ret = htab_map_update_elem(map, key, &ptr, map_flags);
if (ret)
map->ops->map_fd_put_ptr(ptr);
return ret;
}
static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
{
struct bpf_map *map, *inner_map_meta;
inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
if (IS_ERR(inner_map_meta))
return inner_map_meta;
map = fd_htab_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
}
map->inner_map_meta = inner_map_meta;
return map;
}
static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
if (!inner_map)
return NULL;
return READ_ONCE(*inner_map);
}
static void htab_of_map_free(struct bpf_map *map)
{
bpf_map_meta_free(map->inner_map_meta);
fd_htab_map_free(map);
}
static const struct bpf_map_ops htab_of_map_ops = {
.map_alloc = htab_of_map_alloc,
.map_free = htab_of_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_of_map_lookup_elem,
.map_delete_elem = htab_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
};
static struct bpf_map_type_list htab_of_map_type __ro_after_init = {
.ops = &htab_of_map_ops,
.type = BPF_MAP_TYPE_HASH_OF_MAPS,
};
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
bpf_register_map_type(&htab_percpu_type);
bpf_register_map_type(&htab_lru_type);
bpf_register_map_type(&htab_lru_percpu_type);
bpf_register_map_type(&htab_of_map_type);
return 0;
}
late_initcall(register_htab_map);

97
kernel/bpf/map_in_map.c Normal file
View File

@ -0,0 +1,97 @@
/* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/slab.h>
#include <linux/bpf.h>
#include "map_in_map.h"
struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
struct fd f;
f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
/* prog_array->owner_prog_type and owner_jited
* is a runtime binding. Doing static check alone
* in the verifier is not enough.
*/
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta) {
fdput(f);
return ERR_PTR(-EINVAL);
}
inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
if (!inner_map_meta) {
fdput(f);
return ERR_PTR(-ENOMEM);
}
inner_map_meta->map_type = inner_map->map_type;
inner_map_meta->key_size = inner_map->key_size;
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->ops = inner_map->ops;
inner_map_meta->max_entries = inner_map->max_entries;
fdput(f);
return inner_map_meta;
}
void bpf_map_meta_free(struct bpf_map *map_meta)
{
kfree(map_meta);
}
bool bpf_map_meta_equal(const struct bpf_map *meta0,
const struct bpf_map *meta1)
{
/* No need to compare ops because it is covered by map_type */
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
meta0->map_flags == meta1->map_flags &&
meta0->max_entries == meta1->max_entries;
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
struct file *map_file /* not used */,
int ufd)
{
struct bpf_map *inner_map;
struct fd f;
f = fdget(ufd);
inner_map = __bpf_map_get(f);
if (IS_ERR(inner_map))
return inner_map;
if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
inner_map = bpf_map_inc(inner_map, false);
else
inner_map = ERR_PTR(-EINVAL);
fdput(f);
return inner_map;
}
void bpf_map_fd_put_ptr(void *ptr)
{
/* ptr->ops->map_free() has to go through one
* rcu grace period by itself.
*/
bpf_map_put(ptr);
}

23
kernel/bpf/map_in_map.h Normal file
View File

@ -0,0 +1,23 @@
/* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef __MAP_IN_MAP_H__
#define __MAP_IN_MAP_H__
#include <linux/types.h>
struct file;
struct bpf_map;
struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
void bpf_map_meta_free(struct bpf_map *map_meta);
bool bpf_map_meta_equal(const struct bpf_map *meta0,
const struct bpf_map *meta1);
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
void bpf_map_fd_put_ptr(void *ptr);
#endif

View File

@ -215,7 +215,7 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
#define BPF_MAP_CREATE_LAST_FIELD map_flags
#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@ -352,6 +352,9 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
err = -ENOTSUPP;
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@ -438,11 +441,17 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_array_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
rcu_read_lock();
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);

View File

@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 65536
#define BPF_COMPLEXITY_LIMIT_STACK 1024
#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@ -1197,6 +1199,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
goto error;
default:
break;
}
@ -1357,6 +1363,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
struct bpf_insn_aux_data *insn_aux;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
/* remember map_ptr, so that check_map_access()
@ -1369,7 +1377,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
insn_aux = &env->insn_aux_data[insn_idx];
if (!insn_aux->map_ptr)
insn_aux->map_ptr = meta.map_ptr;
else if (insn_aux->map_ptr != meta.map_ptr)
insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
verbose("unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@ -2093,14 +2105,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
struct bpf_reg_state *reg = &regs[regno];
if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
reg->type = type;
if (type == UNKNOWN_VALUE) {
__mark_reg_unknown_value(regs, regno);
} else if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
} else {
reg->type = type;
}
/* We don't need id from this point onwards anymore, thus we
* should better reset it, so that state pruning has chances
* to take effect.
*/
reg->id = 0;
if (type == UNKNOWN_VALUE)
__mark_reg_unknown_value(regs, regno);
}
}
@ -3025,16 +3042,33 @@ process_bpf_exit:
return 0;
}
static int check_map_prealloc(struct bpf_map *map)
{
return (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
!(map->map_flags & BPF_F_NO_PREALLOC);
}
static int check_map_prog_compatibility(struct bpf_map *map,
struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
(map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
(map->map_flags & BPF_F_NO_PREALLOC)) {
verbose("perf_event programs can only use preallocated hash map\n");
return -EINVAL;
/* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
* preallocated hash maps, since doing memory allocation
* in overflow_handler can crash depending on where nmi got
* triggered.
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
verbose("perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
verbose("perf_event programs can only use preallocated inner hash map\n");
return -EINVAL;
}
}
return 0;
}
@ -3307,7 +3341,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (!map_ptr->ops->map_gen_lookup)
if (map_ptr == BPF_MAP_PTR_POISON ||
!map_ptr->ops->map_gen_lookup)
goto patch_call_imm;
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);

View File

@ -34,6 +34,7 @@ hostprogs-y += sampleip
hostprogs-y += tc_l2_redirect
hostprogs-y += lwt_len_hist
hostprogs-y += xdp_tx_iptunnel
hostprogs-y += test_map_in_map
# Libbpf dependencies
LIBBPF := ../../tools/lib/bpf/bpf.o
@ -72,6 +73,7 @@ sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o
tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@ -105,6 +107,7 @@ always += trace_event_kern.o
always += sampleip_kern.o
always += lwt_len_hist_kern.o
always += xdp_tx_iptunnel_kern.o
always += test_map_in_map_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS += -I$(srctree)/tools/lib/
@ -139,6 +142,7 @@ HOSTLOADLIBES_sampleip += -lelf
HOSTLOADLIBES_tc_l2_redirect += -l elf
HOSTLOADLIBES_lwt_len_hist += -l elf
HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
HOSTLOADLIBES_test_map_in_map += -lelf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
# make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang

View File

@ -80,6 +80,7 @@ struct bpf_map_def {
unsigned int value_size;
unsigned int max_entries;
unsigned int map_flags;
unsigned int inner_map_idx;
};
static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) =

View File

@ -43,6 +43,7 @@ struct bpf_map_def {
unsigned int value_size;
unsigned int max_entries;
unsigned int map_flags;
unsigned int inner_map_idx;
};
static int populate_prog_array(const char *event, int prog_fd)
@ -198,11 +199,22 @@ static int load_maps(struct bpf_map_def *maps, int len)
for (i = 0; i < len / sizeof(struct bpf_map_def); i++) {
map_fd[i] = bpf_create_map(maps[i].type,
maps[i].key_size,
maps[i].value_size,
maps[i].max_entries,
maps[i].map_flags);
if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) {
int inner_map_fd = map_fd[maps[i].inner_map_idx];
map_fd[i] = bpf_create_map_in_map(maps[i].type,
maps[i].key_size,
inner_map_fd,
maps[i].max_entries,
maps[i].map_flags);
} else {
map_fd[i] = bpf_create_map(maps[i].type,
maps[i].key_size,
maps[i].value_size,
maps[i].max_entries,
maps[i].map_flags);
}
if (map_fd[i] < 0) {
printf("failed to create a map: %d %s\n",
errno, strerror(errno));

View File

@ -0,0 +1,173 @@
/*
* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#define KBUILD_MODNAME "foo"
#include <linux/ptrace.h>
#include <linux/version.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/in6.h>
#include "bpf_helpers.h"
#define MAX_NR_PORTS 65536
/* map #0 */
struct bpf_map_def SEC("maps") port_a = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(int),
.max_entries = MAX_NR_PORTS,
};
/* map #1 */
struct bpf_map_def SEC("maps") port_h = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(int),
.max_entries = 1,
};
/* map #2 */
struct bpf_map_def SEC("maps") reg_result_h = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(int),
.max_entries = 1,
};
/* map #3 */
struct bpf_map_def SEC("maps") inline_result_h = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(int),
.max_entries = 1,
};
/* map #4 */ /* Test case #0 */
struct bpf_map_def SEC("maps") a_of_port_a = {
.type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
.key_size = sizeof(u32),
.inner_map_idx = 0, /* map_fd[0] is port_a */
.max_entries = MAX_NR_PORTS,
};
/* map #5 */ /* Test case #1 */
struct bpf_map_def SEC("maps") h_of_port_a = {
.type = BPF_MAP_TYPE_HASH_OF_MAPS,
.key_size = sizeof(u32),
.inner_map_idx = 0, /* map_fd[0] is port_a */
.max_entries = 1,
};
/* map #6 */ /* Test case #2 */
struct bpf_map_def SEC("maps") h_of_port_h = {
.type = BPF_MAP_TYPE_HASH_OF_MAPS,
.key_size = sizeof(u32),
.inner_map_idx = 1, /* map_fd[1] is port_h */
.max_entries = 1,
};
static __always_inline int do_reg_lookup(void *inner_map, u32 port)
{
int *result;
result = bpf_map_lookup_elem(inner_map, &port);
return result ? *result : -ENOENT;
}
static __always_inline int do_inline_array_lookup(void *inner_map, u32 port)
{
int *result;
if (inner_map != &port_a)
return -EINVAL;
result = bpf_map_lookup_elem(&port_a, &port);
return result ? *result : -ENOENT;
}
static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port)
{
int *result;
if (inner_map != &port_h)
return -EINVAL;
result = bpf_map_lookup_elem(&port_h, &port);
return result ? *result : -ENOENT;
}
SEC("kprobe/sys_connect")
int trace_sys_connect(struct pt_regs *ctx)
{
struct sockaddr_in6 *in6;
u16 test_case, port, dst6[8];
int addrlen, ret, inline_ret, ret_key = 0;
u32 port_key;
void *outer_map, *inner_map;
bool inline_hash = false;
in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx);
addrlen = (int)PT_REGS_PARM3(ctx);
if (addrlen != sizeof(*in6))
return 0;
ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr);
if (ret) {
inline_ret = ret;
goto done;
}
if (dst6[0] != 0xdead || dst6[1] != 0xbeef)
return 0;
test_case = dst6[7];
ret = bpf_probe_read(&port, sizeof(port), &in6->sin6_port);
if (ret) {
inline_ret = ret;
goto done;
}
port_key = port;
ret = -ENOENT;
if (test_case == 0) {
outer_map = &a_of_port_a;
} else if (test_case == 1) {
outer_map = &h_of_port_a;
} else if (test_case == 2) {
outer_map = &h_of_port_h;
} else {
ret = __LINE__;
inline_ret = ret;
goto done;
}
inner_map = bpf_map_lookup_elem(outer_map, &port_key);
if (!inner_map) {
ret = __LINE__;
inline_ret = ret;
goto done;
}
ret = do_reg_lookup(inner_map, port_key);
if (test_case == 0 || test_case == 1)
inline_ret = do_inline_array_lookup(inner_map, port_key);
else
inline_ret = do_inline_hash_lookup(inner_map, port_key);
done:
bpf_map_update_elem(&reg_result_h, &ret_key, &ret, BPF_ANY);
bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY);
return 0;
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;

View File

@ -0,0 +1,116 @@
/*
* Copyright (c) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <sys/resource.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <stdint.h>
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include "libbpf.h"
#include "bpf_load.h"
#define PORT_A (map_fd[0])
#define PORT_H (map_fd[1])
#define REG_RESULT_H (map_fd[2])
#define INLINE_RESULT_H (map_fd[3])
#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */
#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */
#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */
static const char * const test_names[] = {
"Array of Array",
"Hash of Array",
"Hash of Hash",
};
#define NR_TESTS (sizeof(test_names) / sizeof(*test_names))
static void populate_map(uint32_t port_key, int magic_result)
{
int ret;
ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY);
assert(!ret);
ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result,
BPF_NOEXIST);
assert(!ret);
ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY);
assert(!ret);
ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST);
assert(!ret);
ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST);
assert(!ret);
}
static void test_map_in_map(void)
{
struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
uint32_t result_key = 0, port_key;
int result, inline_result;
int magic_result = 0xfaceb00c;
int ret;
int i;
port_key = rand() & 0x00FF;
populate_map(port_key, magic_result);
in6.sin6_addr.s6_addr16[0] = 0xdead;
in6.sin6_addr.s6_addr16[1] = 0xbeef;
in6.sin6_port = port_key;
for (i = 0; i < NR_TESTS; i++) {
printf("%s: ", test_names[i]);
in6.sin6_addr.s6_addr16[7] = i;
ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6));
assert(ret == -1 && errno == EBADF);
ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result);
assert(!ret);
ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key,
&inline_result);
assert(!ret);
if (result != magic_result || inline_result != magic_result) {
printf("Error. result:%d inline_result:%d\n",
result, inline_result);
exit(1);
}
bpf_map_delete_elem(REG_RESULT_H, &result_key);
bpf_map_delete_elem(INLINE_RESULT_H, &result_key);
printf("Pass\n");
}
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
assert(!setrlimit(RLIMIT_MEMLOCK, &r));
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
test_map_in_map();
return 0;
}

View File

@ -96,6 +96,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
BPF_MAP_TYPE_LPM_TRIE,
BPF_MAP_TYPE_ARRAY_OF_MAPS,
BPF_MAP_TYPE_HASH_OF_MAPS,
};
enum bpf_prog_type {
@ -152,6 +154,7 @@ union bpf_attr {
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* prealloc or not */
__u32 inner_map_fd; /* fd pointing to the inner map */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */

View File

@ -69,6 +69,23 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size,
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
int inner_map_fd, int max_entries, __u32 map_flags)
{
union bpf_attr attr;
memset(&attr, '\0', sizeof(attr));
attr.map_type = map_type;
attr.key_size = key_size;
attr.value_size = 4;
attr.inner_map_fd = inner_map_fd;
attr.max_entries = max_entries;
attr.map_flags = map_flags;
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
size_t insns_cnt, const char *license,
__u32 kern_version, char *log_buf, size_t log_buf_sz)

View File

@ -26,6 +26,8 @@
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
int max_entries, __u32 map_flags);
int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
int inner_map_fd, int max_entries, __u32 map_flags);
/* Recommend log buffer size */
#define BPF_LOG_BUF_SIZE 65536

View File

@ -38,6 +38,7 @@
#define MAX_INSNS 512
#define MAX_FIXUPS 8
#define MAX_NR_MAPS 4
struct bpf_test {
const char *descr;
@ -45,6 +46,7 @@ struct bpf_test {
int fixup_map1[MAX_FIXUPS];
int fixup_map2[MAX_FIXUPS];
int fixup_prog[MAX_FIXUPS];
int fixup_map_in_map[MAX_FIXUPS];
const char *errstr;
const char *errstr_unpriv;
enum {
@ -4452,7 +4454,76 @@ static struct bpf_test tests[] = {
.errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
.result = REJECT,
.result_unpriv = REJECT,
}
},
{
"map in map access",
.insns = {
BPF_ST_MEM(0, BPF_REG_10, -4, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
BPF_ST_MEM(0, BPF_REG_10, -4, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
BPF_MOV64_REG(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_map_in_map = { 3 },
.result = ACCEPT,
},
{
"invalid inner map pointer",
.insns = {
BPF_ST_MEM(0, BPF_REG_10, -4, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
BPF_ST_MEM(0, BPF_REG_10, -4, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
BPF_MOV64_REG(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_map_in_map = { 3 },
.errstr = "R1 type=inv expected=map_ptr",
.errstr_unpriv = "R1 pointer arithmetic prohibited",
.result = REJECT,
},
{
"forgot null checking on the inner map pointer",
.insns = {
BPF_ST_MEM(0, BPF_REG_10, -4, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
BPF_ST_MEM(0, BPF_REG_10, -4, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
BPF_FUNC_map_lookup_elem),
BPF_MOV64_REG(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_map_in_map = { 3 },
.errstr = "R1 type=map_value_or_null expected=map_ptr",
.result = REJECT,
},
};
static int probe_filter_length(const struct bpf_insn *fp)
@ -4489,42 +4560,73 @@ static int create_prog_array(void)
return fd;
}
static int create_map_in_map(void)
{
int inner_map_fd, outer_map_fd;
inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
sizeof(int), 1, 0);
if (inner_map_fd < 0) {
printf("Failed to create array '%s'!\n", strerror(errno));
return inner_map_fd;
}
outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS,
sizeof(int), inner_map_fd, 1, 0);
if (outer_map_fd < 0)
printf("Failed to create array of maps '%s'!\n",
strerror(errno));
close(inner_map_fd);
return outer_map_fd;
}
static char bpf_vlog[32768];
static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
int *fd_f1, int *fd_f2, int *fd_f3)
int *map_fds)
{
int *fixup_map1 = test->fixup_map1;
int *fixup_map2 = test->fixup_map2;
int *fixup_prog = test->fixup_prog;
int *fixup_map_in_map = test->fixup_map_in_map;
/* Allocating HTs with 1 elem is fine here, since we only test
* for verifier and not do a runtime lookup, so the only thing
* that really matters is value size in this case.
*/
if (*fixup_map1) {
*fd_f1 = create_map(sizeof(long long), 1);
map_fds[0] = create_map(sizeof(long long), 1);
do {
prog[*fixup_map1].imm = *fd_f1;
prog[*fixup_map1].imm = map_fds[0];
fixup_map1++;
} while (*fixup_map1);
}
if (*fixup_map2) {
*fd_f2 = create_map(sizeof(struct test_val), 1);
map_fds[1] = create_map(sizeof(struct test_val), 1);
do {
prog[*fixup_map2].imm = *fd_f2;
prog[*fixup_map2].imm = map_fds[1];
fixup_map2++;
} while (*fixup_map2);
}
if (*fixup_prog) {
*fd_f3 = create_prog_array();
map_fds[2] = create_prog_array();
do {
prog[*fixup_prog].imm = *fd_f3;
prog[*fixup_prog].imm = map_fds[2];
fixup_prog++;
} while (*fixup_prog);
}
if (*fixup_map_in_map) {
map_fds[3] = create_map_in_map();
do {
prog[*fixup_map_in_map].imm = map_fds[3];
fixup_map_in_map++;
} while (*fixup_map_in_map);
}
}
static void do_test_single(struct bpf_test *test, bool unpriv,
@ -4533,11 +4635,15 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
struct bpf_insn *prog = test->insns;
int prog_len = probe_filter_length(prog);
int prog_type = test->prog_type;
int fd_f1 = -1, fd_f2 = -1, fd_f3 = -1;
int map_fds[MAX_NR_MAPS];
int fd_prog, expected_ret;
const char *expected_err;
int i;
do_test_fixup(test, prog, &fd_f1, &fd_f2, &fd_f3);
for (i = 0; i < MAX_NR_MAPS; i++)
map_fds[i] = -1;
do_test_fixup(test, prog, map_fds);
fd_prog = bpf_load_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
prog, prog_len, "GPL", 0, bpf_vlog,
@ -4568,9 +4674,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
printf("OK\n");
close_fds:
close(fd_prog);
close(fd_f1);
close(fd_f2);
close(fd_f3);
for (i = 0; i < MAX_NR_MAPS; i++)
close(map_fds[i]);
sched_yield();
return;
fail_log: