linux-stable/kernel/bpf/syscall.c

/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 */
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>

static LIST_HEAD(bpf_map_types);

static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
	struct bpf_map_type_list *tl;
	struct bpf_map *map;

	list_for_each_entry(tl, &bpf_map_types, list_node) {
		if (tl->type == attr->map_type) {
			map = tl->ops->map_alloc(attr);
			if (IS_ERR(map))
				return map;
			map->ops = tl->ops;
			map->map_type = attr->map_type;
			return map;
		}
	}
	return ERR_PTR(-EINVAL);
}

/* boot time registration of different map implementations */
void bpf_register_map_type(struct bpf_map_type_list *tl)
{
	list_add(&tl->list_node, &bpf_map_types);
}

/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
	struct bpf_map *map = container_of(work, struct bpf_map, work);

	/* implementation dependent freeing */
	map->ops->map_free(map);
}

/* decrement map refcnt and schedule it for freeing via workqueue
 * (unrelying map implementation ops->map_free() might sleep)
 */
void bpf_map_put(struct bpf_map *map)
{
	if (atomic_dec_and_test(&map->refcnt)) {
		INIT_WORK(&map->work, bpf_map_free_deferred);
		schedule_work(&map->work);
	}
}

static int bpf_map_release(struct inode *inode, struct file *filp)
{
	struct bpf_map *map = filp->private_data;

	bpf_map_put(map);
	return 0;
}

static const struct file_operations bpf_map_fops = {
	.release = bpf_map_release,
};

/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
		   sizeof(attr->CMD##_LAST_FIELD), 0, \
		   sizeof(*attr) - \
		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
		   sizeof(attr->CMD##_LAST_FIELD)) != NULL

#define BPF_MAP_CREATE_LAST_FIELD max_entries
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
	struct bpf_map *map;
	int err;

	err = CHECK_ATTR(BPF_MAP_CREATE);
	if (err)
		return -EINVAL;

	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
	map = find_and_alloc_map(attr);
	if (IS_ERR(map))
		return PTR_ERR(map);

	atomic_set(&map->refcnt, 1);

	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);

	if (err < 0)
		/* failed to allocate fd */
		goto free_map;

	return err;

free_map:
	map->ops->map_free(map);
	return err;
}

/* if error is returned, fd is released.
 * On success caller should complete fd access with matching fdput()
 */
struct bpf_map *bpf_map_get(struct fd f)
{
	struct bpf_map *map;

	if (!f.file)
		return ERR_PTR(-EBADF);

	if (f.file->f_op != &bpf_map_fops) {
		fdput(f);
		return ERR_PTR(-EINVAL);
	}

	map = f.file->private_data;

	return map;
}

/* helper to convert user pointers passed inside __aligned_u64 fields */
static void __user *u64_to_ptr(__u64 val)
{
	return (void __user *) (unsigned long) val;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value

static int map_lookup_elem(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	void __user *uvalue = u64_to_ptr(attr->value);
	int ufd = attr->map_fd;
	struct fd f = fdget(ufd);
	struct bpf_map *map;
	void *key, *value;
	int err;

	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
		return -EINVAL;

	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	err = -ESRCH;
	rcu_read_lock();
	value = map->ops->map_lookup_elem(map, key);
	if (!value)
		goto err_unlock;

	err = -EFAULT;
	if (copy_to_user(uvalue, value, map->value_size) != 0)
		goto err_unlock;

	err = 0;

err_unlock:
	rcu_read_unlock();
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value

static int map_update_elem(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	void __user *uvalue = u64_to_ptr(attr->value);
	int ufd = attr->map_fd;
	struct fd f = fdget(ufd);
	struct bpf_map *map;
	void *key, *value;
	int err;

	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
		return -EINVAL;

	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	err = -ENOMEM;
	value = kmalloc(map->value_size, GFP_USER);
	if (!value)
		goto free_key;

	err = -EFAULT;
	if (copy_from_user(value, uvalue, map->value_size) != 0)
		goto free_value;

	/* eBPF program that use maps are running under rcu_read_lock(),
	 * therefore all map accessors rely on this fact, so do the same here
	 */
	rcu_read_lock();
	err = map->ops->map_update_elem(map, key, value);
	rcu_read_unlock();

free_value:
	kfree(value);
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

#define BPF_MAP_DELETE_ELEM_LAST_FIELD key

static int map_delete_elem(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	int ufd = attr->map_fd;
	struct fd f = fdget(ufd);
	struct bpf_map *map;
	void *key;
	int err;

	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
		return -EINVAL;

	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	rcu_read_lock();
	err = map->ops->map_delete_elem(map, key);
	rcu_read_unlock();

free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key

static int map_get_next_key(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	void __user *unext_key = u64_to_ptr(attr->next_key);
	int ufd = attr->map_fd;
	struct fd f = fdget(ufd);
	struct bpf_map *map;
	void *key, *next_key;
	int err;

	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
		return -EINVAL;

	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	err = -ENOMEM;
	next_key = kmalloc(map->key_size, GFP_USER);
	if (!next_key)
		goto free_key;

	rcu_read_lock();
	err = map->ops->map_get_next_key(map, key, next_key);
	rcu_read_unlock();
	if (err)
		goto free_next_key;

	err = -EFAULT;
	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
		goto free_next_key;

	err = 0;

free_next_key:
	kfree(next_key);
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	union bpf_attr attr = {};
	int err;

	/* the syscall is limited to root temporarily. This restriction will be
	 * lifted when security audit is clean. Note that eBPF+tracing must have
	 * this restriction, since it may pass kernel data to user space
	 */
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	if (!access_ok(VERIFY_READ, uattr, 1))
		return -EFAULT;

	if (size > PAGE_SIZE)	/* silly large */
		return -E2BIG;

	/* If we're handed a bigger struct than we know of,
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
	 */
	if (size > sizeof(attr)) {
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;

		addr = (void __user *)uattr + sizeof(attr);
		end  = (void __user *)uattr + size;

		for (; addr < end; addr++) {
			err = get_user(val, addr);
			if (err)
				return err;
			if (val)
				return -E2BIG;
		}
		size = sizeof(attr);
	}

	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	if (copy_from_user(&attr, uattr, size) != 0)
		return -EFAULT;

	switch (cmd) {
	case BPF_MAP_CREATE:
		err = map_create(&attr);
		break;
	case BPF_MAP_LOOKUP_ELEM:
		err = map_lookup_elem(&attr);
		break;
	case BPF_MAP_UPDATE_ELEM:
		err = map_update_elem(&attr);
		break;
	case BPF_MAP_DELETE_ELEM:
		err = map_delete_elem(&attr);
		break;
	case BPF_MAP_GET_NEXT_KEY:
		err = map_get_next_key(&attr);
		break;
	default:
		err = -EINVAL;
		break;
	}

	return err;
}
bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2014-09-26 07:16:57 +00:00			`/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of version 2 of the GNU General Public`
			`* License as published by the Free Software Foundation.`
			`*`
			`* This program is distributed in the hope that it will be useful, but`
			`* WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* General Public License for more details.`
			`*/`
			`#include <linux/bpf.h>`
			`#include <linux/syscalls.h>`
			`#include <linux/slab.h>`
			`#include <linux/anon_inodes.h>`
bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2014-09-26 07:16:59 +00:00			`#include <linux/file.h>`
bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2014-09-26 07:16:57 +00:00
			`static LIST_HEAD(bpf_map_types);`

			`static struct bpf_map find_and_alloc_map(union bpf_attr attr)`
			`{`
			`struct bpf_map_type_list *tl;`
			`struct bpf_map *map;`

			`list_for_each_entry(tl, &bpf_map_types, list_node) {`
			`if (tl->type == attr->map_type) {`
			`map = tl->ops->map_alloc(attr);`
			`if (IS_ERR(map))`
			`return map;`
			`map->ops = tl->ops;`
			`map->map_type = attr->map_type;`
			`return map;`
			`}`
			`}`
			`return ERR_PTR(-EINVAL);`
			`}`

			`/* boot time registration of different map implementations */`
			`void bpf_register_map_type(struct bpf_map_type_list *tl)`
			`{`
			`list_add(&tl->list_node, &bpf_map_types);`
			`}`

			`/* called from workqueue */`
			`static void bpf_map_free_deferred(struct work_struct *work)`
			`{`
			`struct bpf_map *map = container_of(work, struct bpf_map, work);`

			`/* implementation dependent freeing */`
			`map->ops->map_free(map);`
			`}`

			`/* decrement map refcnt and schedule it for freeing via workqueue`
			`* (unrelying map implementation ops->map_free() might sleep)`
			`*/`
			`void bpf_map_put(struct bpf_map *map)`
			`{`
			`if (atomic_dec_and_test(&map->refcnt)) {`
			`INIT_WORK(&map->work, bpf_map_free_deferred);`
			`schedule_work(&map->work);`
			`}`
			`}`

			`static int bpf_map_release(struct inode inode, struct file filp)`
			`{`
			`struct bpf_map *map = filp->private_data;`

			`bpf_map_put(map);`
			`return 0;`
			`}`

			`static const struct file_operations bpf_map_fops = {`
			`.release = bpf_map_release,`
			`};`

			`/* helper macro to check that unused fields 'union bpf_attr' are zero */`
			`#define CHECK_ATTR(CMD) \`
			`memchr_inv((void *) &attr->CMD##_LAST_FIELD + \`
			`sizeof(attr->CMD##_LAST_FIELD), 0, \`
			`sizeof(*attr) - \`
			`offsetof(union bpf_attr, CMD##_LAST_FIELD) - \`
			`sizeof(attr->CMD##_LAST_FIELD)) != NULL`

			`#define BPF_MAP_CREATE_LAST_FIELD max_entries`
			`/* called via syscall */`
			`static int map_create(union bpf_attr *attr)`
			`{`
			`struct bpf_map *map;`
			`int err;`

			`err = CHECK_ATTR(BPF_MAP_CREATE);`
			`if (err)`
			`return -EINVAL;`

			`/* find map type and init map: hashtable vs rbtree vs bloom vs ... */`
			`map = find_and_alloc_map(attr);`
			`if (IS_ERR(map))`
			`return PTR_ERR(map);`

			`atomic_set(&map->refcnt, 1);`

			`err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR \| O_CLOEXEC);`

			`if (err < 0)`
			`/* failed to allocate fd */`
			`goto free_map;`

			`return err;`

			`free_map:`
			`map->ops->map_free(map);`
			`return err;`
			`}`

bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2014-09-26 07:16:59 +00:00			`/* if error is returned, fd is released.`
			`* On success caller should complete fd access with matching fdput()`
			`*/`
			`struct bpf_map *bpf_map_get(struct fd f)`
			`{`
			`struct bpf_map *map;`

			`if (!f.file)`
			`return ERR_PTR(-EBADF);`

			`if (f.file->f_op != &bpf_map_fops) {`
			`fdput(f);`
			`return ERR_PTR(-EINVAL);`
			`}`

			`map = f.file->private_data;`

			`return map;`
			`}`

			`/* helper to convert user pointers passed inside __aligned_u64 fields */`
			`static void __user *u64_to_ptr(__u64 val)`
			`{`
			`return (void __user *) (unsigned long) val;`
			`}`

			`/* last field in 'union bpf_attr' used by this command */`
			`#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value`

			`static int map_lookup_elem(union bpf_attr *attr)`
			`{`
			`void __user *ukey = u64_to_ptr(attr->key);`
			`void __user *uvalue = u64_to_ptr(attr->value);`
			`int ufd = attr->map_fd;`
			`struct fd f = fdget(ufd);`
			`struct bpf_map *map;`
			`void key, value;`
			`int err;`

			`if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))`
			`return -EINVAL;`

			`map = bpf_map_get(f);`
			`if (IS_ERR(map))`
			`return PTR_ERR(map);`

			`err = -ENOMEM;`
			`key = kmalloc(map->key_size, GFP_USER);`
			`if (!key)`
			`goto err_put;`

			`err = -EFAULT;`
			`if (copy_from_user(key, ukey, map->key_size) != 0)`
			`goto free_key;`

			`err = -ESRCH;`
			`rcu_read_lock();`
			`value = map->ops->map_lookup_elem(map, key);`
			`if (!value)`
			`goto err_unlock;`

			`err = -EFAULT;`
			`if (copy_to_user(uvalue, value, map->value_size) != 0)`
			`goto err_unlock;`

			`err = 0;`

			`err_unlock:`
			`rcu_read_unlock();`
			`free_key:`
			`kfree(key);`
			`err_put:`
			`fdput(f);`
			`return err;`
			`}`

			`#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value`

			`static int map_update_elem(union bpf_attr *attr)`
			`{`
			`void __user *ukey = u64_to_ptr(attr->key);`
			`void __user *uvalue = u64_to_ptr(attr->value);`
			`int ufd = attr->map_fd;`
			`struct fd f = fdget(ufd);`
			`struct bpf_map *map;`
			`void key, value;`
			`int err;`

			`if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))`
			`return -EINVAL;`

			`map = bpf_map_get(f);`
			`if (IS_ERR(map))`
			`return PTR_ERR(map);`

			`err = -ENOMEM;`
			`key = kmalloc(map->key_size, GFP_USER);`
			`if (!key)`
			`goto err_put;`

			`err = -EFAULT;`
			`if (copy_from_user(key, ukey, map->key_size) != 0)`
			`goto free_key;`

			`err = -ENOMEM;`
			`value = kmalloc(map->value_size, GFP_USER);`
			`if (!value)`
			`goto free_key;`

			`err = -EFAULT;`
			`if (copy_from_user(value, uvalue, map->value_size) != 0)`
			`goto free_value;`

			`/* eBPF program that use maps are running under rcu_read_lock(),`
			`* therefore all map accessors rely on this fact, so do the same here`
			`*/`
			`rcu_read_lock();`
			`err = map->ops->map_update_elem(map, key, value);`
			`rcu_read_unlock();`

			`free_value:`
			`kfree(value);`
			`free_key:`
			`kfree(key);`
			`err_put:`
			`fdput(f);`
			`return err;`
			`}`

			`#define BPF_MAP_DELETE_ELEM_LAST_FIELD key`

			`static int map_delete_elem(union bpf_attr *attr)`
			`{`
			`void __user *ukey = u64_to_ptr(attr->key);`
			`int ufd = attr->map_fd;`
			`struct fd f = fdget(ufd);`
			`struct bpf_map *map;`
			`void *key;`
			`int err;`

			`if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))`
			`return -EINVAL;`

			`map = bpf_map_get(f);`
			`if (IS_ERR(map))`
			`return PTR_ERR(map);`

			`err = -ENOMEM;`
			`key = kmalloc(map->key_size, GFP_USER);`
			`if (!key)`
			`goto err_put;`

			`err = -EFAULT;`
			`if (copy_from_user(key, ukey, map->key_size) != 0)`
			`goto free_key;`

			`rcu_read_lock();`
			`err = map->ops->map_delete_elem(map, key);`
			`rcu_read_unlock();`

			`free_key:`
			`kfree(key);`
			`err_put:`
			`fdput(f);`
			`return err;`
			`}`

			`/* last field in 'union bpf_attr' used by this command */`
			`#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key`

			`static int map_get_next_key(union bpf_attr *attr)`
			`{`
			`void __user *ukey = u64_to_ptr(attr->key);`
			`void __user *unext_key = u64_to_ptr(attr->next_key);`
			`int ufd = attr->map_fd;`
			`struct fd f = fdget(ufd);`
			`struct bpf_map *map;`
			`void key, next_key;`
			`int err;`

			`if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))`
			`return -EINVAL;`

			`map = bpf_map_get(f);`
			`if (IS_ERR(map))`
			`return PTR_ERR(map);`

			`err = -ENOMEM;`
			`key = kmalloc(map->key_size, GFP_USER);`
			`if (!key)`
			`goto err_put;`

			`err = -EFAULT;`
			`if (copy_from_user(key, ukey, map->key_size) != 0)`
			`goto free_key;`

			`err = -ENOMEM;`
			`next_key = kmalloc(map->key_size, GFP_USER);`
			`if (!next_key)`
			`goto free_key;`

			`rcu_read_lock();`
			`err = map->ops->map_get_next_key(map, key, next_key);`
			`rcu_read_unlock();`
			`if (err)`
			`goto free_next_key;`

			`err = -EFAULT;`
			`if (copy_to_user(unext_key, next_key, map->key_size) != 0)`
			`goto free_next_key;`

			`err = 0;`

			`free_next_key:`
			`kfree(next_key);`
			`free_key:`
			`kfree(key);`
			`err_put:`
			`fdput(f);`
			`return err;`
			`}`

bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2014-09-26 07:16:57 +00:00			`SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)`
			`{`
			`union bpf_attr attr = {};`
			`int err;`

			`/* the syscall is limited to root temporarily. This restriction will be`
			`* lifted when security audit is clean. Note that eBPF+tracing must have`
			`* this restriction, since it may pass kernel data to user space`
			`*/`
			`if (!capable(CAP_SYS_ADMIN))`
			`return -EPERM;`

			`if (!access_ok(VERIFY_READ, uattr, 1))`
			`return -EFAULT;`

			`if (size > PAGE_SIZE) /* silly large */`
			`return -E2BIG;`

			`/* If we're handed a bigger struct than we know of,`
			`* ensure all the unknown bits are 0 - i.e. new`
			`* user-space does not rely on any kernel feature`
			`* extensions we dont know about yet.`
			`*/`
			`if (size > sizeof(attr)) {`
			`unsigned char __user *addr;`
			`unsigned char __user *end;`
			`unsigned char val;`

			`addr = (void __user *)uattr + sizeof(attr);`
			`end = (void __user *)uattr + size;`

			`for (; addr < end; addr++) {`
			`err = get_user(val, addr);`
			`if (err)`
			`return err;`
			`if (val)`
			`return -E2BIG;`
			`}`
			`size = sizeof(attr);`
			`}`

			`/* copy attributes from user space, may be less than sizeof(bpf_attr) */`
			`if (copy_from_user(&attr, uattr, size) != 0)`
			`return -EFAULT;`

			`switch (cmd) {`
			`case BPF_MAP_CREATE:`
			`err = map_create(&attr);`
			`break;`
bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2014-09-26 07:16:59 +00:00			`case BPF_MAP_LOOKUP_ELEM:`
			`err = map_lookup_elem(&attr);`
			`break;`
			`case BPF_MAP_UPDATE_ELEM:`
			`err = map_update_elem(&attr);`
			`break;`
			`case BPF_MAP_DELETE_ELEM:`
			`err = map_delete_elem(&attr);`
			`break;`
			`case BPF_MAP_GET_NEXT_KEY:`
			`err = map_get_next_key(&attr);`
			`break;`
bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2014-09-26 07:16:57 +00:00			`default:`
			`err = -EINVAL;`
			`break;`
			`}`

			`return err;`
			`}`