2014-09-26 00:16:57 -07:00
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful , but
* WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*/
# include <linux/bpf.h>
# include <linux/syscalls.h>
# include <linux/slab.h>
# include <linux/anon_inodes.h>
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 00:16:59 -07:00
# include <linux/file.h>
2014-09-26 00:17:00 -07:00
# include <linux/license.h>
# include <linux/filter.h>
2014-09-26 00:16:57 -07:00
static LIST_HEAD ( bpf_map_types ) ;
static struct bpf_map * find_and_alloc_map ( union bpf_attr * attr )
{
struct bpf_map_type_list * tl ;
struct bpf_map * map ;
list_for_each_entry ( tl , & bpf_map_types , list_node ) {
if ( tl - > type = = attr - > map_type ) {
map = tl - > ops - > map_alloc ( attr ) ;
if ( IS_ERR ( map ) )
return map ;
map - > ops = tl - > ops ;
map - > map_type = attr - > map_type ;
return map ;
}
}
return ERR_PTR ( - EINVAL ) ;
}
/* boot time registration of different map implementations */
void bpf_register_map_type ( struct bpf_map_type_list * tl )
{
list_add ( & tl - > list_node , & bpf_map_types ) ;
}
/* called from workqueue */
static void bpf_map_free_deferred ( struct work_struct * work )
{
struct bpf_map * map = container_of ( work , struct bpf_map , work ) ;
/* implementation dependent freeing */
map - > ops - > map_free ( map ) ;
}
/* decrement map refcnt and schedule it for freeing via workqueue
* ( unrelying map implementation ops - > map_free ( ) might sleep )
*/
void bpf_map_put ( struct bpf_map * map )
{
if ( atomic_dec_and_test ( & map - > refcnt ) ) {
INIT_WORK ( & map - > work , bpf_map_free_deferred ) ;
schedule_work ( & map - > work ) ;
}
}
static int bpf_map_release ( struct inode * inode , struct file * filp )
{
struct bpf_map * map = filp - > private_data ;
bpf_map_put ( map ) ;
return 0 ;
}
static const struct file_operations bpf_map_fops = {
. release = bpf_map_release ,
} ;
/* helper macro to check that unused fields 'union bpf_attr' are zero */
# define CHECK_ATTR(CMD) \
memchr_inv ( ( void * ) & attr - > CMD # # _LAST_FIELD + \
sizeof ( attr - > CMD # # _LAST_FIELD ) , 0 , \
sizeof ( * attr ) - \
offsetof ( union bpf_attr , CMD # # _LAST_FIELD ) - \
sizeof ( attr - > CMD # # _LAST_FIELD ) ) ! = NULL
# define BPF_MAP_CREATE_LAST_FIELD max_entries
/* called via syscall */
static int map_create ( union bpf_attr * attr )
{
struct bpf_map * map ;
int err ;
err = CHECK_ATTR ( BPF_MAP_CREATE ) ;
if ( err )
return - EINVAL ;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map ( attr ) ;
if ( IS_ERR ( map ) )
return PTR_ERR ( map ) ;
atomic_set ( & map - > refcnt , 1 ) ;
err = anon_inode_getfd ( " bpf-map " , & bpf_map_fops , map , O_RDWR | O_CLOEXEC ) ;
if ( err < 0 )
/* failed to allocate fd */
goto free_map ;
return err ;
free_map :
map - > ops - > map_free ( map ) ;
return err ;
}
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 00:16:59 -07:00
/* if error is returned, fd is released.
* On success caller should complete fd access with matching fdput ( )
*/
struct bpf_map * bpf_map_get ( struct fd f )
{
struct bpf_map * map ;
if ( ! f . file )
return ERR_PTR ( - EBADF ) ;
if ( f . file - > f_op ! = & bpf_map_fops ) {
fdput ( f ) ;
return ERR_PTR ( - EINVAL ) ;
}
map = f . file - > private_data ;
return map ;
}
/* helper to convert user pointers passed inside __aligned_u64 fields */
static void __user * u64_to_ptr ( __u64 val )
{
return ( void __user * ) ( unsigned long ) val ;
}
/* last field in 'union bpf_attr' used by this command */
# define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
static int map_lookup_elem ( union bpf_attr * attr )
{
void __user * ukey = u64_to_ptr ( attr - > key ) ;
void __user * uvalue = u64_to_ptr ( attr - > value ) ;
int ufd = attr - > map_fd ;
struct fd f = fdget ( ufd ) ;
struct bpf_map * map ;
void * key , * value ;
int err ;
if ( CHECK_ATTR ( BPF_MAP_LOOKUP_ELEM ) )
return - EINVAL ;
map = bpf_map_get ( f ) ;
if ( IS_ERR ( map ) )
return PTR_ERR ( map ) ;
err = - ENOMEM ;
key = kmalloc ( map - > key_size , GFP_USER ) ;
if ( ! key )
goto err_put ;
err = - EFAULT ;
if ( copy_from_user ( key , ukey , map - > key_size ) ! = 0 )
goto free_key ;
err = - ESRCH ;
rcu_read_lock ( ) ;
value = map - > ops - > map_lookup_elem ( map , key ) ;
if ( ! value )
goto err_unlock ;
err = - EFAULT ;
if ( copy_to_user ( uvalue , value , map - > value_size ) ! = 0 )
goto err_unlock ;
err = 0 ;
err_unlock :
rcu_read_unlock ( ) ;
free_key :
kfree ( key ) ;
err_put :
fdput ( f ) ;
return err ;
}
# define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
static int map_update_elem ( union bpf_attr * attr )
{
void __user * ukey = u64_to_ptr ( attr - > key ) ;
void __user * uvalue = u64_to_ptr ( attr - > value ) ;
int ufd = attr - > map_fd ;
struct fd f = fdget ( ufd ) ;
struct bpf_map * map ;
void * key , * value ;
int err ;
if ( CHECK_ATTR ( BPF_MAP_UPDATE_ELEM ) )
return - EINVAL ;
map = bpf_map_get ( f ) ;
if ( IS_ERR ( map ) )
return PTR_ERR ( map ) ;
err = - ENOMEM ;
key = kmalloc ( map - > key_size , GFP_USER ) ;
if ( ! key )
goto err_put ;
err = - EFAULT ;
if ( copy_from_user ( key , ukey , map - > key_size ) ! = 0 )
goto free_key ;
err = - ENOMEM ;
value = kmalloc ( map - > value_size , GFP_USER ) ;
if ( ! value )
goto free_key ;
err = - EFAULT ;
if ( copy_from_user ( value , uvalue , map - > value_size ) ! = 0 )
goto free_value ;
/* eBPF program that use maps are running under rcu_read_lock(),
* therefore all map accessors rely on this fact , so do the same here
*/
rcu_read_lock ( ) ;
err = map - > ops - > map_update_elem ( map , key , value ) ;
rcu_read_unlock ( ) ;
free_value :
kfree ( value ) ;
free_key :
kfree ( key ) ;
err_put :
fdput ( f ) ;
return err ;
}
# define BPF_MAP_DELETE_ELEM_LAST_FIELD key
static int map_delete_elem ( union bpf_attr * attr )
{
void __user * ukey = u64_to_ptr ( attr - > key ) ;
int ufd = attr - > map_fd ;
struct fd f = fdget ( ufd ) ;
struct bpf_map * map ;
void * key ;
int err ;
if ( CHECK_ATTR ( BPF_MAP_DELETE_ELEM ) )
return - EINVAL ;
map = bpf_map_get ( f ) ;
if ( IS_ERR ( map ) )
return PTR_ERR ( map ) ;
err = - ENOMEM ;
key = kmalloc ( map - > key_size , GFP_USER ) ;
if ( ! key )
goto err_put ;
err = - EFAULT ;
if ( copy_from_user ( key , ukey , map - > key_size ) ! = 0 )
goto free_key ;
rcu_read_lock ( ) ;
err = map - > ops - > map_delete_elem ( map , key ) ;
rcu_read_unlock ( ) ;
free_key :
kfree ( key ) ;
err_put :
fdput ( f ) ;
return err ;
}
/* last field in 'union bpf_attr' used by this command */
# define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
static int map_get_next_key ( union bpf_attr * attr )
{
void __user * ukey = u64_to_ptr ( attr - > key ) ;
void __user * unext_key = u64_to_ptr ( attr - > next_key ) ;
int ufd = attr - > map_fd ;
struct fd f = fdget ( ufd ) ;
struct bpf_map * map ;
void * key , * next_key ;
int err ;
if ( CHECK_ATTR ( BPF_MAP_GET_NEXT_KEY ) )
return - EINVAL ;
map = bpf_map_get ( f ) ;
if ( IS_ERR ( map ) )
return PTR_ERR ( map ) ;
err = - ENOMEM ;
key = kmalloc ( map - > key_size , GFP_USER ) ;
if ( ! key )
goto err_put ;
err = - EFAULT ;
if ( copy_from_user ( key , ukey , map - > key_size ) ! = 0 )
goto free_key ;
err = - ENOMEM ;
next_key = kmalloc ( map - > key_size , GFP_USER ) ;
if ( ! next_key )
goto free_key ;
rcu_read_lock ( ) ;
err = map - > ops - > map_get_next_key ( map , key , next_key ) ;
rcu_read_unlock ( ) ;
if ( err )
goto free_next_key ;
err = - EFAULT ;
if ( copy_to_user ( unext_key , next_key , map - > key_size ) ! = 0 )
goto free_next_key ;
err = 0 ;
free_next_key :
kfree ( next_key ) ;
free_key :
kfree ( key ) ;
err_put :
fdput ( f ) ;
return err ;
}
2014-09-26 00:17:00 -07:00
static LIST_HEAD ( bpf_prog_types ) ;
static int find_prog_type ( enum bpf_prog_type type , struct bpf_prog * prog )
{
struct bpf_prog_type_list * tl ;
list_for_each_entry ( tl , & bpf_prog_types , list_node ) {
if ( tl - > type = = type ) {
prog - > aux - > ops = tl - > ops ;
prog - > aux - > prog_type = type ;
return 0 ;
}
}
return - EINVAL ;
}
void bpf_register_prog_type ( struct bpf_prog_type_list * tl )
{
list_add ( & tl - > list_node , & bpf_prog_types ) ;
}
2014-09-26 00:17:01 -07:00
/* fixup insn->imm field of bpf_call instructions:
* if ( insn - > imm = = BPF_FUNC_map_lookup_elem )
* insn - > imm = bpf_map_lookup_elem - __bpf_call_base ;
* else if ( insn - > imm = = BPF_FUNC_map_update_elem )
* insn - > imm = bpf_map_update_elem - __bpf_call_base ;
* else . . .
*
* this function is called after eBPF program passed verification
*/
static void fixup_bpf_calls ( struct bpf_prog * prog )
{
const struct bpf_func_proto * fn ;
int i ;
for ( i = 0 ; i < prog - > len ; i + + ) {
struct bpf_insn * insn = & prog - > insnsi [ i ] ;
if ( insn - > code = = ( BPF_JMP | BPF_CALL ) ) {
/* we reach here when program has bpf_call instructions
* and it passed bpf_check ( ) , means that
* ops - > get_func_proto must have been supplied , check it
*/
BUG_ON ( ! prog - > aux - > ops - > get_func_proto ) ;
fn = prog - > aux - > ops - > get_func_proto ( insn - > imm ) ;
/* all functions that have prototype and verifier allowed
* programs to call them , must be real in - kernel functions
*/
BUG_ON ( ! fn - > func ) ;
insn - > imm = fn - > func - __bpf_call_base ;
}
}
}
2014-09-26 00:17:00 -07:00
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps ( struct bpf_prog_aux * aux )
{
int i ;
for ( i = 0 ; i < aux - > used_map_cnt ; i + + )
bpf_map_put ( aux - > used_maps [ i ] ) ;
kfree ( aux - > used_maps ) ;
}
void bpf_prog_put ( struct bpf_prog * prog )
{
if ( atomic_dec_and_test ( & prog - > aux - > refcnt ) ) {
free_used_maps ( prog - > aux ) ;
bpf_prog_free ( prog ) ;
}
}
static int bpf_prog_release ( struct inode * inode , struct file * filp )
{
struct bpf_prog * prog = filp - > private_data ;
bpf_prog_put ( prog ) ;
return 0 ;
}
static const struct file_operations bpf_prog_fops = {
. release = bpf_prog_release ,
} ;
static struct bpf_prog * get_prog ( struct fd f )
{
struct bpf_prog * prog ;
if ( ! f . file )
return ERR_PTR ( - EBADF ) ;
if ( f . file - > f_op ! = & bpf_prog_fops ) {
fdput ( f ) ;
return ERR_PTR ( - EINVAL ) ;
}
prog = f . file - > private_data ;
return prog ;
}
/* called by sockets/tracing/seccomp before attaching program to an event
* pairs with bpf_prog_put ( )
*/
struct bpf_prog * bpf_prog_get ( u32 ufd )
{
struct fd f = fdget ( ufd ) ;
struct bpf_prog * prog ;
prog = get_prog ( f ) ;
if ( IS_ERR ( prog ) )
return prog ;
atomic_inc ( & prog - > aux - > refcnt ) ;
fdput ( f ) ;
return prog ;
}
/* last field in 'union bpf_attr' used by this command */
bpf: verifier (add ability to receive verification log)
add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
struct {
...
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
__aligned_u64 log_buf; /* user supplied 'char *buffer' */
};
};
when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.
'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
BPF_EXIT_INSN(),
will be rejected with the following multi-line message in log_buf:
0: (7a) *(u64 *)(r10 -8) = 0
1: (bf) r2 = r10
2: (07) r2 += -8
3: (b7) r1 = 0
4: (85) call 1
5: (15) if r0 == 0x0 goto pc+1
R0=map_ptr R10=fp
6: (7a) *(u64 *)(r0 +4) = 0
misaligned access off 4 size 8
The format of the output can change at any time as verifier evolves.
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 00:17:03 -07:00
# define BPF_PROG_LOAD_LAST_FIELD log_buf
2014-09-26 00:17:00 -07:00
static int bpf_prog_load ( union bpf_attr * attr )
{
enum bpf_prog_type type = attr - > prog_type ;
struct bpf_prog * prog ;
int err ;
char license [ 128 ] ;
bool is_gpl ;
if ( CHECK_ATTR ( BPF_PROG_LOAD ) )
return - EINVAL ;
/* copy eBPF program license from user space */
if ( strncpy_from_user ( license , u64_to_ptr ( attr - > license ) ,
sizeof ( license ) - 1 ) < 0 )
return - EFAULT ;
license [ sizeof ( license ) - 1 ] = 0 ;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
is_gpl = license_is_gpl_compatible ( license ) ;
if ( attr - > insn_cnt > = BPF_MAXINSNS )
return - EINVAL ;
/* plain bpf_prog allocation */
prog = bpf_prog_alloc ( bpf_prog_size ( attr - > insn_cnt ) , GFP_USER ) ;
if ( ! prog )
return - ENOMEM ;
prog - > len = attr - > insn_cnt ;
err = - EFAULT ;
if ( copy_from_user ( prog - > insns , u64_to_ptr ( attr - > insns ) ,
prog - > len * sizeof ( struct bpf_insn ) ) ! = 0 )
goto free_prog ;
prog - > orig_prog = NULL ;
prog - > jited = false ;
atomic_set ( & prog - > aux - > refcnt , 1 ) ;
prog - > aux - > is_gpl_compatible = is_gpl ;
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type ( type , prog ) ;
if ( err < 0 )
goto free_prog ;
/* run eBPF verifier */
2014-09-26 00:17:02 -07:00
err = bpf_check ( prog , attr ) ;
2014-09-26 00:17:00 -07:00
if ( err < 0 )
goto free_used_maps ;
2014-09-26 00:17:01 -07:00
/* fixup BPF_CALL->imm field */
fixup_bpf_calls ( prog ) ;
2014-09-26 00:17:00 -07:00
/* eBPF program is ready to be JITed */
bpf_prog_select_runtime ( prog ) ;
err = anon_inode_getfd ( " bpf-prog " , & bpf_prog_fops , prog , O_RDWR | O_CLOEXEC ) ;
if ( err < 0 )
/* failed to allocate fd */
goto free_used_maps ;
return err ;
free_used_maps :
free_used_maps ( prog - > aux ) ;
free_prog :
bpf_prog_free ( prog ) ;
return err ;
}
2014-09-26 00:16:57 -07:00
SYSCALL_DEFINE3 ( bpf , int , cmd , union bpf_attr __user * , uattr , unsigned int , size )
{
union bpf_attr attr = { } ;
int err ;
/* the syscall is limited to root temporarily. This restriction will be
* lifted when security audit is clean . Note that eBPF + tracing must have
* this restriction , since it may pass kernel data to user space
*/
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( ! access_ok ( VERIFY_READ , uattr , 1 ) )
return - EFAULT ;
if ( size > PAGE_SIZE ) /* silly large */
return - E2BIG ;
/* If we're handed a bigger struct than we know of,
* ensure all the unknown bits are 0 - i . e . new
* user - space does not rely on any kernel feature
* extensions we dont know about yet .
*/
if ( size > sizeof ( attr ) ) {
unsigned char __user * addr ;
unsigned char __user * end ;
unsigned char val ;
addr = ( void __user * ) uattr + sizeof ( attr ) ;
end = ( void __user * ) uattr + size ;
for ( ; addr < end ; addr + + ) {
err = get_user ( val , addr ) ;
if ( err )
return err ;
if ( val )
return - E2BIG ;
}
size = sizeof ( attr ) ;
}
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if ( copy_from_user ( & attr , uattr , size ) ! = 0 )
return - EFAULT ;
switch ( cmd ) {
case BPF_MAP_CREATE :
err = map_create ( & attr ) ;
break ;
bpf: add lookup/update/delete/iterate methods to BPF maps
'maps' is a generic storage of different types for sharing data between kernel
and userspace.
The maps are accessed from user space via BPF syscall, which has commands:
- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error
- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key
- close(fd) deletes the map
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-09-26 00:16:59 -07:00
case BPF_MAP_LOOKUP_ELEM :
err = map_lookup_elem ( & attr ) ;
break ;
case BPF_MAP_UPDATE_ELEM :
err = map_update_elem ( & attr ) ;
break ;
case BPF_MAP_DELETE_ELEM :
err = map_delete_elem ( & attr ) ;
break ;
case BPF_MAP_GET_NEXT_KEY :
err = map_get_next_key ( & attr ) ;
break ;
2014-09-26 00:17:00 -07:00
case BPF_PROG_LOAD :
err = bpf_prog_load ( & attr ) ;
break ;
2014-09-26 00:16:57 -07:00
default :
err = - EINVAL ;
break ;
}
return err ;
}