2019-06-04 10:10:45 +02:00
// SPDX-License-Identifier: GPL-2.0-only
2017-01-21 17:26:11 +01:00
/*
* Longest prefix match list implementation
*
* Copyright ( c ) 2016 , 2017 Daniel Mack
* Copyright ( c ) 2016 David Herrmann
*/
# include <linux/bpf.h>
2018-08-12 01:59:17 +02:00
# include <linux/btf.h>
2017-01-21 17:26:11 +01:00
# include <linux/err.h>
# include <linux/slab.h>
# include <linux/spinlock.h>
# include <linux/vmalloc.h>
# include <net/ipv6.h>
2018-08-12 01:59:17 +02:00
# include <uapi/linux/btf.h>
2022-04-25 21:32:47 +08:00
# include <linux/btf_ids.h>
2017-01-21 17:26:11 +01:00
/* Intermediate node */
# define LPM_TREE_NODE_FLAG_IM BIT(0)
struct lpm_trie_node ;
struct lpm_trie_node {
struct rcu_head rcu ;
struct lpm_trie_node __rcu * child [ 2 ] ;
u32 prefixlen ;
u32 flags ;
2020-02-26 18:17:44 -06:00
u8 data [ ] ;
2017-01-21 17:26:11 +01:00
} ;
struct lpm_trie {
struct bpf_map map ;
struct lpm_trie_node __rcu * root ;
size_t n_entries ;
size_t max_prefixlen ;
size_t data_size ;
2020-02-24 15:01:52 +01:00
spinlock_t lock ;
2017-01-21 17:26:11 +01:00
} ;
/* This trie implements a longest prefix match algorithm that can be used to
* match IP addresses to a stored set of ranges .
*
* Data stored in @ data of struct bpf_lpm_key and struct lpm_trie_node is
* interpreted as big endian , so data [ 0 ] stores the most significant byte .
*
* Match ranges are internally stored in instances of struct lpm_trie_node
* which each contain their prefix length as well as two pointers that may
* lead to more nodes containing more specific matches . Each node also stores
* a value that is defined by and returned to userspace via the update_elem
* and lookup functions .
*
* For instance , let ' s start with a trie that was created with a prefix length
* of 32 , so it can be used for IPv4 addresses , and one single element that
* matches 192.168 .0 .0 / 16. The data array would hence contain
* [ 0xc0 , 0xa8 , 0x00 , 0x00 ] in big - endian notation . This documentation will
* stick to IP - address notation for readability though .
*
* As the trie is empty initially , the new node ( 1 ) will be places as root
* node , denoted as ( R ) in the example below . As there are no other node , both
* child pointers are % NULL .
*
* + - - - - - - - - - - - - - - - - +
* | ( 1 ) ( R ) |
* | 192.168 .0 .0 / 16 |
* | value : 1 |
* | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - +
*
* Next , let ' s add a new node ( 2 ) matching 192.168 .0 .0 / 24. As there is already
* a node with the same data and a smaller prefix ( ie , a less specific one ) ,
* node ( 2 ) will become a child of ( 1 ) . In child index depends on the next bit
* that is outside of what ( 1 ) matches , and that bit is 0 , so ( 2 ) will be
* child [ 0 ] of ( 1 ) :
*
* + - - - - - - - - - - - - - - - - +
* | ( 1 ) ( R ) |
* | 192.168 .0 .0 / 16 |
* | value : 1 |
* | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - +
* |
* + - - - - - - - - - - - - - - - - +
* | ( 2 ) |
* | 192.168 .0 .0 / 24 |
* | value : 2 |
* | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - +
*
* The child [ 1 ] slot of ( 1 ) could be filled with another node which has bit # 17
* ( the next bit after the ones that ( 1 ) matches on ) set to 1. For instance ,
* 192.168 .128 .0 / 24 :
*
* + - - - - - - - - - - - - - - - - +
* | ( 1 ) ( R ) |
* | 192.168 .0 .0 / 16 |
* | value : 1 |
* | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - +
* | |
* + - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - +
* | ( 2 ) | | ( 3 ) |
* | 192.168 .0 .0 / 24 | | 192.168 .128 .0 / 24 |
* | value : 2 | | value : 3 |
* | [ 0 ] [ 1 ] | | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - +
*
* Let ' s add another node ( 4 ) to the game for 192.168 .1 .0 / 24. In order to place
* it , node ( 1 ) is looked at first , and because ( 4 ) of the semantics laid out
* above ( bit # 17 is 0 ) , it would normally be attached to ( 1 ) as child [ 0 ] .
* However , that slot is already allocated , so a new node is needed in between .
* That node does not have a value attached to it and it will never be
* returned to users as result of a lookup . It is only there to differentiate
* the traversal further . It will get a prefix as wide as necessary to
* distinguish its two children :
*
* + - - - - - - - - - - - - - - - - +
* | ( 1 ) ( R ) |
* | 192.168 .0 .0 / 16 |
* | value : 1 |
* | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - +
* | |
* + - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - +
* | ( 4 ) ( I ) | | ( 3 ) |
* | 192.168 .0 .0 / 23 | | 192.168 .128 .0 / 24 |
* | value : - - - | | value : 3 |
* | [ 0 ] [ 1 ] | | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - - - +
* | |
* + - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - +
* | ( 2 ) | | ( 5 ) |
* | 192.168 .0 .0 / 24 | | 192.168 .1 .0 / 24 |
* | value : 2 | | value : 5 |
* | [ 0 ] [ 1 ] | | [ 0 ] [ 1 ] |
* + - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - - +
*
* 192.168 .1 .1 / 32 would be a child of ( 5 ) etc .
*
* An intermediate node will be turned into a ' real ' node on demand . In the
* example above , ( 4 ) would be re - used if 192.168 .0 .0 / 23 is added to the trie .
*
* A fully populated trie would have a height of 32 nodes , as the trie was
* created with a prefix length of 32.
*
* The lookup starts at the root node . If the current node matches and if there
* is a child that can be used to become more specific , the trie is traversed
* downwards . The last node in the traversal that is a non - intermediate one is
* returned .
*/
static inline int extract_bit ( const u8 * data , size_t index )
{
return ! ! ( data [ index / 8 ] & ( 1 < < ( 7 - ( index % 8 ) ) ) ) ;
}
/**
* longest_prefix_match ( ) - determine the longest prefix
* @ trie : The trie to get internal sizes from
* @ node : The node to operate on
* @ key : The key to compare to @ node
*
* Determine the longest prefix of @ node that matches the bits in @ key .
*/
static size_t longest_prefix_match ( const struct lpm_trie * trie ,
const struct lpm_trie_node * node ,
const struct bpf_lpm_trie_key * key )
{
2018-11-21 21:39:52 -08:00
u32 limit = min ( node - > prefixlen , key - > prefixlen ) ;
u32 prefixlen = 0 , i = 0 ;
BUILD_BUG_ON ( offsetof ( struct lpm_trie_node , data ) % sizeof ( u32 ) ) ;
BUILD_BUG_ON ( offsetof ( struct bpf_lpm_trie_key , data ) % sizeof ( u32 ) ) ;
# if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
/* data_size >= 16 has very small probability.
* We do not use a loop for optimal code generation .
*/
if ( trie - > data_size > = 8 ) {
u64 diff = be64_to_cpu ( * ( __be64 * ) node - > data ^
* ( __be64 * ) key - > data ) ;
prefixlen = 64 - fls64 ( diff ) ;
if ( prefixlen > = limit )
return limit ;
if ( diff )
return prefixlen ;
i = 8 ;
}
# endif
while ( trie - > data_size > = i + 4 ) {
u32 diff = be32_to_cpu ( * ( __be32 * ) & node - > data [ i ] ^
* ( __be32 * ) & key - > data [ i ] ) ;
prefixlen + = 32 - fls ( diff ) ;
if ( prefixlen > = limit )
return limit ;
if ( diff )
return prefixlen ;
i + = 4 ;
}
2017-01-21 17:26:11 +01:00
2018-11-21 21:39:52 -08:00
if ( trie - > data_size > = i + 2 ) {
u16 diff = be16_to_cpu ( * ( __be16 * ) & node - > data [ i ] ^
* ( __be16 * ) & key - > data [ i ] ) ;
2017-01-21 17:26:11 +01:00
2018-11-21 21:39:52 -08:00
prefixlen + = 16 - fls ( diff ) ;
if ( prefixlen > = limit )
return limit ;
if ( diff )
return prefixlen ;
i + = 2 ;
}
2017-01-21 17:26:11 +01:00
2018-11-21 21:39:52 -08:00
if ( trie - > data_size > = i + 1 ) {
prefixlen + = 8 - fls ( node - > data [ i ] ^ key - > data [ i ] ) ;
2017-01-21 17:26:11 +01:00
2018-11-21 21:39:52 -08:00
if ( prefixlen > = limit )
return limit ;
2017-01-21 17:26:11 +01:00
}
return prefixlen ;
}
/* Called from syscall or from eBPF program */
static void * trie_lookup_elem ( struct bpf_map * map , void * _key )
{
struct lpm_trie * trie = container_of ( map , struct lpm_trie , map ) ;
struct lpm_trie_node * node , * found = NULL ;
struct bpf_lpm_trie_key * key = _key ;
/* Start walking the trie from the root node ... */
2021-06-24 18:05:54 +02:00
for ( node = rcu_dereference_check ( trie - > root , rcu_read_lock_bh_held ( ) ) ;
node ; ) {
2017-01-21 17:26:11 +01:00
unsigned int next_bit ;
size_t matchlen ;
/* Determine the longest prefix of @node that matches @key.
* If it ' s the maximum possible prefix for this trie , we have
* an exact match and can return it directly .
*/
matchlen = longest_prefix_match ( trie , node , key ) ;
if ( matchlen = = trie - > max_prefixlen ) {
found = node ;
break ;
}
/* If the number of bits that match is smaller than the prefix
* length of @ node , bail out and return the node we have seen
* last in the traversal ( ie , the parent ) .
*/
if ( matchlen < node - > prefixlen )
break ;
/* Consider this node as return candidate unless it is an
* artificially added intermediate one .
*/
if ( ! ( node - > flags & LPM_TREE_NODE_FLAG_IM ) )
found = node ;
/* If the node match is fully satisfied, let's see if we can
* become more specific . Determine the next bit in the key and
* traverse down .
*/
next_bit = extract_bit ( key - > data , node - > prefixlen ) ;
2021-06-24 18:05:54 +02:00
node = rcu_dereference_check ( node - > child [ next_bit ] ,
rcu_read_lock_bh_held ( ) ) ;
2017-01-21 17:26:11 +01:00
}
if ( ! found )
return NULL ;
return found - > data + trie - > data_size ;
}
static struct lpm_trie_node * lpm_trie_node_alloc ( const struct lpm_trie * trie ,
const void * value )
{
struct lpm_trie_node * node ;
size_t size = sizeof ( struct lpm_trie_node ) + trie - > data_size ;
if ( value )
size + = trie - > map . value_size ;
bpf: Make non-preallocated allocation low priority
GFP_ATOMIC doesn't cooperate well with memcg pressure so far, especially
if we allocate too much GFP_ATOMIC memory. For example, when we set the
memcg limit to limit a non-preallocated bpf memory, the GFP_ATOMIC can
easily break the memcg limit by force charge. So it is very dangerous to
use GFP_ATOMIC in non-preallocated case. One way to make it safe is to
remove __GFP_HIGH from GFP_ATOMIC, IOW, use (__GFP_ATOMIC |
__GFP_KSWAPD_RECLAIM) instead, then it will be limited if we allocate
too much memory. There's a plan to completely remove __GFP_ATOMIC in the
mm side[1], so let's use GFP_NOWAIT instead.
We introduced BPF_F_NO_PREALLOC is because full map pre-allocation is
too memory expensive for some cases. That means removing __GFP_HIGH
doesn't break the rule of BPF_F_NO_PREALLOC, but has the same goal with
it-avoiding issues caused by too much memory. So let's remove it.
This fix can also apply to other run-time allocations, for example, the
allocation in lpm trie, local storage and devmap. So let fix it
consistently over the bpf code
It also fixes a typo in the comment.
[1]. https://lore.kernel.org/linux-mm/163712397076.13692.4727608274002939094@noble.neil.brown.name/
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Link: https://lore.kernel.org/r/20220709154457.57379-2-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-07-09 15:44:56 +00:00
node = bpf_map_kmalloc_node ( & trie - > map , size , GFP_NOWAIT | __GFP_NOWARN ,
2020-12-01 13:58:39 -08:00
trie - > map . numa_node ) ;
2017-01-21 17:26:11 +01:00
if ( ! node )
return NULL ;
node - > flags = 0 ;
if ( value )
memcpy ( node - > data + trie - > data_size , value ,
trie - > map . value_size ) ;
return node ;
}
/* Called from syscall or from eBPF program */
static int trie_update_elem ( struct bpf_map * map ,
void * _key , void * value , u64 flags )
{
struct lpm_trie * trie = container_of ( map , struct lpm_trie , map ) ;
2017-01-24 01:26:46 +01:00
struct lpm_trie_node * node , * im_node = NULL , * new_node = NULL ;
2017-01-21 17:26:11 +01:00
struct lpm_trie_node __rcu * * slot ;
struct bpf_lpm_trie_key * key = _key ;
unsigned long irq_flags ;
unsigned int next_bit ;
size_t matchlen = 0 ;
int ret = 0 ;
if ( unlikely ( flags > BPF_EXIST ) )
return - EINVAL ;
if ( key - > prefixlen > trie - > max_prefixlen )
return - EINVAL ;
2020-02-24 15:01:52 +01:00
spin_lock_irqsave ( & trie - > lock , irq_flags ) ;
2017-01-21 17:26:11 +01:00
/* Allocate and fill a new node */
if ( trie - > n_entries = = trie - > map . max_entries ) {
ret = - ENOSPC ;
goto out ;
}
new_node = lpm_trie_node_alloc ( trie , value ) ;
if ( ! new_node ) {
ret = - ENOMEM ;
goto out ;
}
trie - > n_entries + + ;
new_node - > prefixlen = key - > prefixlen ;
RCU_INIT_POINTER ( new_node - > child [ 0 ] , NULL ) ;
RCU_INIT_POINTER ( new_node - > child [ 1 ] , NULL ) ;
memcpy ( new_node - > data , key - > data , trie - > data_size ) ;
/* Now find a slot to attach the new node. To do that, walk the tree
* from the root and match as many bits as possible for each node until
* we either find an empty slot or a slot that needs to be replaced by
* an intermediate node .
*/
slot = & trie - > root ;
while ( ( node = rcu_dereference_protected ( * slot ,
lockdep_is_held ( & trie - > lock ) ) ) ) {
matchlen = longest_prefix_match ( trie , node , key ) ;
if ( node - > prefixlen ! = matchlen | |
node - > prefixlen = = key - > prefixlen | |
node - > prefixlen = = trie - > max_prefixlen )
break ;
next_bit = extract_bit ( key - > data , node - > prefixlen ) ;
slot = & node - > child [ next_bit ] ;
}
/* If the slot is empty (a free child pointer or an empty root),
* simply assign the @ new_node to that slot and be done .
*/
if ( ! node ) {
rcu_assign_pointer ( * slot , new_node ) ;
goto out ;
}
/* If the slot we picked already exists, replace it with @new_node
* which already has the correct data array set .
*/
if ( node - > prefixlen = = matchlen ) {
new_node - > child [ 0 ] = node - > child [ 0 ] ;
new_node - > child [ 1 ] = node - > child [ 1 ] ;
if ( ! ( node - > flags & LPM_TREE_NODE_FLAG_IM ) )
trie - > n_entries - - ;
rcu_assign_pointer ( * slot , new_node ) ;
kfree_rcu ( node , rcu ) ;
goto out ;
}
/* If the new node matches the prefix completely, it must be inserted
* as an ancestor . Simply insert it between @ node and * @ slot .
*/
if ( matchlen = = key - > prefixlen ) {
next_bit = extract_bit ( node - > data , matchlen ) ;
rcu_assign_pointer ( new_node - > child [ next_bit ] , node ) ;
rcu_assign_pointer ( * slot , new_node ) ;
goto out ;
}
im_node = lpm_trie_node_alloc ( trie , NULL ) ;
if ( ! im_node ) {
ret = - ENOMEM ;
goto out ;
}
im_node - > prefixlen = matchlen ;
im_node - > flags | = LPM_TREE_NODE_FLAG_IM ;
memcpy ( im_node - > data , node - > data , trie - > data_size ) ;
/* Now determine which child to install in which slot */
if ( extract_bit ( key - > data , matchlen ) ) {
rcu_assign_pointer ( im_node - > child [ 0 ] , node ) ;
rcu_assign_pointer ( im_node - > child [ 1 ] , new_node ) ;
} else {
rcu_assign_pointer ( im_node - > child [ 0 ] , new_node ) ;
rcu_assign_pointer ( im_node - > child [ 1 ] , node ) ;
}
2021-12-29 22:44:22 +08:00
/* Finally, assign the intermediate node to the determined slot */
2017-01-21 17:26:11 +01:00
rcu_assign_pointer ( * slot , im_node ) ;
out :
if ( ret ) {
if ( new_node )
trie - > n_entries - - ;
kfree ( new_node ) ;
kfree ( im_node ) ;
}
2020-02-24 15:01:52 +01:00
spin_unlock_irqrestore ( & trie - > lock , irq_flags ) ;
2017-01-21 17:26:11 +01:00
return ret ;
}
2017-09-18 15:30:55 -04:00
/* Called from syscall or from eBPF program */
static int trie_delete_elem ( struct bpf_map * map , void * _key )
2017-01-21 17:26:11 +01:00
{
2017-09-18 15:30:55 -04:00
struct lpm_trie * trie = container_of ( map , struct lpm_trie , map ) ;
struct bpf_lpm_trie_key * key = _key ;
2017-09-21 18:43:29 -04:00
struct lpm_trie_node __rcu * * trim , * * trim2 ;
struct lpm_trie_node * node , * parent ;
2017-09-18 15:30:55 -04:00
unsigned long irq_flags ;
unsigned int next_bit ;
size_t matchlen = 0 ;
int ret = 0 ;
if ( key - > prefixlen > trie - > max_prefixlen )
return - EINVAL ;
2020-02-24 15:01:52 +01:00
spin_lock_irqsave ( & trie - > lock , irq_flags ) ;
2017-09-18 15:30:55 -04:00
/* Walk the tree looking for an exact key/length match and keeping
2017-09-21 18:43:29 -04:00
* track of the path we traverse . We will need to know the node
* we wish to delete , and the slot that points to the node we want
* to delete . We may also need to know the nodes parent and the
* slot that contains it .
2017-09-18 15:30:55 -04:00
*/
trim = & trie - > root ;
2017-09-21 18:43:29 -04:00
trim2 = trim ;
parent = NULL ;
while ( ( node = rcu_dereference_protected (
* trim , lockdep_is_held ( & trie - > lock ) ) ) ) {
2017-09-18 15:30:55 -04:00
matchlen = longest_prefix_match ( trie , node , key ) ;
if ( node - > prefixlen ! = matchlen | |
node - > prefixlen = = key - > prefixlen )
break ;
2017-09-21 18:43:29 -04:00
parent = node ;
trim2 = trim ;
2017-09-18 15:30:55 -04:00
next_bit = extract_bit ( key - > data , node - > prefixlen ) ;
2017-09-21 18:43:29 -04:00
trim = & node - > child [ next_bit ] ;
2017-09-18 15:30:55 -04:00
}
if ( ! node | | node - > prefixlen ! = key - > prefixlen | |
2019-02-22 14:19:08 +01:00
node - > prefixlen ! = matchlen | |
2017-09-18 15:30:55 -04:00
( node - > flags & LPM_TREE_NODE_FLAG_IM ) ) {
ret = - ENOENT ;
goto out ;
}
trie - > n_entries - - ;
2017-09-21 18:43:29 -04:00
/* If the node we are removing has two children, simply mark it
2017-09-18 15:30:55 -04:00
* as intermediate and we are done .
*/
2017-09-21 18:43:29 -04:00
if ( rcu_access_pointer ( node - > child [ 0 ] ) & &
2017-09-18 15:30:55 -04:00
rcu_access_pointer ( node - > child [ 1 ] ) ) {
node - > flags | = LPM_TREE_NODE_FLAG_IM ;
goto out ;
}
2017-09-21 18:43:29 -04:00
/* If the parent of the node we are about to delete is an intermediate
* node , and the deleted node doesn ' t have any children , we can delete
* the intermediate parent as well and promote its other child
* up the tree . Doing this maintains the invariant that all
* intermediate nodes have exactly 2 children and that there are no
* unnecessary intermediate nodes in the tree .
2017-09-18 15:30:55 -04:00
*/
2017-09-21 18:43:29 -04:00
if ( parent & & ( parent - > flags & LPM_TREE_NODE_FLAG_IM ) & &
! node - > child [ 0 ] & & ! node - > child [ 1 ] ) {
if ( node = = rcu_access_pointer ( parent - > child [ 0 ] ) )
rcu_assign_pointer (
* trim2 , rcu_access_pointer ( parent - > child [ 1 ] ) ) ;
else
rcu_assign_pointer (
* trim2 , rcu_access_pointer ( parent - > child [ 0 ] ) ) ;
kfree_rcu ( parent , rcu ) ;
2017-09-18 15:30:55 -04:00
kfree_rcu ( node , rcu ) ;
2017-09-21 18:43:29 -04:00
goto out ;
2017-09-18 15:30:55 -04:00
}
2017-09-21 18:43:29 -04:00
/* The node we are removing has either zero or one child. If there
* is a child , move it into the removed node ' s slot then delete
* the node . Otherwise just clear the slot and delete the node .
*/
if ( node - > child [ 0 ] )
rcu_assign_pointer ( * trim , rcu_access_pointer ( node - > child [ 0 ] ) ) ;
else if ( node - > child [ 1 ] )
rcu_assign_pointer ( * trim , rcu_access_pointer ( node - > child [ 1 ] ) ) ;
else
RCU_INIT_POINTER ( * trim , NULL ) ;
kfree_rcu ( node , rcu ) ;
2017-09-18 15:30:55 -04:00
out :
2020-02-24 15:01:52 +01:00
spin_unlock_irqrestore ( & trie - > lock , irq_flags ) ;
2017-09-18 15:30:55 -04:00
return ret ;
2017-01-21 17:26:11 +01:00
}
2017-02-08 01:19:43 +01:00
# define LPM_DATA_SIZE_MAX 256
# define LPM_DATA_SIZE_MIN 1
# define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
sizeof ( struct lpm_trie_node ) )
# define LPM_VAL_SIZE_MIN 1
# define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
# define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
# define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
2017-10-18 13:00:22 -07:00
# define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
bpf: add program side {rd, wr}only support for maps
This work adds two new map creation flags BPF_F_RDONLY_PROG
and BPF_F_WRONLY_PROG in order to allow for read-only or
write-only BPF maps from a BPF program side.
Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only
applies to system call side, meaning the BPF program has full
read/write access to the map as usual while bpf(2) calls with
map fd can either only read or write into the map depending
on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows
for the exact opposite such that verifier is going to reject
program loads if write into a read-only map or a read into a
write-only map is detected. For read-only map case also some
helpers are forbidden for programs that would alter the map
state such as map deletion, update, etc. As opposed to the two
BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well
as BPF_F_WRONLY_PROG really do correspond to the map lifetime.
We've enabled this generic map extension to various non-special
maps holding normal user data: array, hash, lru, lpm, local
storage, queue and stack. Further generic map types could be
followed up in future depending on use-case. Main use case
here is to forbid writes into .rodata map values from verifier
side.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2019-04-09 23:20:05 +02:00
BPF_F_ACCESS_MASK )
2017-08-18 11:28:00 -07:00
2017-01-21 17:26:11 +01:00
static struct bpf_map * trie_alloc ( union bpf_attr * attr )
{
struct lpm_trie * trie ;
2020-05-13 16:03:54 -07:00
if ( ! bpf_capable ( ) )
2017-01-21 17:26:11 +01:00
return ERR_PTR ( - EPERM ) ;
/* check sanity of attributes */
if ( attr - > max_entries = = 0 | |
2017-08-18 11:28:00 -07:00
! ( attr - > map_flags & BPF_F_NO_PREALLOC ) | |
attr - > map_flags & ~ LPM_CREATE_FLAG_MASK | |
bpf: add program side {rd, wr}only support for maps
This work adds two new map creation flags BPF_F_RDONLY_PROG
and BPF_F_WRONLY_PROG in order to allow for read-only or
write-only BPF maps from a BPF program side.
Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only
applies to system call side, meaning the BPF program has full
read/write access to the map as usual while bpf(2) calls with
map fd can either only read or write into the map depending
on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows
for the exact opposite such that verifier is going to reject
program loads if write into a read-only map or a read into a
write-only map is detected. For read-only map case also some
helpers are forbidden for programs that would alter the map
state such as map deletion, update, etc. As opposed to the two
BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well
as BPF_F_WRONLY_PROG really do correspond to the map lifetime.
We've enabled this generic map extension to various non-special
maps holding normal user data: array, hash, lru, lpm, local
storage, queue and stack. Further generic map types could be
followed up in future depending on use-case. Main use case
here is to forbid writes into .rodata map values from verifier
side.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2019-04-09 23:20:05 +02:00
! bpf_map_flags_access_ok ( attr - > map_flags ) | |
2017-02-08 01:19:43 +01:00
attr - > key_size < LPM_KEY_SIZE_MIN | |
attr - > key_size > LPM_KEY_SIZE_MAX | |
attr - > value_size < LPM_VAL_SIZE_MIN | |
attr - > value_size > LPM_VAL_SIZE_MAX )
2017-01-21 17:26:11 +01:00
return ERR_PTR ( - EINVAL ) ;
2022-08-10 15:18:29 +00:00
trie = bpf_map_area_alloc ( sizeof ( * trie ) , NUMA_NO_NODE ) ;
2017-01-21 17:26:11 +01:00
if ( ! trie )
return ERR_PTR ( - ENOMEM ) ;
/* copy mandatory map attributes */
2018-01-11 20:29:06 -08:00
bpf_map_init_from_attr ( & trie - > map , attr ) ;
2017-01-21 17:26:11 +01:00
trie - > data_size = attr - > key_size -
offsetof ( struct bpf_lpm_trie_key , data ) ;
trie - > max_prefixlen = trie - > data_size * 8 ;
2020-02-24 15:01:52 +01:00
spin_lock_init ( & trie - > lock ) ;
2017-01-21 17:26:11 +01:00
return & trie - > map ;
}
static void trie_free ( struct bpf_map * map )
{
struct lpm_trie * trie = container_of ( map , struct lpm_trie , map ) ;
struct lpm_trie_node __rcu * * slot ;
struct lpm_trie_node * node ;
/* Always start at the root and walk down to a node that has no
* children . Then free that node , nullify its reference in the parent
* and start over .
*/
for ( ; ; ) {
slot = & trie - > root ;
for ( ; ; ) {
2018-02-22 10:10:35 -08:00
node = rcu_dereference_protected ( * slot , 1 ) ;
2017-01-21 17:26:11 +01:00
if ( ! node )
2018-02-13 19:00:21 -08:00
goto out ;
2017-01-21 17:26:11 +01:00
if ( rcu_access_pointer ( node - > child [ 0 ] ) ) {
slot = & node - > child [ 0 ] ;
continue ;
}
if ( rcu_access_pointer ( node - > child [ 1 ] ) ) {
slot = & node - > child [ 1 ] ;
continue ;
}
kfree ( node ) ;
RCU_INIT_POINTER ( * slot , NULL ) ;
break ;
}
}
2018-02-13 19:00:21 -08:00
out :
2022-08-10 15:18:29 +00:00
bpf_map_area_free ( trie ) ;
2017-01-21 17:26:11 +01:00
}
2018-01-18 15:08:50 -08:00
static int trie_get_next_key ( struct bpf_map * map , void * _key , void * _next_key )
2017-03-05 09:41:08 -08:00
{
2018-01-26 15:06:07 -08:00
struct lpm_trie_node * node , * next_node = NULL , * parent , * search_root ;
2018-01-18 15:08:50 -08:00
struct lpm_trie * trie = container_of ( map , struct lpm_trie , map ) ;
struct bpf_lpm_trie_key * key = _key , * next_key = _next_key ;
struct lpm_trie_node * * node_stack = NULL ;
int err = 0 , stack_ptr = - 1 ;
unsigned int next_bit ;
size_t matchlen ;
/* The get_next_key follows postorder. For the 4 node example in
* the top of this file , the trie_get_next_key ( ) returns the following
* one after another :
* 192.168 .0 .0 / 24
* 192.168 .1 .0 / 24
* 192.168 .128 .0 / 24
* 192.168 .0 .0 / 16
*
* The idea is to return more specific keys before less specific ones .
*/
/* Empty trie */
2018-01-26 15:06:07 -08:00
search_root = rcu_dereference ( trie - > root ) ;
if ( ! search_root )
2018-01-18 15:08:50 -08:00
return - ENOENT ;
/* For invalid key, find the leftmost node in the trie */
2018-01-26 15:06:07 -08:00
if ( ! key | | key - > prefixlen > trie - > max_prefixlen )
2018-01-18 15:08:50 -08:00
goto find_leftmost ;
treewide: kmalloc() -> kmalloc_array()
The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
patch replaces cases of:
kmalloc(a * b, gfp)
with:
kmalloc_array(a * b, gfp)
as well as handling cases of:
kmalloc(a * b * c, gfp)
with:
kmalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kmalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kmalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The tools/ directory was manually excluded, since it has its own
implementation of kmalloc().
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kmalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kmalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kmalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kmalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kmalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kmalloc
+ kmalloc_array
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kmalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kmalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kmalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kmalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kmalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kmalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kmalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kmalloc(sizeof(THING) * C2, ...)
|
kmalloc(sizeof(TYPE) * C2, ...)
|
kmalloc(C1 * C2 * C3, ...)
|
kmalloc(C1 * C2, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * E2
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kmalloc
+ kmalloc_array
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-12 13:55:00 -07:00
node_stack = kmalloc_array ( trie - > max_prefixlen ,
sizeof ( struct lpm_trie_node * ) ,
GFP_ATOMIC | __GFP_NOWARN ) ;
2018-01-18 15:08:50 -08:00
if ( ! node_stack )
return - ENOMEM ;
/* Try to find the exact node for the given key */
2018-01-26 15:06:07 -08:00
for ( node = search_root ; node ; ) {
2018-01-18 15:08:50 -08:00
node_stack [ + + stack_ptr ] = node ;
matchlen = longest_prefix_match ( trie , node , key ) ;
if ( node - > prefixlen ! = matchlen | |
node - > prefixlen = = key - > prefixlen )
break ;
next_bit = extract_bit ( key - > data , node - > prefixlen ) ;
node = rcu_dereference ( node - > child [ next_bit ] ) ;
}
if ( ! node | | node - > prefixlen ! = key - > prefixlen | |
2018-01-26 15:06:07 -08:00
( node - > flags & LPM_TREE_NODE_FLAG_IM ) )
2018-01-18 15:08:50 -08:00
goto find_leftmost ;
/* The node with the exactly-matching key has been found,
* find the first node in postorder after the matched node .
*/
node = node_stack [ stack_ptr ] ;
while ( stack_ptr > 0 ) {
parent = node_stack [ stack_ptr - 1 ] ;
2018-01-26 15:06:07 -08:00
if ( rcu_dereference ( parent - > child [ 0 ] ) = = node ) {
search_root = rcu_dereference ( parent - > child [ 1 ] ) ;
if ( search_root )
goto find_leftmost ;
2018-01-18 15:08:50 -08:00
}
if ( ! ( parent - > flags & LPM_TREE_NODE_FLAG_IM ) ) {
next_node = parent ;
goto do_copy ;
}
node = parent ;
stack_ptr - - ;
}
/* did not find anything */
err = - ENOENT ;
goto free_stack ;
find_leftmost :
/* Find the leftmost non-intermediate node, all intermediate nodes
* have exact two children , so this function will never return NULL .
*/
2018-01-26 15:06:07 -08:00
for ( node = search_root ; node ; ) {
2019-06-08 12:54:19 -07:00
if ( node - > flags & LPM_TREE_NODE_FLAG_IM ) {
node = rcu_dereference ( node - > child [ 0 ] ) ;
} else {
2018-01-18 15:08:50 -08:00
next_node = node ;
2019-06-08 12:54:19 -07:00
node = rcu_dereference ( node - > child [ 0 ] ) ;
if ( ! node )
node = rcu_dereference ( next_node - > child [ 1 ] ) ;
}
2018-01-18 15:08:50 -08:00
}
do_copy :
next_key - > prefixlen = next_node - > prefixlen ;
memcpy ( ( void * ) next_key + offsetof ( struct bpf_lpm_trie_key , data ) ,
next_node - > data , trie - > data_size ) ;
free_stack :
kfree ( node_stack ) ;
return err ;
2017-03-05 09:41:08 -08:00
}
2018-08-12 01:59:17 +02:00
static int trie_check_btf ( const struct bpf_map * map ,
2018-12-10 15:43:00 -08:00
const struct btf * btf ,
2018-08-12 01:59:17 +02:00
const struct btf_type * key_type ,
const struct btf_type * value_type )
{
/* Keys must have struct bpf_lpm_trie_key embedded. */
return BTF_INFO_KIND ( key_type - > info ) ! = BTF_KIND_STRUCT ?
- EINVAL : 0 ;
}
2022-04-25 21:32:47 +08:00
BTF_ID_LIST_SINGLE ( trie_map_btf_ids , struct , lpm_trie )
2017-04-11 15:34:58 +02:00
const struct bpf_map_ops trie_map_ops = {
2020-08-27 18:18:06 -07:00
. map_meta_equal = bpf_map_meta_equal ,
2017-01-21 17:26:11 +01:00
. map_alloc = trie_alloc ,
. map_free = trie_free ,
2017-03-05 09:41:08 -08:00
. map_get_next_key = trie_get_next_key ,
2017-01-21 17:26:11 +01:00
. map_lookup_elem = trie_lookup_elem ,
. map_update_elem = trie_update_elem ,
. map_delete_elem = trie_delete_elem ,
2021-03-22 23:50:53 -03:00
. map_lookup_batch = generic_map_lookup_batch ,
. map_update_batch = generic_map_update_batch ,
. map_delete_batch = generic_map_delete_batch ,
2018-08-12 01:59:17 +02:00
. map_check_btf = trie_check_btf ,
2022-04-25 21:32:47 +08:00
. map_btf_id = & trie_map_btf_ids [ 0 ] ,
2017-01-21 17:26:11 +01:00
} ;