2019-05-27 09:55:01 +03:00
/* SPDX-License-Identifier: GPL-2.0-or-later */
2005-08-12 16:19:38 +04:00
/*
* INET An implementation of the TCP / IP protocol suite for the LINUX
* operating system . INET is implemented using the BSD Socket
* interface as the means of communication with the user level .
*
* Authors : Lotsa people , from code originally in tcp
*/
# ifndef _INET6_HASHTABLES_H
# define _INET6_HASHTABLES_H
2005-08-12 16:26:18 +04:00
2011-12-10 13:48:31 +04:00
# if IS_ENABLED(CONFIG_IPV6)
2005-08-12 16:26:18 +04:00
# include <linux/in6.h>
# include <linux/ipv6.h>
2005-08-12 16:19:38 +04:00
# include <linux/types.h>
2007-03-23 21:40:27 +03:00
# include <linux/jhash.h>
# include <net/inet_sock.h>
2005-08-12 16:19:38 +04:00
2005-08-12 16:26:18 +04:00
# include <net/ipv6.h>
2008-06-17 04:14:11 +04:00
# include <net/netns/hash.h>
2005-08-12 16:26:18 +04:00
2005-08-12 16:19:38 +04:00
struct inet_hashinfo ;
2013-10-19 23:48:52 +04:00
static inline unsigned int __inet6_ehashfn ( const u32 lhash ,
const u16 lport ,
const u32 fhash ,
const __be16 fport ,
const u32 initval )
2005-08-12 16:26:18 +04:00
{
2013-10-19 23:48:52 +04:00
const u32 ports = ( ( ( u32 ) lport ) < < 16 ) | ( __force u32 ) fport ;
return jhash_3words ( lhash , fhash , ports , initval ) ;
2005-08-12 16:26:18 +04:00
}
/*
* Sockets in TCP_CLOSE state are _always_ taken out of the hash , so
* we need not check it for TCP lookups anymore , thanks Alexey . - DaveM
*
* The sockhash lock must be held as a reader here .
*/
2013-09-21 21:22:41 +04:00
struct sock * __inet6_lookup_established ( struct net * net ,
struct inet_hashinfo * hashinfo ,
const struct in6_addr * saddr ,
const __be16 sport ,
const struct in6_addr * daddr ,
2017-08-07 18:44:21 +03:00
const u16 hnum , const int dif ,
const int sdif ) ;
2013-09-21 21:22:41 +04:00
2023-07-20 18:30:08 +03:00
typedef u32 ( inet6_ehashfn_t ) ( const struct net * net ,
const struct in6_addr * laddr , const u16 lport ,
const struct in6_addr * faddr , const __be16 fport ) ;
inet6_ehashfn_t inet6_ehashfn ;
INDIRECT_CALLABLE_DECLARE ( inet6_ehashfn_t udp6_ehashfn ) ;
2023-07-20 18:30:07 +03:00
struct sock * inet6_lookup_reuseport ( struct net * net , struct sock * sk ,
struct sk_buff * skb , int doff ,
const struct in6_addr * saddr ,
__be16 sport ,
const struct in6_addr * daddr ,
2023-07-20 18:30:08 +03:00
unsigned short hnum ,
inet6_ehashfn_t * ehashfn ) ;
2023-07-20 18:30:07 +03:00
2013-09-21 21:22:41 +04:00
struct sock * inet6_lookup_listener ( struct net * net ,
struct inet_hashinfo * hashinfo ,
2016-02-10 19:50:38 +03:00
struct sk_buff * skb , int doff ,
2013-09-21 21:22:41 +04:00
const struct in6_addr * saddr ,
const __be16 sport ,
const struct in6_addr * daddr ,
2017-08-07 18:44:21 +03:00
const unsigned short hnum ,
const int dif , const int sdif ) ;
2005-08-12 16:26:18 +04:00
2023-07-20 18:30:10 +03:00
struct sock * inet6_lookup_run_sk_lookup ( struct net * net ,
int protocol ,
struct sk_buff * skb , int doff ,
const struct in6_addr * saddr ,
const __be16 sport ,
const struct in6_addr * daddr ,
const u16 hnum , const int dif ,
inet6_ehashfn_t * ehashfn ) ;
2008-01-31 16:07:21 +03:00
static inline struct sock * __inet6_lookup ( struct net * net ,
struct inet_hashinfo * hashinfo ,
2016-02-10 19:50:38 +03:00
struct sk_buff * skb , int doff ,
2005-08-12 16:26:18 +04:00
const struct in6_addr * saddr ,
2006-11-08 11:20:00 +03:00
const __be16 sport ,
2005-08-12 16:26:18 +04:00
const struct in6_addr * daddr ,
const u16 hnum ,
2017-08-07 18:44:21 +03:00
const int dif , const int sdif ,
2016-04-01 18:52:17 +03:00
bool * refcounted )
2005-08-12 16:26:18 +04:00
{
2008-01-31 16:07:21 +03:00
struct sock * sk = __inet6_lookup_established ( net , hashinfo , saddr ,
2017-08-07 18:44:21 +03:00
sport , daddr , hnum ,
dif , sdif ) ;
2016-04-01 18:52:17 +03:00
* refcounted = true ;
2005-08-12 16:26:18 +04:00
if ( sk )
return sk ;
2016-04-01 18:52:17 +03:00
* refcounted = false ;
2016-02-10 19:50:38 +03:00
return inet6_lookup_listener ( net , hashinfo , skb , doff , saddr , sport ,
2017-08-07 18:44:21 +03:00
daddr , hnum , dif , sdif ) ;
2005-08-12 16:26:18 +04:00
}
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 18:30:11 +03:00
static inline
struct sock * inet6_steal_sock ( struct net * net , struct sk_buff * skb , int doff ,
const struct in6_addr * saddr , const __be16 sport ,
const struct in6_addr * daddr , const __be16 dport ,
bool * refcounted , inet6_ehashfn_t * ehashfn )
{
struct sock * sk , * reuse_sk ;
bool prefetched ;
sk = skb_steal_sock ( skb , refcounted , & prefetched ) ;
if ( ! sk )
return NULL ;
2023-08-15 11:53:41 +03:00
if ( ! prefetched | | ! sk_fullsock ( sk ) )
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 18:30:11 +03:00
return sk ;
if ( sk - > sk_protocol = = IPPROTO_TCP ) {
if ( sk - > sk_state ! = TCP_LISTEN )
return sk ;
} else if ( sk - > sk_protocol = = IPPROTO_UDP ) {
if ( sk - > sk_state ! = TCP_CLOSE )
return sk ;
} else {
return sk ;
}
reuse_sk = inet6_lookup_reuseport ( net , sk , skb , doff ,
saddr , sport , daddr , ntohs ( dport ) ,
ehashfn ) ;
if ( ! reuse_sk )
return sk ;
/* We've chosen a new reuseport sock which is never refcounted. This
* implies that sk also isn ' t refcounted .
*/
WARN_ON_ONCE ( * refcounted ) ;
return reuse_sk ;
}
2008-10-07 22:41:57 +04:00
static inline struct sock * __inet6_lookup_skb ( struct inet_hashinfo * hashinfo ,
2016-02-10 19:50:38 +03:00
struct sk_buff * skb , int doff ,
2008-10-07 22:41:57 +04:00
const __be16 sport ,
2014-10-17 20:17:20 +04:00
const __be16 dport ,
2017-08-07 18:44:21 +03:00
int iif , int sdif ,
2016-04-01 18:52:17 +03:00
bool * refcounted )
2008-10-07 22:41:57 +04:00
{
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 18:30:11 +03:00
struct net * net = dev_net ( skb_dst ( skb ) - > dev ) ;
const struct ipv6hdr * ip6h = ipv6_hdr ( skb ) ;
struct sock * sk ;
sk = inet6_steal_sock ( net , skb , doff , & ip6h - > saddr , sport , & ip6h - > daddr , dport ,
refcounted , inet6_ehashfn ) ;
if ( IS_ERR ( sk ) )
return NULL ;
2012-07-26 16:18:11 +04:00
if ( sk )
2008-10-07 23:41:01 +04:00
return sk ;
2012-07-26 16:18:11 +04:00
bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign
Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT
sockets. This means we can't use the helper to steer traffic to Envoy,
which configures SO_REUSEPORT on its sockets. In turn, we're blocked
from removing TPROXY from our setup.
The reason that bpf_sk_assign refuses such sockets is that the
bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead,
one of the reuseport sockets is selected by hash. This could cause
dispatch to the "wrong" socket:
sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash
bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed
Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup
helpers unfortunately. In the tc context, L2 headers are at the start
of the skb, while SK_REUSEPORT expects L3 headers instead.
Instead, we execute the SK_REUSEPORT program when the assigned socket
is pulled out of the skb, further up the stack. This creates some
trickiness with regards to refcounting as bpf_sk_assign will put both
refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU
freed. We can infer that the sk_assigned socket is RCU freed if the
reuseport lookup succeeds, but convincing yourself of this fact isn't
straight forward. Therefore we defensively check refcounting on the
sk_assign sock even though it's probably not required in practice.
Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign")
Fixes: cf7fbe660f2d ("bpf: Add socket assign support")
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Joe Stringer <joe@cilium.io>
Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-07-20 18:30:11 +03:00
return __inet6_lookup ( net , hashinfo , skb ,
doff , & ip6h - > saddr , sport ,
& ip6h - > daddr , ntohs ( dport ) ,
2017-08-07 18:44:21 +03:00
iif , sdif , refcounted ) ;
2008-10-07 22:41:57 +04:00
}
2013-09-21 21:22:41 +04:00
struct sock * inet6_lookup ( struct net * net , struct inet_hashinfo * hashinfo ,
2016-02-10 19:50:38 +03:00
struct sk_buff * skb , int doff ,
2013-09-21 21:22:41 +04:00
const struct in6_addr * saddr , const __be16 sport ,
const struct in6_addr * daddr , const __be16 dport ,
const int dif ) ;
2016-02-10 19:50:36 +03:00
int inet6_hash ( struct sock * sk ) ;
2014-11-04 21:59:47 +03:00
2022-05-13 21:55:49 +03:00
static inline bool inet6_match ( struct net * net , const struct sock * sk ,
const struct in6_addr * saddr ,
const struct in6_addr * daddr ,
const __portpair ports ,
const int dif , const int sdif )
{
if ( ! net_eq ( sock_net ( sk ) , net ) | |
sk - > sk_family ! = AF_INET6 | |
sk - > sk_portpair ! = ports | |
! ipv6_addr_equal ( & sk - > sk_v6_daddr , saddr ) | |
! ipv6_addr_equal ( & sk - > sk_v6_rcv_saddr , daddr ) )
return false ;
2022-07-25 21:14:42 +03:00
/* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
return inet_sk_bound_dev_eq ( net , READ_ONCE ( sk - > sk_bound_dev_if ) , dif ,
sdif ) ;
2022-05-13 21:55:49 +03:00
}
# endif /* IS_ENABLED(CONFIG_IPV6) */
2014-11-04 21:59:47 +03:00
2005-08-12 16:19:38 +04:00
# endif /* _INET6_HASHTABLES_H */