From fe38d2a1c8bee0b3a0be40de5b621a28200612e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Jan 2017 07:51:01 -0800 Subject: [PATCH 1/6] inet: collapse ipv4/v6 rcv_saddr_equal functions into one We pass these per-protocol equal functions around in various places, but we can just have one function that checks the sk->sk_family and then do the right comparison function. I've also changed the ipv4 version to not cast to inet_sock since it is unneeded. Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/net/addrconf.h | 4 +- include/net/inet_hashtables.h | 5 +-- include/net/udp.h | 1 - net/ipv4/inet_connection_sock.c | 72 ++++++++++++++++++++++++++++++++ net/ipv4/inet_hashtables.c | 16 +++---- net/ipv4/udp.c | 58 ++++++------------------- net/ipv6/inet6_connection_sock.c | 4 +- net/ipv6/inet6_hashtables.c | 46 +------------------- net/ipv6/udp.c | 2 +- 9 files changed, 95 insertions(+), 113 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 8f998afc1384..17c6fd84e287 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -88,9 +88,7 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); -int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard); -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, +int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 0574493e3899..756ed1692906 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -203,10 +203,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); -int __inet_hash(struct sock *sk, struct sock *osk, - int (*saddr_same)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)); +int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/include/net/udp.h b/include/net/udp.h index 1661791e8ca1..c9d8b8e848e0 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -204,7 +204,6 @@ static inline void udp_lib_close(struct sock *sk, long timeout) } int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*)(const struct sock *, const struct sock *, bool), unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 19ea045c50ed..ba597cb504ff 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -31,6 +31,78 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif +#if IS_ENABLED(CONFIG_IPV6) +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && match_wildcard && + !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + return 1; + + return 0; +} +#endif + +/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 + */ +static int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ + if (!ipv6_only_sock(sk2)) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; +} + +int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_rcv_saddr_equal(sk, sk2, match_wildcard); +#endif + return ipv4_rcv_saddr_equal(sk, sk2, match_wildcard); +} +EXPORT_SYMBOL(inet_rcv_saddr_equal); + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ca97835bfec4..2ef9b010bd34 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -435,10 +435,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) EXPORT_SYMBOL_GPL(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, - struct inet_listen_hashbucket *ilb, - int (*saddr_same)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) + struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; struct sock *sk2; @@ -451,7 +448,7 @@ static int inet_reuseport_add_sock(struct sock *sk, sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && - saddr_same(sk, sk2, false)) + inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2); } @@ -461,10 +458,7 @@ static int inet_reuseport_add_sock(struct sock *sk, return 0; } -int __inet_hash(struct sock *sk, struct sock *osk, - int (*saddr_same)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) +int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; @@ -479,7 +473,7 @@ int __inet_hash(struct sock *sk, struct sock *osk, spin_lock(&ilb->lock); if (sk->sk_reuseport) { - err = inet_reuseport_add_sock(sk, ilb, saddr_same); + err = inet_reuseport_add_sock(sk, ilb); if (err) goto unlock; } @@ -503,7 +497,7 @@ int inet_hash(struct sock *sk) if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); + err = __inet_hash(sk, NULL); local_bh_enable(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4318d72e0248..d6dddcf59e79 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -137,11 +137,7 @@ EXPORT_SYMBOL(udp_memory_allocated); static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard), - unsigned int log) + struct sock *sk, unsigned int log) { struct sock *sk2; kuid_t uid = sock_i_uid(sk); @@ -153,7 +149,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - saddr_comp(sk, sk2, true)) { + inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sock_i_uid(sk2))) { @@ -176,10 +172,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) + struct sock *sk) { struct sock *sk2; kuid_t uid = sock_i_uid(sk); @@ -193,7 +186,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - saddr_comp(sk, sk2, true)) { + inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sock_i_uid(sk2))) { @@ -208,10 +201,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, return res; } -static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, - int (*saddr_same)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard)) +static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) { struct net *net = sock_net(sk); kuid_t uid = sock_i_uid(sk); @@ -225,7 +215,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && - (*saddr_same)(sk, sk2, false)) { + inet_rcv_saddr_equal(sk, sk2, false)) { return reuseport_add_sock(sk, sk2); } } @@ -241,14 +231,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, * * @sk: socket struct in question * @snum: port number to look up - * @saddr_comp: AF-dependent comparison of bound local IP addresses * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2, - bool match_wildcard), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -277,7 +263,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, - saddr_comp, udptable->log); + udptable->log); snum = first; /* @@ -310,12 +296,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, if (hslot->count < hslot2->count) goto scan_primary_hash; - exist = udp_lib_lport_inuse2(net, snum, hslot2, - sk, saddr_comp); + exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); if (!exist && (hash2_nulladdr != slot2)) { hslot2 = udp_hashslot2(udptable, hash2_nulladdr); exist = udp_lib_lport_inuse2(net, snum, hslot2, - sk, saddr_comp); + sk); } if (exist) goto fail_unlock; @@ -323,8 +308,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, goto found; } scan_primary_hash: - if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, - saddr_comp, 0)) + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0)) goto fail_unlock; } found: @@ -333,7 +317,7 @@ found: udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { if (sk->sk_reuseport && - udp_reuseport_add_sock(sk, hslot, saddr_comp)) { + udp_reuseport_add_sock(sk, hslot)) { inet_sk(sk)->inet_num = 0; udp_sk(sk)->udp_port_hash = 0; udp_sk(sk)->udp_portaddr_hash ^= snum; @@ -365,24 +349,6 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses - * match_wildcard == false: addresses must be exactly the same, i.e. - * 0.0.0.0 only equals to 0.0.0.0 - */ -int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, - bool match_wildcard) -{ - struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - - if (!ipv6_only_sock(sk2)) { - if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr) - return 1; - if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr) - return match_wildcard; - } - return 0; -} - static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) { @@ -398,7 +364,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; - return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); + return udp_lib_get_port(sk, snum, hash2_nulladdr); } static int compute_score(struct sock *sk, struct net *net, diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 7396e75e161b..55ee2ea2aee0 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -54,12 +54,12 @@ int inet6_csk_bind_conflict(const struct sock *sk, (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { - if (ipv6_rcv_saddr_equal(sk, sk2, true)) + if (inet_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN && - ipv6_rcv_saddr_equal(sk, sk2, true)) + inet_rcv_saddr_equal(sk, sk2, true)) break; } } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 02761c9fe43e..d0900918a19e 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -268,54 +268,10 @@ int inet6_hash(struct sock *sk) if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal); + err = __inet_hash(sk, NULL); local_bh_enable(); } return err; } EXPORT_SYMBOL_GPL(inet6_hash); - -/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 - * only, and any IPv4 addresses if not IPv6 only - * match_wildcard == false: addresses must be exactly the same, i.e. - * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, - * and 0.0.0.0 equals to 0.0.0.0 only - */ -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) -{ - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - - /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { - if (!sk2_ipv6only) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) - return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) - return match_wildcard; - } - return 0; - } - - if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) - return 1; - - if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && - !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) - return 1; - - if (addr_type == IPV6_ADDR_ANY && match_wildcard && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) - return 1; - - if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) - return 1; - - return 0; -} -EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4d5c4eee4b3f..05d69324862e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -103,7 +103,7 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; - return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); + return udp_lib_get_port(sk, snum, hash2_nulladdr); } static void udp_v6_rehash(struct sock *sk) From aa078842b702b4a45111f028a604a6c8f69cb27d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Jan 2017 07:51:02 -0800 Subject: [PATCH 2/6] inet: drop ->bind_conflict The only difference between inet6_csk_bind_conflict and inet_csk_bind_conflict is how they check the rcv_saddr, so delete this call back and simply change inet_csk_bind_conflict to call inet_rcv_saddr_equal. Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/net/inet6_connection_sock.h | 5 ---- include/net/inet_connection_sock.h | 6 ----- net/dccp/ipv4.c | 1 - net/dccp/ipv6.c | 2 -- net/ipv4/inet_connection_sock.c | 22 +++++----------- net/ipv4/tcp_ipv4.c | 1 - net/ipv6/inet6_connection_sock.c | 40 ----------------------------- net/ipv6/tcp_ipv6.c | 2 -- 8 files changed, 7 insertions(+), 72 deletions(-) diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index 3212b39b5bfc..8ec87b62257b 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -15,16 +15,11 @@ #include -struct inet_bind_bucket; struct request_sock; struct sk_buff; struct sock; struct sockaddr; -int inet6_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb, bool relax, - bool soreuseport_ok); - struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, u8 proto); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 84b2edde09b1..826f198374f8 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -62,9 +62,6 @@ struct inet_connection_sock_af_ops { char __user *optval, int __user *optlen); #endif void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); - int (*bind_conflict)(const struct sock *sk, - const struct inet_bind_bucket *tb, - bool relax, bool soreuseport_ok); void (*mtu_reduced)(struct sock *sk); }; @@ -263,9 +260,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); -int inet_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb, bool relax, - bool soreuseport_ok); int inet_csk_get_port(struct sock *sk, unsigned short snum); struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d859a5c36e70..b043ec833785 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -904,7 +904,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), - .bind_conflict = inet_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index adfc790f7193..08bcdc3d1717 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -937,7 +937,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, @@ -958,7 +957,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ba597cb504ff..a1c9055769fc 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -116,9 +116,9 @@ void inet_get_local_port_range(struct net *net, int *low, int *high) } EXPORT_SYMBOL(inet_get_local_port_range); -int inet_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb, bool relax, - bool reuseport_ok) +static int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, + bool relax, bool reuseport_ok) { struct sock *sk2; bool reuse = sk->sk_reuse; @@ -134,7 +134,6 @@ int inet_csk_bind_conflict(const struct sock *sk, sk_for_each_bound(sk2, &tb->owners) { if (sk != sk2 && - !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { @@ -144,23 +143,18 @@ int inet_csk_bind_conflict(const struct sock *sk, rcu_access_pointer(sk->sk_reuseport_cb) || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { - - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + if (inet_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { - - if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || - sk2->sk_rcv_saddr == sk->sk_rcv_saddr) + if (inet_rcv_saddr_equal(sk, sk2, true)) break; } } } return sk2 != NULL; } -EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. @@ -239,8 +233,7 @@ other_parity_scan: smallest_size = tb->num_owners; smallest_port = port; } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false, - reuseport_ok)) + if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok)) goto tb_found; goto next_port; } @@ -281,8 +274,7 @@ tb_found: sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && smallest_size == -1) goto success; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true, - reuseport_ok)) { + if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) { if ((reuse || (tb->fastreuseport > 0 && sk->sk_reuseport && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 63214136cf1c..3644fc117691 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1817,7 +1817,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), - .bind_conflict = inet_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 55ee2ea2aee0..97074c459fe6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -28,46 +28,6 @@ #include #include -int inet6_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb, bool relax, - bool reuseport_ok) -{ - const struct sock *sk2; - bool reuse = !!sk->sk_reuse; - bool reuseport = !!sk->sk_reuseport && reuseport_ok; - kuid_t uid = sock_i_uid((struct sock *)sk); - - /* We must walk the whole port owner list in this case. -DaveM */ - /* - * See comment in inet_csk_bind_conflict about sock lookup - * vs net namespaces issues. - */ - sk_for_each_bound(sk2, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if ((!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - (!reuseport || !sk2->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || - (sk2->sk_state != TCP_TIME_WAIT && - !uid_eq(uid, - sock_i_uid((struct sock *)sk2))))) { - if (inet_rcv_saddr_equal(sk, sk2, true)) - break; - } - if (!relax && reuse && sk2->sk_reuse && - sk2->sk_state != TCP_LISTEN && - inet_rcv_saddr_equal(sk, sk2, true)) - break; - } - } - - return sk2 != NULL; -} -EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); - struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fc14e04028bf..f72100eedd5d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1621,7 +1621,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, @@ -1652,7 +1651,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), - .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, From b9470c27607bed1ad3450de789c154f225530112 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Jan 2017 07:51:03 -0800 Subject: [PATCH 3/6] inet: kill smallest_size and smallest_port In inet_csk_get_port we seem to be using smallest_port to figure out where the best place to look for a SO_REUSEPORT sk that matches with an existing set of SO_REUSEPORT's. However if we get to the logic if (smallest_size != -1) { port = smallest_port; goto have_port; } we will do a useless search, because we would have already done the inet_csk_bind_conflict for that port and it would have returned 1, otherwise we would have gone to found_tb and succeeded. Since this logic makes us do yet another trip through inet_csk_bind_conflict for a port we know won't work just delete this code and save us the time. Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 1 - net/ipv4/inet_connection_sock.c | 26 ++++---------------------- net/ipv4/inet_hashtables.c | 3 --- 3 files changed, 4 insertions(+), 26 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 756ed1692906..3fc0366743da 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -80,7 +80,6 @@ struct inet_bind_bucket { signed char fastreuse; signed char fastreuseport; kuid_t fastuid; - int num_owners; struct hlist_node node; struct hlist_head owners; }; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a1c9055769fc..d3523661c905 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -165,7 +165,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; int ret = 1, attempts = 5, port = snum; - int smallest_size = -1, smallest_port; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); int i, low, high, attempt_half; @@ -175,7 +174,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) bool reuseport_ok = !!snum; if (port) { -have_port: head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); @@ -209,8 +207,6 @@ other_half_scan: * We do the opposite to not pollute connect() users. */ offset |= 1U; - smallest_size = -1; - smallest_port = low; /* avoid compiler warning */ other_parity_scan: port = low + offset; @@ -224,15 +220,6 @@ other_parity_scan: spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) { - if (((tb->fastreuse > 0 && reuse) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(tb->fastuid, uid))) && - (tb->num_owners < smallest_size || smallest_size == -1)) { - smallest_size = tb->num_owners; - smallest_port = port; - } if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok)) goto tb_found; goto next_port; @@ -243,10 +230,6 @@ next_port: cond_resched(); } - if (smallest_size != -1) { - port = smallest_port; - goto have_port; - } offset--; if (!(offset & 1)) goto other_parity_scan; @@ -268,19 +251,18 @@ tb_found: if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (((tb->fastreuse > 0 && reuse) || + if ((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && !rcu_access_pointer(sk->sk_reuseport_cb) && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) + sk->sk_reuseport && uid_eq(tb->fastuid, uid))) goto success; if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) { if ((reuse || (tb->fastreuseport > 0 && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(tb->fastuid, uid))) && - !snum && smallest_size != -1 && --attempts >= 0) { + uid_eq(tb->fastuid, uid))) && !snum && + --attempts >= 0) { spin_unlock_bh(&head->lock); goto again; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2ef9b010bd34..8bea74298173 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -73,7 +73,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; - tb->num_owners = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); } @@ -96,7 +95,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); - tb->num_owners++; inet_csk(sk)->icsk_bind_hash = tb; } @@ -114,7 +112,6 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); - tb->num_owners--; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); From 6cd66616834c89b8a6c8a182c4c99e5478cf6d6b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Jan 2017 07:51:04 -0800 Subject: [PATCH 4/6] inet: don't check for bind conflicts twice when searching for a port This is just wasted time, we've already found a tb that doesn't have a bind conflict, and we don't drop the head lock so scanning again isn't going to give us a different answer. Instead move the tb->reuse setting logic outside of the found_tb path and put it in the success: path. Then make it so that we don't goto again if we find a bind conflict in the found_tb path as we won't reach this anymore when we are scanning for an ephemeral port. Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d3523661c905..f7e844d84836 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -164,7 +164,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - int ret = 1, attempts = 5, port = snum; + int ret = 1, port = snum; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); int i, low, high, attempt_half; @@ -183,7 +183,6 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) goto tb_not_found; } -again: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); @@ -221,7 +220,7 @@ other_parity_scan: inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) { if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok)) - goto tb_found; + goto success; goto next_port; } goto tb_not_found; @@ -256,23 +255,11 @@ tb_found: !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) goto success; - if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) { - if ((reuse || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(tb->fastuid, uid))) && !snum && - --attempts >= 0) { - spin_unlock_bh(&head->lock); - goto again; - } + if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) goto fail_unlock; - } - if (!reuse) - tb->fastreuse = 0; - if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) - tb->fastreuseport = 0; - } else { + } +success: + if (!hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = 1; @@ -280,8 +267,12 @@ tb_found: } else { tb->fastreuseport = 0; } + } else { + if (!reuse) + tb->fastreuse = 0; + if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) + tb->fastreuseport = 0; } -success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); From 289141b7688b71dc69b8d7a54bf67a4d7bc79f96 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Jan 2017 07:51:05 -0800 Subject: [PATCH 5/6] inet: split inet_csk_get_port into two functions inet_csk_get_port does two different things, it either scans for an open port, or it tries to see if the specified port is available for use. Since these two operations have different rules and are basically independent lets split them into two different functions to make them both more readable. Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 66 ++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f7e844d84836..bbe28920e2d8 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -156,33 +156,21 @@ static int inet_csk_bind_conflict(const struct sock *sk, return sk2 != NULL; } -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - * We try to allocate an odd port (and leave even ports for connect()) +/* + * Find an open port number for the socket. Returns with the + * inet_bind_hashbucket lock held. */ -int inet_csk_get_port(struct sock *sk, unsigned short snum) +static struct inet_bind_hashbucket * +inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret) { - bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - int ret = 1, port = snum; + int port = 0; struct inet_bind_hashbucket *head; struct net *net = sock_net(sk); int i, low, high, attempt_half; struct inet_bind_bucket *tb; - kuid_t uid = sock_i_uid(sk); u32 remaining, offset; - bool reuseport_ok = !!snum; - if (port) { - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock_bh(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == port) - goto tb_found; - - goto tb_not_found; - } attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_get_local_port_range(net, &low, &high); @@ -219,11 +207,12 @@ other_parity_scan: spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (net_eq(ib_net(tb), net) && tb->port == port) { - if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok)) + if (!inet_csk_bind_conflict(sk, tb, false, false)) goto success; goto next_port; } - goto tb_not_found; + tb = NULL; + goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); @@ -238,8 +227,41 @@ next_port: attempt_half = 2; goto other_half_scan; } - return ret; + return NULL; +success: + *port_ret = port; + *tb_ret = tb; + return head; +} +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) + */ +int inet_csk_get_port(struct sock *sk, unsigned short snum) +{ + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, port = snum; + struct inet_bind_hashbucket *head; + struct net *net = sock_net(sk); + struct inet_bind_bucket *tb = NULL; + kuid_t uid = sock_i_uid(sk); + + if (!port) { + head = inet_csk_find_open_port(sk, &tb, &port); + if (!head) + return ret; + if (!tb) + goto tb_not_found; + goto success; + } + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; tb_not_found: tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port); @@ -255,7 +277,7 @@ tb_found: !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) goto success; - if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) + if (inet_csk_bind_conflict(sk, tb, true, true)) goto fail_unlock; } success: From 637bc8bbe6c0a288a596edfdcdd5657c72a848db Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 Jan 2017 07:51:06 -0800 Subject: [PATCH 6/6] inet: reset tb->fastreuseport when adding a reuseport sk If we have non reuseport sockets on a tb we will set tb->fastreuseport to 0 and never set it again. Which means that in the future if we end up adding a bunch of reuseport sk's to that tb we'll have to do the expensive scan every time. Instead add the ipv4/ipv6 saddr fields to the bind bucket, as well as the family so we know what comparison to make, and the ipv6 only setting so we can make sure to compare with new sockets appropriately. Once one sk has made it onto the list we know that there are no potential bind conflicts on the owners list that match that sk's rcv_addr. So copy the sk's information into our bind bucket and set tb->fastruseport to FASTREUSESOCK_STRICT so we know we have to do an extra check for subsequent reuseport sockets and skip the expensive bind conflict check. Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 9 +++ net/ipv4/inet_connection_sock.c | 106 ++++++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 20 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3fc0366743da..1178931288cb 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -74,12 +74,21 @@ struct inet_ehash_bucket { * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ +#define FASTREUSEPORT_ANY 1 +#define FASTREUSEPORT_STRICT 2 + struct inet_bind_bucket { possible_net_t ib_net; unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr fast_v6_rcv_saddr; +#endif + __be32 fast_rcv_saddr; + unsigned short fast_sk_family; + bool fast_ipv6_only; struct hlist_node node; struct hlist_head owners; }; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index bbe28920e2d8..096a085611ab 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -38,20 +38,21 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ -static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, +static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, + const struct in6_addr *sk2_rcv_saddr6, + __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk1_ipv6only, bool sk2_ipv6only, bool match_wildcard) { - const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); - int sk2_ipv6only = inet_v6_ipv6only(sk2); - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + if (sk1_rcv_saddr == sk2_rcv_saddr) return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } return 0; @@ -65,11 +66,11 @@ static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, return 1; if (addr_type == IPV6_ADDR_ANY && match_wildcard && - !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) + !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return 1; if (sk2_rcv_saddr6 && - ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6)) + ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) return 1; return 0; @@ -80,13 +81,13 @@ static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard) +static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, + bool sk2_ipv6only, bool match_wildcard) { - if (!ipv6_only_sock(sk2)) { - if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + if (!sk2_ipv6only) { + if (sk1_rcv_saddr == sk2_rcv_saddr) return 1; - if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + if (!sk1_rcv_saddr || !sk2_rcv_saddr) return match_wildcard; } return 0; @@ -97,9 +98,16 @@ int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) - return ipv6_rcv_saddr_equal(sk, sk2, match_wildcard); + return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, + &sk2->sk_v6_rcv_saddr, + sk->sk_rcv_saddr, + sk2->sk_rcv_saddr, + ipv6_only_sock(sk), + ipv6_only_sock(sk2), + match_wildcard); #endif - return ipv4_rcv_saddr_equal(sk, sk2, match_wildcard); + return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, + ipv6_only_sock(sk2), match_wildcard); } EXPORT_SYMBOL(inet_rcv_saddr_equal); @@ -234,6 +242,39 @@ success: return head; } +static inline int sk_reuseport_match(struct inet_bind_bucket *tb, + struct sock *sk) +{ + kuid_t uid = sock_i_uid(sk); + + if (tb->fastreuseport <= 0) + return 0; + if (!sk->sk_reuseport) + return 0; + if (rcu_access_pointer(sk->sk_reuseport_cb)) + return 0; + if (!uid_eq(tb->fastuid, uid)) + return 0; + /* We only need to check the rcv_saddr if this tb was once marked + * without fastreuseport and then was reset, as we can only know that + * the fast_*rcv_saddr doesn't have any conflicts with the socks on the + * owners list. + */ + if (tb->fastreuseport == FASTREUSEPORT_ANY) + return 1; +#if IS_ENABLED(CONFIG_IPV6) + if (tb->fast_sk_family == AF_INET6) + return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, + &sk->sk_v6_rcv_saddr, + tb->fast_rcv_saddr, + sk->sk_rcv_saddr, + tb->fast_ipv6_only, + ipv6_only_sock(sk), true); +#endif + return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, + ipv6_only_sock(sk), true); +} + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) @@ -273,9 +314,7 @@ tb_found: goto success; if ((tb->fastreuse > 0 && reuse) || - (tb->fastreuseport > 0 && - !rcu_access_pointer(sk->sk_reuseport_cb) && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) + sk_reuseport_match(tb, sk)) goto success; if (inet_csk_bind_conflict(sk, tb, true, true)) goto fail_unlock; @@ -284,16 +323,43 @@ success: if (!hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { - tb->fastreuseport = 1; + tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif } else { tb->fastreuseport = 0; } } else { if (!reuse) tb->fastreuse = 0; - if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) + if (sk->sk_reuseport) { + /* We didn't match or we don't have fastreuseport set on + * the tb, but we have sk_reuseport set on this socket + * and we know that there are no bind conflicts with + * this socket in this tb, so reset our tb's reuseport + * settings so that any subsequent sockets that match + * our current socket will be put on the fast path. + * + * If we reset we need to set FASTREUSEPORT_STRICT so we + * do extra checking for all subsequent sk_reuseport + * socks. + */ + if (!sk_reuseport_match(tb, sk)) { + tb->fastreuseport = FASTREUSEPORT_STRICT; + tb->fastuid = uid; + tb->fast_rcv_saddr = sk->sk_rcv_saddr; + tb->fast_ipv6_only = ipv6_only_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; +#endif + } + } else { tb->fastreuseport = 0; + } } if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, port);