From 8e2b8a9fa512709e6fee744dcd4e2a20ee7f5c56 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 19 Dec 2023 22:31:04 +0100 Subject: [PATCH 1/4] mptcp: don't overwrite sock_ops in mptcp_is_tcpsk() Eric Dumazet suggests: > The fact that mptcp_is_tcpsk() was able to write over sock->ops was a > bit strange to me. > mptcp_is_tcpsk() should answer a question, with a read-only argument. re-factor code to avoid overwriting sock_ops inside that function. Also, change the helper name to reflect the semantics and to disambiguate from its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and mptcp_accept() into a single function, where fallback / non-fallback are separated into a single sk_is_mptcp() conditional. Link: https://github.com/multipath-tcp/mptcp_net-next/issues/432 Suggested-by: Eric Dumazet Signed-off-by: Davide Caratti Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 108 ++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 64 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5cd5c3f535a8..91e5845d80a9 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -55,28 +55,14 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) return READ_ONCE(msk->wnd_end); } -static bool mptcp_is_tcpsk(struct sock *sk) +static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { - struct socket *sock = sk->sk_socket; - - if (unlikely(sk->sk_prot == &tcp_prot)) { - /* we are being invoked after mptcp_accept() has - * accepted a non-mp-capable flow: sk is a tcp_sk, - * not an mptcp one. - * - * Hand the socket over to tcp so all further socket ops - * bypass mptcp. - */ - WRITE_ONCE(sock->ops, &inet_stream_ops); - return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) - } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { - WRITE_ONCE(sock->ops, &inet6_stream_ops); - return true; + if (sk->sk_prot == &tcpv6_prot) + return &inet6_stream_ops; #endif - } - - return false; + WARN_ON_ONCE(sk->sk_prot != &tcp_prot); + return &inet_stream_ops; } static int __mptcp_socket_create(struct mptcp_sock *msk) @@ -3258,44 +3244,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } -static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err, - bool kern) -{ - struct sock *newsk; - - pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); - newsk = inet_csk_accept(ssk, flags, err, kern); - if (!newsk) - return NULL; - - pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); - if (sk_is_mptcp(newsk)) { - struct mptcp_subflow_context *subflow; - struct sock *new_mptcp_sock; - - subflow = mptcp_subflow_ctx(newsk); - new_mptcp_sock = subflow->conn; - - /* is_mptcp should be false if subflow->conn is missing, see - * subflow_syn_recv_sock() - */ - if (WARN_ON_ONCE(!new_mptcp_sock)) { - tcp_sk(newsk)->is_mptcp = 0; - goto out; - } - - newsk = new_mptcp_sock; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - } else { - MPTCP_INC_STATS(sock_net(ssk), - MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); - } - -out: - newsk->sk_kern_sock = kern; - return newsk; -} - void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { struct mptcp_subflow_context *subflow, *tmp; @@ -3739,7 +3687,6 @@ static struct proto mptcp_prot = { .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, - .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, @@ -3849,18 +3796,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk) return -EINVAL; - newsk = mptcp_accept(ssk, flags, &err, kern); + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + newsk = inet_csk_accept(ssk, flags, &err, kern); if (!newsk) return err; - lock_sock(newsk); - - __inet_accept(sock, newsock, newsk); - if (!mptcp_is_tcpsk(newsock->sk)) { - struct mptcp_sock *msk = mptcp_sk(newsk); + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); + if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; + struct sock *new_mptcp_sock; + + subflow = mptcp_subflow_ctx(newsk); + new_mptcp_sock = subflow->conn; + + /* is_mptcp should be false if subflow->conn is missing, see + * subflow_syn_recv_sock() + */ + if (WARN_ON_ONCE(!new_mptcp_sock)) { + tcp_sk(newsk)->is_mptcp = 0; + goto tcpfallback; + } + + newsk = new_mptcp_sock; + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); + + newsk->sk_kern_sock = kern; + lock_sock(newsk); + __inet_accept(sock, newsock, newsk); set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + msk = mptcp_sk(newsk); msk->in_accept_queue = 0; /* set ssk->sk_socket of accept()ed flows to mptcp socket. @@ -3882,6 +3847,21 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (unlikely(list_is_singular(&msk->conn_list))) inet_sk_state_store(newsk, TCP_CLOSE); } + } else { + MPTCP_INC_STATS(sock_net(ssk), + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); +tcpfallback: + newsk->sk_kern_sock = kern; + lock_sock(newsk); + __inet_accept(sock, newsock, newsk); + /* we are being invoked after accepting a non-mp-capable + * flow: sk is a tcp_sk, not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + WRITE_ONCE(newsock->sk->sk_socket->ops, + mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk); From 57d3117ca80f46d5c5af285ca75360eaf28df172 Mon Sep 17 00:00:00 2001 From: Maxim Galaganov Date: Tue, 19 Dec 2023 22:31:05 +0100 Subject: [PATCH 2/4] mptcp: rename mptcp_setsockopt_sol_ip_set_transparent() Next patch extends this function so that it's not specific to IP_TRANSPARENT. Change function name to mptcp_setsockopt_sol_ip_set(). Reviewed-by: Mat Martineau Signed-off-by: Maxim Galaganov Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index cabe856b2a45..a4bf337e6f77 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -683,8 +683,8 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op return 0; } -static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) +static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; @@ -755,7 +755,7 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_FREEBIND: case IP_TRANSPARENT: - return mptcp_setsockopt_sol_ip_set_transparent(msk, optname, optval, optlen); + return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); } From c85636a2926424c70e13925db734eaca14f6367c Mon Sep 17 00:00:00 2001 From: Maxim Galaganov Date: Tue, 19 Dec 2023 22:31:06 +0100 Subject: [PATCH 3/4] mptcp: sockopt: support IP_LOCAL_PORT_RANGE and IP_BIND_ADDRESS_NO_PORT Support for IP_BIND_ADDRESS_NO_PORT sockopt was introduced in [1]. Recently [2] allowed its value to be accessed without locking the socket. Support for (newer) IP_LOCAL_PORT_RANGE sockopt was introduced in [3]. In the same series a selftest was added in [4]. This selftest also covers the IP_BIND_ADDRESS_NO_PORT sockopt. This patch enables getsockopt()/setsockopt() on MPTCP sockets for these socket options, syncing set values to subflows in sync_socket_options(). Ephemeral port range is synced to subflows, enabling NAT usecase described in [3]. [1] commit 90c337da1524 ("inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations") [2] commit ca571e2eb7eb ("inet: move inet->bind_address_no_port to inet->inet_flags") [3] commit 91d0b78c5177 ("inet: Add IP_LOCAL_PORT_RANGE socket option") [4] commit ae5439658cce ("selftests/net: Cover the IP_LOCAL_PORT_RANGE socket option") Signed-off-by: Maxim Galaganov Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a4bf337e6f77..c40f1428e602 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -440,6 +440,8 @@ static bool mptcp_supported_sockopt(int level, int optname) /* should work fine */ case IP_FREEBIND: case IP_TRANSPARENT: + case IP_BIND_ADDRESS_NO_PORT: + case IP_LOCAL_PORT_RANGE: /* the following are control cmsg related */ case IP_PKTINFO: @@ -455,7 +457,6 @@ static bool mptcp_supported_sockopt(int level, int optname) /* common stuff that need some love */ case IP_TOS: case IP_TTL: - case IP_BIND_ADDRESS_NO_PORT: case IP_MTU_DISCOVER: case IP_RECVERR: @@ -710,6 +711,14 @@ static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname, inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); break; + case IP_BIND_ADDRESS_NO_PORT: + inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, + inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + break; + case IP_LOCAL_PORT_RANGE: + WRITE_ONCE(inet_sk(ssk)->local_port_range, + READ_ONCE(inet_sk(sk)->local_port_range)); + break; default: release_sock(sk); WARN_ON_ONCE(1); @@ -755,6 +764,8 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_FREEBIND: case IP_TRANSPARENT: + case IP_BIND_ADDRESS_NO_PORT: + case IP_LOCAL_PORT_RANGE: return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); @@ -1350,6 +1361,12 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos)); + case IP_BIND_ADDRESS_NO_PORT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + case IP_LOCAL_PORT_RANGE: + return mptcp_put_int_option(msk, optval, optlen, + READ_ONCE(inet_sk(sk)->local_port_range)); } return -EOPNOTSUPP; @@ -1450,6 +1467,8 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); + inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) From 122db5e3634b85f6caeca19345e0adbdf79cb257 Mon Sep 17 00:00:00 2001 From: Maxim Galaganov Date: Tue, 19 Dec 2023 22:31:07 +0100 Subject: [PATCH 4/4] selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE Since previous commit, MPTCP has support for IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE sockopts. Add ip4_mptcp and ip6_mptcp fixture variants to ip_local_port_range selftest to provide selftest coverage for these sockopts. Acked-by: Mat Martineau Signed-off-by: Maxim Galaganov Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- tools/testing/selftests/net/ip_local_port_range.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c index 75e3fdacdf73..0f217a1cc837 100644 --- a/tools/testing/selftests/net/ip_local_port_range.c +++ b/tools/testing/selftests/net/ip_local_port_range.c @@ -146,6 +146,12 @@ FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_stcp) { .so_protocol = IPPROTO_SCTP, }; +FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_mptcp) { + .so_domain = AF_INET, + .so_type = SOCK_STREAM, + .so_protocol = IPPROTO_MPTCP, +}; + FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_tcp) { .so_domain = AF_INET6, .so_type = SOCK_STREAM, @@ -164,6 +170,12 @@ FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_stcp) { .so_protocol = IPPROTO_SCTP, }; +FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_mptcp) { + .so_domain = AF_INET6, + .so_type = SOCK_STREAM, + .so_protocol = IPPROTO_MPTCP, +}; + TEST_F(ip_local_port_range, invalid_option_value) { __u16 val16;