2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2007-04-27 02:48:28 +04:00
/* connection-level event handling
*
* Copyright ( C ) 2007 Red Hat , Inc . All Rights Reserved .
* Written by David Howells ( dhowells @ redhat . com )
*/
2016-06-02 22:08:52 +03:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2007-04-27 02:48:28 +04:00
# include <linux/module.h>
# include <linux/net.h>
# include <linux/skbuff.h>
# include <linux/errqueue.h>
# include <net/sock.h>
# include <net/af_rxrpc.h>
# include <net/ip.h>
# include "ar-internal.h"
2016-08-23 17:27:25 +03:00
/*
* Retransmit terminal ACK or ABORT of the previous call .
*/
2016-08-30 11:49:28 +03:00
static void rxrpc_conn_retransmit_call ( struct rxrpc_connection * conn ,
2017-11-24 13:18:41 +03:00
struct sk_buff * skb ,
unsigned int channel )
2016-08-23 17:27:25 +03:00
{
2017-11-24 13:18:41 +03:00
struct rxrpc_skb_priv * sp = skb ? rxrpc_skb ( skb ) : NULL ;
2016-08-23 17:27:25 +03:00
struct rxrpc_channel * chan ;
struct msghdr msg ;
2017-11-29 17:40:41 +03:00
struct kvec iov [ 3 ] ;
2016-08-23 17:27:25 +03:00
struct {
struct rxrpc_wire_header whdr ;
union {
2017-11-29 17:40:41 +03:00
__be32 abort_code ;
struct rxrpc_ackpacket ack ;
2016-08-23 17:27:25 +03:00
} ;
} __attribute__ ( ( packed ) ) pkt ;
2017-11-29 17:40:41 +03:00
struct rxrpc_ackinfo ack_info ;
2016-08-23 17:27:25 +03:00
size_t len ;
2018-05-11 01:26:01 +03:00
int ret , ioc ;
2017-11-29 17:40:41 +03:00
u32 serial , mtu , call_id , padding ;
2016-08-23 17:27:25 +03:00
_enter ( " %d " , conn - > debug_id ) ;
2017-11-24 13:18:41 +03:00
chan = & conn - > channels [ channel ] ;
2016-08-23 17:27:25 +03:00
/* If the last call got moved on whilst we were waiting to run, just
* ignore this packet .
*/
call_id = READ_ONCE ( chan - > last_call ) ;
/* Sync with __rxrpc_disconnect_call() */
smp_rmb ( ) ;
2017-11-24 13:18:41 +03:00
if ( skb & & call_id ! = sp - > hdr . callNumber )
2016-08-23 17:27:25 +03:00
return ;
msg . msg_name = & conn - > params . peer - > srx . transport ;
msg . msg_namelen = conn - > params . peer - > srx . transport_len ;
msg . msg_control = NULL ;
msg . msg_controllen = 0 ;
msg . msg_flags = 0 ;
2017-11-29 17:40:41 +03:00
iov [ 0 ] . iov_base = & pkt ;
iov [ 0 ] . iov_len = sizeof ( pkt . whdr ) ;
iov [ 1 ] . iov_base = & padding ;
iov [ 1 ] . iov_len = 3 ;
iov [ 2 ] . iov_base = & ack_info ;
iov [ 2 ] . iov_len = sizeof ( ack_info ) ;
2017-11-24 13:18:41 +03:00
pkt . whdr . epoch = htonl ( conn - > proto . epoch ) ;
2018-06-06 16:59:14 +03:00
pkt . whdr . cid = htonl ( conn - > proto . cid | channel ) ;
2017-11-24 13:18:41 +03:00
pkt . whdr . callNumber = htonl ( call_id ) ;
2016-08-23 17:27:25 +03:00
pkt . whdr . seq = 0 ;
pkt . whdr . type = chan - > last_type ;
pkt . whdr . flags = conn - > out_clientflag ;
pkt . whdr . userStatus = 0 ;
pkt . whdr . securityIndex = conn - > security_ix ;
pkt . whdr . _rsvd = 0 ;
2017-06-05 16:30:49 +03:00
pkt . whdr . serviceId = htons ( conn - > service_id ) ;
2016-08-23 17:27:25 +03:00
len = sizeof ( pkt . whdr ) ;
switch ( chan - > last_type ) {
case RXRPC_PACKET_TYPE_ABORT :
2017-11-29 17:40:41 +03:00
pkt . abort_code = htonl ( chan - > last_abort ) ;
iov [ 0 ] . iov_len + = sizeof ( pkt . abort_code ) ;
len + = sizeof ( pkt . abort_code ) ;
ioc = 1 ;
2016-08-23 17:27:25 +03:00
break ;
case RXRPC_PACKET_TYPE_ACK :
mtu = conn - > params . peer - > if_mtu ;
mtu - = conn - > params . peer - > hdrsize ;
pkt . ack . bufferSpace = 0 ;
2017-11-24 13:18:41 +03:00
pkt . ack . maxSkew = htons ( skb ? skb - > priority : 0 ) ;
pkt . ack . firstPacket = htonl ( chan - > last_seq + 1 ) ;
pkt . ack . previousPacket = htonl ( chan - > last_seq ) ;
pkt . ack . serial = htonl ( skb ? sp - > hdr . serial : 0 ) ;
pkt . ack . reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE ;
2016-08-23 17:27:25 +03:00
pkt . ack . nAcks = 0 ;
2017-11-29 17:40:41 +03:00
ack_info . rxMTU = htonl ( rxrpc_rx_mtu ) ;
ack_info . maxMTU = htonl ( mtu ) ;
ack_info . rwind = htonl ( rxrpc_rx_window_size ) ;
ack_info . jumbo_max = htonl ( rxrpc_rx_jumbo_max ) ;
2016-09-24 20:05:27 +03:00
pkt . whdr . flags | = RXRPC_SLOW_START_OK ;
2017-11-29 17:40:41 +03:00
padding = 0 ;
iov [ 0 ] . iov_len + = sizeof ( pkt . ack ) ;
len + = sizeof ( pkt . ack ) + 3 + sizeof ( ack_info ) ;
ioc = 3 ;
2016-08-23 17:27:25 +03:00
break ;
2017-11-29 17:40:41 +03:00
default :
return ;
2016-08-23 17:27:25 +03:00
}
/* Resync with __rxrpc_disconnect_call() and check that the last call
* didn ' t get advanced whilst we were filling out the packets .
*/
smp_rmb ( ) ;
if ( READ_ONCE ( chan - > last_call ) ! = call_id )
return ;
serial = atomic_inc_return ( & conn - > serial ) ;
pkt . whdr . serial = htonl ( serial ) ;
switch ( chan - > last_type ) {
case RXRPC_PACKET_TYPE_ABORT :
2018-10-08 17:46:17 +03:00
_proto ( " Tx ABORT %%%u { %d } [re] " , serial , conn - > abort_code ) ;
2016-08-23 17:27:25 +03:00
break ;
case RXRPC_PACKET_TYPE_ACK :
2018-07-23 19:18:37 +03:00
trace_rxrpc_tx_ack ( chan - > call_debug_id , serial ,
2018-07-23 19:18:36 +03:00
ntohl ( pkt . ack . firstPacket ) ,
ntohl ( pkt . ack . serial ) ,
pkt . ack . reason , 0 ) ;
2016-08-23 17:27:25 +03:00
_proto ( " Tx ACK %%%u [re] " , serial ) ;
break ;
}
2018-05-11 01:26:01 +03:00
ret = kernel_sendmsg ( conn - > params . local - > socket , & msg , iov , ioc , len ) ;
2018-08-08 13:30:02 +03:00
conn - > params . peer - > last_tx_at = ktime_get_seconds ( ) ;
2018-05-11 01:26:01 +03:00
if ( ret < 0 )
2018-07-23 19:18:37 +03:00
trace_rxrpc_tx_fail ( chan - > call_debug_id , serial , ret ,
rxrpc_tx_point_call_final_resend ) ;
else
trace_rxrpc_tx_packet ( chan - > call_debug_id , & pkt . whdr ,
rxrpc_tx_point_call_final_resend ) ;
2018-05-11 01:26:01 +03:00
2016-08-23 17:27:25 +03:00
_leave ( " " ) ;
}
2007-04-27 02:48:28 +04:00
/*
* pass a connection - level abort onto all calls on that connection
*/
2016-08-30 11:49:28 +03:00
static void rxrpc_abort_calls ( struct rxrpc_connection * conn ,
2019-04-12 18:34:09 +03:00
enum rxrpc_call_completion compl ,
rxrpc_serial_t serial )
2007-04-27 02:48:28 +04:00
{
struct rxrpc_call * call ;
rxrpc: Rewrite the data and ack handling code
Rewrite the data and ack handling code such that:
(1) Parsing of received ACK and ABORT packets and the distribution and the
filing of DATA packets happens entirely within the data_ready context
called from the UDP socket. This allows us to process and discard ACK
and ABORT packets much more quickly (they're no longer stashed on a
queue for a background thread to process).
(2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead
keep track of the offset and length of the content of each packet in
the sk_buff metadata. This means we don't do any allocation in the
receive path.
(3) Jumbo DATA packet parsing is now done in data_ready context. Rather
than cloning the packet once for each subpacket and pulling/trimming
it, we file the packet multiple times with an annotation for each
indicating which subpacket is there. From that we can directly
calculate the offset and length.
(4) A call's receive queue can be accessed without taking locks (memory
barriers do have to be used, though).
(5) Incoming calls are set up from preallocated resources and immediately
made live. They can than have packets queued upon them and ACKs
generated. If insufficient resources exist, DATA packet #1 is given a
BUSY reply and other DATA packets are discarded).
(6) sk_buffs no longer take a ref on their parent call.
To make this work, the following changes are made:
(1) Each call's receive buffer is now a circular buffer of sk_buff
pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
between the call and the socket. This permits each sk_buff to be in
the buffer multiple times. The receive buffer is reused for the
transmit buffer.
(2) A circular buffer of annotations (rxtx_annotations) is kept parallel
to the data buffer. Transmission phase annotations indicate whether a
buffered packet has been ACK'd or not and whether it needs
retransmission.
Receive phase annotations indicate whether a slot holds a whole packet
or a jumbo subpacket and, if the latter, which subpacket. They also
note whether the packet has been decrypted in place.
(3) DATA packet window tracking is much simplified. Each phase has just
two numbers representing the window (rx_hard_ack/rx_top and
tx_hard_ack/tx_top).
The hard_ack number is the sequence number before base of the window,
representing the last packet the other side says it has consumed.
hard_ack starts from 0 and the first packet is sequence number 1.
The top number is the sequence number of the highest-numbered packet
residing in the buffer. Packets between hard_ack+1 and top are
soft-ACK'd to indicate they've been received, but not yet consumed.
Four macros, before(), before_eq(), after() and after_eq() are added
to compare sequence numbers within the window. This allows for the
top of the window to wrap when the hard-ack sequence number gets close
to the limit.
Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
to indicate when rx_top and tx_top point at the packets with the
LAST_PACKET bit set, indicating the end of the phase.
(4) Calls are queued on the socket 'receive queue' rather than packets.
This means that we don't need have to invent dummy packets to queue to
indicate abnormal/terminal states and we don't have to keep metadata
packets (such as ABORTs) around
(5) The offset and length of a (sub)packet's content are now passed to
the verify_packet security op. This is currently expected to decrypt
the packet in place and validate it.
However, there's now nowhere to store the revised offset and length of
the actual data within the decrypted blob (there may be a header and
padding to skip) because an sk_buff may represent multiple packets, so
a locate_data security op is added to retrieve these details from the
sk_buff content when needed.
(6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
individually secured and needs to be individually decrypted. The code
to do this is broken out into rxrpc_recvmsg_data() and shared with the
kernel API. It now iterates over the call's receive buffer rather
than walking the socket receive queue.
Additional changes:
(1) The timers are condensed to a single timer that is set for the soonest
of three timeouts (delayed ACK generation, DATA retransmission and
call lifespan).
(2) Transmission of ACK and ABORT packets is effected immediately from
process-context socket ops/kernel API calls that cause them instead of
them being punted off to a background work item. The data_ready
handler still has to defer to the background, though.
(3) A shutdown op is added to the AF_RXRPC socket so that the AFS
filesystem can shut down the socket and flush its own work items
before closing the socket to deal with any in-progress service calls.
Future additional changes that will need to be considered:
(1) Make sure that a call doesn't hog the front of the queue by receiving
data from the network as fast as userspace is consuming it to the
exclusion of other calls.
(2) Transmit delayed ACKs from within recvmsg() when we've consumed
sufficiently more packets to avoid the background work item needing to
run.
Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 13:10:12 +03:00
int i ;
2007-04-27 02:48:28 +04:00
2018-10-08 17:46:17 +03:00
_enter ( " {%d},%x " , conn - > debug_id , conn - > abort_code ) ;
2007-04-27 02:48:28 +04:00
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
spin_lock ( & conn - > bundle - > channel_lock ) ;
2007-04-27 02:48:28 +04:00
rxrpc: Call channels should have separate call number spaces
Each channel on a connection has a separate, independent number space from
which to allocate callNumber values. It is entirely possible, for example,
to have a connection with four active calls, each with call number 1.
Note that the callNumber values for any particular channel don't have to
start at 1, but they are supposed to increment monotonically for that
channel from a client's perspective and may not be reused once the call
number is transmitted (until the epoch cycles all the way back round).
Currently, however, call numbers are allocated on a per-connection basis
and, further, are held in an rb-tree. The rb-tree is redundant as the four
channel pointers in the rxrpc_connection struct are entirely capable of
pointing to all the calls currently in progress on a connection.
To this end, make the following changes:
(1) Handle call number allocation independently per channel.
(2) Get rid of the conn->calls rb-tree. This is overkill as a connection
may have a maximum of four calls in progress at any one time. Use the
pointers in the channels[] array instead, indexed by the channel
number from the packet.
(3) For each channel, save the result of the last call that was in
progress on that channel in conn->channels[] so that the final ACK or
ABORT packet can be replayed if necessary. Any call earlier than that
is just ignored. If we've seen the next call number in a packet, the
last one is most definitely defunct.
(4) When generating a RESPONSE packet for a connection, the call number
counter for each channel must be included in it.
(5) When parsing a RESPONSE packet for a connection, the call number
counters contained therein should be used to set the minimum expected
call numbers on each channel.
To do in future commits:
(1) Replay terminal packets based on the last call stored in
conn->channels[].
(2) Connections should be retired before the callNumber space on any
channel runs out.
(3) A server is expected to disregard or reject any new incoming call that
has a call number less than the current call number counter. The call
number counter for that channel must be advanced to the new call
number.
Note that the server cannot just require that the next call that it
sees on a channel be exactly the call number counter + 1 because then
there's a scenario that could cause a problem: The client transmits a
packet to initiate a connection, the network goes out, the server
sends an ACK (which gets lost), the client sends an ABORT (which also
gets lost); the network then reconnects, the client then reuses the
call number for the next call (it doesn't know the server already saw
the call number), but the server thinks it already has the first
packet of this call (it doesn't know that the client doesn't know that
it saw the call number the first time).
Signed-off-by: David Howells <dhowells@redhat.com>
2016-06-27 16:39:44 +03:00
for ( i = 0 ; i < RXRPC_MAXCALLS ; i + + ) {
call = rcu_dereference_protected (
conn - > channels [ i ] . call ,
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
lockdep_is_held ( & conn - > bundle - > channel_lock ) ) ;
2016-08-30 11:49:28 +03:00
if ( call ) {
2016-09-07 00:19:51 +03:00
if ( compl = = RXRPC_CALL_LOCALLY_ABORTED )
2018-03-28 01:03:00 +03:00
trace_rxrpc_abort ( call - > debug_id ,
" CON " , call - > cid ,
2016-09-07 00:19:51 +03:00
call - > call_id , 0 ,
2018-10-08 17:46:17 +03:00
conn - > abort_code ,
conn - > error ) ;
2019-04-12 18:34:09 +03:00
else
trace_rxrpc_rx_abort ( call , serial ,
conn - > abort_code ) ;
2020-06-04 00:21:16 +03:00
rxrpc_set_call_completion ( call , compl ,
conn - > abort_code ,
conn - > error ) ;
2007-04-27 02:48:28 +04:00
}
}
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
spin_unlock ( & conn - > bundle - > channel_lock ) ;
2007-04-27 02:48:28 +04:00
_leave ( " " ) ;
}
/*
* generate a connection - level abort
*/
static int rxrpc_abort_connection ( struct rxrpc_connection * conn ,
2017-04-06 12:11:56 +03:00
int error , u32 abort_code )
2007-04-27 02:48:28 +04:00
{
2016-03-04 18:53:46 +03:00
struct rxrpc_wire_header whdr ;
2007-04-27 02:48:28 +04:00
struct msghdr msg ;
struct kvec iov [ 2 ] ;
__be32 word ;
size_t len ;
2016-03-04 18:53:46 +03:00
u32 serial ;
2007-04-27 02:48:28 +04:00
int ret ;
_enter ( " %d,,%u,%u " , conn - > debug_id , error , abort_code ) ;
/* generate a connection-level abort */
spin_lock_bh ( & conn - > state_lock ) ;
2016-08-30 11:49:28 +03:00
if ( conn - > state > = RXRPC_CONN_REMOTELY_ABORTED ) {
2007-04-27 02:48:28 +04:00
spin_unlock_bh ( & conn - > state_lock ) ;
_leave ( " = 0 [already dead] " ) ;
return 0 ;
}
2018-10-08 17:46:17 +03:00
conn - > error = error ;
conn - > abort_code = abort_code ;
2016-08-30 11:49:28 +03:00
conn - > state = RXRPC_CONN_LOCALLY_ABORTED ;
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
set_bit ( RXRPC_CONN_DONT_REUSE , & conn - > flags ) ;
2016-08-30 11:49:28 +03:00
spin_unlock_bh ( & conn - > state_lock ) ;
2016-04-04 16:00:36 +03:00
msg . msg_name = & conn - > params . peer - > srx . transport ;
msg . msg_namelen = conn - > params . peer - > srx . transport_len ;
2007-04-27 02:48:28 +04:00
msg . msg_control = NULL ;
msg . msg_controllen = 0 ;
msg . msg_flags = 0 ;
2016-04-04 16:00:36 +03:00
whdr . epoch = htonl ( conn - > proto . epoch ) ;
whdr . cid = htonl ( conn - > proto . cid ) ;
2016-03-04 18:53:46 +03:00
whdr . callNumber = 0 ;
whdr . seq = 0 ;
whdr . type = RXRPC_PACKET_TYPE_ABORT ;
whdr . flags = conn - > out_clientflag ;
whdr . userStatus = 0 ;
whdr . securityIndex = conn - > security_ix ;
whdr . _rsvd = 0 ;
2017-06-05 16:30:49 +03:00
whdr . serviceId = htons ( conn - > service_id ) ;
2007-04-27 02:48:28 +04:00
2018-10-08 17:46:17 +03:00
word = htonl ( conn - > abort_code ) ;
2007-04-27 02:48:28 +04:00
2016-03-04 18:53:46 +03:00
iov [ 0 ] . iov_base = & whdr ;
iov [ 0 ] . iov_len = sizeof ( whdr ) ;
2007-04-27 02:48:28 +04:00
iov [ 1 ] . iov_base = & word ;
iov [ 1 ] . iov_len = sizeof ( word ) ;
len = iov [ 0 ] . iov_len + iov [ 1 ] . iov_len ;
2016-03-04 18:53:46 +03:00
serial = atomic_inc_return ( & conn - > serial ) ;
2019-04-12 18:34:09 +03:00
rxrpc_abort_calls ( conn , RXRPC_CALL_LOCALLY_ABORTED , serial ) ;
2016-03-04 18:53:46 +03:00
whdr . serial = htonl ( serial ) ;
2018-10-08 17:46:17 +03:00
_proto ( " Tx CONN ABORT %%%u { %d } " , serial , conn - > abort_code ) ;
2007-04-27 02:48:28 +04:00
2016-04-04 16:00:36 +03:00
ret = kernel_sendmsg ( conn - > params . local - > socket , & msg , iov , 2 , len ) ;
2007-04-27 02:48:28 +04:00
if ( ret < 0 ) {
2018-05-11 01:26:01 +03:00
trace_rxrpc_tx_fail ( conn - > debug_id , serial , ret ,
2018-07-23 19:18:37 +03:00
rxrpc_tx_point_conn_abort ) ;
2007-04-27 02:48:28 +04:00
_debug ( " sendmsg failed: %d " , ret ) ;
return - EAGAIN ;
}
2018-07-23 19:18:37 +03:00
trace_rxrpc_tx_packet ( conn - > debug_id , & whdr , rxrpc_tx_point_conn_abort ) ;
2018-08-08 13:30:02 +03:00
conn - > params . peer - > last_tx_at = ktime_get_seconds ( ) ;
2018-03-30 23:04:43 +03:00
2007-04-27 02:48:28 +04:00
_leave ( " = 0 " ) ;
return 0 ;
}
/*
* mark a call as being on a now - secured channel
rxrpc: Rewrite the data and ack handling code
Rewrite the data and ack handling code such that:
(1) Parsing of received ACK and ABORT packets and the distribution and the
filing of DATA packets happens entirely within the data_ready context
called from the UDP socket. This allows us to process and discard ACK
and ABORT packets much more quickly (they're no longer stashed on a
queue for a background thread to process).
(2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead
keep track of the offset and length of the content of each packet in
the sk_buff metadata. This means we don't do any allocation in the
receive path.
(3) Jumbo DATA packet parsing is now done in data_ready context. Rather
than cloning the packet once for each subpacket and pulling/trimming
it, we file the packet multiple times with an annotation for each
indicating which subpacket is there. From that we can directly
calculate the offset and length.
(4) A call's receive queue can be accessed without taking locks (memory
barriers do have to be used, though).
(5) Incoming calls are set up from preallocated resources and immediately
made live. They can than have packets queued upon them and ACKs
generated. If insufficient resources exist, DATA packet #1 is given a
BUSY reply and other DATA packets are discarded).
(6) sk_buffs no longer take a ref on their parent call.
To make this work, the following changes are made:
(1) Each call's receive buffer is now a circular buffer of sk_buff
pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
between the call and the socket. This permits each sk_buff to be in
the buffer multiple times. The receive buffer is reused for the
transmit buffer.
(2) A circular buffer of annotations (rxtx_annotations) is kept parallel
to the data buffer. Transmission phase annotations indicate whether a
buffered packet has been ACK'd or not and whether it needs
retransmission.
Receive phase annotations indicate whether a slot holds a whole packet
or a jumbo subpacket and, if the latter, which subpacket. They also
note whether the packet has been decrypted in place.
(3) DATA packet window tracking is much simplified. Each phase has just
two numbers representing the window (rx_hard_ack/rx_top and
tx_hard_ack/tx_top).
The hard_ack number is the sequence number before base of the window,
representing the last packet the other side says it has consumed.
hard_ack starts from 0 and the first packet is sequence number 1.
The top number is the sequence number of the highest-numbered packet
residing in the buffer. Packets between hard_ack+1 and top are
soft-ACK'd to indicate they've been received, but not yet consumed.
Four macros, before(), before_eq(), after() and after_eq() are added
to compare sequence numbers within the window. This allows for the
top of the window to wrap when the hard-ack sequence number gets close
to the limit.
Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
to indicate when rx_top and tx_top point at the packets with the
LAST_PACKET bit set, indicating the end of the phase.
(4) Calls are queued on the socket 'receive queue' rather than packets.
This means that we don't need have to invent dummy packets to queue to
indicate abnormal/terminal states and we don't have to keep metadata
packets (such as ABORTs) around
(5) The offset and length of a (sub)packet's content are now passed to
the verify_packet security op. This is currently expected to decrypt
the packet in place and validate it.
However, there's now nowhere to store the revised offset and length of
the actual data within the decrypted blob (there may be a header and
padding to skip) because an sk_buff may represent multiple packets, so
a locate_data security op is added to retrieve these details from the
sk_buff content when needed.
(6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
individually secured and needs to be individually decrypted. The code
to do this is broken out into rxrpc_recvmsg_data() and shared with the
kernel API. It now iterates over the call's receive buffer rather
than walking the socket receive queue.
Additional changes:
(1) The timers are condensed to a single timer that is set for the soonest
of three timeouts (delayed ACK generation, DATA retransmission and
call lifespan).
(2) Transmission of ACK and ABORT packets is effected immediately from
process-context socket ops/kernel API calls that cause them instead of
them being punted off to a background work item. The data_ready
handler still has to defer to the background, though.
(3) A shutdown op is added to the AF_RXRPC socket so that the AFS
filesystem can shut down the socket and flush its own work items
before closing the socket to deal with any in-progress service calls.
Future additional changes that will need to be considered:
(1) Make sure that a call doesn't hog the front of the queue by receiving
data from the network as fast as userspace is consuming it to the
exclusion of other calls.
(2) Transmit delayed ACKs from within recvmsg() when we've consumed
sufficiently more packets to avoid the background work item needing to
run.
Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 13:10:12 +03:00
* - must be called with BH ' s disabled .
2007-04-27 02:48:28 +04:00
*/
2008-12-11 02:18:31 +03:00
static void rxrpc_call_is_secure ( struct rxrpc_call * call )
2007-04-27 02:48:28 +04:00
{
_enter ( " %p " , call ) ;
if ( call ) {
rxrpc: Rewrite the data and ack handling code
Rewrite the data and ack handling code such that:
(1) Parsing of received ACK and ABORT packets and the distribution and the
filing of DATA packets happens entirely within the data_ready context
called from the UDP socket. This allows us to process and discard ACK
and ABORT packets much more quickly (they're no longer stashed on a
queue for a background thread to process).
(2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead
keep track of the offset and length of the content of each packet in
the sk_buff metadata. This means we don't do any allocation in the
receive path.
(3) Jumbo DATA packet parsing is now done in data_ready context. Rather
than cloning the packet once for each subpacket and pulling/trimming
it, we file the packet multiple times with an annotation for each
indicating which subpacket is there. From that we can directly
calculate the offset and length.
(4) A call's receive queue can be accessed without taking locks (memory
barriers do have to be used, though).
(5) Incoming calls are set up from preallocated resources and immediately
made live. They can than have packets queued upon them and ACKs
generated. If insufficient resources exist, DATA packet #1 is given a
BUSY reply and other DATA packets are discarded).
(6) sk_buffs no longer take a ref on their parent call.
To make this work, the following changes are made:
(1) Each call's receive buffer is now a circular buffer of sk_buff
pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
between the call and the socket. This permits each sk_buff to be in
the buffer multiple times. The receive buffer is reused for the
transmit buffer.
(2) A circular buffer of annotations (rxtx_annotations) is kept parallel
to the data buffer. Transmission phase annotations indicate whether a
buffered packet has been ACK'd or not and whether it needs
retransmission.
Receive phase annotations indicate whether a slot holds a whole packet
or a jumbo subpacket and, if the latter, which subpacket. They also
note whether the packet has been decrypted in place.
(3) DATA packet window tracking is much simplified. Each phase has just
two numbers representing the window (rx_hard_ack/rx_top and
tx_hard_ack/tx_top).
The hard_ack number is the sequence number before base of the window,
representing the last packet the other side says it has consumed.
hard_ack starts from 0 and the first packet is sequence number 1.
The top number is the sequence number of the highest-numbered packet
residing in the buffer. Packets between hard_ack+1 and top are
soft-ACK'd to indicate they've been received, but not yet consumed.
Four macros, before(), before_eq(), after() and after_eq() are added
to compare sequence numbers within the window. This allows for the
top of the window to wrap when the hard-ack sequence number gets close
to the limit.
Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
to indicate when rx_top and tx_top point at the packets with the
LAST_PACKET bit set, indicating the end of the phase.
(4) Calls are queued on the socket 'receive queue' rather than packets.
This means that we don't need have to invent dummy packets to queue to
indicate abnormal/terminal states and we don't have to keep metadata
packets (such as ABORTs) around
(5) The offset and length of a (sub)packet's content are now passed to
the verify_packet security op. This is currently expected to decrypt
the packet in place and validate it.
However, there's now nowhere to store the revised offset and length of
the actual data within the decrypted blob (there may be a header and
padding to skip) because an sk_buff may represent multiple packets, so
a locate_data security op is added to retrieve these details from the
sk_buff content when needed.
(6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
individually secured and needs to be individually decrypted. The code
to do this is broken out into rxrpc_recvmsg_data() and shared with the
kernel API. It now iterates over the call's receive buffer rather
than walking the socket receive queue.
Additional changes:
(1) The timers are condensed to a single timer that is set for the soonest
of three timeouts (delayed ACK generation, DATA retransmission and
call lifespan).
(2) Transmission of ACK and ABORT packets is effected immediately from
process-context socket ops/kernel API calls that cause them instead of
them being punted off to a background work item. The data_ready
handler still has to defer to the background, though.
(3) A shutdown op is added to the AF_RXRPC socket so that the AFS
filesystem can shut down the socket and flush its own work items
before closing the socket to deal with any in-progress service calls.
Future additional changes that will need to be considered:
(1) Make sure that a call doesn't hog the front of the queue by receiving
data from the network as fast as userspace is consuming it to the
exclusion of other calls.
(2) Transmit delayed ACKs from within recvmsg() when we've consumed
sufficiently more packets to avoid the background work item needing to
run.
Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 13:10:12 +03:00
write_lock_bh ( & call - > state_lock ) ;
if ( call - > state = = RXRPC_CALL_SERVER_SECURING ) {
2020-09-30 23:27:18 +03:00
call - > state = RXRPC_CALL_SERVER_RECV_REQUEST ;
rxrpc: Rewrite the data and ack handling code
Rewrite the data and ack handling code such that:
(1) Parsing of received ACK and ABORT packets and the distribution and the
filing of DATA packets happens entirely within the data_ready context
called from the UDP socket. This allows us to process and discard ACK
and ABORT packets much more quickly (they're no longer stashed on a
queue for a background thread to process).
(2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead
keep track of the offset and length of the content of each packet in
the sk_buff metadata. This means we don't do any allocation in the
receive path.
(3) Jumbo DATA packet parsing is now done in data_ready context. Rather
than cloning the packet once for each subpacket and pulling/trimming
it, we file the packet multiple times with an annotation for each
indicating which subpacket is there. From that we can directly
calculate the offset and length.
(4) A call's receive queue can be accessed without taking locks (memory
barriers do have to be used, though).
(5) Incoming calls are set up from preallocated resources and immediately
made live. They can than have packets queued upon them and ACKs
generated. If insufficient resources exist, DATA packet #1 is given a
BUSY reply and other DATA packets are discarded).
(6) sk_buffs no longer take a ref on their parent call.
To make this work, the following changes are made:
(1) Each call's receive buffer is now a circular buffer of sk_buff
pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
between the call and the socket. This permits each sk_buff to be in
the buffer multiple times. The receive buffer is reused for the
transmit buffer.
(2) A circular buffer of annotations (rxtx_annotations) is kept parallel
to the data buffer. Transmission phase annotations indicate whether a
buffered packet has been ACK'd or not and whether it needs
retransmission.
Receive phase annotations indicate whether a slot holds a whole packet
or a jumbo subpacket and, if the latter, which subpacket. They also
note whether the packet has been decrypted in place.
(3) DATA packet window tracking is much simplified. Each phase has just
two numbers representing the window (rx_hard_ack/rx_top and
tx_hard_ack/tx_top).
The hard_ack number is the sequence number before base of the window,
representing the last packet the other side says it has consumed.
hard_ack starts from 0 and the first packet is sequence number 1.
The top number is the sequence number of the highest-numbered packet
residing in the buffer. Packets between hard_ack+1 and top are
soft-ACK'd to indicate they've been received, but not yet consumed.
Four macros, before(), before_eq(), after() and after_eq() are added
to compare sequence numbers within the window. This allows for the
top of the window to wrap when the hard-ack sequence number gets close
to the limit.
Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
to indicate when rx_top and tx_top point at the packets with the
LAST_PACKET bit set, indicating the end of the phase.
(4) Calls are queued on the socket 'receive queue' rather than packets.
This means that we don't need have to invent dummy packets to queue to
indicate abnormal/terminal states and we don't have to keep metadata
packets (such as ABORTs) around
(5) The offset and length of a (sub)packet's content are now passed to
the verify_packet security op. This is currently expected to decrypt
the packet in place and validate it.
However, there's now nowhere to store the revised offset and length of
the actual data within the decrypted blob (there may be a header and
padding to skip) because an sk_buff may represent multiple packets, so
a locate_data security op is added to retrieve these details from the
sk_buff content when needed.
(6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
individually secured and needs to be individually decrypted. The code
to do this is broken out into rxrpc_recvmsg_data() and shared with the
kernel API. It now iterates over the call's receive buffer rather
than walking the socket receive queue.
Additional changes:
(1) The timers are condensed to a single timer that is set for the soonest
of three timeouts (delayed ACK generation, DATA retransmission and
call lifespan).
(2) Transmission of ACK and ABORT packets is effected immediately from
process-context socket ops/kernel API calls that cause them instead of
them being punted off to a background work item. The data_ready
handler still has to defer to the background, though.
(3) A shutdown op is added to the AF_RXRPC socket so that the AFS
filesystem can shut down the socket and flush its own work items
before closing the socket to deal with any in-progress service calls.
Future additional changes that will need to be considered:
(1) Make sure that a call doesn't hog the front of the queue by receiving
data from the network as fast as userspace is consuming it to the
exclusion of other calls.
(2) Transmit delayed ACKs from within recvmsg() when we've consumed
sufficiently more packets to avoid the background work item needing to
run.
Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 13:10:12 +03:00
rxrpc_notify_socket ( call ) ;
}
write_unlock_bh ( & call - > state_lock ) ;
2007-04-27 02:48:28 +04:00
}
}
/*
* connection - level Rx packet processor
*/
static int rxrpc_process_event ( struct rxrpc_connection * conn ,
struct sk_buff * skb ,
u32 * _abort_code )
{
struct rxrpc_skb_priv * sp = rxrpc_skb ( skb ) ;
2016-03-04 18:53:46 +03:00
__be32 wtmp ;
u32 abort_code ;
2007-04-27 02:48:28 +04:00
int loop , ret ;
2009-06-17 00:36:44 +04:00
if ( conn - > state > = RXRPC_CONN_REMOTELY_ABORTED ) {
rxrpc: Rewrite the data and ack handling code
Rewrite the data and ack handling code such that:
(1) Parsing of received ACK and ABORT packets and the distribution and the
filing of DATA packets happens entirely within the data_ready context
called from the UDP socket. This allows us to process and discard ACK
and ABORT packets much more quickly (they're no longer stashed on a
queue for a background thread to process).
(2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead
keep track of the offset and length of the content of each packet in
the sk_buff metadata. This means we don't do any allocation in the
receive path.
(3) Jumbo DATA packet parsing is now done in data_ready context. Rather
than cloning the packet once for each subpacket and pulling/trimming
it, we file the packet multiple times with an annotation for each
indicating which subpacket is there. From that we can directly
calculate the offset and length.
(4) A call's receive queue can be accessed without taking locks (memory
barriers do have to be used, though).
(5) Incoming calls are set up from preallocated resources and immediately
made live. They can than have packets queued upon them and ACKs
generated. If insufficient resources exist, DATA packet #1 is given a
BUSY reply and other DATA packets are discarded).
(6) sk_buffs no longer take a ref on their parent call.
To make this work, the following changes are made:
(1) Each call's receive buffer is now a circular buffer of sk_buff
pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
between the call and the socket. This permits each sk_buff to be in
the buffer multiple times. The receive buffer is reused for the
transmit buffer.
(2) A circular buffer of annotations (rxtx_annotations) is kept parallel
to the data buffer. Transmission phase annotations indicate whether a
buffered packet has been ACK'd or not and whether it needs
retransmission.
Receive phase annotations indicate whether a slot holds a whole packet
or a jumbo subpacket and, if the latter, which subpacket. They also
note whether the packet has been decrypted in place.
(3) DATA packet window tracking is much simplified. Each phase has just
two numbers representing the window (rx_hard_ack/rx_top and
tx_hard_ack/tx_top).
The hard_ack number is the sequence number before base of the window,
representing the last packet the other side says it has consumed.
hard_ack starts from 0 and the first packet is sequence number 1.
The top number is the sequence number of the highest-numbered packet
residing in the buffer. Packets between hard_ack+1 and top are
soft-ACK'd to indicate they've been received, but not yet consumed.
Four macros, before(), before_eq(), after() and after_eq() are added
to compare sequence numbers within the window. This allows for the
top of the window to wrap when the hard-ack sequence number gets close
to the limit.
Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
to indicate when rx_top and tx_top point at the packets with the
LAST_PACKET bit set, indicating the end of the phase.
(4) Calls are queued on the socket 'receive queue' rather than packets.
This means that we don't need have to invent dummy packets to queue to
indicate abnormal/terminal states and we don't have to keep metadata
packets (such as ABORTs) around
(5) The offset and length of a (sub)packet's content are now passed to
the verify_packet security op. This is currently expected to decrypt
the packet in place and validate it.
However, there's now nowhere to store the revised offset and length of
the actual data within the decrypted blob (there may be a header and
padding to skip) because an sk_buff may represent multiple packets, so
a locate_data security op is added to retrieve these details from the
sk_buff content when needed.
(6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
individually secured and needs to be individually decrypted. The code
to do this is broken out into rxrpc_recvmsg_data() and shared with the
kernel API. It now iterates over the call's receive buffer rather
than walking the socket receive queue.
Additional changes:
(1) The timers are condensed to a single timer that is set for the soonest
of three timeouts (delayed ACK generation, DATA retransmission and
call lifespan).
(2) Transmission of ACK and ABORT packets is effected immediately from
process-context socket ops/kernel API calls that cause them instead of
them being punted off to a background work item. The data_ready
handler still has to defer to the background, though.
(3) A shutdown op is added to the AF_RXRPC socket so that the AFS
filesystem can shut down the socket and flush its own work items
before closing the socket to deal with any in-progress service calls.
Future additional changes that will need to be considered:
(1) Make sure that a call doesn't hog the front of the queue by receiving
data from the network as fast as userspace is consuming it to the
exclusion of other calls.
(2) Transmit delayed ACKs from within recvmsg() when we've consumed
sufficiently more packets to avoid the background work item needing to
run.
Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 13:10:12 +03:00
_leave ( " = -ECONNABORTED [%u] " , conn - > state ) ;
2007-04-27 02:48:28 +04:00
return - ECONNABORTED ;
2009-06-17 00:36:44 +04:00
}
2007-04-27 02:48:28 +04:00
2016-03-04 18:53:46 +03:00
_enter ( " {%d},{%u,%%%u}, " , conn - > debug_id , sp - > hdr . type , sp - > hdr . serial ) ;
2009-06-17 00:36:44 +04:00
2007-04-27 02:48:28 +04:00
switch ( sp - > hdr . type ) {
2016-08-23 17:27:25 +03:00
case RXRPC_PACKET_TYPE_DATA :
case RXRPC_PACKET_TYPE_ACK :
2017-11-24 13:18:41 +03:00
rxrpc_conn_retransmit_call ( conn , skb ,
sp - > hdr . cid & RXRPC_CHANNELMASK ) ;
2016-08-23 17:27:25 +03:00
return 0 ;
2017-03-16 19:27:10 +03:00
case RXRPC_PACKET_TYPE_BUSY :
/* Just ignore BUSY packets for now. */
return 0 ;
2007-04-27 02:48:28 +04:00
case RXRPC_PACKET_TYPE_ABORT :
2016-09-30 15:26:03 +03:00
if ( skb_copy_bits ( skb , sizeof ( struct rxrpc_wire_header ) ,
2017-04-06 12:12:00 +03:00
& wtmp , sizeof ( wtmp ) ) < 0 ) {
trace_rxrpc_rx_eproto ( NULL , sp - > hdr . serial ,
tracepoint_string ( " bad_abort " ) ) ;
2007-04-27 02:48:28 +04:00
return - EPROTO ;
2017-04-06 12:12:00 +03:00
}
2016-03-04 18:53:46 +03:00
abort_code = ntohl ( wtmp ) ;
_proto ( " Rx ABORT %%%u { ac=%d } " , sp - > hdr . serial , abort_code ) ;
2007-04-27 02:48:28 +04:00
2018-10-08 17:46:17 +03:00
conn - > error = - ECONNABORTED ;
conn - > abort_code = abort_code ;
2007-04-27 02:48:28 +04:00
conn - > state = RXRPC_CONN_REMOTELY_ABORTED ;
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
set_bit ( RXRPC_CONN_DONT_REUSE , & conn - > flags ) ;
2019-04-12 18:34:09 +03:00
rxrpc_abort_calls ( conn , RXRPC_CALL_REMOTELY_ABORTED , sp - > hdr . serial ) ;
2007-04-27 02:48:28 +04:00
return - ECONNABORTED ;
case RXRPC_PACKET_TYPE_CHALLENGE :
2016-04-07 19:23:58 +03:00
return conn - > security - > respond_to_challenge ( conn , skb ,
_abort_code ) ;
2007-04-27 02:48:28 +04:00
case RXRPC_PACKET_TYPE_RESPONSE :
ret = conn - > security - > verify_response ( conn , skb , _abort_code ) ;
if ( ret < 0 )
return ret ;
2020-09-16 10:19:12 +03:00
ret = conn - > security - > init_connection_security (
conn , conn - > params . key - > payload . data [ 0 ] ) ;
2007-04-27 02:48:28 +04:00
if ( ret < 0 )
return ret ;
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
spin_lock ( & conn - > bundle - > channel_lock ) ;
2020-10-01 13:57:40 +03:00
spin_lock_bh ( & conn - > state_lock ) ;
2007-04-27 02:48:28 +04:00
2016-06-27 12:32:02 +03:00
if ( conn - > state = = RXRPC_CONN_SERVICE_CHALLENGING ) {
conn - > state = RXRPC_CONN_SERVICE ;
2020-10-01 13:57:40 +03:00
spin_unlock_bh ( & conn - > state_lock ) ;
2007-04-27 02:48:28 +04:00
for ( loop = 0 ; loop < RXRPC_MAXCALLS ; loop + + )
2016-06-27 19:11:19 +03:00
rxrpc_call_is_secure (
rcu_dereference_protected (
rxrpc: Call channels should have separate call number spaces
Each channel on a connection has a separate, independent number space from
which to allocate callNumber values. It is entirely possible, for example,
to have a connection with four active calls, each with call number 1.
Note that the callNumber values for any particular channel don't have to
start at 1, but they are supposed to increment monotonically for that
channel from a client's perspective and may not be reused once the call
number is transmitted (until the epoch cycles all the way back round).
Currently, however, call numbers are allocated on a per-connection basis
and, further, are held in an rb-tree. The rb-tree is redundant as the four
channel pointers in the rxrpc_connection struct are entirely capable of
pointing to all the calls currently in progress on a connection.
To this end, make the following changes:
(1) Handle call number allocation independently per channel.
(2) Get rid of the conn->calls rb-tree. This is overkill as a connection
may have a maximum of four calls in progress at any one time. Use the
pointers in the channels[] array instead, indexed by the channel
number from the packet.
(3) For each channel, save the result of the last call that was in
progress on that channel in conn->channels[] so that the final ACK or
ABORT packet can be replayed if necessary. Any call earlier than that
is just ignored. If we've seen the next call number in a packet, the
last one is most definitely defunct.
(4) When generating a RESPONSE packet for a connection, the call number
counter for each channel must be included in it.
(5) When parsing a RESPONSE packet for a connection, the call number
counters contained therein should be used to set the minimum expected
call numbers on each channel.
To do in future commits:
(1) Replay terminal packets based on the last call stored in
conn->channels[].
(2) Connections should be retired before the callNumber space on any
channel runs out.
(3) A server is expected to disregard or reject any new incoming call that
has a call number less than the current call number counter. The call
number counter for that channel must be advanced to the new call
number.
Note that the server cannot just require that the next call that it
sees on a channel be exactly the call number counter + 1 because then
there's a scenario that could cause a problem: The client transmits a
packet to initiate a connection, the network goes out, the server
sends an ACK (which gets lost), the client sends an ABORT (which also
gets lost); the network then reconnects, the client then reuses the
call number for the next call (it doesn't know the server already saw
the call number), but the server thinks it already has the first
packet of this call (it doesn't know that the client doesn't know that
it saw the call number the first time).
Signed-off-by: David Howells <dhowells@redhat.com>
2016-06-27 16:39:44 +03:00
conn - > channels [ loop ] . call ,
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
lockdep_is_held ( & conn - > bundle - > channel_lock ) ) ) ;
rxrpc: Rewrite the data and ack handling code
Rewrite the data and ack handling code such that:
(1) Parsing of received ACK and ABORT packets and the distribution and the
filing of DATA packets happens entirely within the data_ready context
called from the UDP socket. This allows us to process and discard ACK
and ABORT packets much more quickly (they're no longer stashed on a
queue for a background thread to process).
(2) We avoid calling skb_clone(), pskb_pull() and pskb_trim(). We instead
keep track of the offset and length of the content of each packet in
the sk_buff metadata. This means we don't do any allocation in the
receive path.
(3) Jumbo DATA packet parsing is now done in data_ready context. Rather
than cloning the packet once for each subpacket and pulling/trimming
it, we file the packet multiple times with an annotation for each
indicating which subpacket is there. From that we can directly
calculate the offset and length.
(4) A call's receive queue can be accessed without taking locks (memory
barriers do have to be used, though).
(5) Incoming calls are set up from preallocated resources and immediately
made live. They can than have packets queued upon them and ACKs
generated. If insufficient resources exist, DATA packet #1 is given a
BUSY reply and other DATA packets are discarded).
(6) sk_buffs no longer take a ref on their parent call.
To make this work, the following changes are made:
(1) Each call's receive buffer is now a circular buffer of sk_buff
pointers (rxtx_buffer) rather than a number of sk_buff_heads spread
between the call and the socket. This permits each sk_buff to be in
the buffer multiple times. The receive buffer is reused for the
transmit buffer.
(2) A circular buffer of annotations (rxtx_annotations) is kept parallel
to the data buffer. Transmission phase annotations indicate whether a
buffered packet has been ACK'd or not and whether it needs
retransmission.
Receive phase annotations indicate whether a slot holds a whole packet
or a jumbo subpacket and, if the latter, which subpacket. They also
note whether the packet has been decrypted in place.
(3) DATA packet window tracking is much simplified. Each phase has just
two numbers representing the window (rx_hard_ack/rx_top and
tx_hard_ack/tx_top).
The hard_ack number is the sequence number before base of the window,
representing the last packet the other side says it has consumed.
hard_ack starts from 0 and the first packet is sequence number 1.
The top number is the sequence number of the highest-numbered packet
residing in the buffer. Packets between hard_ack+1 and top are
soft-ACK'd to indicate they've been received, but not yet consumed.
Four macros, before(), before_eq(), after() and after_eq() are added
to compare sequence numbers within the window. This allows for the
top of the window to wrap when the hard-ack sequence number gets close
to the limit.
Two flags, RXRPC_CALL_RX_LAST and RXRPC_CALL_TX_LAST, are added also
to indicate when rx_top and tx_top point at the packets with the
LAST_PACKET bit set, indicating the end of the phase.
(4) Calls are queued on the socket 'receive queue' rather than packets.
This means that we don't need have to invent dummy packets to queue to
indicate abnormal/terminal states and we don't have to keep metadata
packets (such as ABORTs) around
(5) The offset and length of a (sub)packet's content are now passed to
the verify_packet security op. This is currently expected to decrypt
the packet in place and validate it.
However, there's now nowhere to store the revised offset and length of
the actual data within the decrypted blob (there may be a header and
padding to skip) because an sk_buff may represent multiple packets, so
a locate_data security op is added to retrieve these details from the
sk_buff content when needed.
(6) recvmsg() now has to handle jumbo subpackets, where each subpacket is
individually secured and needs to be individually decrypted. The code
to do this is broken out into rxrpc_recvmsg_data() and shared with the
kernel API. It now iterates over the call's receive buffer rather
than walking the socket receive queue.
Additional changes:
(1) The timers are condensed to a single timer that is set for the soonest
of three timeouts (delayed ACK generation, DATA retransmission and
call lifespan).
(2) Transmission of ACK and ABORT packets is effected immediately from
process-context socket ops/kernel API calls that cause them instead of
them being punted off to a background work item. The data_ready
handler still has to defer to the background, though.
(3) A shutdown op is added to the AF_RXRPC socket so that the AFS
filesystem can shut down the socket and flush its own work items
before closing the socket to deal with any in-progress service calls.
Future additional changes that will need to be considered:
(1) Make sure that a call doesn't hog the front of the queue by receiving
data from the network as fast as userspace is consuming it to the
exclusion of other calls.
(2) Transmit delayed ACKs from within recvmsg() when we've consumed
sufficiently more packets to avoid the background work item needing to
run.
Signed-off-by: David Howells <dhowells@redhat.com>
2016-09-08 13:10:12 +03:00
} else {
2020-10-01 13:57:40 +03:00
spin_unlock_bh ( & conn - > state_lock ) ;
2007-04-27 02:48:28 +04:00
}
rxrpc: Rewrite the client connection manager
Rewrite the rxrpc client connection manager so that it can support multiple
connections for a given security key to a peer. The following changes are
made:
(1) For each open socket, the code currently maintains an rbtree with the
connections placed into it, keyed by communications parameters. This
is tricky to maintain as connections can be culled from the tree or
replaced within it. Connections can require replacement for a number
of reasons, e.g. their IDs span too great a range for the IDR data
type to represent efficiently, the call ID numbers on that conn would
overflow or the conn got aborted.
This is changed so that there's now a connection bundle object placed
in the tree, keyed on the same parameters. The bundle, however, does
not need to be replaced.
(2) An rxrpc_bundle object can now manage the available channels for a set
of parallel connections. The lock that manages this is moved there
from the rxrpc_connection struct (channel_lock).
(3) There'a a dummy bundle for all incoming connections to share so that
they have a channel_lock too. It might be better to give each
incoming connection its own bundle. This bundle is not needed to
manage which channels incoming calls are made on because that's the
solely at whim of the client.
(4) The restrictions on how many client connections are around are
removed. Instead, a previous patch limits the number of client calls
that can be allocated. Ordinarily, client connections are reaped
after 2 minutes on the idle queue, but when more than a certain number
of connections are in existence, the reaper starts reaping them after
2s of idleness instead to get the numbers back down.
It could also be made such that new call allocations are forced to
wait until the number of outstanding connections subsides.
Signed-off-by: David Howells <dhowells@redhat.com>
2020-07-01 13:15:32 +03:00
spin_unlock ( & conn - > bundle - > channel_lock ) ;
2007-04-27 02:48:28 +04:00
return 0 ;
default :
2017-04-06 12:12:00 +03:00
trace_rxrpc_rx_eproto ( NULL , sp - > hdr . serial ,
tracepoint_string ( " bad_conn_pkt " ) ) ;
2007-04-27 02:48:28 +04:00
return - EPROTO ;
}
}
/*
* set up security and issue a challenge
*/
static void rxrpc_secure_connection ( struct rxrpc_connection * conn )
{
u32 abort_code ;
int ret ;
_enter ( " {%d} " , conn - > debug_id ) ;
ASSERT ( conn - > security_ix ! = 0 ) ;
if ( conn - > security - > issue_challenge ( conn ) < 0 ) {
abort_code = RX_CALL_DEAD ;
ret = - ENOMEM ;
goto abort ;
}
_leave ( " " ) ;
return ;
abort :
_debug ( " abort %d, %d " , ret , abort_code ) ;
2017-04-06 12:11:56 +03:00
rxrpc_abort_connection ( conn , ret , abort_code ) ;
2007-04-27 02:48:28 +04:00
_leave ( " [aborted] " ) ;
}
2017-11-24 13:18:41 +03:00
/*
* Process delayed final ACKs that we haven ' t subsumed into a subsequent call .
*/
2020-10-01 01:54:44 +03:00
void rxrpc_process_delayed_final_acks ( struct rxrpc_connection * conn , bool force )
2017-11-24 13:18:41 +03:00
{
unsigned long j = jiffies , next_j ;
unsigned int channel ;
bool set ;
again :
next_j = j + LONG_MAX ;
set = false ;
for ( channel = 0 ; channel < RXRPC_MAXCALLS ; channel + + ) {
struct rxrpc_channel * chan = & conn - > channels [ channel ] ;
unsigned long ack_at ;
if ( ! test_bit ( RXRPC_CONN_FINAL_ACK_0 + channel , & conn - > flags ) )
continue ;
smp_rmb ( ) ; /* vs rxrpc_disconnect_client_call */
ack_at = READ_ONCE ( chan - > final_ack_at ) ;
2020-10-01 01:54:44 +03:00
if ( time_before ( j , ack_at ) & & ! force ) {
2017-11-24 13:18:41 +03:00
if ( time_before ( ack_at , next_j ) ) {
next_j = ack_at ;
set = true ;
}
continue ;
}
if ( test_and_clear_bit ( RXRPC_CONN_FINAL_ACK_0 + channel ,
& conn - > flags ) )
rxrpc_conn_retransmit_call ( conn , NULL , channel ) ;
}
j = jiffies ;
if ( time_before_eq ( next_j , j ) )
goto again ;
if ( set )
rxrpc_reduce_conn_timer ( conn , next_j ) ;
}
2007-04-27 02:48:28 +04:00
/*
* connection - level event processor
*/
2020-01-31 00:50:36 +03:00
static void rxrpc_do_process_connection ( struct rxrpc_connection * conn )
2007-04-27 02:48:28 +04:00
{
struct sk_buff * skb ;
u32 abort_code = RX_PROTOCOL_ERROR ;
int ret ;
2016-06-27 12:32:03 +03:00
if ( test_and_clear_bit ( RXRPC_CONN_EV_CHALLENGE , & conn - > events ) )
2007-04-27 02:48:28 +04:00
rxrpc_secure_connection ( conn ) ;
2017-11-24 13:18:41 +03:00
/* Process delayed ACKs whose time has come. */
if ( conn - > flags & RXRPC_CONN_FINAL_ACK_MASK )
2020-10-01 01:54:44 +03:00
rxrpc_process_delayed_final_acks ( conn , false ) ;
2017-11-24 13:18:41 +03:00
2007-04-27 02:48:28 +04:00
/* go through the conn-level event packets, releasing the ref on this
* connection that each one has when we ' ve finished with it */
while ( ( skb = skb_dequeue ( & conn - > rx_queue ) ) ) {
2019-08-19 11:25:38 +03:00
rxrpc_see_skb ( skb , rxrpc_skb_seen ) ;
2007-04-27 02:48:28 +04:00
ret = rxrpc_process_event ( conn , skb , & abort_code ) ;
switch ( ret ) {
case - EPROTO :
case - EKEYEXPIRED :
case - EKEYREJECTED :
goto protocol_error ;
2018-02-08 18:59:07 +03:00
case - ENOMEM :
2007-04-27 02:48:28 +04:00
case - EAGAIN :
goto requeue_and_leave ;
case - ECONNABORTED :
default :
2019-08-19 11:25:38 +03:00
rxrpc_free_skb ( skb , rxrpc_skb_freed ) ;
2007-04-27 02:48:28 +04:00
break ;
}
}
return ;
requeue_and_leave :
skb_queue_head ( & conn - > rx_queue , skb ) ;
2020-01-31 00:50:36 +03:00
return ;
2007-04-27 02:48:28 +04:00
protocol_error :
2017-04-06 12:11:56 +03:00
if ( rxrpc_abort_connection ( conn , ret , abort_code ) < 0 )
2007-04-27 02:48:28 +04:00
goto requeue_and_leave ;
2019-08-19 11:25:38 +03:00
rxrpc_free_skb ( skb , rxrpc_skb_freed ) ;
2020-01-31 00:50:36 +03:00
return ;
}
void rxrpc_process_connection ( struct work_struct * work )
{
struct rxrpc_connection * conn =
container_of ( work , struct rxrpc_connection , processor ) ;
rxrpc_see_connection ( conn ) ;
if ( __rxrpc_use_local ( conn - > params . local ) ) {
rxrpc_do_process_connection ( conn ) ;
rxrpc_unuse_local ( conn - > params . local ) ;
}
rxrpc_put_connection ( conn ) ;
_leave ( " " ) ;
return ;
2007-04-27 02:48:28 +04:00
}