2006-01-02 21:04:38 +03:00
/*
* net / tipc / bcast . c : TIPC broadcast code
2007-02-09 17:25:21 +03:00
*
2015-02-05 16:36:43 +03:00
* Copyright ( c ) 2004 - 2006 , 2014 - 2015 , Ericsson AB
2006-01-02 21:04:38 +03:00
* Copyright ( c ) 2004 , Intel Corporation .
2011-01-07 21:00:11 +03:00
* Copyright ( c ) 2005 , 2010 - 2011 , Wind River Systems
2006-01-02 21:04:38 +03:00
* All rights reserved .
*
2006-01-11 15:30:43 +03:00
* Redistribution and use in source and binary forms , with or without
2006-01-02 21:04:38 +03:00
* modification , are permitted provided that the following conditions are met :
*
2006-01-11 15:30:43 +03:00
* 1. Redistributions of source code must retain the above copyright
* notice , this list of conditions and the following disclaimer .
* 2. Redistributions in binary form must reproduce the above copyright
* notice , this list of conditions and the following disclaimer in the
* documentation and / or other materials provided with the distribution .
* 3. Neither the names of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission .
2006-01-02 21:04:38 +03:00
*
2006-01-11 15:30:43 +03:00
* Alternatively , this software may be distributed under the terms of the
* GNU General Public License ( " GPL " ) version 2 as published by the Free
* Software Foundation .
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS " AS IS "
* AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT LIMITED TO , THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL , SPECIAL , EXEMPLARY , OR
* CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT LIMITED TO , PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE , DATA , OR PROFITS ; OR BUSINESS
* INTERRUPTION ) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY , WHETHER IN
* CONTRACT , STRICT LIABILITY , OR TORT ( INCLUDING NEGLIGENCE OR OTHERWISE )
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE , EVEN IF ADVISED OF THE
2006-01-02 21:04:38 +03:00
* POSSIBILITY OF SUCH DAMAGE .
*/
2014-07-17 04:41:00 +04:00
# include "socket.h"
# include "msg.h"
2006-01-02 21:04:38 +03:00
# include "bcast.h"
2011-04-07 22:57:53 +04:00
# include "name_distr.h"
2015-01-09 10:27:07 +03:00
# include "core.h"
2006-01-02 21:04:38 +03:00
2014-03-27 08:54:35 +04:00
# define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */
# define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */
2006-01-02 21:04:38 +03:00
2010-05-11 18:30:07 +04:00
const char tipc_bclink_name [ ] = " broadcast-link " ;
2006-01-02 21:04:38 +03:00
2010-10-13 17:20:35 +04:00
static void tipc_nmap_diff ( struct tipc_node_map * nm_a ,
struct tipc_node_map * nm_b ,
struct tipc_node_map * nm_diff ) ;
2014-04-21 06:55:51 +04:00
static void tipc_nmap_add ( struct tipc_node_map * nm_ptr , u32 node ) ;
static void tipc_nmap_remove ( struct tipc_node_map * nm_ptr , u32 node ) ;
2006-01-02 21:04:38 +03:00
2015-01-09 10:27:07 +03:00
static void tipc_bclink_lock ( struct net * net )
2014-05-05 04:56:15 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
spin_lock_bh ( & tn - > bclink - > lock ) ;
2014-05-05 04:56:15 +04:00
}
2015-01-09 10:27:07 +03:00
static void tipc_bclink_unlock ( struct net * net )
2014-05-05 04:56:15 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2014-05-05 04:56:17 +04:00
2015-01-09 10:27:07 +03:00
spin_unlock_bh ( & tn - > bclink - > lock ) ;
2014-05-05 04:56:17 +04:00
}
2015-02-05 16:36:44 +03:00
void tipc_bclink_input ( struct net * net )
{
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
tipc_sk_mcast_rcv ( net , & tn - > bclink - > arrvq , & tn - > bclink - > inputq ) ;
}
2014-07-17 04:41:00 +04:00
uint tipc_bclink_get_mtu ( void )
{
return MAX_PKT_DEFAULT_MCAST ;
}
2006-03-21 09:37:04 +03:00
static u32 bcbuf_acks ( struct sk_buff * buf )
2006-01-02 21:04:38 +03:00
{
2006-01-13 00:22:32 +03:00
return ( u32 ) ( unsigned long ) TIPC_SKB_CB ( buf ) - > handle ;
2006-01-02 21:04:38 +03:00
}
2006-03-21 09:37:04 +03:00
static void bcbuf_set_acks ( struct sk_buff * buf , u32 acks )
2006-01-02 21:04:38 +03:00
{
2006-01-13 00:22:32 +03:00
TIPC_SKB_CB ( buf ) - > handle = ( void * ) ( unsigned long ) acks ;
2006-01-02 21:04:38 +03:00
}
2006-03-21 09:37:04 +03:00
static void bcbuf_decr_acks ( struct sk_buff * buf )
2006-01-02 21:04:38 +03:00
{
bcbuf_set_acks ( buf , bcbuf_acks ( buf ) - 1 ) ;
}
2015-01-09 10:27:07 +03:00
void tipc_bclink_add_node ( struct net * net , u32 addr )
2011-10-24 19:18:12 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
tipc_bclink_lock ( net ) ;
tipc_nmap_add ( & tn - > bclink - > bcast_nodes , addr ) ;
tipc_bclink_unlock ( net ) ;
2011-10-24 19:18:12 +04:00
}
2015-01-09 10:27:07 +03:00
void tipc_bclink_remove_node ( struct net * net , u32 addr )
2011-10-24 19:18:12 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
tipc_bclink_lock ( net ) ;
tipc_nmap_remove ( & tn - > bclink - > bcast_nodes , addr ) ;
tipc_bclink_unlock ( net ) ;
2011-10-24 19:18:12 +04:00
}
2006-01-02 21:04:38 +03:00
2015-01-09 10:27:07 +03:00
static void bclink_set_last_sent ( struct net * net )
2010-08-17 15:00:09 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
struct tipc_link * bcl = tn - > bcl ;
2015-03-13 23:08:10 +03:00
struct sk_buff * skb = skb_peek ( & bcl - > backlogq ) ;
2015-01-09 10:27:07 +03:00
2015-03-13 23:08:10 +03:00
if ( skb )
bcl - > fsm_msg_cnt = mod ( buf_seqno ( skb ) - 1 ) ;
2010-08-17 15:00:09 +04:00
else
bcl - > fsm_msg_cnt = mod ( bcl - > next_out_no - 1 ) ;
}
2015-01-09 10:27:07 +03:00
u32 tipc_bclink_get_last_sent ( struct net * net )
2010-08-17 15:00:09 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
return tn - > bcl - > fsm_msg_cnt ;
2010-08-17 15:00:09 +04:00
}
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
static void bclink_update_last_sent ( struct tipc_node * node , u32 seqno )
2006-01-02 21:04:38 +03:00
{
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
node - > bclink . last_sent = less_eq ( node - > bclink . last_sent , seqno ) ?
seqno : node - > bclink . last_sent ;
2006-01-02 21:04:38 +03:00
}
2012-07-10 14:55:09 +04:00
/**
2011-01-18 21:53:16 +03:00
* tipc_bclink_retransmit_to - get most recent node to request retransmission
*
2014-05-05 04:56:15 +04:00
* Called with bclink_lock locked
2011-01-18 21:53:16 +03:00
*/
2015-01-09 10:27:07 +03:00
struct tipc_node * tipc_bclink_retransmit_to ( struct net * net )
2011-01-18 21:53:16 +03:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
return tn - > bclink - > retransmit_to ;
2011-01-18 21:53:16 +03:00
}
2007-02-09 17:25:21 +03:00
/**
2006-01-02 21:04:38 +03:00
* bclink_retransmit_pkt - retransmit broadcast packets
* @ after : sequence number of last packet to * not * retransmit
* @ to : sequence number of last packet to retransmit
2007-02-09 17:25:21 +03:00
*
2014-05-05 04:56:15 +04:00
* Called with bclink_lock locked
2006-01-02 21:04:38 +03:00
*/
2015-01-09 10:27:07 +03:00
static void bclink_retransmit_pkt ( struct tipc_net * tn , u32 after , u32 to )
2006-01-02 21:04:38 +03:00
{
2014-11-26 06:41:52 +03:00
struct sk_buff * skb ;
2015-01-09 10:27:07 +03:00
struct tipc_link * bcl = tn - > bcl ;
2006-01-02 21:04:38 +03:00
2015-03-13 23:08:10 +03:00
skb_queue_walk ( & bcl - > transmq , skb ) {
2015-01-09 10:26:58 +03:00
if ( more ( buf_seqno ( skb ) , after ) ) {
tipc_link_retransmit ( bcl , skb , mod ( to - after ) ) ;
2014-11-26 06:41:52 +03:00
break ;
2015-01-09 10:26:58 +03:00
}
2014-11-26 06:41:52 +03:00
}
2006-01-02 21:04:38 +03:00
}
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-07 22:12:34 +04:00
/**
* tipc_bclink_wakeup_users - wake up pending users
*
* Called with no locks taken
*/
2015-01-09 10:27:05 +03:00
void tipc_bclink_wakeup_users ( struct net * net )
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-07 22:12:34 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2014-12-03 18:58:40 +03:00
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 16:36:41 +03:00
tipc_sk_rcv ( net , & tn - > bclink - > link . wakeupq ) ;
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-07 22:12:34 +04:00
}
2007-02-09 17:25:21 +03:00
/**
2006-01-18 02:38:21 +03:00
* tipc_bclink_acknowledge - handle acknowledgement of broadcast packets
2006-01-02 21:04:38 +03:00
* @ n_ptr : node that sent acknowledgement info
* @ acked : broadcast sequence # that has been acknowledged
2007-02-09 17:25:21 +03:00
*
2014-05-05 04:56:15 +04:00
* Node is locked , bclink_lock unlocked .
2006-01-02 21:04:38 +03:00
*/
2008-09-03 10:38:32 +04:00
void tipc_bclink_acknowledge ( struct tipc_node * n_ptr , u32 acked )
2006-01-02 21:04:38 +03:00
{
2014-11-26 06:41:52 +03:00
struct sk_buff * skb , * tmp ;
2006-01-02 21:04:38 +03:00
unsigned int released = 0 ;
2015-01-09 10:27:07 +03:00
struct net * net = n_ptr - > net ;
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2006-01-02 21:04:38 +03:00
2015-03-13 23:08:09 +03:00
if ( unlikely ( ! n_ptr - > bclink . recv_permitted ) )
return ;
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2015-03-13 23:08:09 +03:00
2011-10-24 23:26:24 +04:00
/* Bail out if tx queue is empty (no clean up is required) */
2015-03-13 23:08:10 +03:00
skb = skb_peek ( & tn - > bcl - > transmq ) ;
2014-11-26 06:41:52 +03:00
if ( ! skb )
2011-10-24 23:26:24 +04:00
goto exit ;
/* Determine which messages need to be acknowledged */
if ( acked = = INVALID_LINK_SEQ ) {
/*
* Contact with specified node has been lost , so need to
* acknowledge sent messages only ( if other nodes still exist )
* or both sent and unsent messages ( otherwise )
*/
2015-01-09 10:27:07 +03:00
if ( tn - > bclink - > bcast_nodes . count )
acked = tn - > bcl - > fsm_msg_cnt ;
2011-10-24 23:26:24 +04:00
else
2015-01-09 10:27:07 +03:00
acked = tn - > bcl - > next_out_no ;
2011-10-24 23:26:24 +04:00
} else {
/*
* Bail out if specified sequence number does not correspond
* to a message that has been sent and not yet acknowledged
*/
2014-11-26 06:41:52 +03:00
if ( less ( acked , buf_seqno ( skb ) ) | |
2015-01-09 10:27:07 +03:00
less ( tn - > bcl - > fsm_msg_cnt , acked ) | |
2011-10-24 23:26:24 +04:00
less_eq ( acked , n_ptr - > bclink . acked ) )
goto exit ;
}
/* Skip over packets that node has previously acknowledged */
2015-03-13 23:08:10 +03:00
skb_queue_walk ( & tn - > bcl - > transmq , skb ) {
2014-11-26 06:41:52 +03:00
if ( more ( buf_seqno ( skb ) , n_ptr - > bclink . acked ) )
break ;
}
2006-01-02 21:04:38 +03:00
/* Update packets that node is now acknowledging */
2015-03-13 23:08:10 +03:00
skb_queue_walk_from_safe ( & tn - > bcl - > transmq , skb , tmp ) {
2014-11-26 06:41:52 +03:00
if ( more ( buf_seqno ( skb ) , acked ) )
break ;
2015-03-13 23:08:10 +03:00
bcbuf_decr_acks ( skb ) ;
bclink_set_last_sent ( net ) ;
2014-11-26 06:41:52 +03:00
if ( bcbuf_acks ( skb ) = = 0 ) {
2015-03-13 23:08:10 +03:00
__skb_unlink ( skb , & tn - > bcl - > transmq ) ;
2014-11-26 06:41:52 +03:00
kfree_skb ( skb ) ;
2006-01-02 21:04:38 +03:00
released = 1 ;
}
}
n_ptr - > bclink . acked = acked ;
/* Try resolving broadcast link congestion, if necessary */
2015-03-13 23:08:10 +03:00
if ( unlikely ( skb_peek ( & tn - > bcl - > backlogq ) ) ) {
2015-01-09 10:27:07 +03:00
tipc_link_push_packets ( tn - > bcl ) ;
bclink_set_last_sent ( net ) ;
2010-08-17 15:00:09 +04:00
}
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 16:36:41 +03:00
if ( unlikely ( released & & ! skb_queue_empty ( & tn - > bcl - > wakeupq ) ) )
tipc: fix bug in multicast congestion handling
One aim of commit 50100a5e39461b2a61d6040e73c384766c29975d ("tipc:
use pseudo message to wake up sockets after link congestion") was
to handle link congestion abatement in a uniform way for both unicast
and multicast transmit. However, the latter doesn't work correctly,
and has been broken since the referenced commit was applied.
If a user now sends a burst of multicast messages that is big
enough to cause broadcast link congestion, it will be put to sleep,
and not be waked up when the congestion abates as it should be.
This has two reasons. First, the flag that is used, TIPC_WAKEUP_USERS,
is set correctly, but in the wrong field. Instead of setting it in the
'action_flags' field of the arrival node struct, it is by mistake set
in the dummy node struct that is owned by the broadcast link, where it
will never tested for. Second, we cannot use the same flag for waking
up unicast and multicast users, since the function tipc_node_unlock()
needs to pick the wakeup pseudo messages to deliver from different
queues. It must hence be able to distinguish between the two cases.
This commit solves this problem by adding a new flag
TIPC_WAKEUP_BCAST_USERS, and a new function tipc_bclink_wakeup_user().
The latter is to be called by tipc_node_unlock() when the named flag,
now set in the correct field, is encountered.
v2: using explicit 'unsigned int' declaration instead of 'uint', as
per comment from David Miller.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-07 22:12:34 +04:00
n_ptr - > action_flags | = TIPC_WAKEUP_BCAST_USERS ;
2011-10-24 23:26:24 +04:00
exit :
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2006-01-02 21:04:38 +03:00
}
2012-07-10 14:55:09 +04:00
/**
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
* tipc_bclink_update_link_state - update broadcast link state
2007-02-09 17:25:21 +03:00
*
tipc: purge tipc_net_lock lock
Now tipc routing hierarchy comprises the structures 'node', 'link'and
'bearer'. The whole hierarchy is protected by a big read/write lock,
tipc_net_lock, to ensure that nothing is added or removed while code
is accessing any of these structures. Obviously the locking policy
makes node, link and bearer components closely bound together so that
their relationship becomes unnecessarily complex. In the worst case,
such locking policy not only has a negative influence on performance,
but also it's prone to lead to deadlock occasionally.
In order o decouple the complex relationship between bearer and node
as well as link, the locking policy is adjusted as follows:
- Bearer level
RTNL lock is used on update side, and RCU is used on read side.
Meanwhile, all bearer instances including broadcast bearer are
saved into bearer_list array.
- Node and link level
All node instances are saved into two tipc_node_list and node_htable
lists. The two lists are protected by node_list_lock on write side,
and they are guarded with RCU lock on read side. All members in node
structure including link instances are protected by node spin lock.
- The relationship between bearer and node
When link accesses bearer, it first needs to find the bearer with
its bearer identity from the bearer_list array. When bearer accesses
node, it can iterate the node_htable hash list with the node
address to find the corresponding node.
In the new locking policy, every component has its private locking
solution and the relationship between bearer and node is very simple,
that is, they can find each other with node address or bearer identity
from node_htable hash list or bearer_list array.
Until now above all changes have been done, so tipc_net_lock can be
removed safely.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-21 06:55:48 +04:00
* RCU and node lock set
2006-01-02 21:04:38 +03:00
*/
2015-02-05 16:36:36 +03:00
void tipc_bclink_update_link_state ( struct tipc_node * n_ptr ,
2015-01-09 10:27:04 +03:00
u32 last_sent )
2006-01-02 21:04:38 +03:00
{
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
struct sk_buff * buf ;
2015-02-05 16:36:36 +03:00
struct net * net = n_ptr - > net ;
2015-01-09 10:27:04 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2006-01-02 21:04:38 +03:00
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
/* Ignore "stale" link state info */
if ( less_eq ( last_sent , n_ptr - > bclink . last_in ) )
return ;
2006-01-02 21:04:38 +03:00
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
/* Update link synchronization state; quit if in sync */
bclink_update_last_sent ( n_ptr , last_sent ) ;
if ( n_ptr - > bclink . last_sent = = n_ptr - > bclink . last_in )
return ;
/* Update out-of-sync state; quit if loss is still unconfirmed */
if ( ( + + n_ptr - > bclink . oos_state ) = = 1 ) {
if ( n_ptr - > bclink . deferred_size < ( TIPC_MIN_LINK_WIN / 2 ) )
return ;
n_ptr - > bclink . oos_state + + ;
}
/* Don't NACK if one has been recently sent (or seen) */
if ( n_ptr - > bclink . oos_state & 0x1 )
2006-01-02 21:04:38 +03:00
return ;
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
/* Send NACK */
2010-10-13 17:20:35 +04:00
buf = tipc_buf_acquire ( INT_H_SIZE ) ;
2006-01-02 21:04:38 +03:00
if ( buf ) {
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
struct tipc_msg * msg = buf_msg ( buf ) ;
2015-03-13 23:08:10 +03:00
struct sk_buff * skb = skb_peek ( & n_ptr - > bclink . deferdq ) ;
2014-11-26 06:41:53 +03:00
u32 to = skb ? buf_seqno ( skb ) - 1 : n_ptr - > bclink . last_sent ;
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
2015-02-05 16:36:36 +03:00
tipc_msg_init ( tn - > own_addr , msg , BCAST_PROTOCOL , STATE_MSG ,
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
INT_H_SIZE , n_ptr - > addr ) ;
2011-01-26 00:12:39 +03:00
msg_set_non_seq ( msg , 1 ) ;
2015-01-09 10:27:04 +03:00
msg_set_mc_netid ( msg , tn - > net_id ) ;
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
msg_set_bcast_ack ( msg , n_ptr - > bclink . last_in ) ;
msg_set_bcgap_after ( msg , n_ptr - > bclink . last_in ) ;
2014-11-26 06:41:53 +03:00
msg_set_bcgap_to ( msg , to ) ;
2006-01-02 21:04:38 +03:00
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2015-01-09 10:27:06 +03:00
tipc_bearer_send ( net , MAX_BEARERS , buf , NULL ) ;
2015-01-09 10:27:07 +03:00
tn - > bcl - > stats . sent_nacks + + ;
tipc_bclink_unlock ( net ) ;
2011-11-04 21:24:29 +04:00
kfree_skb ( buf ) ;
2006-01-02 21:04:38 +03:00
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
n_ptr - > bclink . oos_state + + ;
2006-01-02 21:04:38 +03:00
}
}
2012-07-10 14:55:09 +04:00
/**
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
* bclink_peek_nack - monitor retransmission requests sent by other nodes
2006-01-02 21:04:38 +03:00
*
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
* Delay any upcoming NACK by this node if another node has already
* requested the first message this node is going to ask for .
2006-01-02 21:04:38 +03:00
*/
2015-01-09 10:27:05 +03:00
static void bclink_peek_nack ( struct net * net , struct tipc_msg * msg )
2006-01-02 21:04:38 +03:00
{
2015-01-09 10:27:05 +03:00
struct tipc_node * n_ptr = tipc_node_find ( net , msg_destnode ( msg ) ) ;
2006-01-02 21:04:38 +03:00
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
if ( unlikely ( ! n_ptr ) )
2006-01-02 21:04:38 +03:00
return ;
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
2006-01-18 02:38:21 +03:00
tipc_node_lock ( n_ptr ) ;
2012-11-16 09:51:30 +04:00
if ( n_ptr - > bclink . recv_permitted & &
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
( n_ptr - > bclink . last_in ! = n_ptr - > bclink . last_sent ) & &
( n_ptr - > bclink . last_in = = msg_bcgap_after ( msg ) ) )
n_ptr - > bclink . oos_state = 2 ;
2006-01-18 02:38:21 +03:00
tipc_node_unlock ( n_ptr ) ;
2015-03-26 13:10:24 +03:00
tipc_node_put ( n_ptr ) ;
2006-01-02 21:04:38 +03:00
}
2015-02-05 16:36:44 +03:00
/* tipc_bclink_xmit - deliver buffer chain to all nodes in cluster
2014-07-17 04:41:03 +04:00
* and to identified node local sockets
2015-01-09 10:27:05 +03:00
* @ net : the applicable net namespace
2014-11-26 06:41:55 +03:00
* @ list : chain of buffers containing message
2014-07-17 04:41:00 +04:00
* Consumes the buffer chain , except when returning - ELINKCONG
* Returns 0 if success , otherwise errno : - ELINKCONG , - EHOSTUNREACH , - EMSGSIZE
*/
2015-01-09 10:27:05 +03:00
int tipc_bclink_xmit ( struct net * net , struct sk_buff_head * list )
2014-07-17 04:41:00 +04:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
struct tipc_link * bcl = tn - > bcl ;
struct tipc_bclink * bclink = tn - > bclink ;
2014-07-17 04:41:00 +04:00
int rc = 0 ;
int bc = 0 ;
2014-11-26 06:41:55 +03:00
struct sk_buff * skb ;
2015-02-05 16:36:44 +03:00
struct sk_buff_head arrvq ;
struct sk_buff_head inputq ;
2014-07-17 04:41:00 +04:00
/* Prepare clone of message for local node */
2014-11-26 06:41:55 +03:00
skb = tipc_msg_reassemble ( list ) ;
if ( unlikely ( ! skb ) ) {
__skb_queue_purge ( list ) ;
2014-07-17 04:41:00 +04:00
return - EHOSTUNREACH ;
}
2015-02-05 16:36:44 +03:00
/* Broadcast to all nodes */
2014-07-17 04:41:00 +04:00
if ( likely ( bclink ) ) {
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2014-07-17 04:41:00 +04:00
if ( likely ( bclink - > bcast_nodes . count ) ) {
2015-01-09 10:27:06 +03:00
rc = __tipc_link_xmit ( net , bcl , list ) ;
2014-07-17 04:41:00 +04:00
if ( likely ( ! rc ) ) {
2015-03-13 23:08:10 +03:00
u32 len = skb_queue_len ( & bcl - > transmq ) ;
2014-11-26 06:41:52 +03:00
2015-01-09 10:27:07 +03:00
bclink_set_last_sent ( net ) ;
2014-07-17 04:41:00 +04:00
bcl - > stats . queue_sz_counts + + ;
2014-11-26 06:41:52 +03:00
bcl - > stats . accu_queue_sz + = len ;
2014-07-17 04:41:00 +04:00
}
bc = 1 ;
}
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2014-07-17 04:41:00 +04:00
}
if ( unlikely ( ! bc ) )
2014-11-26 06:41:55 +03:00
__skb_queue_purge ( list ) ;
2014-07-17 04:41:00 +04:00
2015-02-05 16:36:44 +03:00
if ( unlikely ( rc ) ) {
2014-11-26 06:41:55 +03:00
kfree_skb ( skb ) ;
2015-02-05 16:36:44 +03:00
return rc ;
}
/* Deliver message clone */
__skb_queue_head_init ( & arrvq ) ;
skb_queue_head_init ( & inputq ) ;
__skb_queue_tail ( & arrvq , skb ) ;
tipc_sk_mcast_rcv ( net , & arrvq , & inputq ) ;
2014-07-17 04:41:00 +04:00
return rc ;
}
2012-07-10 14:55:09 +04:00
/**
2011-10-28 00:43:09 +04:00
* bclink_accept_pkt - accept an incoming , in - sequence broadcast packet
*
2014-05-05 04:56:15 +04:00
* Called with both sending node ' s lock and bclink_lock taken .
2011-10-28 00:43:09 +04:00
*/
static void bclink_accept_pkt ( struct tipc_node * node , u32 seqno )
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( node - > net , tipc_net_id ) ;
2011-10-28 00:43:09 +04:00
bclink_update_last_sent ( node , seqno ) ;
node - > bclink . last_in = seqno ;
node - > bclink . oos_state = 0 ;
2015-01-09 10:27:07 +03:00
tn - > bcl - > stats . recv_info + + ;
2011-10-28 00:43:09 +04:00
/*
* Unicast an ACK periodically , ensuring that
* all nodes in the cluster don ' t ACK at the same time
*/
2015-01-09 10:27:10 +03:00
if ( ( ( seqno - tn - > own_addr ) % TIPC_MIN_LINK_WIN ) = = 0 ) {
2014-02-18 12:06:46 +04:00
tipc_link_proto_xmit ( node - > active_links [ node - > addr & 1 ] ,
tipc: simplify link mtu negotiation
When a link is being established, the two endpoints advertise their
respective interface MTU in the transmitted RESET and ACTIVATE messages.
If there is any difference, the lower of the two MTUs will be selected
for use by both endpoints.
However, as a remnant of earlier attempts to introduce TIPC level
routing. there also exists an MTU discovery mechanism. If an intermediate
node has a lower MTU than the two endpoints, they will discover this
through a bisectional approach, and finally adopt this MTU for common use.
Since there is no TIPC level routing, and probably never will be,
this mechanism doesn't make any sense, and only serves to make the
link level protocol unecessarily complex.
In this commit, we eliminate the MTU discovery algorithm,and fall back
to the simple MTU advertising approach. This change is fully backwards
compatible.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-02 16:33:02 +03:00
STATE_MSG , 0 , 0 , 0 , 0 ) ;
2015-01-09 10:27:07 +03:00
tn - > bcl - > stats . sent_acks + + ;
2011-10-28 00:43:09 +04:00
}
}
2012-07-10 14:55:09 +04:00
/**
2014-02-18 12:06:46 +04:00
* tipc_bclink_rcv - receive a broadcast packet , and deliver upwards
2007-02-09 17:25:21 +03:00
*
tipc: purge tipc_net_lock lock
Now tipc routing hierarchy comprises the structures 'node', 'link'and
'bearer'. The whole hierarchy is protected by a big read/write lock,
tipc_net_lock, to ensure that nothing is added or removed while code
is accessing any of these structures. Obviously the locking policy
makes node, link and bearer components closely bound together so that
their relationship becomes unnecessarily complex. In the worst case,
such locking policy not only has a negative influence on performance,
but also it's prone to lead to deadlock occasionally.
In order o decouple the complex relationship between bearer and node
as well as link, the locking policy is adjusted as follows:
- Bearer level
RTNL lock is used on update side, and RCU is used on read side.
Meanwhile, all bearer instances including broadcast bearer are
saved into bearer_list array.
- Node and link level
All node instances are saved into two tipc_node_list and node_htable
lists. The two lists are protected by node_list_lock on write side,
and they are guarded with RCU lock on read side. All members in node
structure including link instances are protected by node spin lock.
- The relationship between bearer and node
When link accesses bearer, it first needs to find the bearer with
its bearer identity from the bearer_list array. When bearer accesses
node, it can iterate the node_htable hash list with the node
address to find the corresponding node.
In the new locking policy, every component has its private locking
solution and the relationship between bearer and node is very simple,
that is, they can find each other with node address or bearer identity
from node_htable hash list or bearer_list array.
Until now above all changes have been done, so tipc_net_lock can be
removed safely.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-21 06:55:48 +04:00
* RCU is locked , no other locks set
2006-01-02 21:04:38 +03:00
*/
2015-01-09 10:27:04 +03:00
void tipc_bclink_rcv ( struct net * net , struct sk_buff * buf )
2006-06-26 10:40:01 +04:00
{
2015-01-09 10:27:04 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2015-01-09 10:27:07 +03:00
struct tipc_link * bcl = tn - > bcl ;
2006-01-02 21:04:38 +03:00
struct tipc_msg * msg = buf_msg ( buf ) ;
2011-04-07 21:57:25 +04:00
struct tipc_node * node ;
2006-01-02 21:04:38 +03:00
u32 next_in ;
u32 seqno ;
2014-07-17 04:41:01 +04:00
int deferred = 0 ;
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 16:36:41 +03:00
int pos = 0 ;
struct sk_buff * iskb ;
2015-02-05 16:36:44 +03:00
struct sk_buff_head * arrvq , * inputq ;
2006-01-02 21:04:38 +03:00
2011-04-07 21:57:25 +04:00
/* Screen out unwanted broadcast messages */
2015-01-09 10:27:04 +03:00
if ( msg_mc_netid ( msg ) ! = tn - > net_id )
2011-04-07 21:57:25 +04:00
goto exit ;
2015-01-09 10:27:05 +03:00
node = tipc_node_find ( net , msg_prevnode ( msg ) ) ;
2011-04-07 21:57:25 +04:00
if ( unlikely ( ! node ) )
goto exit ;
tipc_node_lock ( node ) ;
2012-11-16 09:51:30 +04:00
if ( unlikely ( ! node - > bclink . recv_permitted ) )
2011-04-07 21:57:25 +04:00
goto unlock ;
2006-01-02 21:04:38 +03:00
2011-10-26 23:33:44 +04:00
/* Handle broadcast protocol message */
2006-01-02 21:04:38 +03:00
if ( unlikely ( msg_user ( msg ) = = BCAST_PROTOCOL ) ) {
2011-04-07 22:57:53 +04:00
if ( msg_type ( msg ) ! = STATE_MSG )
goto unlock ;
2015-01-09 10:27:10 +03:00
if ( msg_destnode ( msg ) = = tn - > own_addr ) {
2006-01-18 02:38:21 +03:00
tipc_bclink_acknowledge ( node , msg_bcast_ack ( msg ) ) ;
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2006-01-02 21:04:38 +03:00
bcl - > stats . recv_nacks + + ;
2015-01-09 10:27:07 +03:00
tn - > bclink - > retransmit_to = node ;
bclink_retransmit_pkt ( tn , msg_bcgap_after ( msg ) ,
2006-01-02 21:04:38 +03:00
msg_bcgap_to ( msg ) ) ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
tipc: fix potential deadlock when all links are reset
[ 60.988363] ======================================================
[ 60.988754] [ INFO: possible circular locking dependency detected ]
[ 60.989152] 3.19.0+ #194 Not tainted
[ 60.989377] -------------------------------------------------------
[ 60.989781] swapper/3/0 is trying to acquire lock:
[ 60.990079] (&(&n_ptr->lock)->rlock){+.-...}, at: [<ffffffffa0006dca>] tipc_link_retransmit+0x1aa/0x240 [tipc]
[ 60.990743]
[ 60.990743] but task is already holding lock:
[ 60.991106] (&(&bclink->lock)->rlock){+.-...}, at: [<ffffffffa00004be>] tipc_bclink_lock+0x8e/0xa0 [tipc]
[ 60.991738]
[ 60.991738] which lock already depends on the new lock.
[ 60.991738]
[ 60.992174]
[ 60.992174] the existing dependency chain (in reverse order) is:
[ 60.992174]
-> #1 (&(&bclink->lock)->rlock){+.-...}:
[ 60.992174] [<ffffffff810a9c0c>] lock_acquire+0x9c/0x140
[ 60.992174] [<ffffffff8179c41f>] _raw_spin_lock_bh+0x3f/0x50
[ 60.992174] [<ffffffffa00004be>] tipc_bclink_lock+0x8e/0xa0 [tipc]
[ 60.992174] [<ffffffffa0000f57>] tipc_bclink_add_node+0x97/0xf0 [tipc]
[ 60.992174] [<ffffffffa0011815>] tipc_node_link_up+0xf5/0x110 [tipc]
[ 60.992174] [<ffffffffa0007783>] link_state_event+0x2b3/0x4f0 [tipc]
[ 60.992174] [<ffffffffa00193c0>] tipc_link_proto_rcv+0x24c/0x418 [tipc]
[ 60.992174] [<ffffffffa0008857>] tipc_rcv+0x827/0xac0 [tipc]
[ 60.992174] [<ffffffffa0002ca3>] tipc_l2_rcv_msg+0x73/0xd0 [tipc]
[ 60.992174] [<ffffffff81646e66>] __netif_receive_skb_core+0x746/0x980
[ 60.992174] [<ffffffff816470c1>] __netif_receive_skb+0x21/0x70
[ 60.992174] [<ffffffff81647295>] netif_receive_skb_internal+0x35/0x130
[ 60.992174] [<ffffffff81648218>] napi_gro_receive+0x158/0x1d0
[ 60.992174] [<ffffffff81559e05>] e1000_clean_rx_irq+0x155/0x490
[ 60.992174] [<ffffffff8155c1b7>] e1000_clean+0x267/0x990
[ 60.992174] [<ffffffff81647b60>] net_rx_action+0x150/0x360
[ 60.992174] [<ffffffff8105ec43>] __do_softirq+0x123/0x360
[ 60.992174] [<ffffffff8105f12e>] irq_exit+0x8e/0xb0
[ 60.992174] [<ffffffff8179f9f5>] do_IRQ+0x65/0x110
[ 60.992174] [<ffffffff8179da6f>] ret_from_intr+0x0/0x13
[ 60.992174] [<ffffffff8100de9f>] arch_cpu_idle+0xf/0x20
[ 60.992174] [<ffffffff8109dfa6>] cpu_startup_entry+0x2f6/0x3f0
[ 60.992174] [<ffffffff81033cda>] start_secondary+0x13a/0x150
[ 60.992174]
-> #0 (&(&n_ptr->lock)->rlock){+.-...}:
[ 60.992174] [<ffffffff810a8f7d>] __lock_acquire+0x163d/0x1ca0
[ 60.992174] [<ffffffff810a9c0c>] lock_acquire+0x9c/0x140
[ 60.992174] [<ffffffff8179c41f>] _raw_spin_lock_bh+0x3f/0x50
[ 60.992174] [<ffffffffa0006dca>] tipc_link_retransmit+0x1aa/0x240 [tipc]
[ 60.992174] [<ffffffffa0001e11>] tipc_bclink_rcv+0x611/0x640 [tipc]
[ 60.992174] [<ffffffffa0008646>] tipc_rcv+0x616/0xac0 [tipc]
[ 60.992174] [<ffffffffa0002ca3>] tipc_l2_rcv_msg+0x73/0xd0 [tipc]
[ 60.992174] [<ffffffff81646e66>] __netif_receive_skb_core+0x746/0x980
[ 60.992174] [<ffffffff816470c1>] __netif_receive_skb+0x21/0x70
[ 60.992174] [<ffffffff81647295>] netif_receive_skb_internal+0x35/0x130
[ 60.992174] [<ffffffff81648218>] napi_gro_receive+0x158/0x1d0
[ 60.992174] [<ffffffff81559e05>] e1000_clean_rx_irq+0x155/0x490
[ 60.992174] [<ffffffff8155c1b7>] e1000_clean+0x267/0x990
[ 60.992174] [<ffffffff81647b60>] net_rx_action+0x150/0x360
[ 60.992174] [<ffffffff8105ec43>] __do_softirq+0x123/0x360
[ 60.992174] [<ffffffff8105f12e>] irq_exit+0x8e/0xb0
[ 60.992174] [<ffffffff8179f9f5>] do_IRQ+0x65/0x110
[ 60.992174] [<ffffffff8179da6f>] ret_from_intr+0x0/0x13
[ 60.992174] [<ffffffff8100de9f>] arch_cpu_idle+0xf/0x20
[ 60.992174] [<ffffffff8109dfa6>] cpu_startup_entry+0x2f6/0x3f0
[ 60.992174] [<ffffffff81033cda>] start_secondary+0x13a/0x150
[ 60.992174]
[ 60.992174] other info that might help us debug this:
[ 60.992174]
[ 60.992174] Possible unsafe locking scenario:
[ 60.992174]
[ 60.992174] CPU0 CPU1
[ 60.992174] ---- ----
[ 60.992174] lock(&(&bclink->lock)->rlock);
[ 60.992174] lock(&(&n_ptr->lock)->rlock);
[ 60.992174] lock(&(&bclink->lock)->rlock);
[ 60.992174] lock(&(&n_ptr->lock)->rlock);
[ 60.992174]
[ 60.992174] *** DEADLOCK ***
[ 60.992174]
[ 60.992174] 3 locks held by swapper/3/0:
[ 60.992174] #0: (rcu_read_lock){......}, at: [<ffffffff81646791>] __netif_receive_skb_core+0x71/0x980
[ 60.992174] #1: (rcu_read_lock){......}, at: [<ffffffffa0002c35>] tipc_l2_rcv_msg+0x5/0xd0 [tipc]
[ 60.992174] #2: (&(&bclink->lock)->rlock){+.-...}, at: [<ffffffffa00004be>] tipc_bclink_lock+0x8e/0xa0 [tipc]
[ 60.992174]
The correct the sequence of grabbing n_ptr->lock and bclink->lock
should be that the former is first held and the latter is then taken,
which exactly happened on CPU1. But especially when the retransmission
of broadcast link is failed, bclink->lock is first held in
tipc_bclink_rcv(), and n_ptr->lock is taken in link_retransmit_failure()
called by tipc_link_retransmit() subsequently, which is demonstrated on
CPU0. As a result, deadlock occurs.
If the order of holding the two locks happening on CPU0 is reversed, the
deadlock risk will be relieved. Therefore, the node lock taken in
link_retransmit_failure() originally is moved to tipc_bclink_rcv()
so that it's obtained before bclink lock. But the precondition of
the adjustment of node lock is that responding to bclink reset event
must be moved from tipc_bclink_unlock() to tipc_node_unlock().
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-26 13:10:23 +03:00
tipc_node_unlock ( node ) ;
2006-01-02 21:04:38 +03:00
} else {
2011-04-07 21:57:25 +04:00
tipc_node_unlock ( node ) ;
2015-01-09 10:27:05 +03:00
bclink_peek_nack ( net , msg ) ;
2006-01-02 21:04:38 +03:00
}
2015-03-26 13:10:24 +03:00
tipc_node_put ( node ) ;
2011-04-07 21:57:25 +04:00
goto exit ;
2006-01-02 21:04:38 +03:00
}
2011-04-07 21:57:25 +04:00
/* Handle in-sequence broadcast message */
2006-01-02 21:04:38 +03:00
seqno = msg_seqno ( msg ) ;
2011-10-26 23:33:44 +04:00
next_in = mod ( node - > bclink . last_in + 1 ) ;
2015-02-05 16:36:44 +03:00
arrvq = & tn - > bclink - > arrvq ;
inputq = & tn - > bclink - > inputq ;
2006-01-02 21:04:38 +03:00
if ( likely ( seqno = = next_in ) ) {
2011-10-26 23:33:44 +04:00
receive :
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
/* Deliver message to destination */
2006-01-02 21:04:38 +03:00
if ( likely ( msg_isdata ( msg ) ) ) {
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2011-10-28 00:43:09 +04:00
bclink_accept_pkt ( node , seqno ) ;
2015-02-05 16:36:44 +03:00
spin_lock_bh ( & inputq - > lock ) ;
__skb_queue_tail ( arrvq , buf ) ;
spin_unlock_bh ( & inputq - > lock ) ;
node - > action_flags | = TIPC_BCAST_MSG_EVT ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2006-01-18 02:38:21 +03:00
tipc_node_unlock ( node ) ;
2006-01-02 21:04:38 +03:00
} else if ( msg_user ( msg ) = = MSG_BUNDLER ) {
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2011-10-28 00:43:09 +04:00
bclink_accept_pkt ( node , seqno ) ;
2006-01-02 21:04:38 +03:00
bcl - > stats . recv_bundles + + ;
bcl - > stats . recv_bundled + = msg_msgcnt ( msg ) ;
2015-02-05 16:36:44 +03:00
pos = 0 ;
while ( tipc_msg_extract ( buf , & iskb , & pos ) ) {
spin_lock_bh ( & inputq - > lock ) ;
__skb_queue_tail ( arrvq , iskb ) ;
spin_unlock_bh ( & inputq - > lock ) ;
}
node - > action_flags | = TIPC_BCAST_MSG_EVT ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2006-01-18 02:38:21 +03:00
tipc_node_unlock ( node ) ;
2006-01-02 21:04:38 +03:00
} else if ( msg_user ( msg ) = = MSG_FRAGMENTER ) {
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2011-10-28 00:43:09 +04:00
bclink_accept_pkt ( node , seqno ) ;
2015-03-25 13:09:40 +03:00
tipc_buf_append ( & node - > bclink . reasm_buf , & buf ) ;
if ( unlikely ( ! buf & & ! node - > bclink . reasm_buf ) ) {
tipc_bclink_unlock ( net ) ;
goto unlock ;
}
2006-01-02 21:04:38 +03:00
bcl - > stats . recv_fragments + + ;
2014-05-14 13:39:12 +04:00
if ( buf ) {
2006-01-02 21:04:38 +03:00
bcl - > stats . recv_fragmented + + ;
tipc: message reassembly using fragment chain
When the first fragment of a long data data message is received on a link, a
reassembly buffer large enough to hold the data from this and all subsequent
fragments of the message is allocated. The payload of each new fragment is
copied into this buffer upon arrival. When the last fragment is received, the
reassembled message is delivered upwards to the port/socket layer.
Not only is this an inefficient approach, but it may also cause bursts of
reassembly failures in low memory situations. since we may fail to allocate
the necessary large buffer in the first place. Furthermore, after 100 subsequent
such failures the link will be reset, something that in reality aggravates the
situation.
To remedy this problem, this patch introduces a different approach. Instead of
allocating a big reassembly buffer, we now append the arriving fragments
to a reassembly chain on the link, and deliver the whole chain up to the
socket layer once the last fragment has been received. This is safe because
the retransmission layer of a TIPC link always delivers packets in strict
uninterrupted order, to the reassembly layer as to all other upper layers.
Hence there can never be more than one fragment chain pending reassembly at
any given time in a link, and we can trust (but still verify) that the
fragments will be chained up in the correct order.
Signed-off-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-11-06 12:28:06 +04:00
msg = buf_msg ( buf ) ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2013-11-06 12:28:05 +04:00
goto receive ;
}
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2006-01-18 02:38:21 +03:00
tipc_node_unlock ( node ) ;
2006-01-02 21:04:38 +03:00
} else {
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2011-10-28 00:43:09 +04:00
bclink_accept_pkt ( node , seqno ) ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2006-01-18 02:38:21 +03:00
tipc_node_unlock ( node ) ;
2011-11-04 21:24:29 +04:00
kfree_skb ( buf ) ;
2006-01-02 21:04:38 +03:00
}
2011-04-07 21:57:25 +04:00
buf = NULL ;
2011-10-26 23:33:44 +04:00
/* Determine new synchronization state */
2011-04-07 21:57:25 +04:00
tipc_node_lock ( node ) ;
2011-10-26 23:33:44 +04:00
if ( unlikely ( ! tipc_node_is_up ( node ) ) )
goto unlock ;
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
if ( node - > bclink . last_in = = node - > bclink . last_sent )
2011-10-26 23:33:44 +04:00
goto unlock ;
2015-03-13 23:08:10 +03:00
if ( skb_queue_empty ( & node - > bclink . deferdq ) ) {
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
node - > bclink . oos_state = 1 ;
goto unlock ;
}
2015-03-13 23:08:10 +03:00
msg = buf_msg ( skb_peek ( & node - > bclink . deferdq ) ) ;
2011-10-26 23:33:44 +04:00
seqno = msg_seqno ( msg ) ;
next_in = mod ( next_in + 1 ) ;
if ( seqno ! = next_in )
goto unlock ;
/* Take in-sequence message from deferred queue & deliver it */
2015-03-13 23:08:10 +03:00
buf = __skb_dequeue ( & node - > bclink . deferdq ) ;
2011-10-26 23:33:44 +04:00
goto receive ;
}
/* Handle out-of-sequence broadcast message */
if ( less ( next_in , seqno ) ) {
2015-03-13 23:08:10 +03:00
deferred = tipc_link_defer_pkt ( & node - > bclink . deferdq ,
2011-10-26 23:33:44 +04:00
buf ) ;
tipc: Major redesign of broadcast link ACK/NACK algorithms
Completely redesigns broadcast link ACK and NACK mechanisms to prevent
spurious retransmit requests in dual LAN networks, and to prevent the
broadcast link from stalling due to the failure of a receiving node to
acknowledge receiving a broadcast message or request its retransmission.
Note: These changes only impact the timing of when ACK and NACK messages
are sent, and not the basic broadcast link protocol itself, so inter-
operability with nodes using the "classic" algorithms is maintained.
The revised algorithms are as follows:
1) An explicit ACK message is still sent after receiving 16 in-sequence
messages, and implicit ACK information continues to be carried in other
unicast link message headers (including link state messages). However,
the timing of explicit ACKs is now based on the receiving node's absolute
network address rather than its relative network address to ensure that
the failure of another node does not delay the ACK beyond its 16 message
target.
2) A NACK message is now typically sent only when a message gap persists
for two consecutive incoming link state messages; this ensures that a
suspected gap is not confirmed until both LANs in a dual LAN network have
had an opportunity to deliver the message, thereby preventing spurious NACKs.
A NACK message can also be generated by the arrival of a single link state
message, if the deferred queue is so big that the current message gap
cannot be the result of "normal" mis-ordering due to the use of dual LANs
(or one LAN using a bonded interface). Since link state messages typically
arrive at different nodes at different times the problem of multiple nodes
issuing identical NACKs simultaneously is inherently avoided.
3) Nodes continue to "peek" at NACK messages sent by other nodes. If
another node requests retransmission of a message gap suspected (but not
yet confirmed) by the peeking node, the peeking node forgets about the
gap and does not generate a duplicate retransmit request. (If the peeking
node subsequently fails to receive the lost message, later link state
messages will cause it to rediscover and confirm the gap and send another
NACK.)
4) Message gap "equality" is now determined by the start of the gap only.
This is sufficient to deal with the most common cases of message loss,
and eliminates the need for complex end of gap computations.
5) A peeking node no longer tries to determine whether it should send a
complementary NACK, since the most common cases of message loss don't
require it to be sent. Consequently, the node no longer examines the
"broadcast tag" field of a NACK message when peeking.
Signed-off-by: Allan Stephens <allan.stephens@windriver.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2011-10-27 22:17:53 +04:00
bclink_update_last_sent ( node , seqno ) ;
2011-04-07 21:57:25 +04:00
buf = NULL ;
2014-07-17 04:41:01 +04:00
}
2011-10-26 23:33:44 +04:00
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2011-10-27 00:13:35 +04:00
2011-10-26 23:33:44 +04:00
if ( deferred )
bcl - > stats . deferred_recv + + ;
2011-10-26 23:57:26 +04:00
else
bcl - > stats . duplicates + + ;
2011-10-26 23:33:44 +04:00
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2011-10-27 00:13:35 +04:00
2011-04-07 21:57:25 +04:00
unlock :
2006-01-18 02:38:21 +03:00
tipc_node_unlock ( node ) ;
2015-03-26 13:10:24 +03:00
tipc_node_put ( node ) ;
2011-04-07 21:57:25 +04:00
exit :
2011-11-04 21:24:29 +04:00
kfree_skb ( buf ) ;
2006-01-02 21:04:38 +03:00
}
2008-09-03 10:38:32 +04:00
u32 tipc_bclink_acks_missing ( struct tipc_node * n_ptr )
2006-01-02 21:04:38 +03:00
{
2012-11-16 09:51:30 +04:00
return ( n_ptr - > bclink . recv_permitted & &
2015-01-09 10:27:07 +03:00
( tipc_bclink_get_last_sent ( n_ptr - > net ) ! = n_ptr - > bclink . acked ) ) ;
2006-01-02 21:04:38 +03:00
}
/**
2006-01-18 02:38:21 +03:00
* tipc_bcbearer_send - send a packet through the broadcast pseudo - bearer
2007-02-09 17:25:21 +03:00
*
2011-04-07 18:44:54 +04:00
* Send packet over as many bearers as necessary to reach all nodes
* that have joined the broadcast link .
2007-02-09 17:25:21 +03:00
*
2011-04-07 18:44:54 +04:00
* Returns 0 ( packet sent successfully ) under all circumstances ,
* since the broadcast link ' s pseudo - bearer never blocks
2006-01-02 21:04:38 +03:00
*/
2015-01-09 10:27:07 +03:00
static int tipc_bcbearer_send ( struct net * net , struct sk_buff * buf ,
struct tipc_bearer * unused1 ,
2006-03-21 09:37:52 +03:00
struct tipc_media_addr * unused2 )
2006-01-02 21:04:38 +03:00
{
int bp_index ;
tipc: ensure sequential message delivery across dual bearers
When we run broadcast packets over dual bearers/interfaces, the
current transmission code is flipping bearers between each sent
packet, with the purpose of leveraging the double bandwidth
available. The receiving bclink is resequencing the packets if
needed, so all messages are delivered upwards from the broadcast
link in the correct order, even if they may arrive in concurrent
interrupts.
However, at the moment of delivery upwards to the socket, we release
all spinlocks (bclink_lock, node_lock), so it is still possible
that arriving messages bypass each other before they reach the socket
queue.
We fix this by applying the same technique we are using for unicast
traffic. We use a link selector (i.e., the last bit of sending port
number) to ensure that messages from the same sender socket always are
sent over the same bearer. This guarantees sequential delivery between
socket pairs, which is sufficient to satisfy the protocol spec, as well
as all known user requirements.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-17 04:41:04 +04:00
struct tipc_msg * msg = buf_msg ( buf ) ;
2015-01-09 10:27:04 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2015-01-09 10:27:07 +03:00
struct tipc_bcbearer * bcbearer = tn - > bcbearer ;
struct tipc_bclink * bclink = tn - > bclink ;
2006-01-02 21:04:38 +03:00
2013-05-01 16:04:44 +04:00
/* Prepare broadcast link message for reliable transmission,
2011-04-07 18:44:54 +04:00
* if first time trying to send it ;
* preparation is skipped for broadcast link protocol messages
* since they are sent in an unreliable manner and don ' t need it
*/
2006-01-02 21:04:38 +03:00
if ( likely ( ! msg_non_seq ( buf_msg ( buf ) ) ) ) {
2011-10-24 19:18:12 +04:00
bcbuf_set_acks ( buf , bclink - > bcast_nodes . count ) ;
2008-06-05 04:54:48 +04:00
msg_set_non_seq ( msg , 1 ) ;
2015-01-09 10:27:04 +03:00
msg_set_mc_netid ( msg , tn - > net_id ) ;
2015-01-09 10:27:07 +03:00
tn - > bcl - > stats . sent_info + + ;
2011-10-24 19:18:12 +04:00
if ( WARN_ON ( ! bclink - > bcast_nodes . count ) ) {
2011-05-23 21:14:18 +04:00
dump_stack ( ) ;
return 0 ;
}
2006-01-02 21:04:38 +03:00
}
/* Send buffer over bearers until all targets reached */
2011-10-24 19:18:12 +04:00
bcbearer - > remains = bclink - > bcast_nodes ;
2006-01-02 21:04:38 +03:00
for ( bp_index = 0 ; bp_index < MAX_BEARERS ; bp_index + + ) {
2011-01-07 21:00:11 +03:00
struct tipc_bearer * p = bcbearer - > bpairs [ bp_index ] . primary ;
struct tipc_bearer * s = bcbearer - > bpairs [ bp_index ] . secondary ;
tipc: ensure sequential message delivery across dual bearers
When we run broadcast packets over dual bearers/interfaces, the
current transmission code is flipping bearers between each sent
packet, with the purpose of leveraging the double bandwidth
available. The receiving bclink is resequencing the packets if
needed, so all messages are delivered upwards from the broadcast
link in the correct order, even if they may arrive in concurrent
interrupts.
However, at the moment of delivery upwards to the socket, we release
all spinlocks (bclink_lock, node_lock), so it is still possible
that arriving messages bypass each other before they reach the socket
queue.
We fix this by applying the same technique we are using for unicast
traffic. We use a link selector (i.e., the last bit of sending port
number) to ensure that messages from the same sender socket always are
sent over the same bearer. This guarantees sequential delivery between
socket pairs, which is sufficient to satisfy the protocol spec, as well
as all known user requirements.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-17 04:41:04 +04:00
struct tipc_bearer * bp [ 2 ] = { p , s } ;
struct tipc_bearer * b = bp [ msg_link_selector ( msg ) ] ;
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 16:04:46 +04:00
struct sk_buff * tbuf ;
2006-01-02 21:04:38 +03:00
if ( ! p )
2013-05-01 16:04:44 +04:00
break ; /* No more bearers to try */
tipc: ensure sequential message delivery across dual bearers
When we run broadcast packets over dual bearers/interfaces, the
current transmission code is flipping bearers between each sent
packet, with the purpose of leveraging the double bandwidth
available. The receiving bclink is resequencing the packets if
needed, so all messages are delivered upwards from the broadcast
link in the correct order, even if they may arrive in concurrent
interrupts.
However, at the moment of delivery upwards to the socket, we release
all spinlocks (bclink_lock, node_lock), so it is still possible
that arriving messages bypass each other before they reach the socket
queue.
We fix this by applying the same technique we are using for unicast
traffic. We use a link selector (i.e., the last bit of sending port
number) to ensure that messages from the same sender socket always are
sent over the same bearer. This guarantees sequential delivery between
socket pairs, which is sufficient to satisfy the protocol spec, as well
as all known user requirements.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-17 04:41:04 +04:00
if ( ! b )
b = p ;
2013-05-01 16:04:45 +04:00
tipc_nmap_diff ( & bcbearer - > remains , & b - > nodes ,
2013-05-01 16:04:44 +04:00
& bcbearer - > remains_new ) ;
2006-06-26 10:53:20 +04:00
if ( bcbearer - > remains_new . count = = bcbearer - > remains . count )
2013-05-01 16:04:44 +04:00
continue ; /* Nothing added by bearer pair */
2006-01-02 21:04:38 +03:00
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 16:04:46 +04:00
if ( bp_index = = 0 ) {
/* Use original buffer for first bearer */
2015-01-09 10:27:06 +03:00
tipc_bearer_send ( net , b - > identity , buf , & b - > bcast_addr ) ;
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 16:04:46 +04:00
} else {
/* Avoid concurrent buffer access */
2014-06-12 02:36:26 +04:00
tbuf = pskb_copy_for_clone ( buf , GFP_ATOMIC ) ;
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 16:04:46 +04:00
if ( ! tbuf )
break ;
2015-01-09 10:27:06 +03:00
tipc_bearer_send ( net , b - > identity , tbuf ,
& b - > bcast_addr ) ;
tipc: pskb_copy() buffers when sending on more than one bearer
When sending packets, TIPC bearers use skb_clone() before writing their
hardware header. This will however NOT copy the data buffer.
So when the same packet is sent over multiple bearers (to reach multiple
nodes), the same socket buffer data will be treated by multiple
tipc_media drivers which will write their own hardware header through
dev_hard_header().
Most of the time this is not a problem, because by the time the
packet is processed by the second media, it has already been sent over
the first one. However, when the first transmission is delayed (e.g.
because of insufficient bandwidth or through a shaper), the next bearer
will overwrite the hardware header, resulting in the packet being sent:
a) with the wrong source address, when bearers of the same type,
e.g. ethernet, are involved
b) with a completely corrupt header, or even dropped, when bearers of
different types are involved.
So when the same socket buffer is to be sent multiple times, send a
pskb_copy() instead (from the second instance on), and release it
afterwards (the bearer will skb_clone() it anyway).
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-05-01 16:04:46 +04:00
kfree_skb ( tbuf ) ; /* Bearer keeps a clone */
}
2006-06-26 10:53:20 +04:00
if ( bcbearer - > remains_new . count = = 0 )
2013-05-01 16:04:44 +04:00
break ; /* All targets reached */
2006-01-02 21:04:38 +03:00
2006-06-26 10:53:20 +04:00
bcbearer - > remains = bcbearer - > remains_new ;
2006-01-02 21:04:38 +03:00
}
2007-02-09 17:25:21 +03:00
2011-04-07 18:44:54 +04:00
return 0 ;
2006-01-02 21:04:38 +03:00
}
/**
2006-01-18 02:38:21 +03:00
* tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer
2006-01-02 21:04:38 +03:00
*/
2015-01-09 10:27:06 +03:00
void tipc_bcbearer_sort ( struct net * net , struct tipc_node_map * nm_ptr ,
u32 node , bool action )
2006-01-02 21:04:38 +03:00
{
2015-01-09 10:27:06 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2015-01-09 10:27:07 +03:00
struct tipc_bcbearer * bcbearer = tn - > bcbearer ;
2011-12-30 05:55:27 +04:00
struct tipc_bcbearer_pair * bp_temp = bcbearer - > bpairs_temp ;
struct tipc_bcbearer_pair * bp_curr ;
2014-04-21 06:55:45 +04:00
struct tipc_bearer * b ;
2006-01-02 21:04:38 +03:00
int b_index ;
int pri ;
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2006-01-02 21:04:38 +03:00
2014-04-21 06:55:51 +04:00
if ( action )
tipc_nmap_add ( nm_ptr , node ) ;
else
tipc_nmap_remove ( nm_ptr , node ) ;
2006-01-02 21:04:38 +03:00
/* Group bearers by priority (can assume max of two per priority) */
memset ( bp_temp , 0 , sizeof ( bcbearer - > bpairs_temp ) ) ;
2014-04-21 06:55:45 +04:00
rcu_read_lock ( ) ;
2006-01-02 21:04:38 +03:00
for ( b_index = 0 ; b_index < MAX_BEARERS ; b_index + + ) {
2015-01-09 10:27:06 +03:00
b = rcu_dereference_rtnl ( tn - > bearer_list [ b_index ] ) ;
2014-03-27 08:54:34 +04:00
if ( ! b | | ! b - > nodes . count )
2006-01-02 21:04:38 +03:00
continue ;
if ( ! bp_temp [ b - > priority ] . primary )
bp_temp [ b - > priority ] . primary = b ;
else
bp_temp [ b - > priority ] . secondary = b ;
}
2014-04-21 06:55:45 +04:00
rcu_read_unlock ( ) ;
2006-01-02 21:04:38 +03:00
/* Create array of bearer pairs for broadcasting */
bp_curr = bcbearer - > bpairs ;
memset ( bcbearer - > bpairs , 0 , sizeof ( bcbearer - > bpairs ) ) ;
2006-01-14 00:22:22 +03:00
for ( pri = TIPC_MAX_LINK_PRI ; pri > = 0 ; pri - - ) {
2006-01-02 21:04:38 +03:00
if ( ! bp_temp [ pri ] . primary )
continue ;
bp_curr - > primary = bp_temp [ pri ] . primary ;
if ( bp_temp [ pri ] . secondary ) {
2006-01-18 02:38:21 +03:00
if ( tipc_nmap_equal ( & bp_temp [ pri ] . primary - > nodes ,
& bp_temp [ pri ] . secondary - > nodes ) ) {
2006-01-02 21:04:38 +03:00
bp_curr - > secondary = bp_temp [ pri ] . secondary ;
} else {
bp_curr + + ;
bp_curr - > primary = bp_temp [ pri ] . secondary ;
}
}
bp_curr + + ;
}
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2006-01-02 21:04:38 +03:00
}
2014-11-24 13:10:29 +03:00
static int __tipc_nl_add_bc_link_stat ( struct sk_buff * skb ,
struct tipc_stats * stats )
2014-11-20 12:29:12 +03:00
{
int i ;
struct nlattr * nest ;
struct nla_map {
__u32 key ;
__u32 val ;
} ;
struct nla_map map [ ] = {
{ TIPC_NLA_STATS_RX_INFO , stats - > recv_info } ,
{ TIPC_NLA_STATS_RX_FRAGMENTS , stats - > recv_fragments } ,
{ TIPC_NLA_STATS_RX_FRAGMENTED , stats - > recv_fragmented } ,
{ TIPC_NLA_STATS_RX_BUNDLES , stats - > recv_bundles } ,
{ TIPC_NLA_STATS_RX_BUNDLED , stats - > recv_bundled } ,
{ TIPC_NLA_STATS_TX_INFO , stats - > sent_info } ,
{ TIPC_NLA_STATS_TX_FRAGMENTS , stats - > sent_fragments } ,
{ TIPC_NLA_STATS_TX_FRAGMENTED , stats - > sent_fragmented } ,
{ TIPC_NLA_STATS_TX_BUNDLES , stats - > sent_bundles } ,
{ TIPC_NLA_STATS_TX_BUNDLED , stats - > sent_bundled } ,
{ TIPC_NLA_STATS_RX_NACKS , stats - > recv_nacks } ,
{ TIPC_NLA_STATS_RX_DEFERRED , stats - > deferred_recv } ,
{ TIPC_NLA_STATS_TX_NACKS , stats - > sent_nacks } ,
{ TIPC_NLA_STATS_TX_ACKS , stats - > sent_acks } ,
{ TIPC_NLA_STATS_RETRANSMITTED , stats - > retransmitted } ,
{ TIPC_NLA_STATS_DUPLICATES , stats - > duplicates } ,
{ TIPC_NLA_STATS_LINK_CONGS , stats - > link_congs } ,
{ TIPC_NLA_STATS_MAX_QUEUE , stats - > max_queue_sz } ,
{ TIPC_NLA_STATS_AVG_QUEUE , stats - > queue_sz_counts ?
( stats - > accu_queue_sz / stats - > queue_sz_counts ) : 0 }
} ;
nest = nla_nest_start ( skb , TIPC_NLA_LINK_STATS ) ;
if ( ! nest )
return - EMSGSIZE ;
for ( i = 0 ; i < ARRAY_SIZE ( map ) ; i + + )
if ( nla_put_u32 ( skb , map [ i ] . key , map [ i ] . val ) )
goto msg_full ;
nla_nest_end ( skb , nest ) ;
return 0 ;
msg_full :
nla_nest_cancel ( skb , nest ) ;
return - EMSGSIZE ;
}
2015-01-09 10:27:07 +03:00
int tipc_nl_add_bc_link ( struct net * net , struct tipc_nl_msg * msg )
2014-11-20 12:29:12 +03:00
{
int err ;
void * hdr ;
struct nlattr * attrs ;
struct nlattr * prop ;
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
struct tipc_link * bcl = tn - > bcl ;
2014-11-20 12:29:12 +03:00
if ( ! bcl )
return 0 ;
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2014-11-20 12:29:12 +03:00
2015-02-09 11:50:03 +03:00
hdr = genlmsg_put ( msg - > skb , msg - > portid , msg - > seq , & tipc_genl_family ,
2014-11-20 12:29:12 +03:00
NLM_F_MULTI , TIPC_NL_LINK_GET ) ;
if ( ! hdr )
return - EMSGSIZE ;
attrs = nla_nest_start ( msg - > skb , TIPC_NLA_LINK ) ;
if ( ! attrs )
goto msg_full ;
/* The broadcast link is always up */
if ( nla_put_flag ( msg - > skb , TIPC_NLA_LINK_UP ) )
goto attr_msg_full ;
if ( nla_put_flag ( msg - > skb , TIPC_NLA_LINK_BROADCAST ) )
goto attr_msg_full ;
if ( nla_put_string ( msg - > skb , TIPC_NLA_LINK_NAME , bcl - > name ) )
goto attr_msg_full ;
if ( nla_put_u32 ( msg - > skb , TIPC_NLA_LINK_RX , bcl - > next_in_no ) )
goto attr_msg_full ;
if ( nla_put_u32 ( msg - > skb , TIPC_NLA_LINK_TX , bcl - > next_out_no ) )
goto attr_msg_full ;
prop = nla_nest_start ( msg - > skb , TIPC_NLA_LINK_PROP ) ;
if ( ! prop )
goto attr_msg_full ;
tipc: introduce starvation free send algorithm
Currently, we only use a single counter; the length of the backlog
queue, to determine whether a message should be accepted to the queue
or not. Each time a message is being sent, the queue length is compared
to a threshold value for the message's importance priority. If the queue
length is beyond this threshold, the message is rejected. This algorithm
implies a risk of starvation of low importance senders during very high
load, because it may take a long time before the backlog queue has
decreased enough to accept a lower level message.
We now eliminate this risk by introducing a counter for each importance
priority. When a message is sent, we check only the queue level for that
particular message's priority. If that is ok, the message can be added
to the backlog, irrespective of the queue level for other priorities.
This way, each level is guaranteed a certain portion of the total
bandwidth, and any risk of starvation is eliminated.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-25 19:07:24 +03:00
if ( nla_put_u32 ( msg - > skb , TIPC_NLA_PROP_WIN , bcl - > window ) )
2014-11-20 12:29:12 +03:00
goto prop_msg_full ;
nla_nest_end ( msg - > skb , prop ) ;
err = __tipc_nl_add_bc_link_stat ( msg - > skb , & bcl - > stats ) ;
if ( err )
goto attr_msg_full ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2014-11-20 12:29:12 +03:00
nla_nest_end ( msg - > skb , attrs ) ;
genlmsg_end ( msg - > skb , hdr ) ;
return 0 ;
prop_msg_full :
nla_nest_cancel ( msg - > skb , prop ) ;
attr_msg_full :
nla_nest_cancel ( msg - > skb , attrs ) ;
msg_full :
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2014-11-20 12:29:12 +03:00
genlmsg_cancel ( msg - > skb , hdr ) ;
return - EMSGSIZE ;
}
2006-01-02 21:04:38 +03:00
2015-01-09 10:27:07 +03:00
int tipc_bclink_reset_stats ( struct net * net )
2006-01-02 21:04:38 +03:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
struct tipc_link * bcl = tn - > bcl ;
2006-01-02 21:04:38 +03:00
if ( ! bcl )
return - ENOPROTOOPT ;
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2006-01-02 21:04:38 +03:00
memset ( & bcl - > stats , 0 , sizeof ( bcl - > stats ) ) ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2008-07-15 09:44:01 +04:00
return 0 ;
2006-01-02 21:04:38 +03:00
}
2015-01-09 10:27:07 +03:00
int tipc_bclink_set_queue_limits ( struct net * net , u32 limit )
2006-01-02 21:04:38 +03:00
{
2015-01-09 10:27:07 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
struct tipc_link * bcl = tn - > bcl ;
2006-01-02 21:04:38 +03:00
if ( ! bcl )
return - ENOPROTOOPT ;
if ( ( limit < TIPC_MIN_LINK_WIN ) | | ( limit > TIPC_MAX_LINK_WIN ) )
return - EINVAL ;
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
2006-01-18 02:38:21 +03:00
tipc_link_set_queue_limits ( bcl , limit ) ;
2015-01-09 10:27:07 +03:00
tipc_bclink_unlock ( net ) ;
2008-07-15 09:44:01 +04:00
return 0 ;
2006-01-02 21:04:38 +03:00
}
2015-05-06 14:58:55 +03:00
int tipc_nl_bc_link_set ( struct net * net , struct nlattr * attrs [ ] )
{
int err ;
u32 win ;
struct nlattr * props [ TIPC_NLA_PROP_MAX + 1 ] ;
if ( ! attrs [ TIPC_NLA_LINK_PROP ] )
return - EINVAL ;
err = tipc_nl_parse_link_prop ( attrs [ TIPC_NLA_LINK_PROP ] , props ) ;
if ( err )
return err ;
if ( ! props [ TIPC_NLA_PROP_WIN ] )
return - EOPNOTSUPP ;
win = nla_get_u32 ( props [ TIPC_NLA_PROP_WIN ] ) ;
return tipc_bclink_set_queue_limits ( net , win ) ;
}
2015-01-09 10:27:06 +03:00
int tipc_bclink_init ( struct net * net )
2006-01-02 21:04:38 +03:00
{
2015-01-09 10:27:06 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2015-01-09 10:27:07 +03:00
struct tipc_bcbearer * bcbearer ;
struct tipc_bclink * bclink ;
struct tipc_link * bcl ;
2015-01-09 10:27:06 +03:00
2014-05-05 04:56:16 +04:00
bcbearer = kzalloc ( sizeof ( * bcbearer ) , GFP_ATOMIC ) ;
if ( ! bcbearer )
return - ENOMEM ;
bclink = kzalloc ( sizeof ( * bclink ) , GFP_ATOMIC ) ;
if ( ! bclink ) {
kfree ( bcbearer ) ;
return - ENOMEM ;
}
bcl = & bclink - > link ;
2006-01-02 21:04:38 +03:00
bcbearer - > bearer . media = & bcbearer - > media ;
2006-01-18 02:38:21 +03:00
bcbearer - > media . send_msg = tipc_bcbearer_send ;
2011-04-07 18:22:31 +04:00
sprintf ( bcbearer - > media . name , " tipc-broadcast " ) ;
2006-01-02 21:04:38 +03:00
2014-05-05 04:56:15 +04:00
spin_lock_init ( & bclink - > lock ) ;
2015-03-13 23:08:10 +03:00
__skb_queue_head_init ( & bcl - > transmq ) ;
__skb_queue_head_init ( & bcl - > backlogq ) ;
__skb_queue_head_init ( & bcl - > deferdq ) ;
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 16:36:41 +03:00
skb_queue_head_init ( & bcl - > wakeupq ) ;
2006-01-02 21:04:38 +03:00
bcl - > next_out_no = 1 ;
2006-06-27 13:53:55 +04:00
spin_lock_init ( & bclink - > node . lock ) ;
2015-02-05 16:36:44 +03:00
__skb_queue_head_init ( & bclink - > arrvq ) ;
skb_queue_head_init ( & bclink - > inputq ) ;
2006-01-02 21:04:38 +03:00
bcl - > owner = & bclink - > node ;
2015-01-09 10:27:07 +03:00
bcl - > owner - > net = net ;
tipc: simplify link mtu negotiation
When a link is being established, the two endpoints advertise their
respective interface MTU in the transmitted RESET and ACTIVATE messages.
If there is any difference, the lower of the two MTUs will be selected
for use by both endpoints.
However, as a remnant of earlier attempts to introduce TIPC level
routing. there also exists an MTU discovery mechanism. If an intermediate
node has a lower MTU than the two endpoints, they will discover this
through a bisectional approach, and finally adopt this MTU for common use.
Since there is no TIPC level routing, and probably never will be,
this mechanism doesn't make any sense, and only serves to make the
link level protocol unecessarily complex.
In this commit, we eliminate the MTU discovery algorithm,and fall back
to the simple MTU advertising approach. This change is fully backwards
compatible.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-02 16:33:02 +03:00
bcl - > mtu = MAX_PKT_DEFAULT_MCAST ;
2006-01-18 02:38:21 +03:00
tipc_link_set_queue_limits ( bcl , BCLINK_WIN_DEFAULT ) ;
tipc: decouple the relationship between bearer and link
Currently on both paths of message transmission and reception, the
read lock of tipc_net_lock must be held before bearer is accessed,
while the write lock of tipc_net_lock has to be taken before bearer
is configured. Although it can ensure that bearer is always valid on
the two data paths, link and bearer is closely bound together.
So as the part of effort of removing tipc_net_lock, the locking
policy of bearer protection will be adjusted as below: on the two
data paths, RCU is used, and on the configuration path of bearer,
RTNL lock is applied.
Now RCU just covers the path of message reception. To make it possible
to protect the path of message transmission with RCU, link should not
use its stored bearer pointer to access bearer, but it should use the
bearer identity of its attached bearer as index to get bearer instance
from bearer_list array, which can help us decouple the relationship
between bearer and link. As a result, bearer on the path of message
transmission can be safely protected by RCU when we access bearer_list
array within RCU lock protection.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-21 06:55:46 +04:00
bcl - > bearer_id = MAX_BEARERS ;
2015-01-09 10:27:06 +03:00
rcu_assign_pointer ( tn - > bearer_list [ MAX_BEARERS ] , & bcbearer - > bearer ) ;
2006-01-02 21:04:38 +03:00
bcl - > state = WORKING_WORKING ;
2015-02-05 16:36:36 +03:00
bcl - > pmsg = ( struct tipc_msg * ) & bcl - > proto_msg ;
msg_set_prevnode ( bcl - > pmsg , tn - > own_addr ) ;
2009-03-19 05:11:29 +03:00
strlcpy ( bcl - > name , tipc_bclink_name , TIPC_MAX_LINK_NAME ) ;
2015-01-09 10:27:07 +03:00
tn - > bcbearer = bcbearer ;
tn - > bclink = bclink ;
tn - > bcl = bcl ;
2014-05-05 04:56:16 +04:00
return 0 ;
2006-01-02 21:04:38 +03:00
}
2015-01-09 10:27:06 +03:00
void tipc_bclink_stop ( struct net * net )
2006-01-02 21:04:38 +03:00
{
2015-01-09 10:27:06 +03:00
struct tipc_net * tn = net_generic ( net , tipc_net_id ) ;
2015-01-09 10:27:07 +03:00
tipc_bclink_lock ( net ) ;
tipc_link_purge_queues ( tn - > bcl ) ;
tipc_bclink_unlock ( net ) ;
2011-10-24 18:29:26 +04:00
2015-01-09 10:27:06 +03:00
RCU_INIT_POINTER ( tn - > bearer_list [ BCBEARER ] , NULL ) ;
2014-05-05 04:56:16 +04:00
synchronize_net ( ) ;
2015-01-09 10:27:07 +03:00
kfree ( tn - > bcbearer ) ;
kfree ( tn - > bclink ) ;
2006-01-02 21:04:38 +03:00
}
2010-05-11 18:30:14 +04:00
/**
* tipc_nmap_add - add a node to a node map
*/
2014-04-21 06:55:51 +04:00
static void tipc_nmap_add ( struct tipc_node_map * nm_ptr , u32 node )
2010-05-11 18:30:14 +04:00
{
int n = tipc_node ( node ) ;
int w = n / WSIZE ;
u32 mask = ( 1 < < ( n % WSIZE ) ) ;
if ( ( nm_ptr - > map [ w ] & mask ) = = 0 ) {
nm_ptr - > count + + ;
nm_ptr - > map [ w ] | = mask ;
}
}
/**
* tipc_nmap_remove - remove a node from a node map
*/
2014-04-21 06:55:51 +04:00
static void tipc_nmap_remove ( struct tipc_node_map * nm_ptr , u32 node )
2010-05-11 18:30:14 +04:00
{
int n = tipc_node ( node ) ;
int w = n / WSIZE ;
u32 mask = ( 1 < < ( n % WSIZE ) ) ;
if ( ( nm_ptr - > map [ w ] & mask ) ! = 0 ) {
nm_ptr - > map [ w ] & = ~ mask ;
nm_ptr - > count - - ;
}
}
/**
* tipc_nmap_diff - find differences between node maps
* @ nm_a : input node map A
* @ nm_b : input node map B
* @ nm_diff : output node map A - B ( i . e . nodes of A that are not in B )
*/
2010-10-13 17:20:35 +04:00
static void tipc_nmap_diff ( struct tipc_node_map * nm_a ,
struct tipc_node_map * nm_b ,
struct tipc_node_map * nm_diff )
2010-05-11 18:30:14 +04:00
{
int stop = ARRAY_SIZE ( nm_a - > map ) ;
int w ;
int b ;
u32 map ;
memset ( nm_diff , 0 , sizeof ( * nm_diff ) ) ;
for ( w = 0 ; w < stop ; w + + ) {
map = nm_a - > map [ w ] ^ ( nm_a - > map [ w ] & nm_b - > map [ w ] ) ;
nm_diff - > map [ w ] = map ;
if ( map ! = 0 ) {
for ( b = 0 ; b < WSIZE ; b + + ) {
if ( map & ( 1 < < b ) )
nm_diff - > count + + ;
}
}
}
}