2006-01-02 21:04:38 +03:00
/*
* net / tipc / addr . h : Include file for TIPC address utility routines
2007-02-09 17:25:21 +03:00
*
2018-03-22 22:42:50 +03:00
* Copyright ( c ) 2000 - 2006 , 2018 , Ericsson AB
2006-01-02 21:04:38 +03:00
* Copyright ( c ) 2004 - 2005 , Wind River Systems
2021-03-17 05:06:10 +03:00
* Copyright ( c ) 2020 - 2021 , Red Hat Inc
2006-01-02 21:04:38 +03:00
* All rights reserved .
*
2006-01-11 15:30:43 +03:00
* Redistribution and use in source and binary forms , with or without
2006-01-02 21:04:38 +03:00
* modification , are permitted provided that the following conditions are met :
*
2006-01-11 15:30:43 +03:00
* 1. Redistributions of source code must retain the above copyright
* notice , this list of conditions and the following disclaimer .
* 2. Redistributions in binary form must reproduce the above copyright
* notice , this list of conditions and the following disclaimer in the
* documentation and / or other materials provided with the distribution .
* 3. Neither the names of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission .
2006-01-02 21:04:38 +03:00
*
2006-01-11 15:30:43 +03:00
* Alternatively , this software may be distributed under the terms of the
* GNU General Public License ( " GPL " ) version 2 as published by the Free
* Software Foundation .
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS " AS IS "
* AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT LIMITED TO , THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL , SPECIAL , EXEMPLARY , OR
* CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT LIMITED TO , PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE , DATA , OR PROFITS ; OR BUSINESS
* INTERRUPTION ) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY , WHETHER IN
* CONTRACT , STRICT LIABILITY , OR TORT ( INCLUDING NEGLIGENCE OR OTHERWISE )
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE , EVEN IF ADVISED OF THE
2006-01-02 21:04:38 +03:00
* POSSIBILITY OF SUCH DAMAGE .
*/
# ifndef _TIPC_ADDR_H
# define _TIPC_ADDR_H
2015-01-09 10:27:07 +03:00
# include <linux/types.h>
# include <linux/tipc.h>
2015-01-09 10:27:10 +03:00
# include <net/net_namespace.h>
# include <net/netns/generic.h>
2015-05-14 17:46:13 +03:00
# include "core.h"
2015-01-09 10:27:07 +03:00
2021-03-17 05:06:10 +03:00
/* Struct tipc_uaddr: internal version of struct sockaddr_tipc.
* Must be kept aligned both regarding field positions and size .
*/
struct tipc_uaddr {
unsigned short family ;
unsigned char addrtype ;
signed char scope ;
union {
struct {
struct tipc_service_addr sa ;
u32 lookup_node ;
} ;
struct tipc_service_range sr ;
struct tipc_socket_addr sk ;
} ;
} ;
static inline void tipc_uaddr ( struct tipc_uaddr * ua , u32 atype , u32 scope ,
u32 type , u32 lower , u32 upper )
{
ua - > family = AF_TIPC ;
ua - > addrtype = atype ;
ua - > scope = scope ;
ua - > sr . type = type ;
ua - > sr . lower = lower ;
ua - > sr . upper = upper ;
}
static inline bool tipc_uaddr_valid ( struct tipc_uaddr * ua , int len )
{
u32 atype ;
if ( len < sizeof ( struct sockaddr_tipc ) )
return false ;
atype = ua - > addrtype ;
if ( ua - > family ! = AF_TIPC )
return false ;
if ( atype = = TIPC_SERVICE_ADDR | | atype = = TIPC_SOCKET_ADDR )
return true ;
if ( atype = = TIPC_SERVICE_RANGE )
return ua - > sr . upper > = ua - > sr . lower ;
return false ;
}
2015-05-14 17:46:13 +03:00
static inline u32 tipc_own_addr ( struct net * net )
2018-03-22 22:42:50 +03:00
{
return tipc_net ( net ) - > node_addr ;
}
static inline u8 * tipc_own_id ( struct net * net )
2015-05-14 17:46:13 +03:00
{
2018-03-22 22:42:49 +03:00
struct tipc_net * tn = tipc_net ( net ) ;
2015-05-14 17:46:13 +03:00
2018-03-22 22:42:50 +03:00
if ( ! strlen ( tn - > node_id_string ) )
return NULL ;
return tn - > node_id ;
}
static inline char * tipc_own_id_string ( struct net * net )
{
return tipc_net ( net ) - > node_id_string ;
2015-05-14 17:46:13 +03:00
}
2011-02-23 19:44:49 +03:00
static inline u32 tipc_cluster_mask ( u32 addr )
{
2016-07-26 09:47:18 +03:00
return addr & TIPC_ZONE_CLUSTER_MASK ;
2011-02-23 19:44:49 +03:00
}
2018-03-15 18:48:51 +03:00
static inline int tipc_node2scope ( u32 node )
{
return node ? TIPC_NODE_SCOPE : TIPC_CLUSTER_SCOPE ;
}
static inline int tipc_scope2node ( struct net * net , int sc )
{
return sc ! = TIPC_NODE_SCOPE ? 0 : tipc_own_addr ( net ) ;
}
2018-03-22 22:42:50 +03:00
static inline int in_own_node ( struct net * net , u32 addr )
{
return addr = = tipc_own_addr ( net ) | | ! addr ;
}
tipc: allow closest-first lookup algorithm when legacy address is configured
The removal of an internal structure of the node address has an unwanted
side effect.
- Currently, if a user is sending an anycast message with destination
domain 0, the tipc_namebl_translate() function will use the 'closest-
first' algorithm to first look for a node local destination, and only
when no such is found, will it resort to the cluster global 'round-
robin' lookup algorithm.
- Current users can get around this, and enforce unconditional use of
global round-robin by indicating a destination as Z.0.0 or Z.C.0.
- This option disappears when we make the node address flat, since the
lookup algorithm has no way of recognizing this case. So, as long as
there are node local destinations, the algorithm will always select
one of those, and there is nothing the sender can do to change this.
We solve this by eliminating the 'closest-first' option, which was never
a good idea anyway, for non-legacy users, but only for those. To
distinguish between legacy users and non-legacy users we introduce a new
flag 'legacy_addr_format' in struct tipc_core, to be set when the user
configures a legacy-style Z.C.N node address. Hence, when a legacy user
indicates a zero lookup domain 'closest-first' is selected, and in all
other cases we use 'round-robin'.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-22 22:42:48 +03:00
bool tipc_in_scope ( bool legacy_format , u32 domain , u32 addr ) ;
2018-03-22 22:42:50 +03:00
void tipc_set_node_id ( struct net * net , u8 * id ) ;
void tipc_set_node_addr ( struct net * net , u32 addr ) ;
char * tipc_nodeid2string ( char * str , u8 * id ) ;
u32 tipc_node_id2hash ( u8 * id128 ) ;
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 03:46:22 +03:00
2006-01-02 21:04:38 +03:00
# endif