2006-01-02 21:04:38 +03:00
/*
* net / tipc / name_table . h : Include file for TIPC name table code
2007-02-09 17:25:21 +03:00
*
2018-03-15 18:48:54 +03:00
* Copyright ( c ) 2000 - 2006 , 2014 - 2018 , Ericsson AB
2011-05-30 18:48:48 +04:00
* Copyright ( c ) 2004 - 2005 , 2010 - 2011 , Wind River Systems
2021-03-17 05:06:08 +03:00
* Copyright ( c ) 2020 - 2021 , Red Hat Inc
2006-01-02 21:04:38 +03:00
* All rights reserved .
*
2006-01-11 15:30:43 +03:00
* Redistribution and use in source and binary forms , with or without
2006-01-02 21:04:38 +03:00
* modification , are permitted provided that the following conditions are met :
*
2006-01-11 15:30:43 +03:00
* 1. Redistributions of source code must retain the above copyright
* notice , this list of conditions and the following disclaimer .
* 2. Redistributions in binary form must reproduce the above copyright
* notice , this list of conditions and the following disclaimer in the
* documentation and / or other materials provided with the distribution .
* 3. Neither the names of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission .
2006-01-02 21:04:38 +03:00
*
2006-01-11 15:30:43 +03:00
* Alternatively , this software may be distributed under the terms of the
* GNU General Public License ( " GPL " ) version 2 as published by the Free
* Software Foundation .
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS " AS IS "
* AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT LIMITED TO , THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL , SPECIAL , EXEMPLARY , OR
* CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT LIMITED TO , PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE , DATA , OR PROFITS ; OR BUSINESS
* INTERRUPTION ) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY , WHETHER IN
* CONTRACT , STRICT LIABILITY , OR TORT ( INCLUDING NEGLIGENCE OR OTHERWISE )
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE , EVEN IF ADVISED OF THE
2006-01-02 21:04:38 +03:00
* POSSIBILITY OF SUCH DAMAGE .
*/
# ifndef _TIPC_NAME_TABLE_H
# define _TIPC_NAME_TABLE_H
2011-12-30 05:43:44 +04:00
struct tipc_subscription ;
2015-02-05 16:36:43 +03:00
struct tipc_plist ;
2017-01-18 21:50:51 +03:00
struct tipc_nlist ;
tipc: introduce communication groups
As a preparation for introducing flow control for multicast and datagram
messaging we need a more strictly defined framework than we have now. A
socket must be able keep track of exactly how many and which other
sockets it is allowed to communicate with at any moment, and keep the
necessary state for those.
We therefore introduce a new concept we have named Communication Group.
Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN.
The call takes four parameters: 'type' serves as group identifier,
'instance' serves as an logical member identifier, and 'scope' indicates
the visibility of the group (node/cluster/zone). Finally, 'flags' makes
it possible to set certain properties for the member. For now, there is
only one flag, indicating if the creator of the socket wants to receive
a copy of broadcast or multicast messages it is sending via the socket,
and if wants to be eligible as destination for its own anycasts.
A group is closed, i.e., sockets which have not joined a group will
not be able to send messages to or receive messages from members of
the group, and vice versa.
Any member of a group can send multicast ('group broadcast') messages
to all group members, optionally including itself, using the primitive
send(). The messages are received via the recvmsg() primitive. A socket
can only be member of one group at a time.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-13 12:04:23 +03:00
struct tipc_group ;
2021-03-17 05:06:11 +03:00
struct tipc_uaddr ;
2006-01-02 21:04:38 +03:00
/*
* TIPC name types reserved for internal TIPC use ( both current and planned )
*/
2014-12-02 10:00:24 +03:00
# define TIPC_ZM_SRV 3 /* zone master service name type */
# define TIPC_PUBL_SCOPE_NUM (TIPC_NODE_SCOPE + 1)
# define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */
2006-01-02 21:04:38 +03:00
2021-06-02 20:44:26 +03:00
# define TIPC_ANY_SCOPE 10 /* Both node and cluster scope will match */
2006-01-02 21:04:38 +03:00
/**
2021-03-17 05:06:08 +03:00
* struct publication - info about a published service address or range
* @ sr : service range represented by this publication
* @ sk : address of socket bound to this publication
2018-03-15 18:48:55 +03:00
* @ scope : scope of publication , TIPC_NODE_SCOPE or TIPC_CLUSTER_SCOPE
* @ key : publication key , unique across the cluster
tipc: support in-order name publication events
It is observed that TIPC service binding order will not be kept in the
publication event report to user if the service is subscribed after the
bindings.
For example, services are bound by application in the following order:
Server: bound port A to {18888,66,66} scope 2
Server: bound port A to {18888,33,33} scope 2
Now, if a client subscribes to the service range (e.g. {18888, 0-100}),
it will get the 'TIPC_PUBLISHED' events in that binding order only when
the subscription is started before the bindings.
Otherwise, if started after the bindings, the events will arrive in the
opposite order:
Client: received event for published {18888,33,33}
Client: received event for published {18888,66,66}
For the latter case, it is clear that the bindings have existed in the
name table already, so when reported, the events' order will follow the
order of the rbtree binding nodes (- a node with lesser 'lower'/'upper'
range value will be first).
This is correct as we provide the tracking on a specific service status
(available or not), not the relationship between multiple services.
However, some users expect to see the same order of arriving events
irrespective of when the subscription is issued. This turns out to be
easy to fix. We now add functionality to ensure that publication events
always are issued in the same temporal order as the corresponding
bindings were performed.
v2: replace the unnecessary macro - 'publication_after()' with inline
function.
v3: reuse 'time_after32()' instead of reinventing the same exact code.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-21 11:34:58 +03:00
* @ id : publication id
2018-03-15 18:48:55 +03:00
* @ binding_node : all publications from the same node which bound this one
2020-11-29 21:32:43 +03:00
* - Remote publications : in node - > publ_list ;
* Used by node / name distr to withdraw publications when node is lost
2018-03-15 18:48:55 +03:00
* - Local / node scope publications : in name_table - > node_scope list
* - Local / cluster scope publications : in name_table - > cluster_scope list
* @ binding_sock : all publications from the same socket which bound this one
* Used by socket to withdraw publications when socket is unbound / released
* @ local_publ : list of identical publications made from this node
* Used by closest_first and multicast receive lookup algorithms
* @ all_publ : all publications identical to this one , whatever node and scope
* Used by round - robin lookup algorithm
tipc: support in-order name publication events
It is observed that TIPC service binding order will not be kept in the
publication event report to user if the service is subscribed after the
bindings.
For example, services are bound by application in the following order:
Server: bound port A to {18888,66,66} scope 2
Server: bound port A to {18888,33,33} scope 2
Now, if a client subscribes to the service range (e.g. {18888, 0-100}),
it will get the 'TIPC_PUBLISHED' events in that binding order only when
the subscription is started before the bindings.
Otherwise, if started after the bindings, the events will arrive in the
opposite order:
Client: received event for published {18888,33,33}
Client: received event for published {18888,66,66}
For the latter case, it is clear that the bindings have existed in the
name table already, so when reported, the events' order will follow the
order of the rbtree binding nodes (- a node with lesser 'lower'/'upper'
range value will be first).
This is correct as we provide the tracking on a specific service status
(available or not), not the relationship between multiple services.
However, some users expect to see the same order of arriving events
irrespective of when the subscription is issued. This turns out to be
easy to fix. We now add functionality to ensure that publication events
always are issued in the same temporal order as the corresponding
bindings were performed.
v2: replace the unnecessary macro - 'publication_after()' with inline
function.
v3: reuse 'time_after32()' instead of reinventing the same exact code.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-21 11:34:58 +03:00
* @ list : to form a list of publications in temporal order
2014-12-02 10:00:30 +03:00
* @ rcu : RCU callback head used for deferred freeing
2006-01-02 21:04:38 +03:00
*/
struct publication {
2021-03-17 05:06:08 +03:00
struct tipc_service_range sr ;
struct tipc_socket_addr sk ;
2021-03-17 05:06:12 +03:00
u16 scope ;
2006-01-02 21:04:38 +03:00
u32 key ;
tipc: support in-order name publication events
It is observed that TIPC service binding order will not be kept in the
publication event report to user if the service is subscribed after the
bindings.
For example, services are bound by application in the following order:
Server: bound port A to {18888,66,66} scope 2
Server: bound port A to {18888,33,33} scope 2
Now, if a client subscribes to the service range (e.g. {18888, 0-100}),
it will get the 'TIPC_PUBLISHED' events in that binding order only when
the subscription is started before the bindings.
Otherwise, if started after the bindings, the events will arrive in the
opposite order:
Client: received event for published {18888,33,33}
Client: received event for published {18888,66,66}
For the latter case, it is clear that the bindings have existed in the
name table already, so when reported, the events' order will follow the
order of the rbtree binding nodes (- a node with lesser 'lower'/'upper'
range value will be first).
This is correct as we provide the tracking on a specific service status
(available or not), not the relationship between multiple services.
However, some users expect to see the same order of arriving events
irrespective of when the subscription is issued. This turns out to be
easy to fix. We now add functionality to ensure that publication events
always are issued in the same temporal order as the corresponding
bindings were performed.
v2: replace the unnecessary macro - 'publication_after()' with inline
function.
v3: reuse 'time_after32()' instead of reinventing the same exact code.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-21 11:34:58 +03:00
u32 id ;
2018-03-15 18:48:54 +03:00
struct list_head binding_node ;
2018-03-15 18:48:55 +03:00
struct list_head binding_sock ;
struct list_head local_publ ;
struct list_head all_publ ;
tipc: support in-order name publication events
It is observed that TIPC service binding order will not be kept in the
publication event report to user if the service is subscribed after the
bindings.
For example, services are bound by application in the following order:
Server: bound port A to {18888,66,66} scope 2
Server: bound port A to {18888,33,33} scope 2
Now, if a client subscribes to the service range (e.g. {18888, 0-100}),
it will get the 'TIPC_PUBLISHED' events in that binding order only when
the subscription is started before the bindings.
Otherwise, if started after the bindings, the events will arrive in the
opposite order:
Client: received event for published {18888,33,33}
Client: received event for published {18888,66,66}
For the latter case, it is clear that the bindings have existed in the
name table already, so when reported, the events' order will follow the
order of the rbtree binding nodes (- a node with lesser 'lower'/'upper'
range value will be first).
This is correct as we provide the tracking on a specific service status
(available or not), not the relationship between multiple services.
However, some users expect to see the same order of arriving events
irrespective of when the subscription is issued. This turns out to be
easy to fix. We now add functionality to ensure that publication events
always are issued in the same temporal order as the corresponding
bindings were performed.
v2: replace the unnecessary macro - 'publication_after()' with inline
function.
v3: reuse 'time_after32()' instead of reinventing the same exact code.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-21 11:34:58 +03:00
struct list_head list ;
2014-12-02 10:00:30 +03:00
struct rcu_head rcu ;
2006-01-02 21:04:38 +03:00
} ;
2014-12-02 10:00:24 +03:00
/**
* struct name_table - table containing all existing port name publications
2020-11-29 21:32:43 +03:00
* @ services : name sequence hash lists
2018-03-15 18:48:52 +03:00
* @ node_scope : all local publications with node scope
2018-03-15 18:48:55 +03:00
* - used by name_distr during re - init of name table
2018-03-15 18:48:52 +03:00
* @ cluster_scope : all local publications with cluster scope
2018-03-15 18:48:55 +03:00
* - used by name_distr to send bulk updates to new nodes
* - used by name_distr during re - init of name table
2020-11-29 21:32:43 +03:00
* @ cluster_scope_lock : lock for accessing @ cluster_scope
2014-12-02 10:00:24 +03:00
* @ local_publ_count : number of publications issued by this node
2020-11-29 21:32:43 +03:00
* @ rc_dests : destination node counter
* @ snd_nxt : next sequence number to be used
2014-12-02 10:00:24 +03:00
*/
struct name_table {
2018-03-30 00:20:41 +03:00
struct hlist_head services [ TIPC_NAMETBL_SIZE ] ;
2018-03-15 18:48:52 +03:00
struct list_head node_scope ;
struct list_head cluster_scope ;
tipc: eliminate message disordering during binding table update
We have seen the following race scenario:
1) named_distribute() builds a "bulk" message, containing a PUBLISH
item for a certain publication. This is based on the contents of
the binding tables's 'cluster_scope' list.
2) tipc_named_withdraw() removes the same publication from the list,
bulds a WITHDRAW message and distributes it to all cluster nodes.
3) tipc_named_node_up(), which was calling named_distribute(), sends
out the bulk message built under 1)
4) The WITHDRAW message arrives at the just detected node, finds
no corresponding publication, and is dropped.
5) The PUBLISH item arrives at the same node, is added to its binding
table, and remains there forever.
This arrival disordering was earlier taken care of by the backlog queue,
originally added for a different purpose, which was removed in the
commit referred to below, but we now need a different solution.
In this commit, we replace the rcu lock protecting the 'cluster_scope'
list with a regular RW lock which comprises even the sending of the
bulk message. This both guarantees both the list integrity and the
message sending order. We will later add a commit which cleans up
this code further.
Note that this commit needs recently added commit d3092b2efca1 ("tipc:
fix unsafe rcu locking when accessing publication list") to apply
cleanly.
Fixes: 37922ea4a310 ("tipc: permit overlapping service ranges in name table")
Reported-by: Tuong Lien Tong <tuong.t.lien@dektech.com.au>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-10-19 20:55:40 +03:00
rwlock_t cluster_scope_lock ;
2014-12-02 10:00:24 +03:00
u32 local_publ_count ;
tipc: update a binding service via broadcast
Currently, updating binding table (add service binding to
name table/withdraw a service binding) is being sent over replicast.
However, if we are scaling up clusters to > 100 nodes/containers this
method is less affection because of looping through nodes in a cluster one
by one.
It is worth to use broadcast to update a binding service. This way, the
binding table can be updated on all peer nodes in one shot.
Broadcast is used when all peer nodes, as indicated by a new capability
flag TIPC_NAMED_BCAST, support reception of this message type.
Four problems need to be considered when introducing this feature.
1) When establishing a link to a new peer node we still update this by a
unicast 'bulk' update. This may lead to race conditions, where a later
broadcast publication/withdrawal bypass the 'bulk', resulting in
disordered publications, or even that a withdrawal may arrive before the
corresponding publication. We solve this by adding an 'is_last_bulk' bit
in the last bulk messages so that it can be distinguished from all other
messages. Only when this message has arrived do we open up for reception
of broadcast publications/withdrawals.
2) When a first legacy node is added to the cluster all distribution
will switch over to use the legacy 'replicast' method, while the
opposite happens when the last legacy node leaves the cluster. This
entails another risk of message disordering that has to be handled. We
solve this by adding a sequence number to the broadcast/replicast
messages, so that disordering can be discovered and corrected. Note
however that we don't need to consider potential message loss or
duplication at this protocol level.
3) Bulk messages don't contain any sequence numbers, and will always
arrive in order. Hence we must exempt those from the sequence number
control and deliver them unconditionally. We solve this by adding a new
'is_bulk' bit in those messages so that they can be recognized.
4) Legacy messages, which don't contain any new bits or sequence
numbers, but neither can arrive out of order, also need to be exempt
from the initial synchronization and sequence number check, and
delivered unconditionally. Therefore, we add another 'is_not_legacy' bit
to all new messages so that those can be distinguished from legacy
messages and the latter delivered directly.
v1->v2:
- fix warning issue reported by kbuild test robot <lkp@intel.com>
- add santiy check to drop the publication message with a sequence
number that is lower than the agreed synch point
Signed-off-by: kernel test robot <lkp@intel.com>
Signed-off-by: Hoang Huu Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-17 09:56:05 +03:00
u32 rc_dests ;
u32 snd_nxt ;
2014-12-02 10:00:24 +03:00
} ;
2006-01-02 21:04:38 +03:00
2014-11-20 12:29:20 +03:00
int tipc_nl_name_table_dump ( struct sk_buff * skb , struct netlink_callback * cb ) ;
2021-03-17 05:06:15 +03:00
bool tipc_nametbl_lookup_anycast ( struct net * net , struct tipc_uaddr * ua ,
struct tipc_socket_addr * sk ) ;
2021-03-17 05:06:16 +03:00
void tipc_nametbl_lookup_mcast_sockets ( struct net * net , struct tipc_uaddr * ua ,
2021-06-02 20:44:26 +03:00
struct list_head * dports ) ;
2021-03-17 05:06:17 +03:00
void tipc_nametbl_lookup_mcast_nodes ( struct net * net , struct tipc_uaddr * ua ,
struct tipc_nlist * nodes ) ;
2021-03-17 05:06:18 +03:00
bool tipc_nametbl_lookup_group ( struct net * net , struct tipc_uaddr * ua ,
struct list_head * dsts , int * dstcnt ,
u32 exclude , bool mcast ) ;
tipc: introduce communication groups
As a preparation for introducing flow control for multicast and datagram
messaging we need a more strictly defined framework than we have now. A
socket must be able keep track of exactly how many and which other
sockets it is allowed to communicate with at any moment, and keep the
necessary state for those.
We therefore introduce a new concept we have named Communication Group.
Sockets can join a group via a new setsockopt() call TIPC_GROUP_JOIN.
The call takes four parameters: 'type' serves as group identifier,
'instance' serves as an logical member identifier, and 'scope' indicates
the visibility of the group (node/cluster/zone). Finally, 'flags' makes
it possible to set certain properties for the member. For now, there is
only one flag, indicating if the creator of the socket wants to receive
a copy of broadcast or multicast messages it is sending via the socket,
and if wants to be eligible as destination for its own anycasts.
A group is closed, i.e., sockets which have not joined a group will
not be able to send messages to or receive messages from members of
the group, and vice versa.
Any member of a group can send multicast ('group broadcast') messages
to all group members, optionally including itself, using the primitive
send(). The messages are received via the recvmsg() primitive. A socket
can only be member of one group at a time.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-13 12:04:23 +03:00
void tipc_nametbl_build_group ( struct net * net , struct tipc_group * grp ,
2021-03-17 05:06:20 +03:00
struct tipc_uaddr * ua ) ;
2021-03-17 05:06:11 +03:00
struct publication * tipc_nametbl_publish ( struct net * net , struct tipc_uaddr * ua ,
struct tipc_socket_addr * sk , u32 key ) ;
2021-03-17 05:06:13 +03:00
void tipc_nametbl_withdraw ( struct net * net , struct tipc_uaddr * ua ,
struct tipc_socket_addr * sk , u32 key ) ;
2021-03-17 05:06:12 +03:00
struct publication * tipc_nametbl_insert_publ ( struct net * net ,
struct tipc_uaddr * ua ,
struct tipc_socket_addr * sk ,
u32 key ) ;
2021-03-17 05:06:13 +03:00
struct publication * tipc_nametbl_remove_publ ( struct net * net ,
struct tipc_uaddr * ua ,
struct tipc_socket_addr * sk ,
u32 key ) ;
2018-04-11 23:52:09 +03:00
bool tipc_nametbl_subscribe ( struct tipc_subscription * s ) ;
2011-12-30 05:43:44 +04:00
void tipc_nametbl_unsubscribe ( struct tipc_subscription * s ) ;
2015-01-09 10:27:09 +03:00
int tipc_nametbl_init ( struct net * net ) ;
void tipc_nametbl_stop ( struct net * net ) ;
2006-01-02 21:04:38 +03:00
2017-10-13 12:04:22 +03:00
struct tipc_dest {
2017-01-18 21:50:51 +03:00
struct list_head list ;
2018-08-27 04:32:26 +03:00
u32 port ;
u32 node ;
2017-01-18 21:50:51 +03:00
} ;
2017-10-13 12:04:22 +03:00
struct tipc_dest * tipc_dest_find ( struct list_head * l , u32 node , u32 port ) ;
bool tipc_dest_push ( struct list_head * l , u32 node , u32 port ) ;
bool tipc_dest_pop ( struct list_head * l , u32 * node , u32 * port ) ;
bool tipc_dest_del ( struct list_head * l , u32 node , u32 port ) ;
void tipc_dest_list_purge ( struct list_head * l ) ;
int tipc_dest_list_len ( struct list_head * l ) ;
2015-02-05 16:36:43 +03:00
2006-01-02 21:04:38 +03:00
# endif