2006-11-28 03:51:33 +03:00
/*
ctdb over TCP
Copyright ( C ) Andrew Tridgell 2006
This library is free software ; you can redistribute it and / or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation ; either
version 2 of the License , or ( at your option ) any later version .
This library is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
Lesser General Public License for more details .
You should have received a copy of the GNU Lesser General Public
License along with this library ; if not , write to the Free Software
Foundation , Inc . , 59 Temple Place , Suite 330 , Boston , MA 02111 - 1307 USA
*/
# include "includes.h"
# include "lib/events/events.h"
2007-01-23 03:38:45 +03:00
# include "lib/util/dlinklist.h"
# include "lib/tdb/include/tdb.h"
2006-11-28 03:51:33 +03:00
# include "system/network.h"
# include "system/filesys.h"
2007-01-23 03:38:45 +03:00
# include "../include/ctdb_private.h"
2006-11-28 03:51:33 +03:00
# include "ctdb_tcp.h"
2006-12-19 04:03:10 +03:00
2006-11-28 06:15:46 +03:00
/*
called when we fail to send a message to a node
*/
static void ctdb_tcp_node_dead ( struct event_context * ev , struct timed_event * te ,
struct timeval t , void * private )
{
struct ctdb_node * node = talloc_get_type ( private , struct ctdb_node ) ;
struct ctdb_tcp_node * tnode = talloc_get_type ( node - > private ,
struct ctdb_tcp_node ) ;
/* start a new connect cycle to try to re-establish the
link */
talloc_free ( tnode - > fde ) ;
close ( tnode - > fd ) ;
tnode - > fd = - 1 ;
event_add_timed ( node - > ctdb - > ev , node , timeval_zero ( ) ,
ctdb_tcp_node_connect , node ) ;
}
2006-11-28 03:51:33 +03:00
/*
called when socket becomes readable
*/
2006-11-28 06:15:46 +03:00
void ctdb_tcp_node_write ( struct event_context * ev , struct fd_event * fde ,
uint16_t flags , void * private )
2006-11-28 03:51:33 +03:00
{
struct ctdb_node * node = talloc_get_type ( private , struct ctdb_node ) ;
2006-11-28 06:15:46 +03:00
struct ctdb_tcp_node * tnode = talloc_get_type ( node - > private ,
struct ctdb_tcp_node ) ;
if ( flags & EVENT_FD_READ ) {
/* getting a read event on this fd in the current tcp model is
always an error , as we have separate read and write
sockets . In future we may combine them , but for now it must
mean that the socket is dead , so we try to reconnect */
2007-02-07 05:26:07 +03:00
node - > ctdb - > upcalls - > node_dead ( node ) ;
2006-11-28 06:15:46 +03:00
talloc_free ( tnode - > fde ) ;
close ( tnode - > fd ) ;
tnode - > fd = - 1 ;
event_add_timed ( node - > ctdb - > ev , node , timeval_zero ( ) ,
ctdb_tcp_node_connect , node ) ;
return ;
}
while ( tnode - > queue ) {
struct ctdb_tcp_packet * pkt = tnode - > queue ;
ssize_t n ;
n = write ( tnode - > fd , pkt - > data , pkt - > length ) ;
if ( n = = - 1 & & errno ! = EAGAIN & & errno ! = EWOULDBLOCK ) {
event_add_timed ( node - > ctdb - > ev , node , timeval_zero ( ) ,
ctdb_tcp_node_dead , node ) ;
EVENT_FD_NOT_WRITEABLE ( tnode - > fde ) ;
return ;
}
if ( n < = 0 ) return ;
if ( n ! = pkt - > length ) {
pkt - > length - = n ;
pkt - > data + = n ;
return ;
}
DLIST_REMOVE ( tnode - > queue , pkt ) ;
talloc_free ( pkt ) ;
}
EVENT_FD_NOT_WRITEABLE ( tnode - > fde ) ;
2006-11-28 03:51:33 +03:00
}
/*
called when an incoming connection is readable
*/
void ctdb_tcp_incoming_read ( struct event_context * ev , struct fd_event * fde ,
uint16_t flags , void * private )
{
struct ctdb_incoming * in = talloc_get_type ( private , struct ctdb_incoming ) ;
2006-11-28 06:15:46 +03:00
int num_ready = 0 ;
2006-12-19 04:03:10 +03:00
ssize_t nread ;
uint8_t * data , * data_base ;
2006-11-28 06:15:46 +03:00
if ( ioctl ( in - > fd , FIONREAD , & num_ready ) ! = 0 | |
num_ready = = 0 ) {
/* we've lost the link from another node. We don't
notify the upper layers , as we only want to trigger
a full node reorganisation when a send fails - that
allows nodes to restart without penalty as long as
the network is idle */
talloc_free ( in ) ;
return ;
}
2006-12-19 04:03:10 +03:00
in - > partial . data = talloc_realloc_size ( in , in - > partial . data ,
num_ready + in - > partial . length ) ;
if ( in - > partial . data = = NULL ) {
2006-11-28 06:15:46 +03:00
/* not much we can do except drop the socket */
2006-11-28 03:51:33 +03:00
talloc_free ( in ) ;
2006-11-28 06:15:46 +03:00
return ;
2006-11-28 03:51:33 +03:00
}
2006-11-28 06:15:46 +03:00
2006-12-19 04:03:10 +03:00
nread = read ( in - > fd , in - > partial . data + in - > partial . length , num_ready ) ;
if ( nread < = 0 ) {
/* the connection must be dead */
2006-11-28 06:15:46 +03:00
talloc_free ( in ) ;
return ;
}
2006-12-19 04:03:10 +03:00
data = in - > partial . data ;
nread + = in - > partial . length ;
in - > partial . data = NULL ;
in - > partial . length = 0 ;
if ( nread > = 4 & & * ( uint32_t * ) data = = nread ) {
/* most common case - we got a whole packet in one go
tell the ctdb layer above that we have a packet */
in - > ctdb - > upcalls - > recv_pkt ( in - > ctdb , data , nread ) ;
return ;
}
data_base = data ;
while ( nread > = 4 & & * ( uint32_t * ) data < = nread ) {
/* we have at least one packet */
uint8_t * d2 ;
uint32_t len ;
len = * ( uint32_t * ) data ;
d2 = talloc_memdup ( in , data , len ) ;
if ( d2 = = NULL ) {
/* sigh */
talloc_free ( in ) ;
return ;
}
in - > ctdb - > upcalls - > recv_pkt ( in - > ctdb , d2 , len ) ;
data + = len ;
nread - = len ;
}
2007-02-07 05:26:07 +03:00
if ( nread > 0 ) {
2006-12-19 04:03:10 +03:00
/* we have only part of a packet */
if ( data_base = = data ) {
in - > partial . data = data ;
in - > partial . length = nread ;
} else {
in - > partial . data = talloc_memdup ( in , data , nread ) ;
if ( in - > partial . data = = NULL ) {
talloc_free ( in ) ;
return ;
}
in - > partial . length = nread ;
talloc_free ( data_base ) ;
}
return ;
}
talloc_free ( data_base ) ;
2006-11-28 03:51:33 +03:00
}
2006-11-28 06:15:46 +03:00
/*
queue a packet for sending
*/
int ctdb_tcp_queue_pkt ( struct ctdb_node * node , uint8_t * data , uint32_t length )
{
struct ctdb_tcp_node * tnode = talloc_get_type ( node - > private ,
struct ctdb_tcp_node ) ;
struct ctdb_tcp_packet * pkt ;
2007-01-23 03:38:45 +03:00
uint32_t length2 ;
2006-12-19 04:07:07 +03:00
/* enforce the length and alignment rules from the tcp packet allocator */
2007-01-23 03:38:45 +03:00
length2 = ( length + ( CTDB_TCP_ALIGNMENT - 1 ) ) & ~ ( CTDB_TCP_ALIGNMENT - 1 ) ;
* ( uint32_t * ) data = length2 ;
if ( length2 ! = length ) {
memset ( data + length , 0 , length2 - length ) ;
}
2006-11-28 06:15:46 +03:00
/* if the queue is empty then try an immediate write, avoiding
queue overhead . This relies on non - blocking sockets */
2006-12-01 07:54:15 +03:00
if ( tnode - > queue = = NULL & & tnode - > fd ! = - 1 ) {
2007-01-23 03:38:45 +03:00
ssize_t n = write ( tnode - > fd , data , length2 ) ;
2006-11-28 06:15:46 +03:00
if ( n = = - 1 & & errno ! = EAGAIN & & errno ! = EWOULDBLOCK ) {
event_add_timed ( node - > ctdb - > ev , node , timeval_zero ( ) ,
ctdb_tcp_node_dead , node ) ;
/* yes, we report success, as the dead node is
handled via a separate event */
return 0 ;
}
if ( n > 0 ) {
data + = n ;
2007-01-23 03:38:45 +03:00
length2 - = n ;
2006-11-28 06:15:46 +03:00
}
2007-01-23 03:38:45 +03:00
if ( length2 = = 0 ) return 0 ;
2006-11-28 06:15:46 +03:00
}
pkt = talloc ( tnode , struct ctdb_tcp_packet ) ;
CTDB_NO_MEMORY ( node - > ctdb , pkt ) ;
2007-01-23 03:38:45 +03:00
pkt - > data = talloc_memdup ( pkt , data , length2 ) ;
2006-11-28 06:15:46 +03:00
CTDB_NO_MEMORY ( node - > ctdb , pkt - > data ) ;
2007-01-23 03:38:45 +03:00
pkt - > length = length2 ;
2006-11-28 06:15:46 +03:00
2006-12-01 07:54:15 +03:00
if ( tnode - > queue = = NULL & & tnode - > fd ! = - 1 ) {
2006-11-28 06:15:46 +03:00
EVENT_FD_WRITEABLE ( tnode - > fde ) ;
}
DLIST_ADD_END ( tnode - > queue , pkt , struct ctdb_tcp_packet * ) ;
return 0 ;
}