2007-10-29 03:09:36 +03:00
/*
* AF_INET / AF_INET6 SOCK_STREAM protocol layer ( tcp )
*
MEDIUM: samples: move payload-based fetches and ACLs to their own file
The file acl.c is a real mess, it both contains functions to parse and
process ACLs, and some sample extraction functions which act on buffers.
Some other payload analysers were arbitrarily dispatched to proto_tcp.c.
So now we're moving all payload-based fetches and ACLs to payload.c
which is capable of extracting data from buffers and rely on everything
that is protocol-independant. That way we can safely inflate this file
and only use the other ones when some fetches are really specific (eg:
HTTP, SSL, ...).
As a result of this cleanup, the following new sample fetches became
available even if they're not really useful :
always_false, always_true, rep_ssl_hello_type, rdp_cookie_cnt,
req_len, req_ssl_hello_type, req_ssl_sni, req_ssl_ver, wait_end
The function 'acl_fetch_nothing' was wrong and never used anywhere so it
was removed.
The "rdp_cookie" sample fetch used to have a mandatory argument while it
was optional in ACLs, which are supposed to iterate over RDP cookies. So
we're making it optional as a fetch too, and it will return the first one.
2013-01-08 00:59:07 +04:00
* Copyright 2000 - 2013 Willy Tarreau < w @ 1 wt . eu >
2007-10-29 03:09:36 +03:00
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*
*/
# include <ctype.h>
# include <errno.h>
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <time.h>
# include <sys/param.h>
# include <sys/socket.h>
# include <sys/types.h>
2009-08-24 15:11:06 +04:00
# include <netinet/tcp.h>
2015-08-24 02:43:45 +03:00
# include <netinet/in.h>
2009-08-24 15:11:06 +04:00
2020-05-27 13:58:42 +03:00
# include <haproxy/api.h>
2020-06-09 10:07:15 +03:00
# include <haproxy/arg.h>
2020-06-04 19:02:10 +03:00
# include <haproxy/connection.h>
2020-05-27 17:10:29 +03:00
# include <haproxy/errors.h>
2020-06-09 10:07:15 +03:00
# include <haproxy/fd.h>
2020-06-04 18:05:57 +03:00
# include <haproxy/global.h>
2020-05-27 19:01:47 +03:00
# include <haproxy/list.h>
2020-06-04 15:58:24 +03:00
# include <haproxy/listener.h>
2020-06-04 23:01:04 +03:00
# include <haproxy/log.h>
2020-06-09 10:07:15 +03:00
# include <haproxy/namespace.h>
# include <haproxy/port_range.h>
# include <haproxy/proto_tcp.h>
# include <haproxy/protocol.h>
2020-06-04 23:29:18 +03:00
# include <haproxy/proxy-t.h>
2020-08-28 13:07:22 +03:00
# include <haproxy/sock.h>
2020-08-28 16:30:11 +03:00
# include <haproxy/sock_inet.h>
2020-06-03 19:09:46 +03:00
# include <haproxy/tools.h>
2007-10-29 03:09:36 +03:00
2010-10-22 18:06:11 +04:00
static int tcp_bind_listener ( struct listener * listener , char * errmsg , int errlen ) ;
2020-09-25 18:12:32 +03:00
static int tcp_suspend_receiver ( struct receiver * rx ) ;
2020-09-25 20:40:31 +03:00
static int tcp_resume_receiver ( struct receiver * rx ) ;
2020-09-25 20:27:39 +03:00
static void tcp_enable_listener ( struct listener * listener ) ;
static void tcp_disable_listener ( struct listener * listener ) ;
2007-10-29 03:09:36 +03:00
/* Note: must not be declared <const> as its list will be overwritten */
2020-12-08 16:13:11 +03:00
struct protocol proto_tcpv4 = {
2020-12-08 16:54:20 +03:00
. name = " tcpv4 " ,
/* connection layer */
2022-05-20 17:36:46 +03:00
. xprt_type = PROTO_TYPE_STREAM ,
2020-12-08 16:54:20 +03:00
. listen = tcp_bind_listener ,
. enable = tcp_enable_listener ,
. disable = tcp_disable_listener ,
. add = default_add_listener ,
. unbind = default_unbind_listener ,
. suspend = default_suspend_listener ,
. resume = default_resume_listener ,
. accept_conn = sock_accept_conn ,
2020-12-08 17:50:56 +03:00
. ctrl_init = sock_conn_ctrl_init ,
. ctrl_close = sock_conn_ctrl_close ,
2020-12-08 16:54:20 +03:00
. connect = tcp_connect_server ,
2020-12-11 18:19:12 +03:00
. drain = sock_drain ,
2020-12-11 19:02:50 +03:00
. check_events = sock_check_events ,
. ignore_events = sock_ignore_events ,
2020-12-08 16:54:20 +03:00
/* binding layer */
. rx_suspend = tcp_suspend_receiver ,
. rx_resume = tcp_resume_receiver ,
/* address family */
. fam = & proto_fam_inet4 ,
/* socket layer */
2021-10-27 18:05:36 +03:00
. proto_type = PROTO_TYPE_STREAM ,
2020-12-08 16:54:20 +03:00
. sock_type = SOCK_STREAM ,
. sock_prot = IPPROTO_TCP ,
. rx_enable = sock_enable ,
. rx_disable = sock_disable ,
. rx_unbind = sock_unbind ,
. rx_listening = sock_accepting_conn ,
. default_iocb = sock_accept_iocb ,
. receivers = LIST_HEAD_INIT ( proto_tcpv4 . receivers ) ,
. nb_receivers = 0 ,
2023-04-22 16:09:07 +03:00
# ifdef SO_REUSEPORT
. flags = PROTO_F_REUSEPORT_SUPPORTED ,
# endif
2007-10-29 03:09:36 +03:00
} ;
2018-11-25 21:14:37 +03:00
INITCALL1 ( STG_REGISTER , protocol_register , & proto_tcpv4 ) ;
2007-10-29 03:09:36 +03:00
/* Note: must not be declared <const> as its list will be overwritten */
2020-12-08 16:13:11 +03:00
struct protocol proto_tcpv6 = {
2020-12-08 16:54:20 +03:00
. name = " tcpv6 " ,
/* connection layer */
2022-05-20 17:36:46 +03:00
. xprt_type = PROTO_TYPE_STREAM ,
2020-12-08 16:54:20 +03:00
. listen = tcp_bind_listener ,
. enable = tcp_enable_listener ,
. disable = tcp_disable_listener ,
. add = default_add_listener ,
. unbind = default_unbind_listener ,
. suspend = default_suspend_listener ,
. resume = default_resume_listener ,
. accept_conn = sock_accept_conn ,
2020-12-08 17:50:56 +03:00
. ctrl_init = sock_conn_ctrl_init ,
. ctrl_close = sock_conn_ctrl_close ,
2020-12-08 16:54:20 +03:00
. connect = tcp_connect_server ,
2020-12-11 18:19:12 +03:00
. drain = sock_drain ,
2020-12-11 19:02:50 +03:00
. check_events = sock_check_events ,
. ignore_events = sock_ignore_events ,
2020-12-08 16:54:20 +03:00
/* binding layer */
. rx_suspend = tcp_suspend_receiver ,
. rx_resume = tcp_resume_receiver ,
/* address family */
. fam = & proto_fam_inet6 ,
/* socket layer */
2021-10-27 18:05:36 +03:00
. proto_type = PROTO_TYPE_STREAM ,
2020-12-08 16:54:20 +03:00
. sock_type = SOCK_STREAM ,
. sock_prot = IPPROTO_TCP ,
. rx_enable = sock_enable ,
. rx_disable = sock_disable ,
. rx_unbind = sock_unbind ,
. rx_listening = sock_accepting_conn ,
. default_iocb = sock_accept_iocb ,
. receivers = LIST_HEAD_INIT ( proto_tcpv6 . receivers ) ,
. nb_receivers = 0 ,
2023-04-22 16:09:07 +03:00
# ifdef SO_REUSEPORT
. flags = PROTO_F_REUSEPORT_SUPPORTED ,
# endif
2007-10-29 03:09:36 +03:00
} ;
2018-11-25 21:14:37 +03:00
INITCALL1 ( STG_REGISTER , protocol_register , & proto_tcpv6 ) ;
2011-03-11 00:26:24 +03:00
/* Binds ipv4/ipv6 address <local> to socket <fd>, unless <flags> is set, in which
2008-01-13 20:40:14 +03:00
* case we try to bind < remote > . < flags > is a 2 - bit field consisting of :
* - 0 : ignore remote address ( may even be a NULL pointer )
* - 1 : use provided address
* - 2 : use provided port
* - 3 : use both
*
* The function supports multiple foreign binding methods :
* - linux_tproxy : we directly bind to the foreign address
* The second one can be used as a fallback for the first one .
* This function returns 0 when everything ' s OK , 1 if it could not bind , to the
* local address , 2 if it could not bind to the foreign address .
*/
2011-03-11 00:26:24 +03:00
int tcp_bind_socket ( int fd , int flags , struct sockaddr_storage * local , struct sockaddr_storage * remote )
2008-01-13 20:40:14 +03:00
{
2011-03-11 00:26:24 +03:00
struct sockaddr_storage bind_addr ;
2008-01-13 20:40:14 +03:00
int foreign_ok = 0 ;
int ret ;
2017-10-29 22:14:08 +03:00
static THREAD_LOCAL int ip_transp_working = 1 ;
static THREAD_LOCAL int ip6_transp_working = 1 ;
2013-05-09 00:49:23 +04:00
2012-07-13 16:34:59 +04:00
switch ( local - > ss_family ) {
case AF_INET :
if ( flags & & ip_transp_working ) {
2013-05-09 00:49:23 +04:00
/* This deserves some explanation. Some platforms will support
* multiple combinations of certain methods , so we try the
* supported ones until one succeeds .
*/
2020-08-28 18:23:40 +03:00
if ( sock_inet4_make_foreign ( fd ) )
2012-07-13 16:34:59 +04:00
foreign_ok = 1 ;
else
ip_transp_working = 0 ;
}
break ;
case AF_INET6 :
if ( flags & & ip6_transp_working ) {
2020-08-28 18:23:40 +03:00
if ( sock_inet6_make_foreign ( fd ) )
2012-07-13 16:34:59 +04:00
foreign_ok = 1 ;
else
ip6_transp_working = 0 ;
}
break ;
2008-01-13 20:40:14 +03:00
}
2013-05-09 00:49:23 +04:00
2008-01-13 20:40:14 +03:00
if ( flags ) {
memset ( & bind_addr , 0 , sizeof ( bind_addr ) ) ;
2011-04-19 09:20:57 +04:00
bind_addr . ss_family = remote - > ss_family ;
2011-03-11 00:26:24 +03:00
switch ( remote - > ss_family ) {
case AF_INET :
if ( flags & 1 )
( ( struct sockaddr_in * ) & bind_addr ) - > sin_addr = ( ( struct sockaddr_in * ) remote ) - > sin_addr ;
if ( flags & 2 )
( ( struct sockaddr_in * ) & bind_addr ) - > sin_port = ( ( struct sockaddr_in * ) remote ) - > sin_port ;
break ;
case AF_INET6 :
if ( flags & 1 )
( ( struct sockaddr_in6 * ) & bind_addr ) - > sin6_addr = ( ( struct sockaddr_in6 * ) remote ) - > sin6_addr ;
if ( flags & 2 )
( ( struct sockaddr_in6 * ) & bind_addr ) - > sin6_port = ( ( struct sockaddr_in6 * ) remote ) - > sin6_port ;
break ;
2011-12-17 00:25:11 +04:00
default :
/* we don't want to try to bind to an unknown address family */
foreign_ok = 0 ;
2011-03-11 00:26:24 +03:00
}
2008-01-13 20:40:14 +03:00
}
2011-06-24 10:11:37 +04:00
setsockopt ( fd , SOL_SOCKET , SO_REUSEADDR , & one , sizeof ( one ) ) ;
2008-01-13 20:40:14 +03:00
if ( foreign_ok ) {
2014-05-10 00:56:10 +04:00
if ( is_inet_addr ( & bind_addr ) ) {
2012-10-26 21:57:58 +04:00
ret = bind ( fd , ( struct sockaddr * ) & bind_addr , get_addr_len ( & bind_addr ) ) ;
if ( ret < 0 )
return 2 ;
}
2008-01-13 20:40:14 +03:00
}
else {
2014-05-10 00:56:10 +04:00
if ( is_inet_addr ( local ) ) {
2012-10-26 21:57:58 +04:00
ret = bind ( fd , ( struct sockaddr * ) local , get_addr_len ( local ) ) ;
if ( ret < 0 )
return 1 ;
}
2008-01-13 20:40:14 +03:00
}
if ( ! flags )
return 0 ;
if ( ! foreign_ok )
/* we could not bind to a foreign address */
return 2 ;
return 0 ;
}
2009-08-16 16:02:45 +04:00
/*
2012-08-31 00:23:13 +04:00
* This function initiates a TCP connection establishment to the target assigned
2019-07-17 16:41:35 +03:00
* to connection < conn > using ( si - > { target , dst } ) . A source address may be
* pointed to by conn - > src in case of transparent proxying . Normal source
2012-08-31 00:23:13 +04:00
* bind addresses are still determined locally ( due to the possible need of a
* source port ) . conn - > target may point either to a valid server or to a backend ,
2012-11-12 03:42:33 +04:00
* depending on conn - > target . Only OBJ_TYPE_PROXY and OBJ_TYPE_SERVER are
2012-11-24 13:24:27 +04:00
* supported . The < data > parameter is a boolean indicating whether there are data
* waiting for being sent or not , in order to adjust data write polling and on
2019-05-06 19:32:29 +03:00
* some platforms , the ability to avoid an empty initial ACK . The < flags > argument
* allows the caller to force using a delayed ACK when establishing the connection
2012-11-24 13:24:27 +04:00
* - 0 = no delayed ACK unless data are advertised and backend has tcp - smart - connect
2019-05-06 19:32:29 +03:00
* - CONNECT_DELACK_SMART_CONNECT = delayed ACK if backend has tcp - smart - connect , regardless of data
* - CONNECT_DELACK_ALWAYS = delayed ACK regardless of backend options
2010-03-29 21:36:59 +04:00
*
2013-10-24 23:45:00 +04:00
* Note that a pending send_proxy message accounts for data .
*
2009-08-16 16:02:45 +04:00
* It can return one of :
2015-04-03 02:14:29 +03:00
* - SF_ERR_NONE if everything ' s OK
* - SF_ERR_SRVTO if there are no more servers
* - SF_ERR_SRVCL if the connection was refused by the server
* - SF_ERR_PRXCOND if the connection has been limited by the proxy ( maxconn )
* - SF_ERR_RESOURCE if a system resource is lacking ( eg : fd limits , ports , . . . )
* - SF_ERR_INTERNAL for any other purely internal errors
2016-11-29 04:15:19 +03:00
* Additionally , in the case of SF_ERR_RESOURCE , an emergency log will be emitted .
2012-11-23 11:51:32 +04:00
*
2015-04-03 02:14:29 +03:00
* The connection ' s fd is inserted only when SF_ERR_NONE is returned , otherwise
2012-11-23 11:51:32 +04:00
* it ' s invalid and the caller has nothing to do .
2009-08-16 16:02:45 +04:00
*/
2011-03-03 20:27:32 +03:00
2019-05-06 19:32:29 +03:00
int tcp_connect_server ( struct connection * conn , int flags )
2009-08-16 16:02:45 +04:00
{
int fd ;
2011-03-05 00:04:29 +03:00
struct server * srv ;
struct proxy * be ;
2012-12-09 01:49:11 +04:00
struct conn_src * src ;
2017-01-24 01:36:45 +03:00
int use_fastopen = 0 ;
2019-05-22 14:44:48 +03:00
struct sockaddr_storage * addr ;
2011-03-05 00:04:29 +03:00
2022-05-02 18:45:12 +03:00
BUG_ON ( ! conn - > dst ) ;
2018-11-23 16:23:07 +03:00
conn - > flags | = CO_FL_WAIT_L4_CONN ; /* connection in progress */
2014-01-24 19:08:19 +04:00
2012-11-12 03:42:33 +04:00
switch ( obj_type ( conn - > target ) ) {
case OBJ_TYPE_PROXY :
2021-12-06 10:01:02 +03:00
be = __objt_proxy ( conn - > target ) ;
2011-03-05 00:04:29 +03:00
srv = NULL ;
break ;
2012-11-12 03:42:33 +04:00
case OBJ_TYPE_SERVER :
2021-12-06 10:01:02 +03:00
srv = __objt_server ( conn - > target ) ;
2011-03-05 00:04:29 +03:00
be = srv - > proxy ;
2017-01-24 01:36:45 +03:00
/* Make sure we check that we have data before activating
* TFO , or we could trigger a kernel issue whereby after
* a successful connect ( ) = = 0 , any subsequent connect ( )
* will return EINPROGRESS instead of EISCONN .
*/
use_fastopen = ( srv - > flags & SRV_F_FASTOPEN ) & &
( ( flags & ( CONNECT_CAN_USE_TFO | CONNECT_HAS_DATA ) ) = =
( CONNECT_CAN_USE_TFO | CONNECT_HAS_DATA ) ) ;
2011-03-05 00:04:29 +03:00
break ;
default :
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_INTERNAL ;
2011-03-05 00:04:29 +03:00
}
2009-08-16 16:02:45 +04:00
2020-08-28 13:07:22 +03:00
fd = conn - > handle . fd = sock_create_server_socket ( conn ) ;
2014-11-17 17:11:45 +03:00
if ( fd = = - 1 ) {
2009-08-16 16:02:45 +04:00
qfprintf ( stderr , " Cannot get a server socket. \n " ) ;
2014-01-24 19:08:19 +04:00
if ( errno = = ENFILE ) {
conn - > err_code = CO_ER_SYS_FDLIM ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2018-01-29 17:06:04 +03:00
" Proxy %s reached system FD limit (maxsock=%d). Please check system tunables. \n " ,
be - > id , global . maxsock ) ;
2014-01-24 19:08:19 +04:00
}
else if ( errno = = EMFILE ) {
conn - > err_code = CO_ER_PROC_FDLIM ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2018-01-29 17:06:04 +03:00
" Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart. \n " ,
be - > id , global . maxsock ) ;
2014-01-24 19:08:19 +04:00
}
else if ( errno = = ENOBUFS | | errno = = ENOMEM ) {
conn - > err_code = CO_ER_SYS_MEMLIM ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2018-01-29 17:06:04 +03:00
" Proxy %s reached system memory limit (maxsock=%d). Please check system tunables. \n " ,
be - > id , global . maxsock ) ;
2014-01-24 19:08:19 +04:00
}
else if ( errno = = EAFNOSUPPORT | | errno = = EPROTONOSUPPORT ) {
conn - > err_code = CO_ER_NOPROTO ;
}
else
conn - > err_code = CO_ER_SOCK_ERR ;
2009-08-16 16:02:45 +04:00
/* this is a resource error */
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_RESOURCE ;
2009-08-16 16:02:45 +04:00
}
if ( fd > = global . maxsock ) {
/* do not log anything there, it's a normal condition when this option
* is used to serialize connections to a server !
*/
2017-11-24 18:50:31 +03:00
ha_alert ( " socket(): not enough free sockets. Raise -n argument. Giving up. \n " ) ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_CONF_FDLIM ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_PRXCOND ; /* it is a configuration limit */
2009-08-16 16:02:45 +04:00
}
2022-04-26 11:24:14 +03:00
if ( fd_set_nonblock ( fd ) = = - 1 | |
2011-06-24 10:11:37 +04:00
( setsockopt ( fd , IPPROTO_TCP , TCP_NODELAY , & one , sizeof ( one ) ) = = - 1 ) ) {
2009-08-16 16:02:45 +04:00
qfprintf ( stderr , " Cannot set client socket to non blocking mode. \n " ) ;
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_SOCK_ERR ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_INTERNAL ;
2009-08-16 16:02:45 +04:00
}
2022-04-26 11:24:14 +03:00
if ( master = = 1 & & fd_set_cloexec ( fd ) = = - 1 ) {
2018-11-27 14:02:37 +03:00
ha_alert ( " Cannot set CLOEXEC on client socket. \n " ) ;
close ( fd ) ;
conn - > err_code = CO_ER_SOCK_ERR ;
conn - > flags | = CO_FL_ERROR ;
return SF_ERR_INTERNAL ;
}
2020-07-09 05:13:20 +03:00
if ( be - > options & PR_O_TCP_SRV_KA ) {
2011-06-24 10:11:37 +04:00
setsockopt ( fd , SOL_SOCKET , SO_KEEPALIVE , & one , sizeof ( one ) ) ;
2009-08-16 16:02:45 +04:00
2020-07-09 06:58:51 +03:00
# ifdef TCP_KEEPCNT
2020-07-09 05:13:20 +03:00
if ( be - > srvtcpka_cnt )
setsockopt ( fd , IPPROTO_TCP , TCP_KEEPCNT , & be - > srvtcpka_cnt , sizeof ( be - > srvtcpka_cnt ) ) ;
2020-07-09 06:58:51 +03:00
# endif
2020-07-09 05:13:20 +03:00
2020-07-09 06:58:51 +03:00
# ifdef TCP_KEEPIDLE
2020-07-09 05:13:20 +03:00
if ( be - > srvtcpka_idle )
setsockopt ( fd , IPPROTO_TCP , TCP_KEEPIDLE , & be - > srvtcpka_idle , sizeof ( be - > srvtcpka_idle ) ) ;
2020-07-09 06:58:51 +03:00
# endif
2020-07-09 05:13:20 +03:00
2020-07-09 06:58:51 +03:00
# ifdef TCP_KEEPINTVL
2020-07-09 05:13:20 +03:00
if ( be - > srvtcpka_intvl )
setsockopt ( fd , IPPROTO_TCP , TCP_KEEPINTVL , & be - > srvtcpka_intvl , sizeof ( be - > srvtcpka_intvl ) ) ;
2020-07-09 06:58:51 +03:00
# endif
2020-07-09 05:13:20 +03:00
}
2009-08-16 16:02:45 +04:00
/* allow specific binding :
* - server - specific at first
* - proxy - specific next
*/
2012-12-09 01:49:11 +04:00
if ( srv & & srv - > conn_src . opts & CO_SRC_BIND )
src = & srv - > conn_src ;
else if ( be - > conn_src . opts & CO_SRC_BIND )
src = & be - > conn_src ;
else
src = NULL ;
if ( src ) {
2009-08-16 16:02:45 +04:00
int ret , flags = 0 ;
2019-07-17 16:41:35 +03:00
if ( conn - > src & & is_inet_addr ( conn - > src ) ) {
2012-12-09 01:49:11 +04:00
switch ( src - > opts & CO_SRC_TPROXY_MASK ) {
2012-12-09 01:29:20 +04:00
case CO_SRC_TPROXY_CLI :
2015-08-04 20:24:13 +03:00
case CO_SRC_TPROXY_ADDR :
2012-10-26 21:57:58 +04:00
flags = 3 ;
break ;
2012-12-09 01:29:20 +04:00
case CO_SRC_TPROXY_CIP :
case CO_SRC_TPROXY_DYN :
2012-10-26 21:57:58 +04:00
flags = 1 ;
break ;
}
2009-08-16 16:02:45 +04:00
}
2010-03-29 21:36:59 +04:00
2009-08-16 16:02:45 +04:00
# ifdef SO_BINDTODEVICE
/* Note: this might fail if not CAP_NET_RAW */
2012-12-09 01:49:11 +04:00
if ( src - > iface_name )
setsockopt ( fd , SOL_SOCKET , SO_BINDTODEVICE , src - > iface_name , src - > iface_len + 1 ) ;
2009-08-16 16:02:45 +04:00
# endif
2012-12-09 01:49:11 +04:00
if ( src - > sport_range ) {
2009-08-16 16:02:45 +04:00
int attempts = 10 ; /* should be more than enough to find a spare port */
2012-12-09 01:49:11 +04:00
struct sockaddr_storage sa ;
2009-08-16 16:02:45 +04:00
ret = 1 ;
2016-05-18 17:17:44 +03:00
memcpy ( & sa , & src - > source_addr , sizeof ( sa ) ) ;
2009-08-16 16:02:45 +04:00
do {
/* note: in case of retry, we may have to release a previously
* allocated port , hence this loop ' s construct .
*/
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
if ( ! attempts )
break ;
attempts - - ;
2012-12-09 01:49:11 +04:00
fdinfo [ fd ] . local_port = port_range_alloc_port ( src - > sport_range ) ;
2014-01-24 19:08:19 +04:00
if ( ! fdinfo [ fd ] . local_port ) {
conn - > err_code = CO_ER_PORT_RANGE ;
2009-08-16 16:02:45 +04:00
break ;
2014-01-24 19:08:19 +04:00
}
2009-08-16 16:02:45 +04:00
2012-12-09 01:49:11 +04:00
fdinfo [ fd ] . port_range = src - > sport_range ;
set_host_port ( & sa , fdinfo [ fd ] . local_port ) ;
2009-08-16 16:02:45 +04:00
2019-07-17 16:41:35 +03:00
ret = tcp_bind_socket ( fd , flags , & sa , conn - > src ) ;
2014-01-24 19:08:19 +04:00
if ( ret ! = 0 )
conn - > err_code = CO_ER_CANT_BIND ;
2009-08-16 16:02:45 +04:00
} while ( ret ! = 0 ) ; /* binding NOK */
}
else {
2016-09-13 12:51:15 +03:00
# ifdef IP_BIND_ADDRESS_NO_PORT
2017-10-29 22:14:08 +03:00
static THREAD_LOCAL int bind_address_no_port = 1 ;
2021-03-31 09:45:47 +03:00
setsockopt ( fd , IPPROTO_IP , IP_BIND_ADDRESS_NO_PORT , ( const void * ) & bind_address_no_port , sizeof ( int ) ) ;
2016-09-13 12:51:15 +03:00
# endif
2019-07-17 16:41:35 +03:00
ret = tcp_bind_socket ( fd , flags , & src - > source_addr , conn - > src ) ;
2014-01-24 19:08:19 +04:00
if ( ret ! = 0 )
conn - > err_code = CO_ER_CANT_BIND ;
2009-08-16 16:02:45 +04:00
}
2012-12-09 01:49:11 +04:00
if ( unlikely ( ret ! = 0 ) ) {
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
if ( ret = = 1 ) {
2017-11-24 18:50:31 +03:00
ha_alert ( " Cannot bind to source address before connect() for backend %s. Aborting. \n " ,
be - > id ) ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2012-12-09 01:49:11 +04:00
" Cannot bind to source address before connect() for backend %s. \n " ,
2009-08-16 16:02:45 +04:00
be - > id ) ;
} else {
2017-11-24 18:50:31 +03:00
ha_alert ( " Cannot bind to tproxy source address before connect() for backend %s. Aborting. \n " ,
be - > id ) ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2012-12-09 01:49:11 +04:00
" Cannot bind to tproxy source address before connect() for backend %s. \n " ,
2009-08-16 16:02:45 +04:00
be - > id ) ;
}
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_RESOURCE ;
2009-08-16 16:02:45 +04:00
}
}
2009-08-24 15:11:06 +04:00
# if defined(TCP_QUICKACK)
2009-08-16 16:02:45 +04:00
/* disabling tcp quick ack now allows the first request to leave the
* machine with the first ACK . We only do this if there are pending
2012-11-24 13:24:27 +04:00
* data in the buffer .
2009-08-16 16:02:45 +04:00
*/
2019-05-06 19:32:29 +03:00
if ( flags & ( CONNECT_DELACK_ALWAYS ) | |
( ( flags & CONNECT_DELACK_SMART_CONNECT | |
( flags & CONNECT_HAS_DATA ) | | conn - > send_proxy_ofs ) & &
( be - > options2 & PR_O2_SMARTCON ) ) )
2011-06-24 10:11:37 +04:00
setsockopt ( fd , IPPROTO_TCP , TCP_QUICKACK , & zero , sizeof ( zero ) ) ;
2009-08-16 16:02:45 +04:00
# endif
2015-10-13 17:16:41 +03:00
# ifdef TCP_USER_TIMEOUT
/* there is not much more we can do here when it fails, it's still minor */
if ( srv & & srv - > tcp_ut )
setsockopt ( fd , IPPROTO_TCP , TCP_USER_TIMEOUT , & srv - > tcp_ut , sizeof ( srv - > tcp_ut ) ) ;
# endif
2017-01-24 01:36:45 +03:00
if ( use_fastopen ) {
# if defined(TCP_FASTOPEN_CONNECT)
setsockopt ( fd , IPPROTO_TCP , TCP_FASTOPEN_CONNECT , & one , sizeof ( one ) ) ;
# endif
}
2010-01-21 19:43:04 +03:00
if ( global . tune . server_sndbuf )
setsockopt ( fd , SOL_SOCKET , SO_SNDBUF , & global . tune . server_sndbuf , sizeof ( global . tune . server_sndbuf ) ) ;
if ( global . tune . server_rcvbuf )
setsockopt ( fd , SOL_SOCKET , SO_RCVBUF , & global . tune . server_rcvbuf , sizeof ( global . tune . server_rcvbuf ) ) ;
2019-07-17 16:41:35 +03:00
addr = ( conn - > flags & CO_FL_SOCKS4 ) ? & srv - > socks4_addr : conn - > dst ;
2019-05-22 14:44:48 +03:00
if ( connect ( fd , ( const struct sockaddr * ) addr , get_addr_len ( addr ) ) = = - 1 ) {
2017-01-25 16:12:22 +03:00
if ( errno = = EINPROGRESS | | errno = = EALREADY ) {
/* common case, let's wait for connect status */
conn - > flags | = CO_FL_WAIT_L4_CONN ;
}
else if ( errno = = EISCONN ) {
/* should normally not happen but if so, indicates that it's OK */
conn - > flags & = ~ CO_FL_WAIT_L4_CONN ;
}
2022-04-25 21:32:15 +03:00
else if ( errno = = EAGAIN | | errno = = EWOULDBLOCK | | errno = = EADDRINUSE | | errno = = EADDRNOTAVAIL ) {
2009-08-16 16:02:45 +04:00
char * msg ;
2022-04-25 21:32:15 +03:00
if ( errno = = EAGAIN | | errno = = EWOULDBLOCK | | errno = = EADDRNOTAVAIL ) {
2009-08-16 16:02:45 +04:00
msg = " no free ports " ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_FREE_PORTS ;
}
else {
2009-08-16 16:02:45 +04:00
msg = " local address already in use " ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_ADDR_INUSE ;
}
2009-08-16 16:02:45 +04:00
2012-12-09 02:03:28 +04:00
qfprintf ( stderr , " Connect() failed for backend %s: %s. \n " , be - > id , msg ) ;
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2012-12-09 02:03:28 +04:00
send_log ( be , LOG_ERR , " Connect() failed for backend %s: %s. \n " , be - > id , msg ) ;
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_RESOURCE ;
2009-08-16 16:02:45 +04:00
} else if ( errno = = ETIMEDOUT ) {
//qfprintf(stderr,"Connect(): ETIMEDOUT");
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_SOCK_ERR ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_SRVTO ;
2009-08-16 16:02:45 +04:00
} else {
// (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM)
//qfprintf(stderr,"Connect(): %d", errno);
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_SOCK_ERR ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_SRVCL ;
2009-08-16 16:02:45 +04:00
}
}
2017-01-25 16:12:22 +03:00
else {
/* connect() == 0, this is great! */
conn - > flags & = ~ CO_FL_WAIT_L4_CONN ;
}
2009-08-16 16:02:45 +04:00
MAJOR: connection: add two new flags to indicate readiness of control/transport
Currently the control and transport layers of a connection are supposed
to be initialized when their respective pointers are not NULL. This will
not work anymore when we plan to reuse connections, because there is an
asymmetry between the accept() side and the connect() side :
- on accept() side, the fd is set first, then the ctrl layer then the
transport layer ; upon error, they must be undone in the reverse order,
then the FD must be closed. The FD must not be deleted if the control
layer was not yet initialized ;
- on the connect() side, the fd is set last and there is no reliable way
to know if it has been initialized or not. In practice it's initialized
to -1 first but this is hackish and supposes that local FDs only will
be used forever. Also, there are even less solutions for keeping trace
of the transport layer's state.
Also it is possible to support delayed close() when something (eg: logs)
tracks some information requiring the transport and/or control layers,
making it even more difficult to clean them.
So the proposed solution is to add two flags to the connection :
- CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert)
and cleared after it's released (fd_delete).
- CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init)
and cleared after it's released (xprt->close).
The functions have been adapted to rely on this and not on the pointers
anymore. conn_xprt_close() was unused and dangerous : it did not close
the control layer (eg: the socket itself) but still marks the transport
layer as closed, preventing any future call to conn_full_close() from
finishing the job.
The problem comes from conn_full_close() in fact. It needs to close the
xprt and ctrl layers independantly. After that we're still having an issue :
we don't know based on ->ctrl alone whether the fd was registered or not.
For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We
now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what
remains to be done on the connection.
In order not to miss some flag assignments, we introduce conn_ctrl_init()
to initialize the control layer, register the fd using fd_insert() and set
the flag, and conn_ctrl_close() which unregisters the fd and removes the
flag, but only if the transport layer was closed.
Similarly, at the transport layer, conn_xprt_init() calls ->init and sets
the flag, while conn_xprt_close() checks the flag, calls ->close and clears
the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init
and the ->close functions are called only once each and in the correct order.
Note that conn_xprt_close() does nothing if the transport layer is still
tracked.
conn_full_close() now simply calls conn_xprt_close() then conn_full_close()
in turn, which do nothing if CO_FL_XPRT_TRACKED is set.
In order to handle the error path, we also provide conn_force_close() which
ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers
in turns. All relevant instances of fd_delete() have been replaced with
conn_force_close(). Now we always know what state the connection is in and
we can expect to split its initialization.
2013-10-21 18:30:56 +04:00
conn_ctrl_init ( conn ) ; /* registers the FD */
2021-04-06 18:49:19 +03:00
HA_ATOMIC_OR ( & fdtab [ fd ] . state , FD_LINGER_RISK ) ; /* close hard if needed */
2012-08-31 15:54:11 +04:00
2020-03-04 18:38:00 +03:00
if ( conn - > flags & CO_FL_WAIT_L4_CONN ) {
fd_want_send ( fd ) ;
fd_cant_send ( fd ) ;
MINOR: connection: avoid a useless recvfrom() on outgoing connections
When a connect() doesn't immediately succeed (i.e. most of the times),
fd_cant_send() is called to enable polling. But given that we don't
mark that we cannot receive either, we end up performing a failed
recvfrom() immediately when the connect() is finally confirmed, as
indicated in issue #253.
This patch simply adds fd_cant_recv() as well so that we're only
notified once the recv path is ready. The reason it was not there
is purely historic, as in the past when there was the fd cache,
doing it would have caused a pending recv request to be placed into
the fd cache, hence a useless recvfrom() upon success (i.e. what
happens now).
Without this patch, forwarding 100k connections does this:
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
17.51 0.704229 7 100000 100000 connect
16.75 0.673875 3 200000 sendto
16.24 0.653222 3 200036 close
10.82 0.435082 1 300000 100000 recvfrom
10.37 0.417266 1 300012 setsockopt
7.12 0.286511 1 199954 epoll_ctl
6.80 0.273447 2 100000 shutdown
5.34 0.214942 2 100005 socket
4.65 0.187137 1 105002 5002 accept4
3.35 0.134757 1 100004 fcntl
0.61 0.024585 4 5858 epoll_wait
With the patch:
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
18.04 0.697365 6 100000 100000 connect
17.40 0.672471 3 200000 sendto
17.03 0.658134 3 200036 close
10.57 0.408459 1 300012 setsockopt
7.69 0.297270 1 200000 recvfrom
7.32 0.282934 1 199922 epoll_ctl
7.09 0.274027 2 100000 shutdown
5.59 0.216041 2 100005 socket
4.87 0.188352 1 104697 4697 accept4
3.35 0.129641 1 100004 fcntl
0.65 0.024959 4 5337 1 epoll_wait
Note the total disappearance of 1/3 of failed recvfrom() *without*
adding any extra syscall anywhere else.
The trace of an HTTP health check is now totally clean, with no useless
syscall at all anymore:
09:14:21.959255 connect(9, {sa_family=AF_INET, sin_port=htons(8000), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in progress)
09:14:21.959292 epoll_ctl(4, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLOUT|EPOLLRDHUP, {u32=9, u64=9}}) = 0
09:14:21.959315 epoll_wait(4, [{EPOLLOUT, {u32=9, u64=9}}], 200, 1000) = 1
09:14:21.959376 sendto(9, "OPTIONS / HTTP/1.0\r\ncontent-leng"..., 41, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 41
09:14:21.959436 epoll_wait(4, [{EPOLLOUT, {u32=9, u64=9}}], 200, 1000) = 1
09:14:21.959456 epoll_ctl(4, EPOLL_CTL_MOD, 9, {EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}) = 0
09:14:21.959512 epoll_wait(4, [{EPOLLIN|EPOLLRDHUP, {u32=9, u64=9}}], 200, 1000) = 1
09:14:21.959548 recvfrom(9, "HTTP/1.0 200\r\nContent-length: 0\r"..., 16320, 0, NULL, NULL) = 126
09:14:21.959570 close(9) = 0
With the edge-triggered poller, it gets even better:
09:29:15.776201 connect(9, {sa_family=AF_INET, sin_port=htons(8000), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in progress)
09:29:15.776256 epoll_ctl(4, EPOLL_CTL_ADD, 9, {EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, {u32=9, u64=9}}) = 0
09:29:15.776287 epoll_wait(4, [{EPOLLOUT, {u32=9, u64=9}}], 200, 1000) = 1
09:29:15.776320 sendto(9, "OPTIONS / HTTP/1.0\r\ncontent-leng"..., 41, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 41
09:29:15.776374 epoll_wait(4, [{EPOLLIN|EPOLLOUT|EPOLLRDHUP, {u32=9, u64=9}}], 200, 1000) = 1
09:29:15.776406 recvfrom(9, "HTTP/1.0 200\r\nContent-length: 0\r"..., 16320, 0, NULL, NULL) = 126
09:29:15.776434 close(9) = 0
It could make sense to backport this patch to 2.2 and maybe 2.1 after
it has been sufficiently checked for absence of side effects in 2.3-dev,
as some people had reported an extra overhead like in issue #168.
2020-07-31 09:59:09 +03:00
fd_cant_recv ( fd ) ;
2020-03-04 18:38:00 +03:00
}
MEDIUM: connection: enable reading only once the connection is confirmed
In order to address the absurd polling sequence described in issue #253,
let's make sure we disable receiving on a connection until it's established.
Previously with bottom-top I/Os, we were almost certain that a connection
was ready when the first I/O was confirmed. Now we can enter various
functions, including process_stream(), which will attempt to read
something, will fail, and will then subscribe. But we don't want them
to try to receive if we know the connection didn't complete. The first
prerequisite for this is to mark the connection as not ready for receiving
until it's validated. But we don't want to mark it as not ready for sending
because we know that attempting I/Os later is extremely likely to work
without polling.
Once the connection is confirmed we re-enable recv readiness. In order
for this event to be taken into account, the call to tcp_connect_probe()
was moved earlier, between the attempt to send() and the attempt to recv().
This way if tcp_connect_probe() enables reading, we have a chance to
immediately fall back to this and read the possibly pending data.
Now the trace looks like the following. It's far from being perfect
but we've already saved one recvfrom() and one epollctl():
epoll_wait(3, [], 200, 0) = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_TCP) = 7
fcntl(7, F_SETFL, O_RDONLY|O_NONBLOCK) = 0
setsockopt(7, SOL_TCP, TCP_NODELAY, [1], 4) = 0
connect(7, {sa_family=AF_INET, sin_port=htons(8000), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in progress)
epoll_ctl(3, EPOLL_CTL_ADD, 7, {EPOLLIN|EPOLLOUT|EPOLLRDHUP, {u32=7, u64=7}}) = 0
epoll_wait(3, [{EPOLLOUT, {u32=7, u64=7}}], 200, 1000) = 1
connect(7, {sa_family=AF_INET, sin_port=htons(8000), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
getsockopt(7, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
sendto(7, "OPTIONS / HTTP/1.0\r\n\r\n", 22, MSG_DONTWAIT|MSG_NOSIGNAL, NULL, 0) = 22
epoll_ctl(3, EPOLL_CTL_MOD, 7, {EPOLLIN|EPOLLRDHUP, {u32=7, u64=7}}) = 0
epoll_wait(3, [{EPOLLIN|EPOLLRDHUP, {u32=7, u64=7}}], 200, 1000) = 1
getsockopt(7, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
getsockopt(7, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
recvfrom(7, "HTTP/1.0 200\r\nContent-length: 0\r\nX-req: size=22, time=0 ms\r\nX-rsp: id=dummy, code=200, cache=1, size=0, time=0 ms (0 real)\r\n\r\n", 16384, 0, NULL, NULL) = 126
close(7) = 0
2019-09-05 18:05:05 +03:00
2015-04-03 02:14:29 +03:00
return SF_ERR_NONE ; /* connection is OK */
2009-08-16 16:02:45 +04:00
}
2007-10-29 03:09:36 +03:00
/* This function tries to bind a TCPv4/v6 listener. It may return a warning or
2013-01-24 04:41:38 +04:00
* an error message in < errmsg > if the message is at most < errlen > bytes long
* ( including ' \0 ' ) . Note that < errmsg > may be NULL if < errlen > is also zero .
* The return value is composed from ERR_ABORT , ERR_WARN ,
2007-10-29 03:09:36 +03:00
* ERR_ALERT , ERR_RETRYABLE and ERR_FATAL . ERR_NONE indicates that everything
* was alright and that no message was returned . ERR_RETRYABLE means that an
* error occurred but that it may vanish after a retry ( eg : port in use ) , and
2012-04-07 04:39:26 +04:00
* ERR_FATAL indicates a non - fixable error . ERR_WARN and ERR_ALERT do not alter
2007-10-29 03:09:36 +03:00
* the meaning of the error , but just indicate that a message is present which
* should be displayed with the respective level . Last , ERR_ABORT indicates
* that it ' s pointless to try to start other listeners . No error message is
* returned if errlen is NULL .
*/
int tcp_bind_listener ( struct listener * listener , char * errmsg , int errlen )
{
int fd , err ;
2020-09-01 17:12:50 +03:00
int ready ;
2021-01-12 21:24:43 +03:00
struct buffer * msg = alloc_trash_chunk ( ) ;
2007-10-29 03:09:36 +03:00
2020-09-02 19:40:02 +03:00
err = ERR_NONE ;
2021-10-16 15:54:19 +03:00
if ( ! msg ) {
if ( errlen )
snprintf ( errmsg , errlen , " out of memory " ) ;
return ERR_ALERT | ERR_FATAL ;
}
2007-10-29 03:09:36 +03:00
/* ensure we never return garbage */
2013-01-24 04:41:38 +04:00
if ( errlen )
2007-10-29 03:09:36 +03:00
* errmsg = 0 ;
if ( listener - > state ! = LI_ASSIGNED )
return ERR_NONE ; /* already bound */
2020-09-02 19:40:02 +03:00
if ( ! ( listener - > rx . flags & RX_F_BOUND ) ) {
2021-01-12 21:24:43 +03:00
chunk_appendf ( msg , " %sreceiving socket not bound " , msg - > data ? " , " : " " ) ;
2020-09-02 19:40:02 +03:00
goto tcp_return ;
2007-10-29 03:09:36 +03:00
}
2023-02-27 18:40:54 +03:00
if ( listener - > rx . flags & RX_F_MUST_DUP )
goto done ;
2020-09-01 17:12:50 +03:00
fd = listener - > rx . fd ;
2007-10-29 03:09:36 +03:00
2023-01-12 21:37:07 +03:00
if ( listener - > bind_conf - > options & BC_O_NOLINGER )
2011-06-24 10:11:37 +04:00
setsockopt ( fd , SOL_SOCKET , SO_LINGER , & nolinger , sizeof ( struct linger ) ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
else {
struct linger tmplinger ;
socklen_t len = sizeof ( tmplinger ) ;
if ( getsockopt ( fd , SOL_SOCKET , SO_LINGER , & tmplinger , & len ) = = 0 & &
( tmplinger . l_onoff = = 1 | | tmplinger . l_linger = = 0 ) ) {
tmplinger . l_onoff = 0 ;
tmplinger . l_linger = 0 ;
setsockopt ( fd , SOL_SOCKET , SO_LINGER , & tmplinger ,
sizeof ( tmplinger ) ) ;
}
}
2008-12-01 01:15:34 +03:00
2009-08-24 15:11:06 +04:00
# if defined(TCP_MAXSEG)
2023-01-12 20:42:49 +03:00
if ( listener - > bind_conf - > maxseg > 0 ) {
2009-08-24 15:11:06 +04:00
if ( setsockopt ( fd , IPPROTO_TCP , TCP_MAXSEG ,
2023-01-12 20:42:49 +03:00
& listener - > bind_conf - > maxseg , sizeof ( listener - > bind_conf - > maxseg ) ) = = - 1 ) {
chunk_appendf ( msg , " %scannot set MSS to %d " , msg - > data ? " , " : " " , listener - > bind_conf - > maxseg ) ;
2009-06-14 20:48:19 +04:00
err | = ERR_WARN ;
}
2020-09-01 17:12:50 +03:00
} else {
/* we may want to try to restore the default MSS if the socket was inherited */
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
int tmpmaxseg = - 1 ;
int defaultmss ;
socklen_t len = sizeof ( tmpmaxseg ) ;
2020-08-27 08:48:42 +03:00
if ( listener - > rx . addr . ss_family = = AF_INET )
2020-08-28 19:03:10 +03:00
defaultmss = sock_inet_tcp_maxseg_default ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
else
2020-08-28 19:03:10 +03:00
defaultmss = sock_inet6_tcp_maxseg_default ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
getsockopt ( fd , IPPROTO_TCP , TCP_MAXSEG , & tmpmaxseg , & len ) ;
2020-02-12 17:53:04 +03:00
if ( defaultmss > 0 & &
tmpmaxseg ! = defaultmss & &
setsockopt ( fd , IPPROTO_TCP , TCP_MAXSEG , & defaultmss , sizeof ( defaultmss ) ) = = - 1 ) {
2021-10-14 12:39:18 +03:00
chunk_appendf ( msg , " %scannot set MSS to %d " , msg - > data ? " , " : " " , defaultmss ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
err | = ERR_WARN ;
}
2009-06-14 20:48:19 +04:00
}
2009-10-13 09:34:14 +04:00
# endif
2015-02-04 02:45:58 +03:00
# if defined(TCP_USER_TIMEOUT)
2023-01-12 20:42:49 +03:00
if ( listener - > bind_conf - > tcp_ut ) {
2015-02-04 02:45:58 +03:00
if ( setsockopt ( fd , IPPROTO_TCP , TCP_USER_TIMEOUT ,
2023-01-12 20:42:49 +03:00
& listener - > bind_conf - > tcp_ut , sizeof ( listener - > bind_conf - > tcp_ut ) ) = = - 1 ) {
2021-01-12 21:24:43 +03:00
chunk_appendf ( msg , " %scannot set TCP User Timeout " , msg - > data ? " , " : " " ) ;
2015-02-04 02:45:58 +03:00
err | = ERR_WARN ;
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
} else
setsockopt ( fd , IPPROTO_TCP , TCP_USER_TIMEOUT , & zero ,
sizeof ( zero ) ) ;
2015-02-04 02:45:58 +03:00
# endif
2009-10-13 09:34:14 +04:00
# if defined(TCP_DEFER_ACCEPT)
2023-01-12 21:42:48 +03:00
if ( listener - > bind_conf - > options & BC_O_DEF_ACCEPT ) {
2009-10-13 09:34:14 +04:00
/* defer accept by up to one second */
int accept_delay = 1 ;
if ( setsockopt ( fd , IPPROTO_TCP , TCP_DEFER_ACCEPT , & accept_delay , sizeof ( accept_delay ) ) = = - 1 ) {
2021-01-12 21:24:43 +03:00
chunk_appendf ( msg , " %scannot enable DEFER_ACCEPT " , msg - > data ? " , " : " " ) ;
2009-10-13 09:34:14 +04:00
err | = ERR_WARN ;
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
} else
setsockopt ( fd , IPPROTO_TCP , TCP_DEFER_ACCEPT , & zero ,
sizeof ( zero ) ) ;
2012-10-05 18:21:00 +04:00
# endif
# if defined(TCP_FASTOPEN)
2023-01-12 21:45:58 +03:00
if ( listener - > bind_conf - > options & BC_O_TCP_FO ) {
2012-10-05 18:21:00 +04:00
/* TFO needs a queue length, let's use the configured backlog */
2019-02-27 17:39:41 +03:00
int qlen = listener_backlog ( listener ) ;
2012-10-05 18:21:00 +04:00
if ( setsockopt ( fd , IPPROTO_TCP , TCP_FASTOPEN , & qlen , sizeof ( qlen ) ) = = - 1 ) {
2021-01-12 21:24:43 +03:00
chunk_appendf ( msg , " %scannot enable TCP_FASTOPEN " , msg - > data ? " , " : " " ) ;
2012-10-05 18:21:00 +04:00
err | = ERR_WARN ;
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
} else {
socklen_t len ;
int qlen ;
len = sizeof ( qlen ) ;
/* Only disable fast open if it was enabled, we don't want
* the kernel to create a fast open queue if there ' s none .
*/
if ( getsockopt ( fd , IPPROTO_TCP , TCP_FASTOPEN , & qlen , & len ) = = 0 & &
qlen ! = 0 ) {
if ( setsockopt ( fd , IPPROTO_TCP , TCP_FASTOPEN , & zero ,
sizeof ( zero ) ) = = - 1 ) {
2021-01-12 21:24:43 +03:00
chunk_appendf ( msg , " %scannot disable TCP_FASTOPEN " , msg - > data ? " , " : " " ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
err | = ERR_WARN ;
}
}
2012-10-05 18:21:00 +04:00
}
2007-10-29 03:09:36 +03:00
# endif
2020-10-13 18:42:21 +03:00
2020-10-15 10:19:43 +03:00
ready = sock_accepting_conn ( & listener - > rx ) > 0 ;
2013-03-11 02:51:38 +04:00
2020-09-01 17:12:50 +03:00
if ( ! ready & & /* only listen if not already done by external process */
2019-02-27 17:39:41 +03:00
listen ( fd , listener_backlog ( listener ) ) = = - 1 ) {
2007-10-29 03:09:36 +03:00
err | = ERR_RETRYABLE | ERR_ALERT ;
2021-01-12 21:24:43 +03:00
chunk_appendf ( msg , " %scannot listen to socket " , msg - > data ? " , " : " " ) ;
2007-10-29 03:09:36 +03:00
goto tcp_close_return ;
}
2008-12-01 01:15:34 +03:00
2021-02-06 15:11:11 +03:00
# if !defined(TCP_DEFER_ACCEPT) && defined(SO_ACCEPTFILTER)
/* the socket needs to listen first */
2023-01-12 21:42:48 +03:00
if ( listener - > bind_conf - > options & BC_O_DEF_ACCEPT ) {
2021-02-06 15:11:11 +03:00
struct accept_filter_arg accept ;
memset ( & accept , 0 , sizeof ( accept ) ) ;
2023-04-08 15:58:53 +03:00
strlcpy2 ( accept . af_name , " dataready " , sizeof ( accept . af_name ) ) ;
2021-02-06 15:11:11 +03:00
if ( setsockopt ( fd , SOL_SOCKET , SO_ACCEPTFILTER , & accept , sizeof ( accept ) ) = = - 1 ) {
2021-01-12 21:24:43 +03:00
chunk_appendf ( msg , " %scannot enable ACCEPT_FILTER " , msg - > data ? " , " : " " ) ;
2021-02-06 15:11:11 +03:00
err | = ERR_WARN ;
}
}
# endif
2009-08-24 15:11:06 +04:00
# if defined(TCP_QUICKACK)
2023-01-12 21:40:42 +03:00
if ( listener - > bind_conf - > options & BC_O_NOQUICKACK )
2011-06-24 10:11:37 +04:00
setsockopt ( fd , IPPROTO_TCP , TCP_QUICKACK , & zero , sizeof ( zero ) ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
else
setsockopt ( fd , IPPROTO_TCP , TCP_QUICKACK , & one , sizeof ( one ) ) ;
2009-06-14 14:07:01 +04:00
# endif
2023-02-27 18:40:54 +03:00
done :
2007-10-29 03:09:36 +03:00
/* the socket is ready */
2020-09-24 08:23:45 +03:00
listener_set_state ( listener , LI_LISTEN ) ;
2020-10-07 12:14:47 +03:00
goto tcp_return ;
2007-10-29 03:09:36 +03:00
2020-09-01 17:12:50 +03:00
tcp_close_return :
2021-01-12 21:24:43 +03:00
free_trash_chunk ( msg ) ;
msg = NULL ;
2020-09-01 17:12:50 +03:00
close ( fd ) ;
2020-09-02 19:40:02 +03:00
tcp_return :
2021-01-12 21:24:43 +03:00
if ( msg & & errlen & & msg - > data ) {
2010-11-01 21:26:01 +03:00
char pn [ INET6_ADDRSTRLEN ] ;
2020-08-27 08:48:42 +03:00
addr_to_str ( & listener - > rx . addr , pn , sizeof ( pn ) ) ;
2021-10-14 12:59:15 +03:00
snprintf ( errmsg , errlen , " %s for [%s:%d] " , msg - > area , pn , get_host_port ( & listener - > rx . addr ) ) ;
2010-11-01 21:26:01 +03:00
}
2021-01-12 21:24:43 +03:00
free_trash_chunk ( msg ) ;
msg = NULL ;
2007-10-29 03:09:36 +03:00
return err ;
}
2020-09-25 20:27:39 +03:00
/* Enable receipt of incoming connections for listener <l>. The receiver must
2020-11-04 15:59:04 +03:00
* still be valid .
2020-09-25 20:27:39 +03:00
*/
static void tcp_enable_listener ( struct listener * l )
{
2020-11-04 15:59:04 +03:00
fd_want_recv_safe ( l - > rx . fd ) ;
2020-09-25 20:27:39 +03:00
}
/* Disable receipt of incoming connections for listener <l>. The receiver must
2020-11-04 15:59:04 +03:00
* still be valid .
2020-09-25 20:27:39 +03:00
*/
static void tcp_disable_listener ( struct listener * l )
{
2020-11-04 15:59:04 +03:00
fd_stop_recv ( l - > rx . fd ) ;
2020-09-25 20:27:39 +03:00
}
2020-09-25 18:12:32 +03:00
/* Suspend a receiver. Returns < 0 in case of failure, 0 if the receiver
2023-01-16 13:47:01 +03:00
* was totally stopped , or > 0 if correctly suspended . Note that inherited FDs
* are neither suspended nor resumed , we only enable / disable polling on them .
2014-07-07 22:22:12 +04:00
*/
2020-09-25 18:12:32 +03:00
static int tcp_suspend_receiver ( struct receiver * rx )
2014-07-07 22:22:12 +04:00
{
2020-10-14 11:50:41 +03:00
const struct sockaddr sa = { . sa_family = AF_UNSPEC } ;
2020-10-13 18:42:21 +03:00
int ret ;
2020-10-08 17:51:09 +03:00
2023-01-16 13:47:01 +03:00
/* We never disconnect a shared FD otherwise we'd break it in the
2020-11-04 16:14:55 +03:00
* parent process and any possible subsequent worker inheriting it .
2023-01-16 13:47:01 +03:00
* Thus we just stop receiving from it .
2020-11-04 16:14:55 +03:00
*/
if ( rx - > flags & RX_F_INHERITED )
2023-01-16 13:47:01 +03:00
goto done ;
2020-11-04 16:14:55 +03:00
2020-10-13 17:34:19 +03:00
if ( connect ( rx - > fd , & sa , sizeof ( sa ) ) < 0 )
goto check_already_done ;
2023-01-16 13:47:01 +03:00
done :
2020-09-25 18:12:32 +03:00
fd_stop_recv ( rx - > fd ) ;
2014-07-07 22:22:12 +04:00
return 1 ;
2020-10-08 17:51:09 +03:00
check_already_done :
/* in case one of the shutdown() above fails, it might be because we're
* dealing with a socket that is shared with other processes doing the
* same . Let ' s check if it ' s still accepting connections .
*/
2020-10-15 10:19:43 +03:00
ret = sock_accepting_conn ( rx ) ;
2020-10-13 18:42:21 +03:00
if ( ret < = 0 ) {
/* unrecoverable or paused by another process */
2020-09-25 18:12:32 +03:00
fd_stop_recv ( rx - > fd ) ;
2020-10-13 18:42:21 +03:00
return ret = = 0 ;
2020-09-24 19:20:37 +03:00
}
2020-10-08 17:51:09 +03:00
2020-10-13 18:42:21 +03:00
/* still listening, that's not good */
2020-10-08 17:51:09 +03:00
return - 1 ;
2014-07-07 22:22:12 +04:00
}
2020-09-25 20:40:31 +03:00
/* Resume a receiver. Returns < 0 in case of failure, 0 if the receiver
2023-01-16 13:47:01 +03:00
* was totally stopped , or > 0 if correctly resumed . Note that inherited FDs
* are neither suspended nor resumed , we only enable / disable polling on them .
2020-09-25 20:40:31 +03:00
*/
static int tcp_resume_receiver ( struct receiver * rx )
{
struct listener * l = LIST_ELEM ( rx , struct listener * , rx ) ;
if ( rx - > fd < 0 )
return 0 ;
2023-01-16 13:47:01 +03:00
if ( ( rx - > flags & RX_F_INHERITED ) | | listen ( rx - > fd , listener_backlog ( l ) ) = = 0 ) {
2020-09-25 20:40:31 +03:00
fd_want_recv ( l - > rx . fd ) ;
return 1 ;
}
return - 1 ;
}
2007-10-29 03:09:36 +03:00
/*
* Local variables :
* c - indent - level : 8
* c - basic - offset : 8
* End :
*/