2007-10-29 03:09:36 +03:00
/*
* AF_INET / AF_INET6 SOCK_STREAM protocol layer ( tcp )
*
MEDIUM: samples: move payload-based fetches and ACLs to their own file
The file acl.c is a real mess, it both contains functions to parse and
process ACLs, and some sample extraction functions which act on buffers.
Some other payload analysers were arbitrarily dispatched to proto_tcp.c.
So now we're moving all payload-based fetches and ACLs to payload.c
which is capable of extracting data from buffers and rely on everything
that is protocol-independant. That way we can safely inflate this file
and only use the other ones when some fetches are really specific (eg:
HTTP, SSL, ...).
As a result of this cleanup, the following new sample fetches became
available even if they're not really useful :
always_false, always_true, rep_ssl_hello_type, rdp_cookie_cnt,
req_len, req_ssl_hello_type, req_ssl_sni, req_ssl_ver, wait_end
The function 'acl_fetch_nothing' was wrong and never used anywhere so it
was removed.
The "rdp_cookie" sample fetch used to have a mandatory argument while it
was optional in ACLs, which are supposed to iterate over RDP cookies. So
we're making it optional as a fetch too, and it will return the first one.
2013-01-08 00:59:07 +04:00
* Copyright 2000 - 2013 Willy Tarreau < w @ 1 wt . eu >
2007-10-29 03:09:36 +03:00
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*
*/
2016-08-08 15:12:08 +03:00
/* this is to have tcp_info defined on systems using musl
* library , such as Alpine Linux
*/
# define _GNU_SOURCE
2007-10-29 03:09:36 +03:00
# include <ctype.h>
# include <errno.h>
# include <fcntl.h>
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <time.h>
# include <sys/param.h>
# include <sys/socket.h>
# include <sys/stat.h>
# include <sys/types.h>
# include <sys/un.h>
2009-08-24 15:11:06 +04:00
# include <netinet/tcp.h>
2015-08-24 02:43:45 +03:00
# include <netinet/in.h>
2009-08-24 15:11:06 +04:00
2007-10-29 03:09:36 +03:00
# include <common/compat.h>
# include <common/config.h>
# include <common/debug.h>
# include <common/errors.h>
# include <common/mini-clist.h>
# include <common/standard.h>
2014-11-17 17:11:45 +03:00
# include <common/namespace.h>
2007-10-29 03:09:36 +03:00
2016-11-25 17:49:32 +03:00
# include <types/action.h>
2015-08-24 02:43:45 +03:00
# include <types/connection.h>
2016-11-25 17:49:32 +03:00
# include <types/global.h>
# include <types/stream.h>
2007-10-29 03:09:36 +03:00
2012-04-20 16:45:49 +04:00
# include <proto/arg.h>
2012-08-24 21:22:53 +04:00
# include <proto/channel.h>
2012-07-06 16:29:45 +04:00
# include <proto/connection.h>
2012-09-03 00:34:23 +04:00
# include <proto/fd.h>
2012-09-13 00:58:11 +04:00
# include <proto/listener.h>
2009-08-16 16:02:45 +04:00
# include <proto/log.h>
# include <proto/port_range.h>
2012-09-13 00:58:11 +04:00
# include <proto/protocol.h>
2015-08-24 02:43:45 +03:00
# include <proto/proto_http.h>
2007-10-29 03:09:36 +03:00
# include <proto/proto_tcp.h>
[MAJOR] implement tcp request content inspection
Some people need to inspect contents of TCP requests before
deciding to forward a connection or not. A future extension
of this demand might consist in selecting a server farm
depending on the protocol detected in the request.
For this reason, a new state CL_STINSPECT has been added on
the client side. It is immediately entered upon accept() if
the statement "tcp-request inspect-delay <xxx>" is found in
the frontend configuration. Haproxy will then wait up to
this amount of time trying to find a matching ACL, and will
either accept or reject the connection depending on the
"tcp-request content <action> {if|unless}" rules, where
<action> is either "accept" or "reject".
Note that it only waits that long if no definitive verdict
can be found earlier. That generally implies calling a fetch()
function which does not have enough information to decode
some contents, or a match() function which only finds the
beginning of what it's looking for.
It is only at the ACL level that partial data may be processed
as such, because we need to distinguish between MISS and FAIL
*before* applying the term negation.
Thus it is enough to add "| ACL_PARTIAL" to the last argument
when calling acl_exec_cond() to indicate that we expect
ACL_PAT_MISS to be returned if some data is missing (for
fetch() or match()). This is the only case we may return
this value. For this reason, the ACL check in process_cli()
has become a lot simpler.
A new ACL "req_len" of type "int" has been added. Right now
it is already possible to drop requests which talk too early
(eg: for SMTP) or which don't talk at all (eg: HTTP/SSL).
Also, the acl fetch() functions have been extended in order
to permit reporting of missing data in case of fetch failure,
using the ACL_TEST_F_MAY_CHANGE flag.
The default behaviour is unchanged, and if no rule matches,
the request is accepted.
As a side effect, all layer 7 fetching functions have been
cleaned up so that they now check for the validity of the
layer 7 pointer before dereferencing it.
2008-07-15 01:54:42 +04:00
# include <proto/proxy.h>
2012-04-27 23:52:18 +04:00
# include <proto/sample.h>
2015-10-13 17:16:41 +03:00
# include <proto/server.h>
2010-06-05 21:13:27 +04:00
# include <proto/task.h>
2016-11-25 17:49:32 +03:00
# include <proto/tcp_rules.h>
2007-10-29 03:09:36 +03:00
2010-10-22 18:06:11 +04:00
static int tcp_bind_listeners ( struct protocol * proto , char * errmsg , int errlen ) ;
static int tcp_bind_listener ( struct listener * listener , char * errmsg , int errlen ) ;
2017-09-15 08:55:51 +03:00
static void tcpv4_add_listener ( struct listener * listener , int port ) ;
static void tcpv6_add_listener ( struct listener * listener , int port ) ;
2007-10-29 03:09:36 +03:00
/* Note: must not be declared <const> as its list will be overwritten */
static struct protocol proto_tcpv4 = {
. name = " tcpv4 " ,
. sock_domain = AF_INET ,
. sock_type = SOCK_STREAM ,
. sock_prot = IPPROTO_TCP ,
. sock_family = AF_INET ,
. sock_addrlen = sizeof ( struct sockaddr_in ) ,
. l3_addrlen = 32 / 8 ,
2012-05-07 23:22:09 +04:00
. accept = & listener_accept ,
2012-05-07 20:12:14 +04:00
. connect = tcp_connect_server ,
2010-10-22 18:06:11 +04:00
. bind = tcp_bind_listener ,
2007-10-29 03:09:36 +03:00
. bind_all = tcp_bind_listeners ,
. unbind_all = unbind_all_listeners ,
. enable_all = enable_all_listeners ,
2012-05-11 18:16:40 +04:00
. get_src = tcp_get_src ,
. get_dst = tcp_get_dst ,
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
. drain = tcp_drain ,
2014-07-07 22:22:12 +04:00
. pause = tcp_pause_listener ,
2017-09-15 08:55:51 +03:00
. add = tcpv4_add_listener ,
2007-10-29 03:09:36 +03:00
. listeners = LIST_HEAD_INIT ( proto_tcpv4 . listeners ) ,
. nb_listeners = 0 ,
} ;
/* Note: must not be declared <const> as its list will be overwritten */
static struct protocol proto_tcpv6 = {
. name = " tcpv6 " ,
. sock_domain = AF_INET6 ,
. sock_type = SOCK_STREAM ,
. sock_prot = IPPROTO_TCP ,
. sock_family = AF_INET6 ,
. sock_addrlen = sizeof ( struct sockaddr_in6 ) ,
. l3_addrlen = 128 / 8 ,
2012-05-07 23:22:09 +04:00
. accept = & listener_accept ,
2012-05-07 20:12:14 +04:00
. connect = tcp_connect_server ,
2010-10-22 18:06:11 +04:00
. bind = tcp_bind_listener ,
2007-10-29 03:09:36 +03:00
. bind_all = tcp_bind_listeners ,
. unbind_all = unbind_all_listeners ,
. enable_all = enable_all_listeners ,
2012-05-11 18:16:40 +04:00
. get_src = tcp_get_src ,
. get_dst = tcp_get_dst ,
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
. drain = tcp_drain ,
2014-07-07 22:22:12 +04:00
. pause = tcp_pause_listener ,
2017-09-15 08:55:51 +03:00
. add = tcpv6_add_listener ,
2007-10-29 03:09:36 +03:00
. listeners = LIST_HEAD_INIT ( proto_tcpv6 . listeners ) ,
. nb_listeners = 0 ,
} ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
/* Default TCP parameters, got by opening a temporary TCP socket. */
# ifdef TCP_MAXSEG
2017-10-29 22:14:08 +03:00
static THREAD_LOCAL int default_tcp_maxseg = - 1 ;
static THREAD_LOCAL int default_tcp6_maxseg = - 1 ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
# endif
2011-03-11 00:26:24 +03:00
/* Binds ipv4/ipv6 address <local> to socket <fd>, unless <flags> is set, in which
2008-01-13 20:40:14 +03:00
* case we try to bind < remote > . < flags > is a 2 - bit field consisting of :
* - 0 : ignore remote address ( may even be a NULL pointer )
* - 1 : use provided address
* - 2 : use provided port
* - 3 : use both
*
* The function supports multiple foreign binding methods :
* - linux_tproxy : we directly bind to the foreign address
* The second one can be used as a fallback for the first one .
* This function returns 0 when everything ' s OK , 1 if it could not bind , to the
* local address , 2 if it could not bind to the foreign address .
*/
2011-03-11 00:26:24 +03:00
int tcp_bind_socket ( int fd , int flags , struct sockaddr_storage * local , struct sockaddr_storage * remote )
2008-01-13 20:40:14 +03:00
{
2011-03-11 00:26:24 +03:00
struct sockaddr_storage bind_addr ;
2008-01-13 20:40:14 +03:00
int foreign_ok = 0 ;
int ret ;
2017-10-29 22:14:08 +03:00
static THREAD_LOCAL int ip_transp_working = 1 ;
static THREAD_LOCAL int ip6_transp_working = 1 ;
2013-05-09 00:49:23 +04:00
2012-07-13 16:34:59 +04:00
switch ( local - > ss_family ) {
case AF_INET :
if ( flags & & ip_transp_working ) {
2013-05-09 00:49:23 +04:00
/* This deserves some explanation. Some platforms will support
* multiple combinations of certain methods , so we try the
* supported ones until one succeeds .
*/
if ( 0
# if defined(IP_TRANSPARENT)
| | ( setsockopt ( fd , SOL_IP , IP_TRANSPARENT , & one , sizeof ( one ) ) = = 0 )
# endif
# if defined(IP_FREEBIND)
| | ( setsockopt ( fd , SOL_IP , IP_FREEBIND , & one , sizeof ( one ) ) = = 0 )
2013-05-09 01:22:39 +04:00
# endif
# if defined(IP_BINDANY)
| | ( setsockopt ( fd , IPPROTO_IP , IP_BINDANY , & one , sizeof ( one ) ) = = 0 )
2013-05-09 01:30:23 +04:00
# endif
# if defined(SO_BINDANY)
| | ( setsockopt ( fd , SOL_SOCKET , SO_BINDANY , & one , sizeof ( one ) ) = = 0 )
2013-05-09 00:49:23 +04:00
# endif
)
2012-07-13 16:34:59 +04:00
foreign_ok = 1 ;
else
ip_transp_working = 0 ;
}
break ;
case AF_INET6 :
if ( flags & & ip6_transp_working ) {
2013-05-09 00:49:23 +04:00
if ( 0
2016-09-09 10:41:15 +03:00
# if defined(IPV6_TRANSPARENT) && defined(SOL_IPV6)
2013-05-09 00:49:23 +04:00
| | ( setsockopt ( fd , SOL_IPV6 , IPV6_TRANSPARENT , & one , sizeof ( one ) ) = = 0 )
2013-05-09 01:22:39 +04:00
# endif
2014-03-04 00:10:51 +04:00
# if defined(IP_FREEBIND)
| | ( setsockopt ( fd , SOL_IP , IP_FREEBIND , & one , sizeof ( one ) ) = = 0 )
# endif
2013-05-09 01:22:39 +04:00
# if defined(IPV6_BINDANY)
| | ( setsockopt ( fd , IPPROTO_IPV6 , IPV6_BINDANY , & one , sizeof ( one ) ) = = 0 )
2013-05-09 01:30:23 +04:00
# endif
# if defined(SO_BINDANY)
| | ( setsockopt ( fd , SOL_SOCKET , SO_BINDANY , & one , sizeof ( one ) ) = = 0 )
2013-05-09 00:49:23 +04:00
# endif
)
2012-07-13 16:34:59 +04:00
foreign_ok = 1 ;
else
ip6_transp_working = 0 ;
}
break ;
2008-01-13 20:40:14 +03:00
}
2013-05-09 00:49:23 +04:00
2008-01-13 20:40:14 +03:00
if ( flags ) {
memset ( & bind_addr , 0 , sizeof ( bind_addr ) ) ;
2011-04-19 09:20:57 +04:00
bind_addr . ss_family = remote - > ss_family ;
2011-03-11 00:26:24 +03:00
switch ( remote - > ss_family ) {
case AF_INET :
if ( flags & 1 )
( ( struct sockaddr_in * ) & bind_addr ) - > sin_addr = ( ( struct sockaddr_in * ) remote ) - > sin_addr ;
if ( flags & 2 )
( ( struct sockaddr_in * ) & bind_addr ) - > sin_port = ( ( struct sockaddr_in * ) remote ) - > sin_port ;
break ;
case AF_INET6 :
if ( flags & 1 )
( ( struct sockaddr_in6 * ) & bind_addr ) - > sin6_addr = ( ( struct sockaddr_in6 * ) remote ) - > sin6_addr ;
if ( flags & 2 )
( ( struct sockaddr_in6 * ) & bind_addr ) - > sin6_port = ( ( struct sockaddr_in6 * ) remote ) - > sin6_port ;
break ;
2011-12-17 00:25:11 +04:00
default :
/* we don't want to try to bind to an unknown address family */
foreign_ok = 0 ;
2011-03-11 00:26:24 +03:00
}
2008-01-13 20:40:14 +03:00
}
2011-06-24 10:11:37 +04:00
setsockopt ( fd , SOL_SOCKET , SO_REUSEADDR , & one , sizeof ( one ) ) ;
2008-01-13 20:40:14 +03:00
if ( foreign_ok ) {
2014-05-10 00:56:10 +04:00
if ( is_inet_addr ( & bind_addr ) ) {
2012-10-26 21:57:58 +04:00
ret = bind ( fd , ( struct sockaddr * ) & bind_addr , get_addr_len ( & bind_addr ) ) ;
if ( ret < 0 )
return 2 ;
}
2008-01-13 20:40:14 +03:00
}
else {
2014-05-10 00:56:10 +04:00
if ( is_inet_addr ( local ) ) {
2012-10-26 21:57:58 +04:00
ret = bind ( fd , ( struct sockaddr * ) local , get_addr_len ( local ) ) ;
if ( ret < 0 )
return 1 ;
}
2008-01-13 20:40:14 +03:00
}
if ( ! flags )
return 0 ;
if ( ! foreign_ok )
/* we could not bind to a foreign address */
return 2 ;
return 0 ;
}
2014-11-17 17:11:45 +03:00
static int create_server_socket ( struct connection * conn )
{
2014-12-24 15:47:55 +03:00
const struct netns_entry * ns = NULL ;
2014-11-17 17:11:45 +03:00
2014-12-24 15:47:55 +03:00
# ifdef CONFIG_HAP_NS
if ( objt_server ( conn - > target ) ) {
if ( __objt_server ( conn - > target ) - > flags & SRV_F_USE_NS_FROM_PP )
ns = conn - > proxy_netns ;
else
ns = __objt_server ( conn - > target ) - > netns ;
}
# endif
2014-11-17 17:11:45 +03:00
return my_socketat ( ns , conn - > addr . to . ss_family , SOCK_STREAM , IPPROTO_TCP ) ;
}
2009-08-16 16:02:45 +04:00
/*
2012-08-31 00:23:13 +04:00
* This function initiates a TCP connection establishment to the target assigned
* to connection < conn > using ( si - > { target , addr . to } ) . A source address may be
* pointed to by conn - > addr . from in case of transparent proxying . Normal source
* bind addresses are still determined locally ( due to the possible need of a
* source port ) . conn - > target may point either to a valid server or to a backend ,
2012-11-12 03:42:33 +04:00
* depending on conn - > target . Only OBJ_TYPE_PROXY and OBJ_TYPE_SERVER are
2012-11-24 13:24:27 +04:00
* supported . The < data > parameter is a boolean indicating whether there are data
* waiting for being sent or not , in order to adjust data write polling and on
* some platforms , the ability to avoid an empty initial ACK . The < delack > argument
* allows the caller to force using a delayed ACK when establishing the connection :
* - 0 = no delayed ACK unless data are advertised and backend has tcp - smart - connect
* - 1 = delayed ACK if backend has tcp - smart - connect , regardless of data
* - 2 = delayed ACK regardless of backend options
2010-03-29 21:36:59 +04:00
*
2013-10-24 23:45:00 +04:00
* Note that a pending send_proxy message accounts for data .
*
2009-08-16 16:02:45 +04:00
* It can return one of :
2015-04-03 02:14:29 +03:00
* - SF_ERR_NONE if everything ' s OK
* - SF_ERR_SRVTO if there are no more servers
* - SF_ERR_SRVCL if the connection was refused by the server
* - SF_ERR_PRXCOND if the connection has been limited by the proxy ( maxconn )
* - SF_ERR_RESOURCE if a system resource is lacking ( eg : fd limits , ports , . . . )
* - SF_ERR_INTERNAL for any other purely internal errors
2016-11-29 04:15:19 +03:00
* Additionally , in the case of SF_ERR_RESOURCE , an emergency log will be emitted .
2012-11-23 11:51:32 +04:00
*
2015-04-03 02:14:29 +03:00
* The connection ' s fd is inserted only when SF_ERR_NONE is returned , otherwise
2012-11-23 11:51:32 +04:00
* it ' s invalid and the caller has nothing to do .
2009-08-16 16:02:45 +04:00
*/
2011-03-03 20:27:32 +03:00
2012-11-24 13:24:27 +04:00
int tcp_connect_server ( struct connection * conn , int data , int delack )
2009-08-16 16:02:45 +04:00
{
int fd ;
2011-03-05 00:04:29 +03:00
struct server * srv ;
struct proxy * be ;
2012-12-09 01:49:11 +04:00
struct conn_src * src ;
2011-03-05 00:04:29 +03:00
2014-01-24 19:08:19 +04:00
conn - > flags = CO_FL_WAIT_L4_CONN ; /* connection in progress */
2012-11-12 03:42:33 +04:00
switch ( obj_type ( conn - > target ) ) {
case OBJ_TYPE_PROXY :
be = objt_proxy ( conn - > target ) ;
2011-03-05 00:04:29 +03:00
srv = NULL ;
break ;
2012-11-12 03:42:33 +04:00
case OBJ_TYPE_SERVER :
srv = objt_server ( conn - > target ) ;
2011-03-05 00:04:29 +03:00
be = srv - > proxy ;
break ;
default :
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_INTERNAL ;
2011-03-05 00:04:29 +03:00
}
2009-08-16 16:02:45 +04:00
2017-08-24 15:31:19 +03:00
fd = conn - > handle . fd = create_server_socket ( conn ) ;
2014-11-17 17:11:45 +03:00
if ( fd = = - 1 ) {
2009-08-16 16:02:45 +04:00
qfprintf ( stderr , " Cannot get a server socket. \n " ) ;
2014-01-24 19:08:19 +04:00
if ( errno = = ENFILE ) {
conn - > err_code = CO_ER_SYS_FDLIM ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2018-01-29 17:06:04 +03:00
" Proxy %s reached system FD limit (maxsock=%d). Please check system tunables. \n " ,
be - > id , global . maxsock ) ;
2014-01-24 19:08:19 +04:00
}
else if ( errno = = EMFILE ) {
conn - > err_code = CO_ER_PROC_FDLIM ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2018-01-29 17:06:04 +03:00
" Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart. \n " ,
be - > id , global . maxsock ) ;
2014-01-24 19:08:19 +04:00
}
else if ( errno = = ENOBUFS | | errno = = ENOMEM ) {
conn - > err_code = CO_ER_SYS_MEMLIM ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2018-01-29 17:06:04 +03:00
" Proxy %s reached system memory limit (maxsock=%d). Please check system tunables. \n " ,
be - > id , global . maxsock ) ;
2014-01-24 19:08:19 +04:00
}
else if ( errno = = EAFNOSUPPORT | | errno = = EPROTONOSUPPORT ) {
conn - > err_code = CO_ER_NOPROTO ;
}
else
conn - > err_code = CO_ER_SOCK_ERR ;
2009-08-16 16:02:45 +04:00
/* this is a resource error */
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_RESOURCE ;
2009-08-16 16:02:45 +04:00
}
if ( fd > = global . maxsock ) {
/* do not log anything there, it's a normal condition when this option
* is used to serialize connections to a server !
*/
2017-11-24 18:50:31 +03:00
ha_alert ( " socket(): not enough free sockets. Raise -n argument. Giving up. \n " ) ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_CONF_FDLIM ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_PRXCOND ; /* it is a configuration limit */
2009-08-16 16:02:45 +04:00
}
if ( ( fcntl ( fd , F_SETFL , O_NONBLOCK ) = = - 1 ) | |
2011-06-24 10:11:37 +04:00
( setsockopt ( fd , IPPROTO_TCP , TCP_NODELAY , & one , sizeof ( one ) ) = = - 1 ) ) {
2009-08-16 16:02:45 +04:00
qfprintf ( stderr , " Cannot set client socket to non blocking mode. \n " ) ;
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_SOCK_ERR ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_INTERNAL ;
2009-08-16 16:02:45 +04:00
}
if ( be - > options & PR_O_TCP_SRV_KA )
2011-06-24 10:11:37 +04:00
setsockopt ( fd , SOL_SOCKET , SO_KEEPALIVE , & one , sizeof ( one ) ) ;
2009-08-16 16:02:45 +04:00
/* allow specific binding :
* - server - specific at first
* - proxy - specific next
*/
2012-12-09 01:49:11 +04:00
if ( srv & & srv - > conn_src . opts & CO_SRC_BIND )
src = & srv - > conn_src ;
else if ( be - > conn_src . opts & CO_SRC_BIND )
src = & be - > conn_src ;
else
src = NULL ;
if ( src ) {
2009-08-16 16:02:45 +04:00
int ret , flags = 0 ;
2014-05-10 00:56:10 +04:00
if ( is_inet_addr ( & conn - > addr . from ) ) {
2012-12-09 01:49:11 +04:00
switch ( src - > opts & CO_SRC_TPROXY_MASK ) {
2012-12-09 01:29:20 +04:00
case CO_SRC_TPROXY_CLI :
2015-08-04 20:24:13 +03:00
conn - > flags | = CO_FL_PRIVATE ;
/* fall through */
case CO_SRC_TPROXY_ADDR :
2012-10-26 21:57:58 +04:00
flags = 3 ;
break ;
2012-12-09 01:29:20 +04:00
case CO_SRC_TPROXY_CIP :
case CO_SRC_TPROXY_DYN :
2015-08-04 20:24:13 +03:00
conn - > flags | = CO_FL_PRIVATE ;
2012-10-26 21:57:58 +04:00
flags = 1 ;
break ;
}
2009-08-16 16:02:45 +04:00
}
2010-03-29 21:36:59 +04:00
2009-08-16 16:02:45 +04:00
# ifdef SO_BINDTODEVICE
/* Note: this might fail if not CAP_NET_RAW */
2012-12-09 01:49:11 +04:00
if ( src - > iface_name )
setsockopt ( fd , SOL_SOCKET , SO_BINDTODEVICE , src - > iface_name , src - > iface_len + 1 ) ;
2009-08-16 16:02:45 +04:00
# endif
2012-12-09 01:49:11 +04:00
if ( src - > sport_range ) {
2009-08-16 16:02:45 +04:00
int attempts = 10 ; /* should be more than enough to find a spare port */
2012-12-09 01:49:11 +04:00
struct sockaddr_storage sa ;
2009-08-16 16:02:45 +04:00
ret = 1 ;
2016-05-18 17:17:44 +03:00
memcpy ( & sa , & src - > source_addr , sizeof ( sa ) ) ;
2009-08-16 16:02:45 +04:00
do {
/* note: in case of retry, we may have to release a previously
* allocated port , hence this loop ' s construct .
*/
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
if ( ! attempts )
break ;
attempts - - ;
2012-12-09 01:49:11 +04:00
fdinfo [ fd ] . local_port = port_range_alloc_port ( src - > sport_range ) ;
2014-01-24 19:08:19 +04:00
if ( ! fdinfo [ fd ] . local_port ) {
conn - > err_code = CO_ER_PORT_RANGE ;
2009-08-16 16:02:45 +04:00
break ;
2014-01-24 19:08:19 +04:00
}
2009-08-16 16:02:45 +04:00
2012-12-09 01:49:11 +04:00
fdinfo [ fd ] . port_range = src - > sport_range ;
set_host_port ( & sa , fdinfo [ fd ] . local_port ) ;
2009-08-16 16:02:45 +04:00
2012-12-09 01:49:11 +04:00
ret = tcp_bind_socket ( fd , flags , & sa , & conn - > addr . from ) ;
2014-01-24 19:08:19 +04:00
if ( ret ! = 0 )
conn - > err_code = CO_ER_CANT_BIND ;
2009-08-16 16:02:45 +04:00
} while ( ret ! = 0 ) ; /* binding NOK */
}
else {
2016-09-13 12:51:15 +03:00
# ifdef IP_BIND_ADDRESS_NO_PORT
2017-10-29 22:14:08 +03:00
static THREAD_LOCAL int bind_address_no_port = 1 ;
2016-09-13 12:51:15 +03:00
setsockopt ( fd , SOL_IP , IP_BIND_ADDRESS_NO_PORT , ( const void * ) & bind_address_no_port , sizeof ( int ) ) ;
# endif
2012-12-09 01:49:11 +04:00
ret = tcp_bind_socket ( fd , flags , & src - > source_addr , & conn - > addr . from ) ;
2014-01-24 19:08:19 +04:00
if ( ret ! = 0 )
conn - > err_code = CO_ER_CANT_BIND ;
2009-08-16 16:02:45 +04:00
}
2012-12-09 01:49:11 +04:00
if ( unlikely ( ret ! = 0 ) ) {
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
if ( ret = = 1 ) {
2017-11-24 18:50:31 +03:00
ha_alert ( " Cannot bind to source address before connect() for backend %s. Aborting. \n " ,
be - > id ) ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2012-12-09 01:49:11 +04:00
" Cannot bind to source address before connect() for backend %s. \n " ,
2009-08-16 16:02:45 +04:00
be - > id ) ;
} else {
2017-11-24 18:50:31 +03:00
ha_alert ( " Cannot bind to tproxy source address before connect() for backend %s. Aborting. \n " ,
be - > id ) ;
2009-08-16 16:02:45 +04:00
send_log ( be , LOG_EMERG ,
2012-12-09 01:49:11 +04:00
" Cannot bind to tproxy source address before connect() for backend %s. \n " ,
2009-08-16 16:02:45 +04:00
be - > id ) ;
}
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_RESOURCE ;
2009-08-16 16:02:45 +04:00
}
}
2009-08-24 15:11:06 +04:00
# if defined(TCP_QUICKACK)
2009-08-16 16:02:45 +04:00
/* disabling tcp quick ack now allows the first request to leave the
* machine with the first ACK . We only do this if there are pending
2012-11-24 13:24:27 +04:00
* data in the buffer .
2009-08-16 16:02:45 +04:00
*/
2014-10-24 14:02:24 +04:00
if ( delack = = 2 | | ( ( delack | | data | | conn - > send_proxy_ofs ) & & ( be - > options2 & PR_O2_SMARTCON ) ) )
2011-06-24 10:11:37 +04:00
setsockopt ( fd , IPPROTO_TCP , TCP_QUICKACK , & zero , sizeof ( zero ) ) ;
2009-08-16 16:02:45 +04:00
# endif
2015-10-13 17:16:41 +03:00
# ifdef TCP_USER_TIMEOUT
/* there is not much more we can do here when it fails, it's still minor */
if ( srv & & srv - > tcp_ut )
setsockopt ( fd , IPPROTO_TCP , TCP_USER_TIMEOUT , & srv - > tcp_ut , sizeof ( srv - > tcp_ut ) ) ;
# endif
2010-01-21 19:43:04 +03:00
if ( global . tune . server_sndbuf )
setsockopt ( fd , SOL_SOCKET , SO_SNDBUF , & global . tune . server_sndbuf , sizeof ( global . tune . server_sndbuf ) ) ;
if ( global . tune . server_rcvbuf )
setsockopt ( fd , SOL_SOCKET , SO_RCVBUF , & global . tune . server_rcvbuf , sizeof ( global . tune . server_rcvbuf ) ) ;
2017-01-25 16:12:22 +03:00
if ( connect ( fd , ( struct sockaddr * ) & conn - > addr . to , get_addr_len ( & conn - > addr . to ) ) = = - 1 ) {
if ( errno = = EINPROGRESS | | errno = = EALREADY ) {
/* common case, let's wait for connect status */
conn - > flags | = CO_FL_WAIT_L4_CONN ;
}
else if ( errno = = EISCONN ) {
/* should normally not happen but if so, indicates that it's OK */
conn - > flags & = ~ CO_FL_WAIT_L4_CONN ;
}
else if ( errno = = EAGAIN | | errno = = EADDRINUSE | | errno = = EADDRNOTAVAIL ) {
2009-08-16 16:02:45 +04:00
char * msg ;
2014-01-24 19:08:19 +04:00
if ( errno = = EAGAIN | | errno = = EADDRNOTAVAIL ) {
2009-08-16 16:02:45 +04:00
msg = " no free ports " ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_FREE_PORTS ;
}
else {
2009-08-16 16:02:45 +04:00
msg = " local address already in use " ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_ADDR_INUSE ;
}
2009-08-16 16:02:45 +04:00
2012-12-09 02:03:28 +04:00
qfprintf ( stderr , " Connect() failed for backend %s: %s. \n " , be - > id , msg ) ;
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2012-12-09 02:03:28 +04:00
send_log ( be , LOG_ERR , " Connect() failed for backend %s: %s. \n " , be - > id , msg ) ;
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_RESOURCE ;
2009-08-16 16:02:45 +04:00
} else if ( errno = = ETIMEDOUT ) {
//qfprintf(stderr,"Connect(): ETIMEDOUT");
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_SOCK_ERR ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_SRVTO ;
2009-08-16 16:02:45 +04:00
} else {
// (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM)
//qfprintf(stderr,"Connect(): %d", errno);
2009-10-18 09:25:52 +04:00
port_range_release_port ( fdinfo [ fd ] . port_range , fdinfo [ fd ] . local_port ) ;
fdinfo [ fd ] . port_range = NULL ;
2009-08-16 16:02:45 +04:00
close ( fd ) ;
2014-01-24 19:08:19 +04:00
conn - > err_code = CO_ER_SOCK_ERR ;
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_SRVCL ;
2009-08-16 16:02:45 +04:00
}
}
2017-01-25 16:12:22 +03:00
else {
/* connect() == 0, this is great! */
conn - > flags & = ~ CO_FL_WAIT_L4_CONN ;
}
2009-08-16 16:02:45 +04:00
2012-12-08 21:53:44 +04:00
conn - > flags | = CO_FL_ADDR_TO_SET ;
2012-05-11 21:53:32 +04:00
2013-10-24 23:45:00 +04:00
/* Prepare to send a few handshakes related to the on-wire protocol. */
if ( conn - > send_proxy_ofs )
2013-10-25 00:01:26 +04:00
conn - > flags | = CO_FL_SEND_PROXY ;
2013-10-24 23:45:00 +04:00
MAJOR: connection: add two new flags to indicate readiness of control/transport
Currently the control and transport layers of a connection are supposed
to be initialized when their respective pointers are not NULL. This will
not work anymore when we plan to reuse connections, because there is an
asymmetry between the accept() side and the connect() side :
- on accept() side, the fd is set first, then the ctrl layer then the
transport layer ; upon error, they must be undone in the reverse order,
then the FD must be closed. The FD must not be deleted if the control
layer was not yet initialized ;
- on the connect() side, the fd is set last and there is no reliable way
to know if it has been initialized or not. In practice it's initialized
to -1 first but this is hackish and supposes that local FDs only will
be used forever. Also, there are even less solutions for keeping trace
of the transport layer's state.
Also it is possible to support delayed close() when something (eg: logs)
tracks some information requiring the transport and/or control layers,
making it even more difficult to clean them.
So the proposed solution is to add two flags to the connection :
- CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert)
and cleared after it's released (fd_delete).
- CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init)
and cleared after it's released (xprt->close).
The functions have been adapted to rely on this and not on the pointers
anymore. conn_xprt_close() was unused and dangerous : it did not close
the control layer (eg: the socket itself) but still marks the transport
layer as closed, preventing any future call to conn_full_close() from
finishing the job.
The problem comes from conn_full_close() in fact. It needs to close the
xprt and ctrl layers independantly. After that we're still having an issue :
we don't know based on ->ctrl alone whether the fd was registered or not.
For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We
now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what
remains to be done on the connection.
In order not to miss some flag assignments, we introduce conn_ctrl_init()
to initialize the control layer, register the fd using fd_insert() and set
the flag, and conn_ctrl_close() which unregisters the fd and removes the
flag, but only if the transport layer was closed.
Similarly, at the transport layer, conn_xprt_init() calls ->init and sets
the flag, while conn_xprt_close() checks the flag, calls ->close and clears
the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init
and the ->close functions are called only once each and in the correct order.
Note that conn_xprt_close() does nothing if the transport layer is still
tracked.
conn_full_close() now simply calls conn_xprt_close() then conn_full_close()
in turn, which do nothing if CO_FL_XPRT_TRACKED is set.
In order to handle the error path, we also provide conn_force_close() which
ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers
in turns. All relevant instances of fd_delete() have been replaced with
conn_force_close(). Now we always know what state the connection is in and
we can expect to split its initialization.
2013-10-21 18:30:56 +04:00
conn_ctrl_init ( conn ) ; /* registers the FD */
2013-12-15 17:19:38 +04:00
fdtab [ fd ] . linger_risk = 1 ; /* close hard if needed */
2012-08-31 15:54:11 +04:00
REORG: connection: rename the data layer the "transport layer"
While working on the changes required to make the health checks use the
new connections, it started to become obvious that some naming was not
logical at all in the connections. Specifically, it is not logical to
call the "data layer" the layer which is in charge for all the handshake
and which does not yet provide a data layer once established until a
session has allocated all the required buffers.
In fact, it's more a transport layer, which makes much more sense. The
transport layer offers a medium on which data can transit, and it offers
the functions to move these data when the upper layer requests this. And
it is the upper layer which iterates over the transport layer's functions
to move data which should be called the data layer.
The use case where it's obvious is with embryonic sessions : an incoming
SSL connection is accepted. Only the connection is allocated, not the
buffers nor stream interface, etc... The connection handles the SSL
handshake by itself. Once this handshake is complete, we can't use the
data functions because the buffers and stream interface are not there
yet. Hence we have to first call a specific function to complete the
session initialization, after which we'll be able to use the data
functions. This clearly proves that SSL here is only a transport layer
and that the stream interface constitutes the data layer.
A similar change will be performed to rename app_cb => data, but the
two could not be in the same commit for obvious reasons.
2012-10-03 02:19:48 +04:00
if ( conn_xprt_init ( conn ) < 0 ) {
2017-10-05 19:01:29 +03:00
conn_full_close ( conn ) ;
2014-01-24 19:08:19 +04:00
conn - > flags | = CO_FL_ERROR ;
2015-04-03 02:14:29 +03:00
return SF_ERR_RESOURCE ;
2012-09-06 16:04:41 +04:00
}
2012-08-31 15:54:11 +04:00
MEDIUM: ssl: Handle early data with OpenSSL 1.1.1
When compiled with Openssl >= 1.1.1, before attempting to do the handshake,
try to read any early data. If any early data is present, then we'll create
the session, read the data, and handle the request before we're doing the
handshake.
For this, we add a new connection flag, CO_FL_EARLY_SSL_HS, which is not
part of the CO_FL_HANDSHAKE set, allowing to proceed with a session even
before an SSL handshake is completed.
As early data do have security implication, we let the origin server know
the request comes from early data by adding the "Early-Data" header, as
specified in this draft from the HTTP working group :
https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-replay
2017-09-22 19:26:28 +03:00
if ( conn - > flags & ( CO_FL_HANDSHAKE | CO_FL_WAIT_L4_CONN | CO_FL_EARLY_SSL_HS ) ) {
2017-01-25 16:12:22 +03:00
conn_sock_want_send ( conn ) ; /* for connect status, proxy protocol or SSL */
MEDIUM: ssl: Handle early data with OpenSSL 1.1.1
When compiled with Openssl >= 1.1.1, before attempting to do the handshake,
try to read any early data. If any early data is present, then we'll create
the session, read the data, and handle the request before we're doing the
handshake.
For this, we add a new connection flag, CO_FL_EARLY_SSL_HS, which is not
part of the CO_FL_HANDSHAKE set, allowing to proceed with a session even
before an SSL handshake is completed.
As early data do have security implication, we let the origin server know
the request comes from early data by adding the "Early-Data" header, as
specified in this draft from the HTTP working group :
https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-replay
2017-09-22 19:26:28 +03:00
if ( conn - > flags & CO_FL_EARLY_SSL_HS )
conn_xprt_want_send ( conn ) ;
2017-01-25 16:12:22 +03:00
}
else {
/* If there's no more handshake, we need to notify the data
* layer when the connection is already OK otherwise we ' ll have
* no other opportunity to do it later ( eg : health checks ) .
*/
data = 1 ;
}
2012-08-31 00:23:13 +04:00
if ( data )
2017-09-13 19:30:23 +03:00
conn_xprt_want_send ( conn ) ; /* prepare to send data if any */
2009-08-16 16:02:45 +04:00
2015-04-03 02:14:29 +03:00
return SF_ERR_NONE ; /* connection is OK */
2009-08-16 16:02:45 +04:00
}
2012-05-11 18:16:40 +04:00
/*
* Retrieves the source address for the socket < fd > , with < dir > indicating
* if we ' re a listener ( = 0 ) or an initiator ( ! = 0 ) . It returns 0 in case of
* success , - 1 in case of error . The socket ' s source address is stored in
* < sa > for < salen > bytes .
*/
int tcp_get_src ( int fd , struct sockaddr * sa , socklen_t salen , int dir )
{
if ( dir )
return getsockname ( fd , sa , & salen ) ;
else
return getpeername ( fd , sa , & salen ) ;
}
/*
* Retrieves the original destination address for the socket < fd > , with < dir >
* indicating if we ' re a listener ( = 0 ) or an initiator ( ! = 0 ) . In the case of a
* listener , if the original destination address was translated , the original
* address is retrieved . It returns 0 in case of success , - 1 in case of error .
* The socket ' s source address is stored in < sa > for < salen > bytes .
*/
int tcp_get_dst ( int fd , struct sockaddr * sa , socklen_t salen , int dir )
{
if ( dir )
return getpeername ( fd , sa , & salen ) ;
2014-10-29 23:46:01 +03:00
else {
int ret = getsockname ( fd , sa , & salen ) ;
if ( ret < 0 )
return ret ;
2012-05-11 18:16:40 +04:00
# if defined(TPROXY) && defined(SO_ORIGINAL_DST)
2014-10-29 23:46:01 +03:00
/* For TPROXY and Netfilter's NAT, we can retrieve the original
* IPv4 address before DNAT / REDIRECT . We must not do that with
* other families because v6 - mapped IPv4 addresses are still
* reported as v4 .
*/
if ( ( ( struct sockaddr_storage * ) sa ) - > ss_family = = AF_INET
& & getsockopt ( fd , SOL_IP , SO_ORIGINAL_DST , sa , & salen ) = = 0 )
return 0 ;
2012-05-11 18:16:40 +04:00
# endif
2014-10-29 23:46:01 +03:00
return ret ;
}
2012-05-11 18:16:40 +04:00
}
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
/* Tries to drain any pending incoming data from the socket to reach the
2014-01-20 14:26:12 +04:00
* receive shutdown . Returns positive if the shutdown was found , negative
* if EAGAIN was hit , otherwise zero . This is useful to decide whether we
* can close a connection cleanly are we must kill it hard .
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
*/
int tcp_drain ( int fd )
{
int turns = 2 ;
int len ;
while ( turns ) {
# ifdef MSG_TRUNC_CLEARS_INPUT
len = recv ( fd , NULL , INT_MAX , MSG_DONTWAIT | MSG_NOSIGNAL | MSG_TRUNC ) ;
if ( len = = - 1 & & errno = = EFAULT )
# endif
len = recv ( fd , trash . str , trash . size , MSG_DONTWAIT | MSG_NOSIGNAL ) ;
2014-01-20 14:56:37 +04:00
if ( len = = 0 ) {
/* cool, shutdown received */
fdtab [ fd ] . linger_risk = 0 ;
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
return 1 ;
2014-01-20 14:56:37 +04:00
}
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
if ( len < 0 ) {
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 19:58:45 +04:00
if ( errno = = EAGAIN ) {
/* connection not closed yet */
fd_cant_recv ( fd ) ;
2014-01-20 14:26:12 +04:00
return - 1 ;
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 19:58:45 +04:00
}
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
if ( errno = = EINTR ) /* oops, try again */
continue ;
/* other errors indicate a dead connection, fine. */
2014-01-20 14:56:37 +04:00
fdtab [ fd ] . linger_risk = 0 ;
MEDIUM: protocol: implement a "drain" function in protocol layers
Since commit cfd97c6f was merged into 1.5-dev14 (BUG/MEDIUM: checks:
prevent TIME_WAITs from appearing also on timeouts), some valid health
checks sometimes used to show some TCP resets. For example, this HTTP
health check sent to a local server :
19:55:15.742818 IP 127.0.0.1.16568 > 127.0.0.1.8000: S 3355859679:3355859679(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742841 IP 127.0.0.1.8000 > 127.0.0.1.16568: S 1060952566:1060952566(0) ack 3355859680 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:15.742863 IP 127.0.0.1.16568 > 127.0.0.1.8000: . ack 1 win 257
19:55:15.745402 IP 127.0.0.1.16568 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:15.745488 IP 127.0.0.1.8000 > 127.0.0.1.16568: FP 1:146(145) ack 23 win 257
19:55:15.747109 IP 127.0.0.1.16568 > 127.0.0.1.8000: R 23:23(0) ack 147 win 257
After some discussion with Chris Huang-Leaver, it appeared clear that
what we want is to only send the RST when we have no other choice, which
means when the server has not closed. So we still keep SYN/SYN-ACK/RST
for pure TCP checks, but don't want to see an RST emitted as above when
the server has already sent the FIN.
The solution against this consists in implementing a "drain" function at
the protocol layer, which, when defined, causes as much as possible of
the input socket buffer to be flushed to make recv() return zero so that
we know that the server's FIN was received and ACKed. On Linux, we can make
use of MSG_TRUNC on TCP sockets, which has the benefit of draining everything
at once without even copying data. On other platforms, we read up to one
buffer of data before the close. If recv() manages to get the final zero,
we don't disable lingering. Same for hard errors. Otherwise we do.
In practice, on HTTP health checks we generally find that the close was
pending and is returned upon first recv() call. The network trace becomes
cleaner :
19:55:23.650621 IP 127.0.0.1.16561 > 127.0.0.1.8000: S 3982804816:3982804816(0) win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650644 IP 127.0.0.1.8000 > 127.0.0.1.16561: S 4082139313:4082139313(0) ack 3982804817 win 32792 <mss 16396,nop,nop,sackOK,nop,wscale 7>
19:55:23.650666 IP 127.0.0.1.16561 > 127.0.0.1.8000: . ack 1 win 257
19:55:23.651615 IP 127.0.0.1.16561 > 127.0.0.1.8000: P 1:23(22) ack 1 win 257
19:55:23.651696 IP 127.0.0.1.8000 > 127.0.0.1.16561: FP 1:146(145) ack 23 win 257
19:55:23.652628 IP 127.0.0.1.16561 > 127.0.0.1.8000: F 23:23(0) ack 147 win 257
19:55:23.652655 IP 127.0.0.1.8000 > 127.0.0.1.16561: . ack 24 win 257
This change should be backported to 1.4 which is where Chris encountered
this issue. The code is different, so probably the tcp_drain() function
will have to be put in the checks only.
2013-06-10 21:56:38 +04:00
return 1 ;
}
/* OK we read some data, let's try again once */
turns - - ;
}
/* some data are still present, give up */
return 0 ;
}
2012-05-11 21:53:32 +04:00
/* This is the callback which is set when a connection establishment is pending
2013-12-04 19:11:04 +04:00
* and we have nothing to send . It updates the FD polling status . It returns 0
* if it fails in a fatal way or needs to poll to go further , otherwise it
* returns non - zero and removes the CO_FL_WAIT_L4_CONN flag from the connection ' s
* flags . In case of error , it sets CO_FL_ERROR and leaves the error code in
* errno . The error checking is done in two passes in order to limit the number
* of syscalls in the normal case :
* - if POLL_ERR was reported by the poller , we check for a pending error on
* the socket before proceeding . If found , it ' s assigned to errno so that
* upper layers can see it .
* - otherwise connect ( ) is used to check the connection state again , since
* the getsockopt return cannot reliably be used to know if the connection
* is still pending or ready . This one may often return an error as well ,
* since we don ' t always have POLL_ERR ( eg : OSX or cached events ) .
2012-05-11 21:53:32 +04:00
*/
2012-07-23 20:53:03 +04:00
int tcp_connect_probe ( struct connection * conn )
2012-05-11 21:53:32 +04:00
{
2017-08-24 15:31:19 +03:00
int fd = conn - > handle . fd ;
2013-12-04 19:11:04 +04:00
socklen_t lskerr ;
int skerr ;
2012-05-11 21:53:32 +04:00
2012-07-06 16:54:49 +04:00
if ( conn - > flags & CO_FL_ERROR )
2012-08-09 16:45:22 +04:00
return 0 ;
2012-05-11 21:53:32 +04:00
2014-01-23 16:50:42 +04:00
if ( ! conn_ctrl_ready ( conn ) )
MAJOR: connection: add two new flags to indicate readiness of control/transport
Currently the control and transport layers of a connection are supposed
to be initialized when their respective pointers are not NULL. This will
not work anymore when we plan to reuse connections, because there is an
asymmetry between the accept() side and the connect() side :
- on accept() side, the fd is set first, then the ctrl layer then the
transport layer ; upon error, they must be undone in the reverse order,
then the FD must be closed. The FD must not be deleted if the control
layer was not yet initialized ;
- on the connect() side, the fd is set last and there is no reliable way
to know if it has been initialized or not. In practice it's initialized
to -1 first but this is hackish and supposes that local FDs only will
be used forever. Also, there are even less solutions for keeping trace
of the transport layer's state.
Also it is possible to support delayed close() when something (eg: logs)
tracks some information requiring the transport and/or control layers,
making it even more difficult to clean them.
So the proposed solution is to add two flags to the connection :
- CO_FL_CTRL_READY is set when the control layer is initialized (fd_insert)
and cleared after it's released (fd_delete).
- CO_FL_XPRT_READY is set when the control layer is initialized (xprt->init)
and cleared after it's released (xprt->close).
The functions have been adapted to rely on this and not on the pointers
anymore. conn_xprt_close() was unused and dangerous : it did not close
the control layer (eg: the socket itself) but still marks the transport
layer as closed, preventing any future call to conn_full_close() from
finishing the job.
The problem comes from conn_full_close() in fact. It needs to close the
xprt and ctrl layers independantly. After that we're still having an issue :
we don't know based on ->ctrl alone whether the fd was registered or not.
For this we use the two new flags CO_FL_XPRT_READY and CO_FL_CTRL_READY. We
now rely on this and not on conn->xprt nor conn->ctrl anymore to decide what
remains to be done on the connection.
In order not to miss some flag assignments, we introduce conn_ctrl_init()
to initialize the control layer, register the fd using fd_insert() and set
the flag, and conn_ctrl_close() which unregisters the fd and removes the
flag, but only if the transport layer was closed.
Similarly, at the transport layer, conn_xprt_init() calls ->init and sets
the flag, while conn_xprt_close() checks the flag, calls ->close and clears
the flag, regardless xprt_ctx or xprt_st. This also ensures that the ->init
and the ->close functions are called only once each and in the correct order.
Note that conn_xprt_close() does nothing if the transport layer is still
tracked.
conn_full_close() now simply calls conn_xprt_close() then conn_full_close()
in turn, which do nothing if CO_FL_XPRT_TRACKED is set.
In order to handle the error path, we also provide conn_force_close() which
ignores CO_FL_XPRT_TRACKED and closes the transport and the control layers
in turns. All relevant instances of fd_delete() have been replaced with
conn_force_close(). Now we always know what state the connection is in and
we can expect to split its initialization.
2013-10-21 18:30:56 +04:00
return 0 ;
2012-07-06 16:54:49 +04:00
if ( ! ( conn - > flags & CO_FL_WAIT_L4_CONN ) )
2012-07-23 22:05:00 +04:00
return 1 ; /* strange we were called while ready */
2012-05-11 21:53:32 +04:00
2014-01-20 18:13:07 +04:00
if ( ! fd_send_ready ( fd ) )
return 0 ;
2013-12-04 19:11:04 +04:00
/* we might be the first witness of FD_POLL_ERR. Note that FD_POLL_HUP
* without FD_POLL_IN also indicates a hangup without input data meaning
* there was no connection .
*/
if ( fdtab [ fd ] . ev & FD_POLL_ERR | |
( fdtab [ fd ] . ev & ( FD_POLL_IN | FD_POLL_HUP ) ) = = FD_POLL_HUP ) {
skerr = 0 ;
lskerr = sizeof ( skerr ) ;
getsockopt ( fd , SOL_SOCKET , SO_ERROR , & skerr , & lskerr ) ;
errno = skerr ;
if ( errno = = EAGAIN )
errno = 0 ;
if ( errno )
goto out_error ;
}
2012-07-23 17:07:23 +04:00
2013-12-04 19:11:04 +04:00
/* Use connect() to check the state of the socket. This has the
* advantage of giving us the following info :
2012-07-06 19:12:34 +04:00
* - error
* - connecting ( EALREADY , EINPROGRESS )
* - connected ( EISCONN , 0 )
2012-05-11 21:53:32 +04:00
*/
2012-08-30 23:11:38 +04:00
if ( connect ( fd , ( struct sockaddr * ) & conn - > addr . to , get_addr_len ( & conn - > addr . to ) ) < 0 ) {
2012-08-17 19:33:53 +04:00
if ( errno = = EALREADY | | errno = = EINPROGRESS ) {
2012-12-10 20:03:52 +04:00
__conn_sock_stop_recv ( conn ) ;
2014-01-22 23:02:06 +04:00
fd_cant_send ( fd ) ;
2012-07-23 22:05:00 +04:00
return 0 ;
2012-08-17 19:33:53 +04:00
}
2012-05-20 20:35:19 +04:00
2012-07-06 19:12:34 +04:00
if ( errno & & errno ! = EISCONN )
2012-05-20 20:35:19 +04:00
goto out_error ;
2012-07-06 19:12:34 +04:00
/* otherwise we're connected */
2012-05-20 20:35:19 +04:00
}
2012-05-11 21:53:32 +04:00
2012-07-06 18:02:29 +04:00
/* The FD is ready now, we'll mark the connection as complete and
REORG: connection: rename the data layer the "transport layer"
While working on the changes required to make the health checks use the
new connections, it started to become obvious that some naming was not
logical at all in the connections. Specifically, it is not logical to
call the "data layer" the layer which is in charge for all the handshake
and which does not yet provide a data layer once established until a
session has allocated all the required buffers.
In fact, it's more a transport layer, which makes much more sense. The
transport layer offers a medium on which data can transit, and it offers
the functions to move these data when the upper layer requests this. And
it is the upper layer which iterates over the transport layer's functions
to move data which should be called the data layer.
The use case where it's obvious is with embryonic sessions : an incoming
SSL connection is accepted. Only the connection is allocated, not the
buffers nor stream interface, etc... The connection handles the SSL
handshake by itself. Once this handshake is complete, we can't use the
data functions because the buffers and stream interface are not there
yet. Hence we have to first call a specific function to complete the
session initialization, after which we'll be able to use the data
functions. This clearly proves that SSL here is only a transport layer
and that the stream interface constitutes the data layer.
A similar change will be performed to rename app_cb => data, but the
two could not be in the same commit for obvious reasons.
2012-10-03 02:19:48 +04:00
* forward the event to the transport layer which will notify the
* data layer .
2012-05-20 20:35:19 +04:00
*/
2012-07-06 16:54:49 +04:00
conn - > flags & = ~ CO_FL_WAIT_L4_CONN ;
2012-07-23 22:05:00 +04:00
return 1 ;
2012-05-11 21:53:32 +04:00
out_error :
2012-07-23 22:05:00 +04:00
/* Write error on the file descriptor. Report it to the connection
* and disable polling on this FD .
2012-05-11 21:53:32 +04:00
*/
2014-01-20 14:56:37 +04:00
fdtab [ fd ] . linger_risk = 0 ;
2013-12-05 02:44:10 +04:00
conn - > flags | = CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH ;
2012-12-10 20:03:52 +04:00
__conn_sock_stop_both ( conn ) ;
2012-08-09 16:45:22 +04:00
return 0 ;
2012-05-11 21:53:32 +04:00
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
/* XXX: Should probably be elsewhere */
static int compare_sockaddr ( struct sockaddr_storage * a , struct sockaddr_storage * b )
{
if ( a - > ss_family ! = b - > ss_family ) {
return ( - 1 ) ;
}
switch ( a - > ss_family ) {
case AF_INET :
{
struct sockaddr_in * a4 = ( void * ) a , * b4 = ( void * ) b ;
if ( a4 - > sin_port ! = b4 - > sin_port )
return ( - 1 ) ;
return ( memcmp ( & a4 - > sin_addr , & b4 - > sin_addr ,
sizeof ( a4 - > sin_addr ) ) ) ;
}
case AF_INET6 :
{
struct sockaddr_in6 * a6 = ( void * ) a , * b6 = ( void * ) b ;
if ( a6 - > sin6_port ! = b6 - > sin6_port )
return ( - 1 ) ;
return ( memcmp ( & a6 - > sin6_addr , & b6 - > sin6_addr ,
sizeof ( a6 - > sin6_addr ) ) ) ;
}
default :
return ( - 1 ) ;
}
}
# define LI_MANDATORY_FLAGS (LI_O_FOREIGN | LI_O_V6ONLY | LI_O_V4V6)
/* When binding the listeners, check if a socket has been sent to us by the
* previous process that we could reuse , instead of creating a new one .
*/
static int tcp_find_compatible_fd ( struct listener * l )
{
struct xfer_sock_list * xfer_sock = xfer_sock_list ;
int ret = - 1 ;
while ( xfer_sock ) {
if ( ! compare_sockaddr ( & xfer_sock - > addr , & l - > addr ) ) {
if ( ( l - > interface = = NULL & & xfer_sock - > iface = = NULL ) | |
( l - > interface ! = NULL & & xfer_sock - > iface ! = NULL & &
! strcmp ( l - > interface , xfer_sock - > iface ) ) ) {
if ( ( l - > options & LI_MANDATORY_FLAGS ) = =
( xfer_sock - > options & LI_MANDATORY_FLAGS ) ) {
if ( ( xfer_sock - > namespace = = NULL & &
l - > netns = = NULL )
# ifdef CONFIG_HAP_NS
| | ( xfer_sock - > namespace ! = NULL & &
l - > netns ! = NULL & &
! strcmp ( xfer_sock - > namespace ,
l - > netns - > node . key ) )
# endif
) {
break ;
}
}
}
}
xfer_sock = xfer_sock - > next ;
}
if ( xfer_sock ! = NULL ) {
ret = xfer_sock - > fd ;
if ( xfer_sock = = xfer_sock_list )
xfer_sock_list = xfer_sock - > next ;
if ( xfer_sock - > prev )
xfer_sock - > prev - > next = xfer_sock - > next ;
if ( xfer_sock - > next )
xfer_sock - > next - > prev = xfer_sock - > prev ;
free ( xfer_sock - > iface ) ;
free ( xfer_sock - > namespace ) ;
free ( xfer_sock ) ;
}
return ret ;
}
# undef L1_MANDATORY_FLAGS
2012-05-11 21:53:32 +04:00
2007-10-29 03:09:36 +03:00
/* This function tries to bind a TCPv4/v6 listener. It may return a warning or
2013-01-24 04:41:38 +04:00
* an error message in < errmsg > if the message is at most < errlen > bytes long
* ( including ' \0 ' ) . Note that < errmsg > may be NULL if < errlen > is also zero .
* The return value is composed from ERR_ABORT , ERR_WARN ,
2007-10-29 03:09:36 +03:00
* ERR_ALERT , ERR_RETRYABLE and ERR_FATAL . ERR_NONE indicates that everything
* was alright and that no message was returned . ERR_RETRYABLE means that an
* error occurred but that it may vanish after a retry ( eg : port in use ) , and
2012-04-07 04:39:26 +04:00
* ERR_FATAL indicates a non - fixable error . ERR_WARN and ERR_ALERT do not alter
2007-10-29 03:09:36 +03:00
* the meaning of the error , but just indicate that a message is present which
* should be displayed with the respective level . Last , ERR_ABORT indicates
* that it ' s pointless to try to start other listeners . No error message is
* returned if errlen is NULL .
*/
int tcp_bind_listener ( struct listener * listener , char * errmsg , int errlen )
{
__label__ tcp_return , tcp_close_return ;
int fd , err ;
2013-03-11 02:51:38 +04:00
int ext , ready ;
socklen_t ready_len ;
2007-10-29 03:09:36 +03:00
const char * msg = NULL ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
# ifdef TCP_MAXSEG
/* Create a temporary TCP socket to get default parameters we can't
* guess .
* */
ready_len = sizeof ( default_tcp_maxseg ) ;
if ( default_tcp_maxseg = = - 1 ) {
default_tcp_maxseg = - 2 ;
fd = socket ( AF_INET , SOCK_STREAM , IPPROTO_TCP ) ;
if ( fd < 0 )
2017-11-24 18:50:31 +03:00
ha_warning ( " Failed to create a temporary socket! \n " ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
else {
if ( getsockopt ( fd , IPPROTO_TCP , TCP_MAXSEG , & default_tcp_maxseg ,
& ready_len ) = = - 1 )
2017-11-24 18:50:31 +03:00
ha_warning ( " Failed to get the default value of TCP_MAXSEG \n " ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
}
close ( fd ) ;
}
if ( default_tcp6_maxseg = = - 1 ) {
default_tcp6_maxseg = - 2 ;
fd = socket ( AF_INET6 , SOCK_STREAM , IPPROTO_TCP ) ;
if ( fd > = 0 ) {
if ( getsockopt ( fd , IPPROTO_TCP , TCP_MAXSEG , & default_tcp6_maxseg ,
& ready_len ) = = - 1 )
2017-11-24 18:50:31 +03:00
ha_warning ( " Failed ot get the default value of TCP_MAXSEG for IPv6 \n " ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
close ( fd ) ;
}
}
# endif
2007-10-29 03:09:36 +03:00
/* ensure we never return garbage */
2013-01-24 04:41:38 +04:00
if ( errlen )
2007-10-29 03:09:36 +03:00
* errmsg = 0 ;
if ( listener - > state ! = LI_ASSIGNED )
return ERR_NONE ; /* already bound */
err = ERR_NONE ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
if ( listener - > fd = = - 1 )
listener - > fd = tcp_find_compatible_fd ( listener ) ;
2013-03-11 02:51:38 +04:00
/* if the listener already has an fd assigned, then we were offered the
* fd by an external process ( most likely the parent ) , and we don ' t want
* to create a new socket . However we still want to set a few flags on
* the socket .
*/
fd = listener - > fd ;
ext = ( fd > = 0 ) ;
2014-11-17 17:11:45 +03:00
if ( ! ext ) {
fd = my_socketat ( listener - > netns , listener - > addr . ss_family , SOCK_STREAM , IPPROTO_TCP ) ;
if ( fd = = - 1 ) {
err | = ERR_RETRYABLE | ERR_ALERT ;
msg = " cannot create listening socket " ;
goto tcp_return ;
}
2007-10-29 03:09:36 +03:00
}
2008-12-01 01:15:34 +03:00
2007-10-29 03:09:36 +03:00
if ( fd > = global . maxsock ) {
err | = ERR_FATAL | ERR_ABORT | ERR_ALERT ;
msg = " not enough free sockets (raise '-n' parameter) " ;
goto tcp_close_return ;
}
2009-06-14 17:24:37 +04:00
if ( fcntl ( fd , F_SETFL , O_NONBLOCK ) = = - 1 ) {
2007-10-29 03:09:36 +03:00
err | = ERR_FATAL | ERR_ALERT ;
msg = " cannot make socket non-blocking " ;
goto tcp_close_return ;
}
2013-03-11 02:51:38 +04:00
if ( ! ext & & setsockopt ( fd , SOL_SOCKET , SO_REUSEADDR , & one , sizeof ( one ) ) = = - 1 ) {
2007-10-29 03:09:36 +03:00
/* not fatal but should be reported */
msg = " cannot do so_reuseaddr " ;
err | = ERR_ALERT ;
}
if ( listener - > options & LI_O_NOLINGER )
2011-06-24 10:11:37 +04:00
setsockopt ( fd , SOL_SOCKET , SO_LINGER , & nolinger , sizeof ( struct linger ) ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
else {
struct linger tmplinger ;
socklen_t len = sizeof ( tmplinger ) ;
if ( getsockopt ( fd , SOL_SOCKET , SO_LINGER , & tmplinger , & len ) = = 0 & &
( tmplinger . l_onoff = = 1 | | tmplinger . l_linger = = 0 ) ) {
tmplinger . l_onoff = 0 ;
tmplinger . l_linger = 0 ;
setsockopt ( fd , SOL_SOCKET , SO_LINGER , & tmplinger ,
sizeof ( tmplinger ) ) ;
}
}
2008-12-01 01:15:34 +03:00
2007-10-29 03:09:36 +03:00
# ifdef SO_REUSEPORT
2016-09-13 00:42:20 +03:00
/* OpenBSD and Linux 3.9 support this. As it's present in old libc versions of
* Linux , it might return an error that we will silently ignore .
2007-10-29 03:09:36 +03:00
*/
2016-09-13 00:42:20 +03:00
if ( ! ext & & ( global . tune . options & GTUNE_USE_REUSEPORT ) )
2013-03-11 02:51:38 +04:00
setsockopt ( fd , SOL_SOCKET , SO_REUSEPORT , & one , sizeof ( one ) ) ;
2008-01-13 16:49:51 +03:00
# endif
2013-05-09 00:49:23 +04:00
2013-03-11 02:51:38 +04:00
if ( ! ext & & ( listener - > options & LI_O_FOREIGN ) ) {
2012-07-13 16:34:59 +04:00
switch ( listener - > addr . ss_family ) {
case AF_INET :
2013-05-09 00:49:23 +04:00
if ( 1
# if defined(IP_TRANSPARENT)
& & ( setsockopt ( fd , SOL_IP , IP_TRANSPARENT , & one , sizeof ( one ) ) = = - 1 )
# endif
# if defined(IP_FREEBIND)
& & ( setsockopt ( fd , SOL_IP , IP_FREEBIND , & one , sizeof ( one ) ) = = - 1 )
2013-05-09 01:22:39 +04:00
# endif
# if defined(IP_BINDANY)
& & ( setsockopt ( fd , IPPROTO_IP , IP_BINDANY , & one , sizeof ( one ) ) = = - 1 )
2013-05-09 01:30:23 +04:00
# endif
# if defined(SO_BINDANY)
& & ( setsockopt ( fd , SOL_SOCKET , SO_BINDANY , & one , sizeof ( one ) ) = = - 1 )
2013-05-09 00:49:23 +04:00
# endif
) {
2012-07-13 16:34:59 +04:00
msg = " cannot make listening socket transparent " ;
err | = ERR_ALERT ;
}
break ;
case AF_INET6 :
2013-05-09 00:49:23 +04:00
if ( 1
2016-09-09 10:41:15 +03:00
# if defined(IPV6_TRANSPARENT) && defined(SOL_IPV6)
2013-05-09 00:49:23 +04:00
& & ( setsockopt ( fd , SOL_IPV6 , IPV6_TRANSPARENT , & one , sizeof ( one ) ) = = - 1 )
2013-05-09 01:22:39 +04:00
# endif
2014-03-04 00:10:51 +04:00
# if defined(IP_FREEBIND)
& & ( setsockopt ( fd , SOL_IP , IP_FREEBIND , & one , sizeof ( one ) ) = = - 1 )
# endif
2013-05-09 01:22:39 +04:00
# if defined(IPV6_BINDANY)
& & ( setsockopt ( fd , IPPROTO_IPV6 , IPV6_BINDANY , & one , sizeof ( one ) ) = = - 1 )
2013-05-09 01:30:23 +04:00
# endif
# if defined(SO_BINDANY)
& & ( setsockopt ( fd , SOL_SOCKET , SO_BINDANY , & one , sizeof ( one ) ) = = - 1 )
2013-05-09 00:49:23 +04:00
# endif
) {
2012-07-13 16:34:59 +04:00
msg = " cannot make listening socket transparent " ;
err | = ERR_ALERT ;
}
break ;
}
2008-01-13 16:49:51 +03:00
}
2013-05-09 00:49:23 +04:00
2009-02-04 19:19:29 +03:00
# ifdef SO_BINDTODEVICE
/* Note: this might fail if not CAP_NET_RAW */
2013-03-11 02:51:38 +04:00
if ( ! ext & & listener - > interface ) {
2009-02-04 19:19:29 +03:00
if ( setsockopt ( fd , SOL_SOCKET , SO_BINDTODEVICE ,
2009-03-06 02:48:23 +03:00
listener - > interface , strlen ( listener - > interface ) + 1 ) = = - 1 ) {
2009-02-04 19:19:29 +03:00
msg = " cannot bind listener to device " ;
err | = ERR_WARN ;
}
}
2009-06-14 20:48:19 +04:00
# endif
2009-08-24 15:11:06 +04:00
# if defined(TCP_MAXSEG)
2010-12-24 17:26:39 +03:00
if ( listener - > maxseg > 0 ) {
2009-08-24 15:11:06 +04:00
if ( setsockopt ( fd , IPPROTO_TCP , TCP_MAXSEG ,
2009-06-14 20:48:19 +04:00
& listener - > maxseg , sizeof ( listener - > maxseg ) ) = = - 1 ) {
msg = " cannot set MSS " ;
err | = ERR_WARN ;
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
} else if ( ext ) {
int tmpmaxseg = - 1 ;
int defaultmss ;
socklen_t len = sizeof ( tmpmaxseg ) ;
if ( listener - > addr . ss_family = = AF_INET )
defaultmss = default_tcp_maxseg ;
else
defaultmss = default_tcp6_maxseg ;
getsockopt ( fd , IPPROTO_TCP , TCP_MAXSEG , & tmpmaxseg , & len ) ;
if ( tmpmaxseg ! = defaultmss & & setsockopt ( fd , IPPROTO_TCP ,
TCP_MAXSEG , & defaultmss ,
sizeof ( defaultmss ) ) = = - 1 ) {
msg = " cannot set MSS " ;
err | = ERR_WARN ;
}
2009-06-14 20:48:19 +04:00
}
2009-10-13 09:34:14 +04:00
# endif
2015-02-04 02:45:58 +03:00
# if defined(TCP_USER_TIMEOUT)
if ( listener - > tcp_ut ) {
if ( setsockopt ( fd , IPPROTO_TCP , TCP_USER_TIMEOUT ,
& listener - > tcp_ut , sizeof ( listener - > tcp_ut ) ) = = - 1 ) {
msg = " cannot set TCP User Timeout " ;
err | = ERR_WARN ;
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
} else
setsockopt ( fd , IPPROTO_TCP , TCP_USER_TIMEOUT , & zero ,
sizeof ( zero ) ) ;
2015-02-04 02:45:58 +03:00
# endif
2009-10-13 09:34:14 +04:00
# if defined(TCP_DEFER_ACCEPT)
if ( listener - > options & LI_O_DEF_ACCEPT ) {
/* defer accept by up to one second */
int accept_delay = 1 ;
if ( setsockopt ( fd , IPPROTO_TCP , TCP_DEFER_ACCEPT , & accept_delay , sizeof ( accept_delay ) ) = = - 1 ) {
msg = " cannot enable DEFER_ACCEPT " ;
err | = ERR_WARN ;
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
} else
setsockopt ( fd , IPPROTO_TCP , TCP_DEFER_ACCEPT , & zero ,
sizeof ( zero ) ) ;
2012-10-05 18:21:00 +04:00
# endif
# if defined(TCP_FASTOPEN)
if ( listener - > options & LI_O_TCP_FO ) {
/* TFO needs a queue length, let's use the configured backlog */
int qlen = listener - > backlog ? listener - > backlog : listener - > maxconn ;
if ( setsockopt ( fd , IPPROTO_TCP , TCP_FASTOPEN , & qlen , sizeof ( qlen ) ) = = - 1 ) {
msg = " cannot enable TCP_FASTOPEN " ;
err | = ERR_WARN ;
}
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
} else {
socklen_t len ;
int qlen ;
len = sizeof ( qlen ) ;
/* Only disable fast open if it was enabled, we don't want
* the kernel to create a fast open queue if there ' s none .
*/
if ( getsockopt ( fd , IPPROTO_TCP , TCP_FASTOPEN , & qlen , & len ) = = 0 & &
qlen ! = 0 ) {
if ( setsockopt ( fd , IPPROTO_TCP , TCP_FASTOPEN , & zero ,
sizeof ( zero ) ) = = - 1 ) {
msg = " cannot disable TCP_FASTOPEN " ;
err | = ERR_WARN ;
}
}
2012-10-05 18:21:00 +04:00
}
2007-10-29 03:09:36 +03:00
# endif
2012-11-24 14:55:28 +04:00
# if defined(IPV6_V6ONLY)
if ( listener - > options & LI_O_V6ONLY )
setsockopt ( fd , IPPROTO_IPV6 , IPV6_V6ONLY , & one , sizeof ( one ) ) ;
2012-11-24 18:07:23 +04:00
else if ( listener - > options & LI_O_V4V6 )
setsockopt ( fd , IPPROTO_IPV6 , IPV6_V6ONLY , & zero , sizeof ( zero ) ) ;
2012-11-24 14:55:28 +04:00
# endif
2013-03-11 02:51:38 +04:00
if ( ! ext & & bind ( fd , ( struct sockaddr * ) & listener - > addr , listener - > proto - > sock_addrlen ) = = - 1 ) {
2007-10-29 03:09:36 +03:00
err | = ERR_RETRYABLE | ERR_ALERT ;
msg = " cannot bind socket " ;
goto tcp_close_return ;
}
2008-12-01 01:15:34 +03:00
2013-03-11 02:51:38 +04:00
ready = 0 ;
ready_len = sizeof ( ready ) ;
if ( getsockopt ( fd , SOL_SOCKET , SO_ACCEPTCONN , & ready , & ready_len ) = = - 1 )
ready = 0 ;
if ( ! ( ext & & ready ) & & /* only listen if not already done by external process */
listen ( fd , listener - > backlog ? listener - > backlog : listener - > maxconn ) = = - 1 ) {
2007-10-29 03:09:36 +03:00
err | = ERR_RETRYABLE | ERR_ALERT ;
msg = " cannot listen to socket " ;
goto tcp_close_return ;
}
2008-12-01 01:15:34 +03:00
2009-08-24 15:11:06 +04:00
# if defined(TCP_QUICKACK)
2009-06-14 14:07:01 +04:00
if ( listener - > options & LI_O_NOQUICKACK )
2011-06-24 10:11:37 +04:00
setsockopt ( fd , IPPROTO_TCP , TCP_QUICKACK , & zero , sizeof ( zero ) ) ;
MINOR: tcp: When binding socket, attempt to reuse one from the old proc.
Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
2017-04-05 23:39:56 +03:00
else
setsockopt ( fd , IPPROTO_TCP , TCP_QUICKACK , & one , sizeof ( one ) ) ;
2009-06-14 14:07:01 +04:00
# endif
2007-10-29 03:09:36 +03:00
/* the socket is ready */
listener - > fd = fd ;
listener - > state = LI_LISTEN ;
2008-08-30 01:36:51 +04:00
fdtab [ fd ] . owner = listener ; /* reference the listener instead of a task */
2012-07-06 14:25:58 +04:00
fdtab [ fd ] . iocb = listener - > proto - > accept ;
2017-11-24 12:08:09 +03:00
if ( listener - > bind_conf - > bind_thread [ relative_pid - 1 ] )
fd_insert ( fd , listener - > bind_conf - > bind_thread [ relative_pid - 1 ] ) ;
else
fd_insert ( fd , MAX_THREADS_MASK ) ;
2010-05-28 20:46:57 +04:00
2007-10-29 03:09:36 +03:00
tcp_return :
2010-11-01 21:26:01 +03:00
if ( msg & & errlen ) {
char pn [ INET6_ADDRSTRLEN ] ;
2011-09-05 02:36:48 +04:00
addr_to_str ( & listener - > addr , pn , sizeof ( pn ) ) ;
snprintf ( errmsg , errlen , " %s [%s:%d] " , msg , pn , get_host_port ( & listener - > addr ) ) ;
2010-11-01 21:26:01 +03:00
}
2007-10-29 03:09:36 +03:00
return err ;
tcp_close_return :
close ( fd ) ;
goto tcp_return ;
}
/* This function creates all TCP sockets bound to the protocol entry <proto>.
* It is intended to be used as the protocol ' s bind_all ( ) function .
* The sockets will be registered but not added to any fd_set , in order not to
* loose them across the fork ( ) . A call to enable_all_listeners ( ) is needed
* to complete initialization . The return value is composed from ERR_ * .
*/
2010-10-22 18:06:11 +04:00
static int tcp_bind_listeners ( struct protocol * proto , char * errmsg , int errlen )
2007-10-29 03:09:36 +03:00
{
struct listener * listener ;
int err = ERR_NONE ;
list_for_each_entry ( listener , & proto - > listeners , proto_list ) {
2010-10-22 18:06:11 +04:00
err | = tcp_bind_listener ( listener , errmsg , errlen ) ;
if ( err & ERR_ABORT )
2007-10-29 03:09:36 +03:00
break ;
}
return err ;
}
2017-09-15 08:44:44 +03:00
/* Add <listener> to the list of tcpv4 listeners, on port <port>. The
* listener ' s state is automatically updated from LI_INIT to LI_ASSIGNED .
* The number of listeners for the protocol is updated .
2007-10-29 03:09:36 +03:00
*/
2017-09-15 08:55:51 +03:00
static void tcpv4_add_listener ( struct listener * listener , int port )
2007-10-29 03:09:36 +03:00
{
if ( listener - > state ! = LI_INIT )
return ;
listener - > state = LI_ASSIGNED ;
listener - > proto = & proto_tcpv4 ;
2017-09-15 08:44:44 +03:00
( ( struct sockaddr_in * ) ( & listener - > addr ) ) - > sin_port = htons ( port ) ;
2007-10-29 03:09:36 +03:00
LIST_ADDQ ( & proto_tcpv4 . listeners , & listener - > proto_list ) ;
proto_tcpv4 . nb_listeners + + ;
}
2017-09-15 08:44:44 +03:00
/* Add <listener> to the list of tcpv6 listeners, on port <port>. The
* listener ' s state is automatically updated from LI_INIT to LI_ASSIGNED .
* The number of listeners for the protocol is updated .
2007-10-29 03:09:36 +03:00
*/
2017-09-15 08:55:51 +03:00
static void tcpv6_add_listener ( struct listener * listener , int port )
2007-10-29 03:09:36 +03:00
{
if ( listener - > state ! = LI_INIT )
return ;
listener - > state = LI_ASSIGNED ;
listener - > proto = & proto_tcpv6 ;
2017-09-15 08:44:44 +03:00
( ( struct sockaddr_in * ) ( & listener - > addr ) ) - > sin_port = htons ( port ) ;
2007-10-29 03:09:36 +03:00
LIST_ADDQ ( & proto_tcpv6 . listeners , & listener - > proto_list ) ;
proto_tcpv6 . nb_listeners + + ;
}
2014-07-07 22:22:12 +04:00
/* Pause a listener. Returns < 0 in case of failure, 0 if the listener
* was totally stopped , or > 0 if correctly paused .
*/
int tcp_pause_listener ( struct listener * l )
{
if ( shutdown ( l - > fd , SHUT_WR ) ! = 0 )
return - 1 ; /* Solaris dies here */
if ( listen ( l - > fd , l - > backlog ? l - > backlog : l - > maxconn ) ! = 0 )
return - 1 ; /* OpenBSD dies here */
if ( shutdown ( l - > fd , SHUT_RD ) ! = 0 )
return - 1 ; /* should always be OK */
return 1 ;
}
2016-05-25 02:48:42 +03:00
/*
2016-10-21 16:07:45 +03:00
* Execute the " set-src " action . May be called from { tcp , http } request .
* It only changes the address and tries to preserve the original port . If the
* previous family was neither AF_INET nor AF_INET6 , the port is set to zero .
2016-05-25 02:48:42 +03:00
*/
enum act_return tcp_action_req_set_src ( struct act_rule * rule , struct proxy * px ,
struct session * sess , struct stream * s , int flags )
{
struct connection * cli_conn ;
if ( ( cli_conn = objt_conn ( sess - > origin ) ) & & conn_ctrl_ready ( cli_conn ) ) {
struct sample * smp ;
smp = sample_fetch_as_type ( px , sess , s , SMP_OPT_DIR_REQ | SMP_OPT_FINAL , rule - > arg . expr , SMP_T_ADDR ) ;
if ( smp ) {
2016-10-21 16:07:45 +03:00
int port = get_net_port ( & cli_conn - > addr . from ) ;
2016-05-25 02:48:42 +03:00
if ( smp - > data . type = = SMP_T_IPV4 ) {
( ( struct sockaddr_in * ) & cli_conn - > addr . from ) - > sin_family = AF_INET ;
( ( struct sockaddr_in * ) & cli_conn - > addr . from ) - > sin_addr . s_addr = smp - > data . u . ipv4 . s_addr ;
2016-10-21 16:07:45 +03:00
( ( struct sockaddr_in * ) & cli_conn - > addr . from ) - > sin_port = port ;
2016-05-25 02:48:42 +03:00
} else if ( smp - > data . type = = SMP_T_IPV6 ) {
( ( struct sockaddr_in6 * ) & cli_conn - > addr . from ) - > sin6_family = AF_INET6 ;
memcpy ( & ( ( struct sockaddr_in6 * ) & cli_conn - > addr . from ) - > sin6_addr , & smp - > data . u . ipv6 , sizeof ( struct in6_addr ) ) ;
2016-10-21 16:07:45 +03:00
( ( struct sockaddr_in6 * ) & cli_conn - > addr . from ) - > sin6_port = port ;
2016-05-25 02:48:42 +03:00
}
}
2016-05-25 03:33:16 +03:00
cli_conn - > flags | = CO_FL_ADDR_FROM_SET ;
2016-05-25 02:48:42 +03:00
}
return ACT_RET_CONT ;
}
2016-05-25 03:34:07 +03:00
/*
2016-10-21 16:07:45 +03:00
* Execute the " set-dst " action . May be called from { tcp , http } request .
* It only changes the address and tries to preserve the original port . If the
* previous family was neither AF_INET nor AF_INET6 , the port is set to zero .
2016-05-25 03:34:07 +03:00
*/
enum act_return tcp_action_req_set_dst ( struct act_rule * rule , struct proxy * px ,
struct session * sess , struct stream * s , int flags )
{
struct connection * cli_conn ;
if ( ( cli_conn = objt_conn ( sess - > origin ) ) & & conn_ctrl_ready ( cli_conn ) ) {
struct sample * smp ;
smp = sample_fetch_as_type ( px , sess , s , SMP_OPT_DIR_REQ | SMP_OPT_FINAL , rule - > arg . expr , SMP_T_ADDR ) ;
if ( smp ) {
2016-10-21 16:07:45 +03:00
int port = get_net_port ( & cli_conn - > addr . to ) ;
2016-05-25 03:34:07 +03:00
if ( smp - > data . type = = SMP_T_IPV4 ) {
( ( struct sockaddr_in * ) & cli_conn - > addr . to ) - > sin_family = AF_INET ;
( ( struct sockaddr_in * ) & cli_conn - > addr . to ) - > sin_addr . s_addr = smp - > data . u . ipv4 . s_addr ;
} else if ( smp - > data . type = = SMP_T_IPV6 ) {
( ( struct sockaddr_in6 * ) & cli_conn - > addr . to ) - > sin6_family = AF_INET6 ;
memcpy ( & ( ( struct sockaddr_in6 * ) & cli_conn - > addr . to ) - > sin6_addr , & smp - > data . u . ipv6 , sizeof ( struct in6_addr ) ) ;
2016-10-21 16:07:45 +03:00
( ( struct sockaddr_in6 * ) & cli_conn - > addr . to ) - > sin6_port = port ;
2016-05-25 03:34:07 +03:00
}
cli_conn - > flags | = CO_FL_ADDR_TO_SET ;
}
}
return ACT_RET_CONT ;
}
2016-05-25 02:51:35 +03:00
/*
2016-10-21 16:07:45 +03:00
* Execute the " set-src-port " action . May be called from { tcp , http } request .
* We must test the sin_family before setting the port . If the address family
* is neither AF_INET nor AF_INET6 , the address is forced to AF_INET " 0.0.0.0 "
* and the port is assigned .
2016-05-25 02:51:35 +03:00
*/
enum act_return tcp_action_req_set_src_port ( struct act_rule * rule , struct proxy * px ,
struct session * sess , struct stream * s , int flags )
{
struct connection * cli_conn ;
if ( ( cli_conn = objt_conn ( sess - > origin ) ) & & conn_ctrl_ready ( cli_conn ) ) {
struct sample * smp ;
conn_get_from_addr ( cli_conn ) ;
smp = sample_fetch_as_type ( px , sess , s , SMP_OPT_DIR_REQ | SMP_OPT_FINAL , rule - > arg . expr , SMP_T_SINT ) ;
if ( smp ) {
2016-10-21 16:07:45 +03:00
if ( cli_conn - > addr . from . ss_family = = AF_INET6 ) {
2016-05-25 02:51:35 +03:00
( ( struct sockaddr_in6 * ) & cli_conn - > addr . from ) - > sin6_port = htons ( smp - > data . u . sint ) ;
2016-10-21 16:07:45 +03:00
} else {
if ( cli_conn - > addr . from . ss_family ! = AF_INET ) {
cli_conn - > addr . from . ss_family = AF_INET ;
( ( struct sockaddr_in * ) & cli_conn - > addr . from ) - > sin_addr . s_addr = 0 ;
}
( ( struct sockaddr_in * ) & cli_conn - > addr . from ) - > sin_port = htons ( smp - > data . u . sint ) ;
2016-05-25 02:51:35 +03:00
}
}
}
return ACT_RET_CONT ;
}
2016-05-25 03:34:07 +03:00
/*
2016-10-21 16:07:45 +03:00
* Execute the " set-dst-port " action . May be called from { tcp , http } request .
* We must test the sin_family before setting the port . If the address family
* is neither AF_INET nor AF_INET6 , the address is forced to AF_INET " 0.0.0.0 "
* and the port is assigned .
2016-05-25 03:34:07 +03:00
*/
enum act_return tcp_action_req_set_dst_port ( struct act_rule * rule , struct proxy * px ,
struct session * sess , struct stream * s , int flags )
{
struct connection * cli_conn ;
if ( ( cli_conn = objt_conn ( sess - > origin ) ) & & conn_ctrl_ready ( cli_conn ) ) {
struct sample * smp ;
conn_get_to_addr ( cli_conn ) ;
smp = sample_fetch_as_type ( px , sess , s , SMP_OPT_DIR_REQ | SMP_OPT_FINAL , rule - > arg . expr , SMP_T_SINT ) ;
if ( smp ) {
2017-10-04 00:16:36 +03:00
if ( cli_conn - > addr . to . ss_family = = AF_INET6 ) {
( ( struct sockaddr_in6 * ) & cli_conn - > addr . to ) - > sin6_port = htons ( smp - > data . u . sint ) ;
2016-10-21 16:07:45 +03:00
} else {
2017-10-04 00:16:36 +03:00
if ( cli_conn - > addr . to . ss_family ! = AF_INET ) {
cli_conn - > addr . to . ss_family = AF_INET ;
( ( struct sockaddr_in * ) & cli_conn - > addr . to ) - > sin_addr . s_addr = 0 ;
2016-10-21 16:07:45 +03:00
}
2017-10-04 00:16:36 +03:00
( ( struct sockaddr_in * ) & cli_conn - > addr . to ) - > sin_port = htons ( smp - > data . u . sint ) ;
2016-05-25 03:34:07 +03:00
}
}
}
return ACT_RET_CONT ;
}
2015-08-24 02:43:45 +03:00
/* Executes the "silent-drop" action. May be called from {tcp,http}{request,response} */
static enum act_return tcp_exec_action_silent_drop ( struct act_rule * rule , struct proxy * px , struct session * sess , struct stream * strm , int flags )
{
struct connection * conn = objt_conn ( sess - > origin ) ;
if ( ! conn )
goto out ;
if ( ! conn_ctrl_ready ( conn ) )
goto out ;
# ifdef TCP_QUICKACK
2015-09-29 19:15:01 +03:00
/* drain is needed only to send the quick ACK */
conn_sock_drain ( conn ) ;
2015-08-24 02:43:45 +03:00
/* re-enable quickack if it was disabled to ack all data and avoid
* retransmits from the client that might trigger a real reset .
*/
2017-08-24 15:31:19 +03:00
setsockopt ( conn - > handle . fd , SOL_TCP , TCP_QUICKACK , & one , sizeof ( one ) ) ;
2015-08-24 02:43:45 +03:00
# endif
/* lingering must absolutely be disabled so that we don't send a
* shutdown ( ) , this is critical to the TCP_REPAIR trick . When no stream
* is present , returning with ERR will cause lingering to be disabled .
*/
if ( strm )
strm - > si [ 0 ] . flags | = SI_FL_NOLINGER ;
2015-09-29 19:11:32 +03:00
/* We're on the client-facing side, we must force to disable lingering to
* ensure we will use an RST exclusively and kill any pending data .
*/
2017-08-24 15:31:19 +03:00
fdtab [ conn - > handle . fd ] . linger_risk = 1 ;
2015-09-29 19:11:32 +03:00
2015-08-24 02:43:45 +03:00
# ifdef TCP_REPAIR
2017-08-24 15:31:19 +03:00
if ( setsockopt ( conn - > handle . fd , SOL_TCP , TCP_REPAIR , & one , sizeof ( one ) ) = = 0 ) {
2015-08-24 02:43:45 +03:00
/* socket will be quiet now */
goto out ;
}
# endif
/* either TCP_REPAIR is not defined or it failed (eg: permissions).
* Let ' s fall back on the TTL trick , though it only works for routed
* network and has no effect on local net .
*/
# ifdef IP_TTL
2017-08-24 15:31:19 +03:00
setsockopt ( conn - > handle . fd , SOL_IP , IP_TTL , & one , sizeof ( one ) ) ;
2015-08-24 02:43:45 +03:00
# endif
out :
/* kill the stream if any */
if ( strm ) {
channel_abort ( & strm - > req ) ;
channel_abort ( & strm - > res ) ;
strm - > req . analysers = 0 ;
strm - > res . analysers = 0 ;
2017-06-02 16:33:24 +03:00
HA_ATOMIC_ADD ( & strm - > be - > be_counters . denied_req , 1 ) ;
2015-08-24 02:43:45 +03:00
if ( ! ( strm - > flags & SF_ERR_MASK ) )
strm - > flags | = SF_ERR_PRXCOND ;
if ( ! ( strm - > flags & SF_FINST_MASK ) )
strm - > flags | = SF_FINST_R ;
}
2017-06-02 16:33:24 +03:00
HA_ATOMIC_ADD ( & sess - > fe - > fe_counters . denied_req , 1 ) ;
2015-08-24 02:43:45 +03:00
if ( sess - > listener - > counters )
2017-05-30 16:36:50 +03:00
HA_ATOMIC_ADD ( & sess - > listener - > counters - > denied_req , 1 ) ;
2015-08-24 02:43:45 +03:00
return ACT_RET_STOP ;
}
2016-05-25 03:34:07 +03:00
/* parse "set-{src,dst}[-port]" action */
enum act_parse_ret tcp_parse_set_src_dst ( const char * * args , int * orig_arg , struct proxy * px , struct act_rule * rule , char * * err )
2016-05-25 02:48:42 +03:00
{
int cur_arg ;
struct sample_expr * expr ;
unsigned int where ;
cur_arg = * orig_arg ;
expr = sample_parse_expr ( ( char * * ) args , & cur_arg , px - > conf . args . file , px - > conf . args . line , err , & px - > conf . args ) ;
if ( ! expr )
return ACT_RET_PRS_ERR ;
where = 0 ;
2017-11-24 18:54:05 +03:00
if ( px - > cap & PR_CAP_FE )
2016-05-25 02:48:42 +03:00
where | = SMP_VAL_FE_HRQ_HDR ;
2017-11-24 18:54:05 +03:00
if ( px - > cap & PR_CAP_BE )
2016-05-25 02:48:42 +03:00
where | = SMP_VAL_BE_HRQ_HDR ;
if ( ! ( expr - > fetch - > val & where ) ) {
memprintf ( err ,
" fetch method '%s' extracts information from '%s', none of which is available here " ,
args [ cur_arg - 1 ] , sample_src_names ( expr - > fetch - > use ) ) ;
free ( expr ) ;
return ACT_RET_PRS_ERR ;
}
rule - > arg . expr = expr ;
rule - > action = ACT_CUSTOM ;
if ( ! strcmp ( args [ * orig_arg - 1 ] , " set-src " ) ) {
rule - > action_ptr = tcp_action_req_set_src ;
2016-05-25 02:51:35 +03:00
} else if ( ! strcmp ( args [ * orig_arg - 1 ] , " set-src-port " ) ) {
rule - > action_ptr = tcp_action_req_set_src_port ;
2016-05-25 03:34:07 +03:00
} else if ( ! strcmp ( args [ * orig_arg - 1 ] , " set-dst " ) ) {
rule - > action_ptr = tcp_action_req_set_dst ;
} else if ( ! strcmp ( args [ * orig_arg - 1 ] , " set-dst-port " ) ) {
rule - > action_ptr = tcp_action_req_set_dst_port ;
2016-05-25 02:48:42 +03:00
} else {
return ACT_RET_PRS_ERR ;
}
( * orig_arg ) + + ;
return ACT_RET_PRS_OK ;
}
2015-08-24 02:43:45 +03:00
/* Parse a "silent-drop" action. It takes no argument. It returns ACT_RET_PRS_OK on
* success , ACT_RET_PRS_ERR on error .
*/
static enum act_parse_ret tcp_parse_silent_drop ( const char * * args , int * orig_arg , struct proxy * px ,
struct act_rule * rule , char * * err )
{
rule - > action = ACT_CUSTOM ;
rule - > action_ptr = tcp_exec_action_silent_drop ;
return ACT_RET_PRS_OK ;
}
2010-05-24 22:55:15 +04:00
2012-04-24 01:13:20 +04:00
/************************************************************************/
MEDIUM: samples: move payload-based fetches and ACLs to their own file
The file acl.c is a real mess, it both contains functions to parse and
process ACLs, and some sample extraction functions which act on buffers.
Some other payload analysers were arbitrarily dispatched to proto_tcp.c.
So now we're moving all payload-based fetches and ACLs to payload.c
which is capable of extracting data from buffers and rely on everything
that is protocol-independant. That way we can safely inflate this file
and only use the other ones when some fetches are really specific (eg:
HTTP, SSL, ...).
As a result of this cleanup, the following new sample fetches became
available even if they're not really useful :
always_false, always_true, rep_ssl_hello_type, rdp_cookie_cnt,
req_len, req_ssl_hello_type, req_ssl_sni, req_ssl_ver, wait_end
The function 'acl_fetch_nothing' was wrong and never used anywhere so it
was removed.
The "rdp_cookie" sample fetch used to have a mandatory argument while it
was optional in ACLs, which are supposed to iterate over RDP cookies. So
we're making it optional as a fetch too, and it will return the first one.
2013-01-08 00:59:07 +04:00
/* All supported sample fetch functions must be declared here */
2012-04-24 01:13:20 +04:00
/************************************************************************/
2012-04-25 19:31:42 +04:00
/* fetch the connection's source IPv4/IPv6 address */
2015-07-24 10:12:15 +03:00
int smp_fetch_src ( const struct arg * args , struct sample * smp , const char * kw , void * private )
2010-05-24 22:55:15 +04:00
{
2015-05-11 16:20:49 +03:00
struct connection * cli_conn = objt_conn ( smp - > sess - > origin ) ;
2013-10-01 12:45:07 +04:00
if ( ! cli_conn )
return 0 ;
switch ( cli_conn - > addr . from . ss_family ) {
2011-12-16 20:49:52 +04:00
case AF_INET :
2015-08-19 10:07:19 +03:00
smp - > data . u . ipv4 = ( ( struct sockaddr_in * ) & cli_conn - > addr . from ) - > sin_addr ;
2015-08-19 10:00:18 +03:00
smp - > data . type = SMP_T_IPV4 ;
2011-12-16 20:49:52 +04:00
break ;
case AF_INET6 :
2015-08-19 10:07:19 +03:00
smp - > data . u . ipv6 = ( ( struct sockaddr_in6 * ) & cli_conn - > addr . from ) - > sin6_addr ;
2015-08-19 10:00:18 +03:00
smp - > data . type = SMP_T_IPV6 ;
2011-12-16 20:49:52 +04:00
break ;
default :
2010-10-22 19:14:01 +04:00
return 0 ;
2011-12-16 20:49:52 +04:00
}
2010-10-22 19:14:01 +04:00
2012-04-23 18:16:37 +04:00
smp - > flags = 0 ;
2010-05-24 22:55:15 +04:00
return 1 ;
}
2011-12-16 20:06:15 +04:00
/* set temp integer to the connection's source port */
2010-05-24 22:55:15 +04:00
static int
2015-05-11 16:42:45 +03:00
smp_fetch_sport ( const struct arg * args , struct sample * smp , const char * k , void * private )
2010-05-24 22:55:15 +04:00
{
2015-05-11 16:20:49 +03:00
struct connection * cli_conn = objt_conn ( smp - > sess - > origin ) ;
2013-10-01 12:45:07 +04:00
if ( ! cli_conn )
return 0 ;
2015-08-19 10:00:18 +03:00
smp - > data . type = SMP_T_SINT ;
2015-08-19 10:07:19 +03:00
if ( ! ( smp - > data . u . sint = get_host_port ( & cli_conn - > addr . from ) ) )
2010-10-22 19:14:01 +04:00
return 0 ;
2012-04-23 18:16:37 +04:00
smp - > flags = 0 ;
2010-05-24 22:55:15 +04:00
return 1 ;
}
2012-04-25 19:31:42 +04:00
/* fetch the connection's destination IPv4/IPv6 address */
2010-05-24 22:55:15 +04:00
static int
2015-05-11 16:42:45 +03:00
smp_fetch_dst ( const struct arg * args , struct sample * smp , const char * kw , void * private )
2010-05-24 22:55:15 +04:00
{
2015-05-11 16:20:49 +03:00
struct connection * cli_conn = objt_conn ( smp - > sess - > origin ) ;
2010-05-24 22:55:15 +04:00
2013-10-01 12:45:07 +04:00
if ( ! cli_conn )
return 0 ;
conn_get_to_addr ( cli_conn ) ;
switch ( cli_conn - > addr . to . ss_family ) {
2011-12-16 20:49:52 +04:00
case AF_INET :
2015-08-19 10:07:19 +03:00
smp - > data . u . ipv4 = ( ( struct sockaddr_in * ) & cli_conn - > addr . to ) - > sin_addr ;
2015-08-19 10:00:18 +03:00
smp - > data . type = SMP_T_IPV4 ;
2011-12-16 20:49:52 +04:00
break ;
case AF_INET6 :
2015-08-19 10:07:19 +03:00
smp - > data . u . ipv6 = ( ( struct sockaddr_in6 * ) & cli_conn - > addr . to ) - > sin6_addr ;
2015-08-19 10:00:18 +03:00
smp - > data . type = SMP_T_IPV6 ;
2011-12-16 20:49:52 +04:00
break ;
default :
2010-10-22 19:14:01 +04:00
return 0 ;
2011-12-16 20:49:52 +04:00
}
2010-10-22 19:14:01 +04:00
2012-04-23 18:16:37 +04:00
smp - > flags = 0 ;
2010-05-24 22:55:15 +04:00
return 1 ;
}
MINOR: tcp: add dst_is_local and src_is_local
It is sometimes needed in application server environments to easily tell
if a source is local to the machine or a remote one, without necessarily
knowing all the local addresses (dhcp, vrrp, etc). Similarly in transparent
proxy configurations it is sometimes desired to tell the difference between
local and remote destination addresses.
This patch adds two new sample fetch functions for this :
dst_is_local : boolean
Returns true if the destination address of the incoming connection is local
to the system, or false if the address doesn't exist on the system, meaning
that it was intercepted in transparent mode. It can be useful to apply
certain rules by default to forwarded traffic and other rules to the traffic
targetting the real address of the machine. For example the stats page could
be delivered only on this address, or SSH access could be locally redirected.
Please note that the check involves a few system calls, so it's better to do
it only once per connection.
src_is_local : boolean
Returns true if the source address of the incoming connection is local to the
system, or false if the address doesn't exist on the system, meaning that it
comes from a remote machine. Note that UNIX addresses are considered local.
It can be useful to apply certain access restrictions based on where the
client comes from (eg: require auth or https for remote machines). Please
note that the check involves a few system calls, so it's better to do it only
once per connection.
2016-08-09 17:46:18 +03:00
/* check if the destination address of the front connection is local to the
* system or if it was intercepted .
*/
int smp_fetch_dst_is_local ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
struct connection * conn = objt_conn ( smp - > sess - > origin ) ;
struct listener * li = smp - > sess - > listener ;
if ( ! conn )
return 0 ;
conn_get_to_addr ( conn ) ;
if ( ! ( conn - > flags & CO_FL_ADDR_TO_SET ) )
return 0 ;
smp - > data . type = SMP_T_BOOL ;
smp - > flags = 0 ;
smp - > data . u . sint = addr_is_local ( li - > netns , & conn - > addr . to ) ;
return smp - > data . u . sint > = 0 ;
}
/* check if the source address of the front connection is local to the system
* or not .
*/
int smp_fetch_src_is_local ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
struct connection * conn = objt_conn ( smp - > sess - > origin ) ;
struct listener * li = smp - > sess - > listener ;
if ( ! conn )
return 0 ;
conn_get_from_addr ( conn ) ;
if ( ! ( conn - > flags & CO_FL_ADDR_FROM_SET ) )
return 0 ;
smp - > data . type = SMP_T_BOOL ;
smp - > flags = 0 ;
smp - > data . u . sint = addr_is_local ( li - > netns , & conn - > addr . from ) ;
return smp - > data . u . sint > = 0 ;
}
2011-12-16 20:06:15 +04:00
/* set temp integer to the frontend connexion's destination port */
2010-05-24 22:55:15 +04:00
static int
2015-05-11 16:42:45 +03:00
smp_fetch_dport ( const struct arg * args , struct sample * smp , const char * kw , void * private )
2010-05-24 22:55:15 +04:00
{
2015-05-11 16:20:49 +03:00
struct connection * cli_conn = objt_conn ( smp - > sess - > origin ) ;
2013-10-01 12:45:07 +04:00
if ( ! cli_conn )
return 0 ;
conn_get_to_addr ( cli_conn ) ;
2010-05-24 22:55:15 +04:00
2015-08-19 10:00:18 +03:00
smp - > data . type = SMP_T_SINT ;
2015-08-19 10:07:19 +03:00
if ( ! ( smp - > data . u . sint = get_host_port ( & cli_conn - > addr . to ) ) )
2010-10-22 19:14:01 +04:00
return 0 ;
2012-04-23 18:16:37 +04:00
smp - > flags = 0 ;
2010-05-24 22:55:15 +04:00
return 1 ;
}
2016-07-24 21:16:50 +03:00
# ifdef TCP_INFO
/* Returns some tcp_info data is its avalaible. "dir" must be set to 0 if
* the client connection is require , otherwise it is set to 1. " val " represents
* the required value . Use 0 for rtt and 1 for rttavg . " unit " is the expected unit
* by default , the rtt is in us . Id " unit " is set to 0 , the unit is us , if it is
* set to 1 , the untis are milliseconds .
* If the function fails it returns 0 , otherwise it returns 1 and " result " is filled .
*/
static inline int get_tcp_info ( const struct arg * args , struct sample * smp ,
int dir , int val )
{
struct connection * conn ;
struct tcp_info info ;
socklen_t optlen ;
/* strm can be null. */
if ( ! smp - > strm )
return 0 ;
/* get the object associated with the stream interface.The
* object can be other thing than a connection . For example ,
* it be a appctx . */
2017-09-13 19:30:23 +03:00
conn = cs_conn ( objt_cs ( smp - > strm - > si [ dir ] . end ) ) ;
2016-07-24 21:16:50 +03:00
if ( ! conn )
return 0 ;
2017-09-13 19:30:23 +03:00
/* The fd may not be available for the tcp_info struct, and the
2016-07-24 21:16:50 +03:00
syscal can fail . */
optlen = sizeof ( info ) ;
2017-08-24 15:31:19 +03:00
if ( getsockopt ( conn - > handle . fd , SOL_TCP , TCP_INFO , & info , & optlen ) = = - 1 )
2016-07-24 21:16:50 +03:00
return 0 ;
/* extract the value. */
smp - > data . type = SMP_T_SINT ;
switch ( val ) {
2016-08-10 17:06:44 +03:00
case 0 : smp - > data . u . sint = info . tcpi_rtt ; break ;
case 1 : smp - > data . u . sint = info . tcpi_rttvar ; break ;
# if defined(__linux__)
/* these ones are common to all Linux versions */
case 2 : smp - > data . u . sint = info . tcpi_unacked ; break ;
case 3 : smp - > data . u . sint = info . tcpi_sacked ; break ;
case 4 : smp - > data . u . sint = info . tcpi_lost ; break ;
case 5 : smp - > data . u . sint = info . tcpi_retrans ; break ;
case 6 : smp - > data . u . sint = info . tcpi_fackets ; break ;
case 7 : smp - > data . u . sint = info . tcpi_reordering ; break ;
# elif defined(__FreeBSD__) || defined(__NetBSD__)
/* the ones are found on FreeBSD and NetBSD featuring TCP_INFO */
case 2 : smp - > data . u . sint = info . __tcpi_unacked ; break ;
case 3 : smp - > data . u . sint = info . __tcpi_sacked ; break ;
case 4 : smp - > data . u . sint = info . __tcpi_lost ; break ;
case 5 : smp - > data . u . sint = info . __tcpi_retrans ; break ;
case 6 : smp - > data . u . sint = info . __tcpi_fackets ; break ;
case 7 : smp - > data . u . sint = info . __tcpi_reordering ; break ;
# endif
2016-07-24 21:16:50 +03:00
default : return 0 ;
}
/* Convert the value as expected. */
if ( args ) {
if ( args [ 0 ] . type = = ARGT_STR ) {
if ( strcmp ( args [ 0 ] . data . str . str , " us " ) = = 0 ) {
/* Do nothing. */
} else if ( strcmp ( args [ 0 ] . data . str . str , " ms " ) = = 0 ) {
smp - > data . u . sint = ( smp - > data . u . sint + 500 ) / 1000 ;
} else
return 0 ;
} else if ( args [ 0 ] . type = = ARGT_STOP ) {
smp - > data . u . sint = ( smp - > data . u . sint + 500 ) / 1000 ;
} else
return 0 ;
}
return 1 ;
}
/* get the mean rtt of a client connexion */
static int
smp_fetch_fc_rtt ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 0 ) )
return 0 ;
return 1 ;
}
/* get the variance of the mean rtt of a client connexion */
static int
smp_fetch_fc_rttvar ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 1 ) )
return 0 ;
return 1 ;
}
2016-08-10 17:06:44 +03:00
# if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__)
/* get the unacked counter on a client connexion */
static int
smp_fetch_fc_unacked ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 2 ) )
return 0 ;
return 1 ;
}
/* get the sacked counter on a client connexion */
static int
smp_fetch_fc_sacked ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 3 ) )
return 0 ;
return 1 ;
}
/* get the lost counter on a client connexion */
static int
smp_fetch_fc_lost ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 4 ) )
return 0 ;
return 1 ;
}
/* get the retrans counter on a client connexion */
static int
smp_fetch_fc_retrans ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 5 ) )
return 0 ;
return 1 ;
}
/* get the fackets counter on a client connexion */
static int
smp_fetch_fc_fackets ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 6 ) )
return 0 ;
return 1 ;
}
/* get the reordering counter on a client connexion */
static int
smp_fetch_fc_reordering ( const struct arg * args , struct sample * smp , const char * kw , void * private )
{
if ( ! get_tcp_info ( args , smp , 0 , 7 ) )
return 0 ;
return 1 ;
}
# endif // linux || freebsd || netbsd
# endif // TCP_INFO
2016-07-24 21:16:50 +03:00
2012-11-24 14:55:28 +04:00
# ifdef IPV6_V6ONLY
2012-11-24 18:07:23 +04:00
/* parse the "v4v6" bind keyword */
static int bind_parse_v4v6 ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
{
struct listener * l ;
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET6 )
l - > options | = LI_O_V4V6 ;
}
return 0 ;
}
2012-11-24 14:55:28 +04:00
/* parse the "v6only" bind keyword */
static int bind_parse_v6only ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
{
struct listener * l ;
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET6 )
l - > options | = LI_O_V6ONLY ;
}
return 0 ;
}
# endif
2013-05-09 00:49:23 +04:00
# ifdef CONFIG_HAP_TRANSPARENT
2012-09-13 01:27:21 +04:00
/* parse the "transparent" bind keyword */
2012-09-20 18:48:07 +04:00
static int bind_parse_transparent ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
2012-09-13 01:27:21 +04:00
{
struct listener * l ;
2012-09-20 18:48:07 +04:00
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET | | l - > addr . ss_family = = AF_INET6 )
l - > options | = LI_O_FOREIGN ;
2012-09-13 01:27:21 +04:00
}
return 0 ;
}
# endif
# ifdef TCP_DEFER_ACCEPT
/* parse the "defer-accept" bind keyword */
2012-09-20 18:48:07 +04:00
static int bind_parse_defer_accept ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
2012-09-13 01:27:21 +04:00
{
struct listener * l ;
2012-09-20 18:48:07 +04:00
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET | | l - > addr . ss_family = = AF_INET6 )
l - > options | = LI_O_DEF_ACCEPT ;
2012-09-13 01:27:21 +04:00
}
return 0 ;
}
# endif
2012-10-05 18:21:00 +04:00
# ifdef TCP_FASTOPEN
2013-02-14 02:35:39 +04:00
/* parse the "tfo" bind keyword */
2012-10-05 18:21:00 +04:00
static int bind_parse_tfo ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
{
struct listener * l ;
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET | | l - > addr . ss_family = = AF_INET6 )
l - > options | = LI_O_TCP_FO ;
}
return 0 ;
}
# endif
2012-09-13 01:27:21 +04:00
# ifdef TCP_MAXSEG
/* parse the "mss" bind keyword */
2012-09-20 18:48:07 +04:00
static int bind_parse_mss ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
2012-09-13 01:27:21 +04:00
{
struct listener * l ;
int mss ;
if ( ! * args [ cur_arg + 1 ] ) {
2012-09-20 21:43:14 +04:00
memprintf ( err , " '%s' : missing MSS value " , args [ cur_arg ] ) ;
2012-09-13 01:27:21 +04:00
return ERR_ALERT | ERR_FATAL ;
}
mss = atoi ( args [ cur_arg + 1 ] ) ;
if ( ! mss | | abs ( mss ) > 65535 ) {
2012-09-20 21:43:14 +04:00
memprintf ( err , " '%s' : expects an MSS with and absolute value between 1 and 65535 " , args [ cur_arg ] ) ;
2012-09-13 01:27:21 +04:00
return ERR_ALERT | ERR_FATAL ;
}
2012-09-20 18:48:07 +04:00
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET | | l - > addr . ss_family = = AF_INET6 )
l - > maxseg = mss ;
}
2012-09-13 01:27:21 +04:00
return 0 ;
}
# endif
2015-02-04 02:45:58 +03:00
# ifdef TCP_USER_TIMEOUT
/* parse the "tcp-ut" bind keyword */
static int bind_parse_tcp_ut ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
{
const char * ptr = NULL ;
struct listener * l ;
unsigned int timeout ;
if ( ! * args [ cur_arg + 1 ] ) {
memprintf ( err , " '%s' : missing TCP User Timeout value " , args [ cur_arg ] ) ;
return ERR_ALERT | ERR_FATAL ;
}
ptr = parse_time_err ( args [ cur_arg + 1 ] , & timeout , TIME_UNIT_MS ) ;
if ( ptr ) {
memprintf ( err , " '%s' : expects a positive delay in milliseconds " , args [ cur_arg ] ) ;
return ERR_ALERT | ERR_FATAL ;
}
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET | | l - > addr . ss_family = = AF_INET6 )
l - > tcp_ut = timeout ;
}
return 0 ;
}
# endif
2012-09-13 01:27:21 +04:00
# ifdef SO_BINDTODEVICE
2015-02-04 02:45:58 +03:00
/* parse the "interface" bind keyword */
2012-09-20 18:48:07 +04:00
static int bind_parse_interface ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
2012-09-13 01:27:21 +04:00
{
struct listener * l ;
if ( ! * args [ cur_arg + 1 ] ) {
2012-09-20 21:43:14 +04:00
memprintf ( err , " '%s' : missing interface name " , args [ cur_arg ] ) ;
2012-09-13 01:27:21 +04:00
return ERR_ALERT | ERR_FATAL ;
}
2012-09-20 18:48:07 +04:00
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
if ( l - > addr . ss_family = = AF_INET | | l - > addr . ss_family = = AF_INET6 )
l - > interface = strdup ( args [ cur_arg + 1 ] ) ;
}
2012-09-13 01:27:21 +04:00
return 0 ;
}
# endif
2014-11-17 17:11:45 +03:00
# ifdef CONFIG_HAP_NS
/* parse the "namespace" bind keyword */
static int bind_parse_namespace ( char * * args , int cur_arg , struct proxy * px , struct bind_conf * conf , char * * err )
{
struct listener * l ;
char * namespace = NULL ;
if ( ! * args [ cur_arg + 1 ] ) {
memprintf ( err , " '%s' : missing namespace id " , args [ cur_arg ] ) ;
return ERR_ALERT | ERR_FATAL ;
}
namespace = args [ cur_arg + 1 ] ;
list_for_each_entry ( l , & conf - > listeners , by_bind ) {
l - > netns = netns_store_lookup ( namespace , strlen ( namespace ) ) ;
if ( l - > netns = = NULL )
l - > netns = netns_store_insert ( namespace ) ;
if ( l - > netns = = NULL ) {
2017-11-24 18:50:31 +03:00
ha_alert ( " Cannot open namespace '%s'. \n " , args [ cur_arg + 1 ] ) ;
2014-11-17 17:11:45 +03:00
return ERR_ALERT | ERR_FATAL ;
}
}
return 0 ;
}
# endif
2015-10-13 17:16:41 +03:00
# ifdef TCP_USER_TIMEOUT
/* parse the "tcp-ut" server keyword */
static int srv_parse_tcp_ut ( char * * args , int * cur_arg , struct proxy * px , struct server * newsrv , char * * err )
{
const char * ptr = NULL ;
unsigned int timeout ;
if ( ! * args [ * cur_arg + 1 ] ) {
memprintf ( err , " '%s' : missing TCP User Timeout value " , args [ * cur_arg ] ) ;
return ERR_ALERT | ERR_FATAL ;
}
ptr = parse_time_err ( args [ * cur_arg + 1 ] , & timeout , TIME_UNIT_MS ) ;
if ( ptr ) {
memprintf ( err , " '%s' : expects a positive delay in milliseconds " , args [ * cur_arg ] ) ;
return ERR_ALERT | ERR_FATAL ;
}
if ( newsrv - > addr . ss_family = = AF_INET | | newsrv - > addr . ss_family = = AF_INET6 )
newsrv - > tcp_ut = timeout ;
return 0 ;
}
# endif
MEDIUM: samples: move payload-based fetches and ACLs to their own file
The file acl.c is a real mess, it both contains functions to parse and
process ACLs, and some sample extraction functions which act on buffers.
Some other payload analysers were arbitrarily dispatched to proto_tcp.c.
So now we're moving all payload-based fetches and ACLs to payload.c
which is capable of extracting data from buffers and rely on everything
that is protocol-independant. That way we can safely inflate this file
and only use the other ones when some fetches are really specific (eg:
HTTP, SSL, ...).
As a result of this cleanup, the following new sample fetches became
available even if they're not really useful :
always_false, always_true, rep_ssl_hello_type, rdp_cookie_cnt,
req_len, req_ssl_hello_type, req_ssl_sni, req_ssl_ver, wait_end
The function 'acl_fetch_nothing' was wrong and never used anywhere so it
was removed.
The "rdp_cookie" sample fetch used to have a mandatory argument while it
was optional in ACLs, which are supposed to iterate over RDP cookies. So
we're making it optional as a fetch too, and it will return the first one.
2013-01-08 00:59:07 +04:00
2012-04-25 19:31:42 +04:00
/* Note: must not be declared <const> as its list will be overwritten.
* Note : fetches that may return multiple types must be declared as the lowest
* common denominator , the type that can be casted into all other ones . For
* instance v4 / v6 must be declared v4 .
*/
2013-06-22 01:16:39 +04:00
static struct sample_fetch_kw_list sample_fetch_keywords = { ILH , {
MEDIUM: samples: move payload-based fetches and ACLs to their own file
The file acl.c is a real mess, it both contains functions to parse and
process ACLs, and some sample extraction functions which act on buffers.
Some other payload analysers were arbitrarily dispatched to proto_tcp.c.
So now we're moving all payload-based fetches and ACLs to payload.c
which is capable of extracting data from buffers and rely on everything
that is protocol-independant. That way we can safely inflate this file
and only use the other ones when some fetches are really specific (eg:
HTTP, SSL, ...).
As a result of this cleanup, the following new sample fetches became
available even if they're not really useful :
always_false, always_true, rep_ssl_hello_type, rdp_cookie_cnt,
req_len, req_ssl_hello_type, req_ssl_sni, req_ssl_ver, wait_end
The function 'acl_fetch_nothing' was wrong and never used anywhere so it
was removed.
The "rdp_cookie" sample fetch used to have a mandatory argument while it
was optional in ACLs, which are supposed to iterate over RDP cookies. So
we're making it optional as a fetch too, and it will return the first one.
2013-01-08 00:59:07 +04:00
{ " dst " , smp_fetch_dst , 0 , NULL , SMP_T_IPV4 , SMP_USE_L4CLI } ,
MINOR: tcp: add dst_is_local and src_is_local
It is sometimes needed in application server environments to easily tell
if a source is local to the machine or a remote one, without necessarily
knowing all the local addresses (dhcp, vrrp, etc). Similarly in transparent
proxy configurations it is sometimes desired to tell the difference between
local and remote destination addresses.
This patch adds two new sample fetch functions for this :
dst_is_local : boolean
Returns true if the destination address of the incoming connection is local
to the system, or false if the address doesn't exist on the system, meaning
that it was intercepted in transparent mode. It can be useful to apply
certain rules by default to forwarded traffic and other rules to the traffic
targetting the real address of the machine. For example the stats page could
be delivered only on this address, or SSH access could be locally redirected.
Please note that the check involves a few system calls, so it's better to do
it only once per connection.
src_is_local : boolean
Returns true if the source address of the incoming connection is local to the
system, or false if the address doesn't exist on the system, meaning that it
comes from a remote machine. Note that UNIX addresses are considered local.
It can be useful to apply certain access restrictions based on where the
client comes from (eg: require auth or https for remote machines). Please
note that the check involves a few system calls, so it's better to do it only
once per connection.
2016-08-09 17:46:18 +03:00
{ " dst_is_local " , smp_fetch_dst_is_local , 0 , NULL , SMP_T_BOOL , SMP_USE_L4CLI } ,
2015-07-07 00:43:03 +03:00
{ " dst_port " , smp_fetch_dport , 0 , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
MEDIUM: samples: move payload-based fetches and ACLs to their own file
The file acl.c is a real mess, it both contains functions to parse and
process ACLs, and some sample extraction functions which act on buffers.
Some other payload analysers were arbitrarily dispatched to proto_tcp.c.
So now we're moving all payload-based fetches and ACLs to payload.c
which is capable of extracting data from buffers and rely on everything
that is protocol-independant. That way we can safely inflate this file
and only use the other ones when some fetches are really specific (eg:
HTTP, SSL, ...).
As a result of this cleanup, the following new sample fetches became
available even if they're not really useful :
always_false, always_true, rep_ssl_hello_type, rdp_cookie_cnt,
req_len, req_ssl_hello_type, req_ssl_sni, req_ssl_ver, wait_end
The function 'acl_fetch_nothing' was wrong and never used anywhere so it
was removed.
The "rdp_cookie" sample fetch used to have a mandatory argument while it
was optional in ACLs, which are supposed to iterate over RDP cookies. So
we're making it optional as a fetch too, and it will return the first one.
2013-01-08 00:59:07 +04:00
{ " src " , smp_fetch_src , 0 , NULL , SMP_T_IPV4 , SMP_USE_L4CLI } ,
MINOR: tcp: add dst_is_local and src_is_local
It is sometimes needed in application server environments to easily tell
if a source is local to the machine or a remote one, without necessarily
knowing all the local addresses (dhcp, vrrp, etc). Similarly in transparent
proxy configurations it is sometimes desired to tell the difference between
local and remote destination addresses.
This patch adds two new sample fetch functions for this :
dst_is_local : boolean
Returns true if the destination address of the incoming connection is local
to the system, or false if the address doesn't exist on the system, meaning
that it was intercepted in transparent mode. It can be useful to apply
certain rules by default to forwarded traffic and other rules to the traffic
targetting the real address of the machine. For example the stats page could
be delivered only on this address, or SSH access could be locally redirected.
Please note that the check involves a few system calls, so it's better to do
it only once per connection.
src_is_local : boolean
Returns true if the source address of the incoming connection is local to the
system, or false if the address doesn't exist on the system, meaning that it
comes from a remote machine. Note that UNIX addresses are considered local.
It can be useful to apply certain access restrictions based on where the
client comes from (eg: require auth or https for remote machines). Please
note that the check involves a few system calls, so it's better to do it only
once per connection.
2016-08-09 17:46:18 +03:00
{ " src_is_local " , smp_fetch_src_is_local , 0 , NULL , SMP_T_BOOL , SMP_USE_L4CLI } ,
2015-07-07 00:43:03 +03:00
{ " src_port " , smp_fetch_sport , 0 , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
2016-07-24 21:16:50 +03:00
# ifdef TCP_INFO
2016-08-10 17:06:44 +03:00
{ " fc_rtt " , smp_fetch_fc_rtt , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
{ " fc_rttvar " , smp_fetch_fc_rttvar , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
# if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__)
{ " fc_unacked " , smp_fetch_fc_unacked , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
{ " fc_sacked " , smp_fetch_fc_sacked , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
{ " fc_retrans " , smp_fetch_fc_retrans , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
{ " fc_fackets " , smp_fetch_fc_fackets , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
{ " fc_lost " , smp_fetch_fc_lost , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
{ " fc_reordering " , smp_fetch_fc_reordering , ARG1 ( 0 , STR ) , NULL , SMP_T_SINT , SMP_USE_L4CLI } ,
# endif // linux || freebsd || netbsd
# endif // TCP_INFO
MEDIUM: samples: move payload-based fetches and ACLs to their own file
The file acl.c is a real mess, it both contains functions to parse and
process ACLs, and some sample extraction functions which act on buffers.
Some other payload analysers were arbitrarily dispatched to proto_tcp.c.
So now we're moving all payload-based fetches and ACLs to payload.c
which is capable of extracting data from buffers and rely on everything
that is protocol-independant. That way we can safely inflate this file
and only use the other ones when some fetches are really specific (eg:
HTTP, SSL, ...).
As a result of this cleanup, the following new sample fetches became
available even if they're not really useful :
always_false, always_true, rep_ssl_hello_type, rdp_cookie_cnt,
req_len, req_ssl_hello_type, req_ssl_sni, req_ssl_ver, wait_end
The function 'acl_fetch_nothing' was wrong and never used anywhere so it
was removed.
The "rdp_cookie" sample fetch used to have a mandatory argument while it
was optional in ACLs, which are supposed to iterate over RDP cookies. So
we're making it optional as a fetch too, and it will return the first one.
2013-01-08 00:59:07 +04:00
{ /* END */ } ,
2010-05-24 22:55:15 +04:00
} } ;
2012-09-13 01:27:21 +04:00
/************************************************************************/
/* All supported bind keywords must be declared here. */
/************************************************************************/
/* Note: must not be declared <const> as its list will be overwritten.
* Please take care of keeping this list alphabetically sorted , doing so helps
* all code contributors .
* Optional keywords are also declared with a NULL - > parse ( ) function so that
* the config parser can report an appropriate error when a known keyword was
* not enabled .
*/
2012-09-18 20:24:39 +04:00
static struct bind_kw_list bind_kws = { " TCP " , { } , {
2012-09-13 01:27:21 +04:00
# ifdef TCP_DEFER_ACCEPT
{ " defer-accept " , bind_parse_defer_accept , 0 } , /* wait for some data for 1 second max before doing accept */
# endif
# ifdef SO_BINDTODEVICE
{ " interface " , bind_parse_interface , 1 } , /* specifically bind to this interface */
# endif
# ifdef TCP_MAXSEG
{ " mss " , bind_parse_mss , 1 } , /* set MSS of listening socket */
# endif
2015-02-04 02:45:58 +03:00
# ifdef TCP_USER_TIMEOUT
{ " tcp-ut " , bind_parse_tcp_ut , 1 } , /* set User Timeout on listening socket */
# endif
2012-10-05 18:21:00 +04:00
# ifdef TCP_FASTOPEN
{ " tfo " , bind_parse_tfo , 0 } , /* enable TCP_FASTOPEN of listening socket */
# endif
2013-05-09 00:49:23 +04:00
# ifdef CONFIG_HAP_TRANSPARENT
2012-09-13 01:27:21 +04:00
{ " transparent " , bind_parse_transparent , 0 } , /* transparently bind to the specified addresses */
2012-11-24 14:55:28 +04:00
# endif
# ifdef IPV6_V6ONLY
2012-11-24 18:07:23 +04:00
{ " v4v6 " , bind_parse_v4v6 , 0 } , /* force socket to bind to IPv4+IPv6 */
2012-11-24 14:55:28 +04:00
{ " v6only " , bind_parse_v6only , 0 } , /* force socket to bind to IPv6 only */
2014-11-17 17:11:45 +03:00
# endif
# ifdef CONFIG_HAP_NS
{ " namespace " , bind_parse_namespace , 1 } ,
2012-09-13 01:27:21 +04:00
# endif
/* the versions with the NULL parse function*/
{ " defer-accept " , NULL , 0 } ,
{ " interface " , NULL , 1 } ,
{ " mss " , NULL , 1 } ,
{ " transparent " , NULL , 0 } ,
2012-11-24 18:07:23 +04:00
{ " v4v6 " , NULL , 0 } ,
2012-11-24 14:55:28 +04:00
{ " v6only " , NULL , 0 } ,
2012-09-13 01:27:21 +04:00
{ NULL , NULL , 0 } ,
} } ;
2015-10-13 17:16:41 +03:00
static struct srv_kw_list srv_kws = { " TCP " , { } , {
# ifdef TCP_USER_TIMEOUT
2017-03-15 18:36:09 +03:00
{ " tcp-ut " , srv_parse_tcp_ut , 1 , 1 } , /* set TCP user timeout on server */
2015-10-13 17:16:41 +03:00
# endif
{ NULL , NULL , 0 } ,
} } ;
2015-08-24 02:43:45 +03:00
static struct action_kw_list tcp_req_conn_actions = { ILH , {
2016-05-25 02:48:42 +03:00
{ " silent-drop " , tcp_parse_silent_drop } ,
2016-05-25 03:34:07 +03:00
{ " set-src " , tcp_parse_set_src_dst } ,
{ " set-src-port " , tcp_parse_set_src_dst } ,
{ " set-dst " , tcp_parse_set_src_dst } ,
{ " set-dst-port " , tcp_parse_set_src_dst } ,
2015-08-24 02:43:45 +03:00
{ /* END */ }
} } ;
2016-10-21 17:37:51 +03:00
static struct action_kw_list tcp_req_sess_actions = { ILH , {
{ " silent-drop " , tcp_parse_silent_drop } ,
{ " set-src " , tcp_parse_set_src_dst } ,
{ " set-src-port " , tcp_parse_set_src_dst } ,
{ " set-dst " , tcp_parse_set_src_dst } ,
{ " set-dst-port " , tcp_parse_set_src_dst } ,
{ /* END */ }
} } ;
2015-08-24 02:43:45 +03:00
static struct action_kw_list tcp_req_cont_actions = { ILH , {
{ " silent-drop " , tcp_parse_silent_drop } ,
{ /* END */ }
} } ;
static struct action_kw_list tcp_res_cont_actions = { ILH , {
{ " silent-drop " , tcp_parse_silent_drop } ,
{ /* END */ }
} } ;
static struct action_kw_list http_req_actions = { ILH , {
2016-05-25 02:48:42 +03:00
{ " silent-drop " , tcp_parse_silent_drop } ,
2016-05-25 03:34:07 +03:00
{ " set-src " , tcp_parse_set_src_dst } ,
{ " set-src-port " , tcp_parse_set_src_dst } ,
{ " set-dst " , tcp_parse_set_src_dst } ,
{ " set-dst-port " , tcp_parse_set_src_dst } ,
2015-08-24 02:43:45 +03:00
{ /* END */ }
} } ;
static struct action_kw_list http_res_actions = { ILH , {
{ " silent-drop " , tcp_parse_silent_drop } ,
{ /* END */ }
} } ;
2007-10-29 03:09:36 +03:00
__attribute__ ( ( constructor ) )
static void __tcp_protocol_init ( void )
{
protocol_register ( & proto_tcpv4 ) ;
protocol_register ( & proto_tcpv6 ) ;
2012-04-27 23:37:17 +04:00
sample_register_fetches ( & sample_fetch_keywords ) ;
2012-09-13 01:27:21 +04:00
bind_register_keywords ( & bind_kws ) ;
2015-10-13 17:16:41 +03:00
srv_register_keywords ( & srv_kws ) ;
2015-08-24 02:43:45 +03:00
tcp_req_conn_keywords_register ( & tcp_req_conn_actions ) ;
2016-10-21 17:37:51 +03:00
tcp_req_sess_keywords_register ( & tcp_req_sess_actions ) ;
2015-08-24 02:43:45 +03:00
tcp_req_cont_keywords_register ( & tcp_req_cont_actions ) ;
tcp_res_cont_keywords_register ( & tcp_res_cont_actions ) ;
http_req_keywords_register ( & http_req_actions ) ;
http_res_keywords_register ( & http_res_actions ) ;
2016-12-21 20:55:02 +03:00
hap_register_build_opts ( " Built with transparent proxy support using: "
# if defined(IP_TRANSPARENT)
" IP_TRANSPARENT "
# endif
# if defined(IPV6_TRANSPARENT)
" IPV6_TRANSPARENT "
# endif
# if defined(IP_FREEBIND)
" IP_FREEBIND "
# endif
# if defined(IP_BINDANY)
" IP_BINDANY "
# endif
# if defined(IPV6_BINDANY)
" IPV6_BINDANY "
# endif
# if defined(SO_BINDANY)
" SO_BINDANY "
# endif
" " , 0 ) ;
2007-10-29 03:09:36 +03:00
}
/*
* Local variables :
* c - indent - level : 8
* c - basic - offset : 8
* End :
*/