Merge branch 'tcpsplice'
This commit is contained in:
commit
733fef4add
19
Makefile
19
Makefile
@ -31,6 +31,9 @@ LD = gcc
|
||||
PCREDIR := $(shell pcre-config --prefix 2>/dev/null || :)
|
||||
#PCREDIR=/usr/local
|
||||
|
||||
# This is the directory hosting libtcpsplice.[ah] when USE_TCPSPLICE is set
|
||||
TCPSPLICEDIR :=
|
||||
|
||||
# This is for standard Linux 2.6 with netfilter and epoll()
|
||||
COPTS.linux26 = -DNETFILTER -DENABLE_POLL -DENABLE_EPOLL
|
||||
LIBS.linux26 =
|
||||
@ -117,6 +120,10 @@ endif
|
||||
# do not change this one, enable USE_* variables instead.
|
||||
OPTIONS =
|
||||
|
||||
ifneq ($(USE_TCPSPLICE),)
|
||||
OPTIONS += -DCONFIG_HAP_TCPSPLICE
|
||||
endif
|
||||
|
||||
ifneq ($(USE_CTTPROXY),)
|
||||
OPTIONS += -DCONFIG_HAP_CTTPROXY
|
||||
endif
|
||||
@ -169,8 +176,16 @@ TARGET_OPTS=$(COPTS.$(TARGET))
|
||||
REGEX_OPTS=$(COPTS.$(REGEX))
|
||||
CPU_OPTS=$(COPTS.$(CPU))
|
||||
|
||||
COPTS=-Iinclude $(ADDINC) $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) $(DEFINE) $(OPTIONS)
|
||||
LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX)) $(ADDLIB)
|
||||
COPTS = -Iinclude $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) $(DEFINE) $(OPTIONS)
|
||||
LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX))
|
||||
|
||||
ifneq ($(USE_TCPSPLICE),)
|
||||
COPTS += -I$(TCPSPLICEDIR)
|
||||
LIBS += -L$(TCPSPLICEDIR) -ltcpsplice
|
||||
endif
|
||||
|
||||
COPTS += $(ADDINC)
|
||||
LIBS += $(ADDLIB)
|
||||
|
||||
CFLAGS = -Wall $(COPTS) $(DEBUG)
|
||||
LDFLAGS = -g
|
||||
|
196
doc/tcp-splicing.txt
Normal file
196
doc/tcp-splicing.txt
Normal file
@ -0,0 +1,196 @@
|
||||
Using Linux TCP Splicing with HAProxy
|
||||
Willy Tarreau <w@1wt.eu>
|
||||
- 2007/01/06 -
|
||||
|
||||
|
||||
Alexandre Cassen has started a project called Linux Layer7 Switching (L7SW),
|
||||
whose goal is to provide kernel services to help userland proxies achieving
|
||||
very high performance. Right now, the project consists in a loadable kernel
|
||||
module providing TCP Splicing under Linux.
|
||||
|
||||
TCP Splicing is a method by which a userland proxy can tell the kernel that
|
||||
it considers it has no added value on the data part of a connection, and that
|
||||
the kernel can perform the transfers it itself, thus relieving the proxy from
|
||||
a potentially heavy job. There are two advantages to this method :
|
||||
|
||||
- it reduces the number of process wakeups
|
||||
- it reduces the number of data copies between user-space and kernel buffers
|
||||
|
||||
This method is particularly suited to protocols in which data is sent till
|
||||
the end of the session. This is the case for FTP data for instance, and it
|
||||
is also the case for the BODY part of HTTP/1.0.
|
||||
|
||||
The great news is that haproxy has been designed from the beginning with a
|
||||
clear distinction between the headers and the DATA phase, so it was a child's
|
||||
game to add hooks to Alex's library in it
|
||||
|
||||
Be careful! Both versions are to be considered BETA software ! Run them on
|
||||
your systems if you want, but do not complain if it crashes twice a day !
|
||||
Anyway, it seems stable on our test machines.
|
||||
|
||||
In order to use TCP Splicing on haproxy, you need :
|
||||
|
||||
- Linux Layer7 Switching code version 0.1.1 : [ http://linux-l7sw.sf.net/ ]
|
||||
- Haproxy version 1.3.5 : [ http://haproxy.1wt.eu/download/1.3/src/ ]
|
||||
|
||||
Then, you must untar both packages in any location, let's assume you'll
|
||||
be using /tmp. First extract l7sw and :
|
||||
|
||||
$ cd /tmp
|
||||
$ tar zxf layer7switch-0.1.1.tar.gz
|
||||
$ cd layer7switch-0.1.1
|
||||
|
||||
L7SW currently only supports Linux kernel 2.6.19+. If you prefer to use it
|
||||
on a more stable kernel, such as 2.6.16.X, you can apply this patch to the
|
||||
L7SW directory :
|
||||
|
||||
[ http://haproxy.1wt.eu/download/patches/tcp_splice-0.1.1-linux-2.6.16.diff ]
|
||||
|
||||
$ patch -p1 -d kernel < tcp_splice-0.1.1-linux-2.6.16.diff
|
||||
|
||||
Alternatively, if you prefer to run it on 2.4.33+, you can apply this patch
|
||||
to the L7SW directory :
|
||||
|
||||
[ http://haproxy.1wt.eu/download/patches/tcp_splice-0.1.1-linux-2.4.33.diff ]
|
||||
|
||||
$ patch -p1 -d kernel < tcp_splice-0.1.1-linux-2.4.33.diff
|
||||
|
||||
Then build the kernel module as described in the L7SW README. Basically, you
|
||||
just have to do this once your tree has been patched :
|
||||
|
||||
$ cd kernel
|
||||
$ make
|
||||
|
||||
You can either install the resulting module (tcp_splice) or load it now. During
|
||||
early testing periods, it might be preferable to avoid installing anything and
|
||||
just load it manually :
|
||||
|
||||
$ sudo insmod tcp_splice.*o
|
||||
$ cd ..
|
||||
|
||||
Now that the module is loaded, you need to build the libtcpsplice library on
|
||||
which haproxy currently relies :
|
||||
|
||||
$ cd userland/libtcpsplice
|
||||
$ make
|
||||
$ cd ..
|
||||
|
||||
For the adventurous, there's also a proof of concept in the userlan/switchd
|
||||
directory, it may be useful if you encounter problems with haproxy for
|
||||
instance. But it is not needed at all here.
|
||||
|
||||
OK, L7SW is ready. Now you have to extract haproxy and tell it to build using
|
||||
libtcpsplice :
|
||||
|
||||
$ cd /tmp
|
||||
$ tar zxf haproxy-1.3.5.tar.gz
|
||||
$ cd haproxy-1.3.5
|
||||
$ make USE_TCPSPLICE=1 TCPSPLICEDIR=/tmp/layer7switch-0.1.1/userland/libtcpsplice
|
||||
|
||||
There are other options to make, which are hugely recommended, such as
|
||||
CPU=, REGEX=, and above all, TARGET= so that you use the best syscalls and
|
||||
functions for your system. Generally you will use TARGET=linux26, but 2.4 users
|
||||
with an epoll-patched kernel will use TARGET=linux24e. This is very important
|
||||
because failing to specify those options will disable important optimizations
|
||||
which might hide the tcpsplice benefits ! Please consult the haproxy's README.
|
||||
|
||||
Now that you have haproxy built with support for tcpsplice, and that the module
|
||||
is loaded, you have to write a config. There is an example in the 'examples'
|
||||
directory. Basically, you just have to add the "option tcpsplice" keyword BOTH
|
||||
in the frontend AND in the backend sections that you want to accelerate.
|
||||
|
||||
If the option is specified only in the frontend or in the backend, then no
|
||||
acceleration will be used. It is designed this way to allow some front-back
|
||||
combinations to use it without forcing others to use it. Of course, if you use
|
||||
a single "listen" section, you just have to specify it once.
|
||||
|
||||
As of now (l7sw-0.1.1 and haproxy-1.3.5), you need the CAP_NETADMIN capability
|
||||
to START and to RUN. For human beings, it means that you have to start haproxy
|
||||
as root and keep it running as root, so it must not drop its priviledges. This
|
||||
is somewhat annoying, but we'll try to find a solution later.
|
||||
|
||||
Also, l7sw-0.1.1 does not yet support TCP window scaling nor SACK. So you have
|
||||
to disable both features on the proxy :
|
||||
|
||||
$ sudo sysctl -w net.ipv4.tcp_window_scaling=0
|
||||
$ sudo sysctl -w net.ipv4.tcp_sack=0
|
||||
$ sudo sysctl -w net.ipv4.tcp_dsack=0
|
||||
$ sudo sysctl -w net.ipv4.tcp_tw_recycle=1
|
||||
|
||||
You can now check that everything works as expected. Run "vmstat 1" or "top"
|
||||
in one terminal, and haproxy in another one :
|
||||
|
||||
$ sudo ./haproxy -f examples/tcp-splicing-sample.cfg
|
||||
|
||||
Transfering large file through it should not affect it much. You should observe
|
||||
something like 10% CPU instead of 95% when transferring 1 MB files at full
|
||||
speed. You can play with the tcpsplice option in the configuration to see the
|
||||
effects.
|
||||
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
This software is still beta, and you will probably encounter some caveats.
|
||||
I personnally ran into a few issues that we'll try to address with Alex. First
|
||||
of all, I had occasionnal lockups on my SMP machine which I never had on an UP
|
||||
one. So if you get problems on an SMP machine, please reboot it in UP and do
|
||||
not lose your time on this.
|
||||
|
||||
I also noticed that sometimes, some sessions remained established even after
|
||||
the end of the program. You might also see some situtations where even after
|
||||
the proxy's exit, the traffic still passes through the system. It may happen
|
||||
when you have a limited source port range and that you reuse a TIME_WAIT
|
||||
session matching exactly the same source and destinations. This will need
|
||||
to be addressed too.
|
||||
|
||||
You can play with tcp_splice variables and timeouts here in /proc/sys/net/ :
|
||||
|
||||
$ ls /proc/sys/net/tcp_splice/
|
||||
debug_level timeout_established timeout_listen timeout_synsent
|
||||
timeout_close timeout_finwait timeout_synack timeout_timewait
|
||||
timeout_closewait timeout_lastack timeout_synrecv
|
||||
|
||||
$ sysctl net/tcp_splice
|
||||
net.tcp_splice.debug_level = 0
|
||||
net.tcp_splice.timeout_synack = 120
|
||||
net.tcp_splice.timeout_listen = 120
|
||||
net.tcp_splice.timeout_lastack = 30
|
||||
net.tcp_splice.timeout_closewait = 60
|
||||
net.tcp_splice.timeout_close = 10
|
||||
net.tcp_splice.timeout_timewait = 120
|
||||
net.tcp_splice.timeout_finwait = 120
|
||||
net.tcp_splice.timeout_synrecv = 60
|
||||
net.tcp_splice.timeout_synsent = 120
|
||||
net.tcp_splice.timeout_established = 900
|
||||
|
||||
You can also consult the full session list here :
|
||||
|
||||
$ head /proc/net/tcp_splice_conn
|
||||
FromIP FPrt ToIP TPrt LocalIP LPrt DestIP DPrt State Expires
|
||||
0A000301 4EBB 0A000302 1F40 0A000302 817B 0A000301 0050 CLOSE 7
|
||||
0A000301 4E9B 0A000302 1F40 0A000302 8165 0A000301 0050 CLOSE 7
|
||||
|
||||
Since a session exists at least in CLOSE state for 10 seconds, you just have
|
||||
to consult this entry less than 10 seconds after a test to see a session.
|
||||
|
||||
Please report your successes, failures, suggestions or fixes to the L7SW
|
||||
mailing list here (do not use the list to report other haproxy bugs) :
|
||||
|
||||
https://lists.sourceforge.net/lists/listinfo/linux-l7sw-devel
|
||||
|
||||
|
||||
Motivations
|
||||
-----------
|
||||
|
||||
I've always wanted haproxy to be the fastest and most reliable software load
|
||||
balancer available. L7SW is an opportunity to make get a huge performance boost
|
||||
on high traffic sites (eg: photo sharing, streaming, ...). In turn, I find it a
|
||||
shame that Alex wastes his time redevelopping a proxy as a proof of concept for
|
||||
his kernel code. While it is a fun game to enter into, it really becomes harder
|
||||
when you need to get close to customers' needs. So by porting haproxy early to
|
||||
L7SW, I get both the opportunity to get an idea of what it will soon be capable
|
||||
of, and help Alex spend more time on the complex kernel part.
|
||||
|
||||
Have fun !
|
||||
Willy
|
82
examples/tcp-splicing-sample.cfg
Normal file
82
examples/tcp-splicing-sample.cfg
Normal file
@ -0,0 +1,82 @@
|
||||
#
|
||||
# This is a sample configuration
|
||||
# haproxy >= 1.3.5 required.
|
||||
#
|
||||
# It listens on 192.168.1.10:80, and directs all requests for Host 'img' or
|
||||
# URIs starting with /img or /css to a dedicated group of servers. URIs
|
||||
# starting with /admin/stats are directed to a backend dedicated to statistics.
|
||||
# TCP splicing is used on static objects to relieve the process from the heavy
|
||||
# job.
|
||||
#
|
||||
|
||||
global
|
||||
maxconn 10000
|
||||
log 127.0.0.1 local0
|
||||
uid 200
|
||||
gid 200
|
||||
chroot /var/empty
|
||||
daemon
|
||||
|
||||
|
||||
# The public 'www' address in the DMZ
|
||||
frontend public
|
||||
bind 192.168.1.10:80
|
||||
mode http
|
||||
log global
|
||||
option httplog
|
||||
option dontlognull
|
||||
option httpclose
|
||||
option tcpsplice
|
||||
monitor-uri /monitoruri
|
||||
maxconn 8000
|
||||
clitimeout 30000
|
||||
|
||||
# Host: will use a specific keyword soon
|
||||
reqisetbe ^Host:\ img static
|
||||
|
||||
# The URI will use a specific keyword soon
|
||||
reqisetbe ^[^\ ]*\ /(img|css)/ static
|
||||
reqisetbe ^[^\ ]*\ /admin/stats stats
|
||||
|
||||
default_backend dynamic
|
||||
|
||||
|
||||
# The static backend backend for 'Host: img', /img and /css.
|
||||
# TCP splicing is enabled on this backend because we don't expect to do
|
||||
# anything interesting with static objects, but we know they can eat much
|
||||
# bandwidth.
|
||||
backend static
|
||||
mode http
|
||||
balance roundrobin
|
||||
option tcpsplice
|
||||
contimeout 5000
|
||||
srvtimeout 5000
|
||||
redispatch
|
||||
retries 2
|
||||
option httpchk HEAD /favicon.ico
|
||||
server statsrv1 192.168.1.8:80 check inter 1000
|
||||
server statsrv2 192.168.1.9:80 check inter 1000
|
||||
|
||||
|
||||
backend dynamic
|
||||
mode http
|
||||
balance roundrobin
|
||||
contimeout 30000
|
||||
srvtimeout 30000
|
||||
redispatch
|
||||
retries 2
|
||||
option httpchk HEAD /login.php
|
||||
cookie DYNSRV insert indirect nocache
|
||||
fullconn 4000 # the servers will be used at full load above this number of connections
|
||||
server dynsrv1 192.168.1.1:80 minconn 50 maxconn 500 cookie s1 check inter 1000
|
||||
server dynsrv2 192.168.1.2:80 minconn 50 maxconn 500 cookie s2 check inter 1000
|
||||
server dynsrv3 192.168.1.3:80 minconn 50 maxconn 500 cookie s3 check inter 1000
|
||||
server dynsrv4 192.168.1.4:80 minconn 50 maxconn 500 cookie s4 check inter 1000
|
||||
|
||||
|
||||
backend stats
|
||||
log global
|
||||
mode http
|
||||
stats uri /
|
||||
balance roundrobin
|
||||
|
@ -57,6 +57,7 @@
|
||||
#define PR_O_TPXY_CIP 0x04000000 /* bind to the client's IP address when connect()ing */
|
||||
#define PR_O_TPXY_CLI 0x06000000 /* bind to the client's IP+port when connect()ing */
|
||||
#define PR_O_TPXY_MASK 0x06000000 /* bind to a non-local address when connect()ing */
|
||||
#define PR_O_TCPSPLICE 0x08000000 /* delegate data transfer to linux kernel's tcp_splice */
|
||||
|
||||
|
||||
#endif /* _TYPES_BACKEND_H */
|
||||
|
@ -42,6 +42,7 @@
|
||||
#define LSTCHK_CAP_BIND 0x00000001 /* check that we can bind to any port */
|
||||
#define LSTCHK_CTTPROXY 0x00000002 /* check that tproxy is enabled */
|
||||
#define LSTCHK_NETADM 0x00000004 /* check that we have CAP_NET_ADMIN */
|
||||
#define LSTCHK_TCPSPLICE 0x00000008 /* check that linux tcp_splice is enabled */
|
||||
|
||||
/* FIXME : this will have to be redefined correctly */
|
||||
struct global {
|
||||
|
@ -41,6 +41,10 @@
|
||||
#include <import/ip_tproxy.h>
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
#include <libtcpsplice.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This function recounts the number of usable active and backup servers for
|
||||
* proxy <p>. These numbers are returned into the p->srv_act and p->srv_bck.
|
||||
@ -364,6 +368,13 @@ int connect_server(struct session *s)
|
||||
return SN_ERR_PRXCOND; /* it is a configuration limit */
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
if ((s->fe->options & s->be->beprm->options) & PR_O_TCPSPLICE) {
|
||||
/* TCP splicing supported by both FE and BE */
|
||||
tcp_splice_initfd(s->cli_fd, fd);
|
||||
}
|
||||
#endif
|
||||
|
||||
if ((fcntl(fd, F_SETFL, O_NONBLOCK)==-1) ||
|
||||
(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)) {
|
||||
qfprintf(stderr,"Cannot set client socket to non blocking mode.\n");
|
||||
|
@ -94,6 +94,10 @@ static const struct {
|
||||
{ "allbackups", PR_O_USE_ALL_BK, PR_CAP_BE, 0 },
|
||||
{ "persist", PR_O_PERSIST, PR_CAP_BE, 0 },
|
||||
{ "forceclose", PR_O_FORCE_CLO | PR_O_HTTP_CLOSE, PR_CAP_BE, 0 },
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
{ "tcpsplice", PR_O_TCPSPLICE , PR_CAP_BE|PR_CAP_FE, LSTCHK_TCPSPLICE|LSTCHK_NETADM },
|
||||
#endif
|
||||
|
||||
{ NULL, 0, 0 }
|
||||
};
|
||||
|
||||
|
@ -90,6 +90,10 @@
|
||||
#include <proto/stream_sock.h>
|
||||
#include <proto/task.h>
|
||||
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
#include <libtcpsplice.h>
|
||||
#endif
|
||||
|
||||
/*********************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
@ -751,6 +755,18 @@ int main(int argc, char **argv)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
if (global.last_checks & LSTCHK_TCPSPLICE) {
|
||||
if (tcp_splice_start() < 0) {
|
||||
Alert("[%s.main()] Cannot enable tcp_splice.\n"
|
||||
" Make sure you have enough permissions and that the module is loadable.\n"
|
||||
" Alternatively, you may disable the 'tcpsplice' options in the configuration.\n"
|
||||
"", argv[0], global.gid);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (nb_oldpids)
|
||||
tell_old_pids(oldpids_sig);
|
||||
|
||||
|
@ -51,6 +51,9 @@
|
||||
#include <proto/session.h>
|
||||
#include <proto/task.h>
|
||||
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
#include <libtcpsplice.h>
|
||||
#endif
|
||||
|
||||
#define DEBUG_PARSE_NO_SPEEDUP
|
||||
#undef DEBUG_PARSE_NO_SPEEDUP
|
||||
@ -1800,6 +1803,12 @@ int process_srv(struct session *t)
|
||||
t->logs.t_close = t->logs.t_connect; /* to get a valid end date */
|
||||
sess_log(t);
|
||||
}
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
if ((t->fe->options & t->be->beprm->options) & PR_O_TCPSPLICE) {
|
||||
/* TCP splicing supported by both FE and BE */
|
||||
tcp_splice_splicefd(t->cli_fd, t->srv_fd, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
t->srv_state = SV_STHEADERS;
|
||||
@ -1956,6 +1965,12 @@ int process_srv(struct session *t)
|
||||
t->srv_state = SV_STSHUTW;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAP_TCPSPLICE
|
||||
if ((t->fe->options & t->be->beprm->options) & PR_O_TCPSPLICE) {
|
||||
/* TCP splicing supported by both FE and BE */
|
||||
tcp_splice_splicefd(t->cli_fd, t->srv_fd, 0);
|
||||
}
|
||||
#endif
|
||||
/* if the user wants to log as soon as possible, without counting
|
||||
bytes from the server, then this is the right moment. */
|
||||
if (t->fe->to_log && !(t->logs.logwait & LW_BYTES)) {
|
||||
|
Loading…
Reference in New Issue
Block a user