diff --git a/Makefile b/Makefile index 195e6a9fc..48f4775a9 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,9 @@ LD = gcc PCREDIR := $(shell pcre-config --prefix 2>/dev/null || :) #PCREDIR=/usr/local +# This is the directory hosting libtcpsplice.[ah] when USE_TCPSPLICE is set +TCPSPLICEDIR := + # This is for standard Linux 2.6 with netfilter and epoll() COPTS.linux26 = -DNETFILTER -DENABLE_POLL -DENABLE_EPOLL LIBS.linux26 = @@ -117,6 +120,10 @@ endif # do not change this one, enable USE_* variables instead. OPTIONS = +ifneq ($(USE_TCPSPLICE),) +OPTIONS += -DCONFIG_HAP_TCPSPLICE +endif + ifneq ($(USE_CTTPROXY),) OPTIONS += -DCONFIG_HAP_CTTPROXY endif @@ -169,8 +176,16 @@ TARGET_OPTS=$(COPTS.$(TARGET)) REGEX_OPTS=$(COPTS.$(REGEX)) CPU_OPTS=$(COPTS.$(CPU)) -COPTS=-Iinclude $(ADDINC) $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) $(DEFINE) $(OPTIONS) -LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX)) $(ADDLIB) +COPTS = -Iinclude $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) $(DEFINE) $(OPTIONS) +LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX)) + +ifneq ($(USE_TCPSPLICE),) +COPTS += -I$(TCPSPLICEDIR) +LIBS += -L$(TCPSPLICEDIR) -ltcpsplice +endif + +COPTS += $(ADDINC) +LIBS += $(ADDLIB) CFLAGS = -Wall $(COPTS) $(DEBUG) LDFLAGS = -g diff --git a/doc/tcp-splicing.txt b/doc/tcp-splicing.txt new file mode 100644 index 000000000..ffdb2565e --- /dev/null +++ b/doc/tcp-splicing.txt @@ -0,0 +1,196 @@ + Using Linux TCP Splicing with HAProxy + Willy Tarreau + - 2007/01/06 - + + +Alexandre Cassen has started a project called Linux Layer7 Switching (L7SW), +whose goal is to provide kernel services to help userland proxies achieving +very high performance. Right now, the project consists in a loadable kernel +module providing TCP Splicing under Linux. + +TCP Splicing is a method by which a userland proxy can tell the kernel that +it considers it has no added value on the data part of a connection, and that +the kernel can perform the transfers it itself, thus relieving the proxy from +a potentially heavy job. There are two advantages to this method : + + - it reduces the number of process wakeups + - it reduces the number of data copies between user-space and kernel buffers + +This method is particularly suited to protocols in which data is sent till +the end of the session. This is the case for FTP data for instance, and it +is also the case for the BODY part of HTTP/1.0. + +The great news is that haproxy has been designed from the beginning with a +clear distinction between the headers and the DATA phase, so it was a child's +game to add hooks to Alex's library in it + +Be careful! Both versions are to be considered BETA software ! Run them on +your systems if you want, but do not complain if it crashes twice a day ! +Anyway, it seems stable on our test machines. + +In order to use TCP Splicing on haproxy, you need : + + - Linux Layer7 Switching code version 0.1.1 : [ http://linux-l7sw.sf.net/ ] + - Haproxy version 1.3.5 : [ http://haproxy.1wt.eu/download/1.3/src/ ] + +Then, you must untar both packages in any location, let's assume you'll +be using /tmp. First extract l7sw and : + + $ cd /tmp + $ tar zxf layer7switch-0.1.1.tar.gz + $ cd layer7switch-0.1.1 + +L7SW currently only supports Linux kernel 2.6.19+. If you prefer to use it +on a more stable kernel, such as 2.6.16.X, you can apply this patch to the +L7SW directory : + + [ http://haproxy.1wt.eu/download/patches/tcp_splice-0.1.1-linux-2.6.16.diff ] + + $ patch -p1 -d kernel < tcp_splice-0.1.1-linux-2.6.16.diff + +Alternatively, if you prefer to run it on 2.4.33+, you can apply this patch +to the L7SW directory : + + [ http://haproxy.1wt.eu/download/patches/tcp_splice-0.1.1-linux-2.4.33.diff ] + + $ patch -p1 -d kernel < tcp_splice-0.1.1-linux-2.4.33.diff + +Then build the kernel module as described in the L7SW README. Basically, you +just have to do this once your tree has been patched : + + $ cd kernel + $ make + +You can either install the resulting module (tcp_splice) or load it now. During +early testing periods, it might be preferable to avoid installing anything and +just load it manually : + + $ sudo insmod tcp_splice.*o + $ cd .. + +Now that the module is loaded, you need to build the libtcpsplice library on +which haproxy currently relies : + + $ cd userland/libtcpsplice + $ make + $ cd .. + +For the adventurous, there's also a proof of concept in the userlan/switchd +directory, it may be useful if you encounter problems with haproxy for +instance. But it is not needed at all here. + +OK, L7SW is ready. Now you have to extract haproxy and tell it to build using +libtcpsplice : + + $ cd /tmp + $ tar zxf haproxy-1.3.5.tar.gz + $ cd haproxy-1.3.5 + $ make USE_TCPSPLICE=1 TCPSPLICEDIR=/tmp/layer7switch-0.1.1/userland/libtcpsplice + +There are other options to make, which are hugely recommended, such as +CPU=, REGEX=, and above all, TARGET= so that you use the best syscalls and +functions for your system. Generally you will use TARGET=linux26, but 2.4 users +with an epoll-patched kernel will use TARGET=linux24e. This is very important +because failing to specify those options will disable important optimizations +which might hide the tcpsplice benefits ! Please consult the haproxy's README. + +Now that you have haproxy built with support for tcpsplice, and that the module +is loaded, you have to write a config. There is an example in the 'examples' +directory. Basically, you just have to add the "option tcpsplice" keyword BOTH +in the frontend AND in the backend sections that you want to accelerate. + +If the option is specified only in the frontend or in the backend, then no +acceleration will be used. It is designed this way to allow some front-back +combinations to use it without forcing others to use it. Of course, if you use +a single "listen" section, you just have to specify it once. + +As of now (l7sw-0.1.1 and haproxy-1.3.5), you need the CAP_NETADMIN capability +to START and to RUN. For human beings, it means that you have to start haproxy +as root and keep it running as root, so it must not drop its priviledges. This +is somewhat annoying, but we'll try to find a solution later. + +Also, l7sw-0.1.1 does not yet support TCP window scaling nor SACK. So you have +to disable both features on the proxy : + + $ sudo sysctl -w net.ipv4.tcp_window_scaling=0 + $ sudo sysctl -w net.ipv4.tcp_sack=0 + $ sudo sysctl -w net.ipv4.tcp_dsack=0 + $ sudo sysctl -w net.ipv4.tcp_tw_recycle=1 + +You can now check that everything works as expected. Run "vmstat 1" or "top" +in one terminal, and haproxy in another one : + + $ sudo ./haproxy -f examples/tcp-splicing-sample.cfg + +Transfering large file through it should not affect it much. You should observe +something like 10% CPU instead of 95% when transferring 1 MB files at full +speed. You can play with the tcpsplice option in the configuration to see the +effects. + + +Troubleshooting +--------------- + +This software is still beta, and you will probably encounter some caveats. +I personnally ran into a few issues that we'll try to address with Alex. First +of all, I had occasionnal lockups on my SMP machine which I never had on an UP +one. So if you get problems on an SMP machine, please reboot it in UP and do +not lose your time on this. + +I also noticed that sometimes, some sessions remained established even after +the end of the program. You might also see some situtations where even after +the proxy's exit, the traffic still passes through the system. It may happen +when you have a limited source port range and that you reuse a TIME_WAIT +session matching exactly the same source and destinations. This will need +to be addressed too. + +You can play with tcp_splice variables and timeouts here in /proc/sys/net/ : + + $ ls /proc/sys/net/tcp_splice/ + debug_level timeout_established timeout_listen timeout_synsent + timeout_close timeout_finwait timeout_synack timeout_timewait + timeout_closewait timeout_lastack timeout_synrecv + + $ sysctl net/tcp_splice + net.tcp_splice.debug_level = 0 + net.tcp_splice.timeout_synack = 120 + net.tcp_splice.timeout_listen = 120 + net.tcp_splice.timeout_lastack = 30 + net.tcp_splice.timeout_closewait = 60 + net.tcp_splice.timeout_close = 10 + net.tcp_splice.timeout_timewait = 120 + net.tcp_splice.timeout_finwait = 120 + net.tcp_splice.timeout_synrecv = 60 + net.tcp_splice.timeout_synsent = 120 + net.tcp_splice.timeout_established = 900 + +You can also consult the full session list here : + +$ head /proc/net/tcp_splice_conn +FromIP FPrt ToIP TPrt LocalIP LPrt DestIP DPrt State Expires +0A000301 4EBB 0A000302 1F40 0A000302 817B 0A000301 0050 CLOSE 7 +0A000301 4E9B 0A000302 1F40 0A000302 8165 0A000301 0050 CLOSE 7 + +Since a session exists at least in CLOSE state for 10 seconds, you just have +to consult this entry less than 10 seconds after a test to see a session. + +Please report your successes, failures, suggestions or fixes to the L7SW +mailing list here (do not use the list to report other haproxy bugs) : + + https://lists.sourceforge.net/lists/listinfo/linux-l7sw-devel + + +Motivations +----------- + +I've always wanted haproxy to be the fastest and most reliable software load +balancer available. L7SW is an opportunity to make get a huge performance boost +on high traffic sites (eg: photo sharing, streaming, ...). In turn, I find it a +shame that Alex wastes his time redevelopping a proxy as a proof of concept for +his kernel code. While it is a fun game to enter into, it really becomes harder +when you need to get close to customers' needs. So by porting haproxy early to +L7SW, I get both the opportunity to get an idea of what it will soon be capable +of, and help Alex spend more time on the complex kernel part. + +Have fun ! +Willy diff --git a/examples/tcp-splicing-sample.cfg b/examples/tcp-splicing-sample.cfg new file mode 100644 index 000000000..84d55a307 --- /dev/null +++ b/examples/tcp-splicing-sample.cfg @@ -0,0 +1,82 @@ +# +# This is a sample configuration +# haproxy >= 1.3.5 required. +# +# It listens on 192.168.1.10:80, and directs all requests for Host 'img' or +# URIs starting with /img or /css to a dedicated group of servers. URIs +# starting with /admin/stats are directed to a backend dedicated to statistics. +# TCP splicing is used on static objects to relieve the process from the heavy +# job. +# + +global + maxconn 10000 + log 127.0.0.1 local0 + uid 200 + gid 200 + chroot /var/empty + daemon + + +# The public 'www' address in the DMZ +frontend public + bind 192.168.1.10:80 + mode http + log global + option httplog + option dontlognull + option httpclose + option tcpsplice + monitor-uri /monitoruri + maxconn 8000 + clitimeout 30000 + + # Host: will use a specific keyword soon + reqisetbe ^Host:\ img static + + # The URI will use a specific keyword soon + reqisetbe ^[^\ ]*\ /(img|css)/ static + reqisetbe ^[^\ ]*\ /admin/stats stats + + default_backend dynamic + + +# The static backend backend for 'Host: img', /img and /css. +# TCP splicing is enabled on this backend because we don't expect to do +# anything interesting with static objects, but we know they can eat much +# bandwidth. +backend static + mode http + balance roundrobin + option tcpsplice + contimeout 5000 + srvtimeout 5000 + redispatch + retries 2 + option httpchk HEAD /favicon.ico + server statsrv1 192.168.1.8:80 check inter 1000 + server statsrv2 192.168.1.9:80 check inter 1000 + + +backend dynamic + mode http + balance roundrobin + contimeout 30000 + srvtimeout 30000 + redispatch + retries 2 + option httpchk HEAD /login.php + cookie DYNSRV insert indirect nocache + fullconn 4000 # the servers will be used at full load above this number of connections + server dynsrv1 192.168.1.1:80 minconn 50 maxconn 500 cookie s1 check inter 1000 + server dynsrv2 192.168.1.2:80 minconn 50 maxconn 500 cookie s2 check inter 1000 + server dynsrv3 192.168.1.3:80 minconn 50 maxconn 500 cookie s3 check inter 1000 + server dynsrv4 192.168.1.4:80 minconn 50 maxconn 500 cookie s4 check inter 1000 + + +backend stats + log global + mode http + stats uri / + balance roundrobin + diff --git a/include/types/backend.h b/include/types/backend.h index f70a3707c..d6079ad66 100644 --- a/include/types/backend.h +++ b/include/types/backend.h @@ -57,6 +57,7 @@ #define PR_O_TPXY_CIP 0x04000000 /* bind to the client's IP address when connect()ing */ #define PR_O_TPXY_CLI 0x06000000 /* bind to the client's IP+port when connect()ing */ #define PR_O_TPXY_MASK 0x06000000 /* bind to a non-local address when connect()ing */ +#define PR_O_TCPSPLICE 0x08000000 /* delegate data transfer to linux kernel's tcp_splice */ #endif /* _TYPES_BACKEND_H */ diff --git a/include/types/global.h b/include/types/global.h index 40c6c997a..222d4feac 100644 --- a/include/types/global.h +++ b/include/types/global.h @@ -42,6 +42,7 @@ #define LSTCHK_CAP_BIND 0x00000001 /* check that we can bind to any port */ #define LSTCHK_CTTPROXY 0x00000002 /* check that tproxy is enabled */ #define LSTCHK_NETADM 0x00000004 /* check that we have CAP_NET_ADMIN */ +#define LSTCHK_TCPSPLICE 0x00000008 /* check that linux tcp_splice is enabled */ /* FIXME : this will have to be redefined correctly */ struct global { diff --git a/src/backend.c b/src/backend.c index f7dd67571..c0283d20a 100644 --- a/src/backend.c +++ b/src/backend.c @@ -41,6 +41,10 @@ #include #endif +#ifdef CONFIG_HAP_TCPSPLICE +#include +#endif + /* * This function recounts the number of usable active and backup servers for * proxy

. These numbers are returned into the p->srv_act and p->srv_bck. @@ -364,6 +368,13 @@ int connect_server(struct session *s) return SN_ERR_PRXCOND; /* it is a configuration limit */ } +#ifdef CONFIG_HAP_TCPSPLICE + if ((s->fe->options & s->be->beprm->options) & PR_O_TCPSPLICE) { + /* TCP splicing supported by both FE and BE */ + tcp_splice_initfd(s->cli_fd, fd); + } +#endif + if ((fcntl(fd, F_SETFL, O_NONBLOCK)==-1) || (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)) { qfprintf(stderr,"Cannot set client socket to non blocking mode.\n"); diff --git a/src/cfgparse.c b/src/cfgparse.c index 5017d50d0..8b6e94d36 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -94,6 +94,10 @@ static const struct { { "allbackups", PR_O_USE_ALL_BK, PR_CAP_BE, 0 }, { "persist", PR_O_PERSIST, PR_CAP_BE, 0 }, { "forceclose", PR_O_FORCE_CLO | PR_O_HTTP_CLOSE, PR_CAP_BE, 0 }, +#ifdef CONFIG_HAP_TCPSPLICE + { "tcpsplice", PR_O_TCPSPLICE , PR_CAP_BE|PR_CAP_FE, LSTCHK_TCPSPLICE|LSTCHK_NETADM }, +#endif + { NULL, 0, 0 } }; diff --git a/src/haproxy.c b/src/haproxy.c index 047136422..19fffd9c5 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -90,6 +90,10 @@ #include #include +#ifdef CONFIG_HAP_TCPSPLICE +#include +#endif + /*********************************************************************/ /*********************************************************************/ @@ -751,6 +755,18 @@ int main(int argc, char **argv) #endif } +#ifdef CONFIG_HAP_TCPSPLICE + if (global.last_checks & LSTCHK_TCPSPLICE) { + if (tcp_splice_start() < 0) { + Alert("[%s.main()] Cannot enable tcp_splice.\n" + " Make sure you have enough permissions and that the module is loadable.\n" + " Alternatively, you may disable the 'tcpsplice' options in the configuration.\n" + "", argv[0], global.gid); + exit(1); + } + } +#endif + if (nb_oldpids) tell_old_pids(oldpids_sig); diff --git a/src/proto_http.c b/src/proto_http.c index 184d91aa2..b5f974a44 100644 --- a/src/proto_http.c +++ b/src/proto_http.c @@ -51,6 +51,9 @@ #include #include +#ifdef CONFIG_HAP_TCPSPLICE +#include +#endif #define DEBUG_PARSE_NO_SPEEDUP #undef DEBUG_PARSE_NO_SPEEDUP @@ -1800,6 +1803,12 @@ int process_srv(struct session *t) t->logs.t_close = t->logs.t_connect; /* to get a valid end date */ sess_log(t); } +#ifdef CONFIG_HAP_TCPSPLICE + if ((t->fe->options & t->be->beprm->options) & PR_O_TCPSPLICE) { + /* TCP splicing supported by both FE and BE */ + tcp_splice_splicefd(t->cli_fd, t->srv_fd, 0); + } +#endif } else { t->srv_state = SV_STHEADERS; @@ -1956,6 +1965,12 @@ int process_srv(struct session *t) t->srv_state = SV_STSHUTW; } +#ifdef CONFIG_HAP_TCPSPLICE + if ((t->fe->options & t->be->beprm->options) & PR_O_TCPSPLICE) { + /* TCP splicing supported by both FE and BE */ + tcp_splice_splicefd(t->cli_fd, t->srv_fd, 0); + } +#endif /* if the user wants to log as soon as possible, without counting bytes from the server, then this is the right moment. */ if (t->fe->to_log && !(t->logs.logwait & LW_BYTES)) {