From fb14edc215d200c0857f098f2d339b504ff69095 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 14 Jun 2009 15:24:37 +0200 Subject: [PATCH] [MEDIUM] stream_sock: implement tcp-cork for use during shutdowns on Linux Setting TCP_CORK on a socket before sending the last segment enables automatic merging of this segment with the FIN from the shutdown() call. Playing with TCP_CORK is not easy though as we have to track the status of the TCP_NODELAY flag since both are mutually exclusive. Doing so saves one more packet per session and offers about 5% more performance. There is no reason not to do it, so there is no associated option. --- include/types/fd.h | 8 ++++++++ src/backend.c | 1 + src/checks.c | 1 + src/client.c | 1 + src/proto_tcp.c | 5 ++--- src/stream_sock.c | 37 +++++++++++++++++++++++++++++++++++++ 6 files changed, 50 insertions(+), 3 deletions(-) diff --git a/include/types/fd.h b/include/types/fd.h index 8ae0f2be1..2bc258fa7 100644 --- a/include/types/fd.h +++ b/include/types/fd.h @@ -59,6 +59,13 @@ enum { #define FD_POLL_DATA (FD_POLL_IN | FD_POLL_OUT) #define FD_POLL_STICKY (FD_POLL_ERR | FD_POLL_HUP) +/* bit values for fdtab[fd]->flags. Most of them are used to hold a value + * consecutive to a behaviour change. + */ +#define FD_FL_TCP 0x0001 /* socket is TCP */ +#define FD_FL_TCP_NODELAY 0x0002 +#define FD_FL_TCP_CORK 0x0004 + /* info about one given fd */ struct fdtab { struct { @@ -66,6 +73,7 @@ struct fdtab { struct buffer *b; /* read/write buffer */ } cb[DIR_SIZE]; void *owner; /* the session (or proxy) associated with this fd */ + unsigned short flags; /* various flags precising the exact status of this fd */ unsigned char state; /* the state of this fd */ unsigned char ev; /* event seen in return of poll() : FD_POLL_* */ struct sockaddr *peeraddr; /* pointer to peer's network address, or NULL if unset */ diff --git a/src/backend.c b/src/backend.c index b830bdb92..5e78fd8df 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1948,6 +1948,7 @@ int connect_server(struct session *s) fdtab[fd].owner = s->req->cons; fdtab[fd].state = FD_STCONN; /* connection in progress */ + fdtab[fd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY; fdtab[fd].cb[DIR_RD].f = &stream_sock_read; fdtab[fd].cb[DIR_RD].b = s->rep; fdtab[fd].cb[DIR_WR].f = &stream_sock_write; diff --git a/src/checks.c b/src/checks.c index b4e985702..4022cad58 100644 --- a/src/checks.c +++ b/src/checks.c @@ -692,6 +692,7 @@ struct task *process_chk(struct task *t) fdtab[fd].peeraddr = (struct sockaddr *)&sa; fdtab[fd].peerlen = sizeof(sa); fdtab[fd].state = FD_STCONN; /* connection in progress */ + fdtab[fd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY; EV_FD_SET(fd, DIR_WR); /* for connect status */ #ifdef DEBUG_FULL assert (!EV_FD_ISSET(fd, DIR_RD)); diff --git a/src/client.c b/src/client.c index 3e156eb78..346adc673 100644 --- a/src/client.c +++ b/src/client.c @@ -417,6 +417,7 @@ int event_accept(int fd) { fd_insert(cfd); fdtab[cfd].owner = &s->si[0]; fdtab[cfd].state = FD_STREADY; + fdtab[cfd].flags = FD_FL_TCP | FD_FL_TCP_NODELAY; fdtab[cfd].cb[DIR_RD].f = l->proto->read; fdtab[cfd].cb[DIR_RD].b = s->req; fdtab[cfd].cb[DIR_WR].f = l->proto->write; diff --git a/src/proto_tcp.c b/src/proto_tcp.c index ed0812c20..e9b3ae37b 100644 --- a/src/proto_tcp.c +++ b/src/proto_tcp.c @@ -212,9 +212,7 @@ int tcp_bind_listener(struct listener *listener, char *errmsg, int errlen) goto tcp_close_return; } - if ((fcntl(fd, F_SETFL, O_NONBLOCK) == -1) || - (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, - (char *) &one, sizeof(one)) == -1)) { + if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) { err |= ERR_FATAL | ERR_ALERT; msg = "cannot make socket non-blocking"; goto tcp_close_return; @@ -281,6 +279,7 @@ int tcp_bind_listener(struct listener *listener, char *errmsg, int errlen) fdtab[fd].cb[DIR_RD].b = fdtab[fd].cb[DIR_WR].b = NULL; fdtab[fd].owner = listener; /* reference the listener instead of a task */ fdtab[fd].state = FD_STLISTEN; + fdtab[fd].flags = FD_FL_TCP; fdtab[fd].peeraddr = NULL; fdtab[fd].peerlen = 0; tcp_return: diff --git a/src/stream_sock.c b/src/stream_sock.c index 82bf8caf6..a3ef26992 100644 --- a/src/stream_sock.c +++ b/src/stream_sock.c @@ -16,6 +16,8 @@ #include #include +#include + #include #include #include @@ -569,6 +571,26 @@ static int stream_sock_write_loop(struct stream_interface *si, struct buffer *b) if (max > b->send_max) max = b->send_max; + +#ifdef TCP_CORK + /* + * Check if we want to cork output before sending. This typically occurs + * when there are data left in the buffer, or when we reached the end of + * buffer but we know we will close, so we try to merge the ongoing FIN + * with the last data segment. + */ + if ((fdtab[si->fd].flags & (FD_FL_TCP|FD_FL_TCP_CORK)) == FD_FL_TCP) { + if (unlikely((b->send_max == b->l && + (b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) == + (BF_WRITE_ENA|BF_SHUTR)))) { + /* we have to unconditionally reset TCP_NODELAY for CORK */ + setsockopt(si->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &zero, sizeof(zero)); + setsockopt(si->fd, SOL_TCP, TCP_CORK, (char *) &one, sizeof(one)); + fdtab[si->fd].flags = (fdtab[si->fd].flags & ~FD_FL_TCP_NODELAY) | FD_FL_TCP_CORK; + } + } +#endif + #ifndef MSG_NOSIGNAL { int skerr; @@ -628,6 +650,21 @@ static int stream_sock_write_loop(struct stream_interface *si, struct buffer *b) } } /* while (1) */ + /* check if we need to uncork the output, for instance when the + * output buffer is empty but not shutr(). + */ + if (unlikely((fdtab[si->fd].flags & (FD_FL_TCP|FD_FL_TCP_NODELAY)) == FD_FL_TCP && (b->flags & BF_EMPTY))) { + if ((b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK|BF_WRITE_ENA|BF_SHUTR)) != (BF_WRITE_ENA|BF_SHUTR)) { +#ifdef TCP_CORK + if (fdtab[si->fd].flags & FD_FL_TCP_CORK) + setsockopt(si->fd, SOL_TCP, TCP_CORK, (char *) &zero, sizeof(zero)); +#endif + setsockopt(si->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)); + fdtab[si->fd].flags = (fdtab[si->fd].flags & ~FD_FL_TCP_CORK) | FD_FL_TCP_NODELAY; + } + } + + return retval; }