MAJOR: compression: integrate support for libslz

This library is designed to emit a zlib-compatible stream with no
memory usage and to favor resource savings over compression ratio.
While zlib requires 256 kB of RAM per compression context (and can only
support 4000 connections per GB of RAM), the stateless compression
offered by libslz does not need to retain buffers between subsequent
calls. In theory this slightly reduces the compression ratio but in
practice it does not have that much of an effect since the zlib
window is limited to 32kB.

Libslz is available at :

      http://git.1wt.eu/web?p=libslz.git

It was designed for web compression and provides a lot of savings
over zlib in haproxy. Here are the preliminary results on a single
core of a core2-quad 3.0 GHz in 32-bit for only 300 concurrent
sessions visiting the home page of www.haproxy.org (76 kB) with
the default 16kB buffers :

          BW In      BW Out     BW Saved   Ratio   memory VSZ/RSS
zlib      237 Mbps    92 Mbps   145 Mbps   2.58     84M /  69M
slz       733 Mbps   380 Mbps   353 Mbps   1.93    5.9M / 4.2M

So while the compression ratio is lower, the bandwidth savings are
much more important due to the significantly lower compression cost
which allows to consume even more data from the servers. In the
example above, zlib became the bottleneck at 24% of the output
bandwidth. Also the difference in memory usage is obvious.

More tests run on a single core of a core i5-3320M, with 500 concurrent
users and the default 16kB buffers :

At 100% CPU (no limit) :
          BW In      BW Out     BW Saved   Ratio   memory VSZ/RSS  hits/s
zlib      480 Mbps   188 Mbps   292 Mbps   2.55     130M / 101M     744
slz      1700 Mbps   810 Mbps   890 Mbps   2.10    23.7M / 9.7M    2382

At 85% CPU (limited) :
          BW In      BW Out     BW Saved   Ratio   memory VSZ/RSS  hits/s
zlib     1240 Mbps   976 Mbps   264 Mbps   1.27     130M / 100M    1738
slz      1600 Mbps   976 Mbps   624 Mbps   1.64    23.7M / 9.7M    2210

The most important benefit really happens when the CPU usage is
limited by "maxcompcpuusage" or the BW limited by "maxcomprate" :
in order to preserve resources, haproxy throttles the compression
ratio until usage is within limits. Since slz is much cheaper, the
average compression ratio is much higher and the input bandwidth
is quite higher for one Gbps output.

Other tests made with some reference files :

                           BW In     BW Out    BW Saved  Ratio  hits/s
daniels.html       zlib  1320 Mbps  163 Mbps  1157 Mbps   8.10    1925
                   slz   3600 Mbps  580 Mbps  3020 Mbps   6.20    5300

tv.com/listing     zlib   980 Mbps  124 Mbps   856 Mbps   7.90     310
                   slz   3300 Mbps  553 Mbps  2747 Mbps   5.97    1100

jquery.min.js      zlib   430 Mbps  180 Mbps   250 Mbps   2.39     547
                   slz   1470 Mbps  764 Mbps   706 Mbps   1.92    1815

bootstrap.min.css  zlib   790 Mbps  165 Mbps   625 Mbps   4.79     777
                   slz   2450 Mbps  650 Mbps  1800 Mbps   3.77    2400

So on top of saving a lot of memory, slz is constantly 2.5-3.5 times
faster than zlib and results in providing more savings for a fixed CPU
usage. For links smaller than 100 Mbps, zlib still provides a better
compression ratio, at the expense of a much higher CPU usage.

Larger input files provide slightly higher bandwidth for both libs, at
the expense of a bit more memory usage for zlib (it converges to 256kB
per connection).
This commit is contained in:
Willy Tarreau 2015-03-29 03:32:06 +02:00
parent 7b21877888
commit 418b8c0c41
4 changed files with 195 additions and 18 deletions

View File

@ -33,6 +33,7 @@
# USE_ACCEPT4 : enable use of accept4() on linux. Automatic.
# USE_MY_ACCEPT4 : use own implemention of accept4() if glibc < 2.10.
# USE_ZLIB : enable zlib library support.
# USE_SLZ : enable slz library instead of zlib (pick at most one).
# USE_CPU_AFFINITY : enable pinning processes to CPU on Linux. Automatic.
# USE_TFO : enable TCP fast open. Supported on Linux >= 3.7.
# USE_NS : enable network namespace support. Supported on Linux >= 2.6.24.
@ -448,6 +449,15 @@ OPTIONS_CFLAGS += -DUSE_GETADDRINFO
BUILD_OPTIONS += $(call ignore_implicit,USE_GETADDRINFO)
endif
ifneq ($(USE_SLZ),)
# Use SLZ_INC and SLZ_LIB to force path to zlib.h and libz.{a,so} if needed.
SLZ_INC =
SLZ_LIB =
OPTIONS_CFLAGS += -DUSE_SLZ $(if $(SLZ_INC),-I$(SLZ_INC))
BUILD_OPTIONS += $(call ignore_implicit,USE_SLZ)
OPTIONS_LDFLAGS += $(if $(SLZ_LIB),-L$(SLZ_LIB)) -lslz
endif
ifneq ($(USE_ZLIB),)
# Use ZLIB_INC and ZLIB_LIB to force path to zlib.h and libz.{a,so} if needed.
ZLIB_INC =

4
README
View File

@ -118,7 +118,9 @@ include additional libs with ADDLIB if needed (in this case for example libdl):
It is also possible to include native support for ZLIB to benefit from HTTP
compression. For this, pass "USE_ZLIB=1" on the "make" command line and ensure
that zlib is present on the system.
that zlib is present on the system. Alternatively it is possible to use libslz
for a faster, memory less, but slightly less efficient compression, by passing
"USE_SLZ=1".
By default, the DEBUG variable is set to '-g' to enable debug symbols. It is
not wise to disable it on uncommon systems, because it's often the only way to

View File

@ -23,11 +23,11 @@
#ifndef _TYPES_COMP_H
#define _TYPES_COMP_H
#ifdef USE_ZLIB
#if defined(USE_SLZ)
#include <slz.h>
#elif defined(USE_ZLIB)
#include <zlib.h>
#endif /* USE_ZLIB */
#endif
struct comp {
struct comp_algo *algos;
@ -36,14 +36,19 @@ struct comp {
};
struct comp_ctx {
#ifdef USE_ZLIB
#if defined(USE_SLZ)
struct slz_stream strm;
const void *direct_ptr; /* NULL or pointer to beginning of data */
int direct_len; /* length of direct_ptr if not NULL */
struct buffer *queued; /* if not NULL, data already queued */
#elif defined(USE_ZLIB)
z_stream strm; /* zlib stream */
void *zlib_deflate_state;
void *zlib_window;
void *zlib_prev;
void *zlib_pending_buf;
void *zlib_head;
#endif /* USE_ZLIB */
#endif
int cur_lvl;
};

View File

@ -13,7 +13,9 @@
#include <stdio.h>
#ifdef USE_ZLIB
#if defined(USE_SLZ)
#include <slz.h>
#elif defined(USE_ZLIB)
/* Note: the crappy zlib and openssl libs both define the "free_func" type.
* That's a very clever idea to use such a generic name in general purpose
* libraries, really... The zlib one is easier to redefine than openssl's,
@ -61,7 +63,17 @@ static int identity_flush(struct comp_ctx *comp_ctx, struct buffer *out);
static int identity_finish(struct comp_ctx *comp_ctx, struct buffer *out);
static int identity_end(struct comp_ctx **comp_ctx);
#ifdef USE_ZLIB
#if defined(USE_SLZ)
static int rfc1950_init(struct comp_ctx **comp_ctx, int level);
static int rfc1951_init(struct comp_ctx **comp_ctx, int level);
static int rfc1952_init(struct comp_ctx **comp_ctx, int level);
static int rfc195x_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out);
static int rfc195x_flush(struct comp_ctx *comp_ctx, struct buffer *out);
static int rfc195x_finish(struct comp_ctx *comp_ctx, struct buffer *out);
static int rfc195x_end(struct comp_ctx **comp_ctx);
#elif defined(USE_ZLIB)
static int gzip_init(struct comp_ctx **comp_ctx, int level);
static int raw_def_init(struct comp_ctx **comp_ctx, int level);
@ -77,7 +89,11 @@ static int deflate_end(struct comp_ctx **comp_ctx);
const struct comp_algo comp_algos[] =
{
{ "identity", 8, "identity", 8, identity_init, identity_add_data, identity_flush, identity_finish, identity_end },
#ifdef USE_ZLIB
#if defined(USE_SLZ)
{ "deflate", 7, "deflate", 7, rfc1950_init, rfc195x_add_data, rfc195x_flush, rfc195x_finish, rfc195x_end },
{ "raw-deflate", 11, "deflate", 7, rfc1951_init, rfc195x_add_data, rfc195x_flush, rfc195x_finish, rfc195x_end },
{ "gzip", 4, "gzip", 4, rfc1952_init, rfc195x_add_data, rfc195x_flush, rfc195x_finish, rfc195x_end },
#elif defined(USE_ZLIB)
{ "deflate", 7, "deflate", 7, deflate_init, deflate_add_data, deflate_flush, deflate_finish, deflate_end },
{ "raw-deflate", 11, "deflate", 7, raw_def_init, deflate_add_data, deflate_flush, deflate_finish, deflate_end },
{ "gzip", 4, "gzip", 4, gzip_init, deflate_add_data, deflate_flush, deflate_finish, deflate_end },
@ -221,7 +237,7 @@ int http_compression_buffer_end(struct session *s, struct buffer **in, struct bu
struct buffer *ib = *in, *ob = *out;
char *tail;
#ifdef USE_ZLIB
#if defined(USE_SLZ) || defined(USE_ZLIB)
int ret;
/* flush data here */
@ -357,7 +373,11 @@ static inline int init_comp_ctx(struct comp_ctx **comp_ctx)
*comp_ctx = pool_alloc2(pool_comp_ctx);
if (*comp_ctx == NULL)
return -1;
#ifdef USE_ZLIB
#if defined(USE_SLZ)
(*comp_ctx)->direct_ptr = NULL;
(*comp_ctx)->direct_len = 0;
(*comp_ctx)->queued = NULL;
#elif defined(USE_ZLIB)
zlib_used_memory += sizeof(struct comp_ctx);
strm = &(*comp_ctx)->strm;
@ -427,11 +447,6 @@ static int identity_finish(struct comp_ctx *comp_ctx, struct buffer *out)
return 0;
}
static int identity_reset(struct comp_ctx *comp_ctx)
{
return 0;
}
/*
* Deinit the algorithm
*/
@ -441,7 +456,148 @@ static int identity_end(struct comp_ctx **comp_ctx)
}
#ifdef USE_ZLIB
#ifdef USE_SLZ
/* SLZ's gzip format (RFC1952). Returns < 0 on error. */
static int rfc1952_init(struct comp_ctx **comp_ctx, int level)
{
if (init_comp_ctx(comp_ctx) < 0)
return -1;
(*comp_ctx)->cur_lvl = !!level;
return slz_rfc1952_init(&(*comp_ctx)->strm, !!level);
}
/* SLZ's raw deflate format (RFC1951). Returns < 0 on error. */
static int rfc1951_init(struct comp_ctx **comp_ctx, int level)
{
if (init_comp_ctx(comp_ctx) < 0)
return -1;
(*comp_ctx)->cur_lvl = !!level;
return slz_rfc1951_init(&(*comp_ctx)->strm, !!level);
}
/* SLZ's zlib format (RFC1950). Returns < 0 on error. */
static int rfc1950_init(struct comp_ctx **comp_ctx, int level)
{
if (init_comp_ctx(comp_ctx) < 0)
return -1;
(*comp_ctx)->cur_lvl = !!level;
return slz_rfc1950_init(&(*comp_ctx)->strm, !!level);
}
/* Return the size of consumed data or -1. The output buffer is unused at this
* point, we only keep a reference to the input data or a copy of them if the
* reference is already used.
*/
static int rfc195x_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out)
{
static struct buffer *tmpbuf = &buf_empty;
if (in_len <= 0)
return 0;
if (comp_ctx->direct_ptr && !comp_ctx->queued) {
/* data already being pointed to, we're in front of fragmented
* data and need a buffer now. We reuse the same buffer, as it's
* not used out of the scope of a series of add_data()*, end().
*/
if (unlikely(!tmpbuf->size)) {
/* this is the first time we need the compression buffer */
if (b_alloc(&tmpbuf) == NULL)
return -1; /* no memory */
}
b_reset(tmpbuf);
memcpy(bi_end(tmpbuf), comp_ctx->direct_ptr, comp_ctx->direct_len);
tmpbuf->i += comp_ctx->direct_len;
comp_ctx->direct_ptr = NULL;
comp_ctx->direct_len = 0;
comp_ctx->queued = tmpbuf;
/* fall through buffer copy */
}
if (comp_ctx->queued) {
/* data already pending */
memcpy(bi_end(comp_ctx->queued), in_data, in_len);
comp_ctx->queued->i += in_len;
return in_len;
}
comp_ctx->direct_ptr = in_data;
comp_ctx->direct_len = in_len;
return in_len;
}
/* Compresses the data accumulated using add_data(), and optionally sends the
* format-specific trailer if <finish> is non-null. <out> is expected to have a
* large enough free non-wrapping space as verified by http_comp_buffer_init().
* The number of bytes emitted is reported.
*/
static int rfc195x_flush_or_finish(struct comp_ctx *comp_ctx, struct buffer *out, int finish)
{
struct slz_stream *strm = &comp_ctx->strm;
const char *in_ptr;
int in_len;
int out_len;
in_ptr = comp_ctx->direct_ptr;
in_len = comp_ctx->direct_len;
if (comp_ctx->queued) {
in_ptr = comp_ctx->queued->p;
in_len = comp_ctx->queued->i;
}
out_len = out->i;
if (in_ptr)
out->i += slz_encode(strm, bi_end(out), in_ptr, in_len, !finish);
if (finish)
out->i += slz_finish(strm, bi_end(out));
out_len = out->i - out_len;
/* very important, we must wipe the data we've just flushed */
comp_ctx->direct_len = 0;
comp_ctx->direct_ptr = NULL;
comp_ctx->queued = NULL;
/* Verify compression rate limiting and CPU usage */
if ((global.comp_rate_lim > 0 && (read_freq_ctr(&global.comp_bps_out) > global.comp_rate_lim)) || /* rate */
(idle_pct < compress_min_idle)) { /* idle */
if (comp_ctx->cur_lvl > 0)
strm->level = --comp_ctx->cur_lvl;
}
else if (comp_ctx->cur_lvl < global.tune.comp_maxlevel && comp_ctx->cur_lvl < 1) {
strm->level = ++comp_ctx->cur_lvl;
}
/* and that's all */
return out_len;
}
static int rfc195x_flush(struct comp_ctx *comp_ctx, struct buffer *out)
{
return rfc195x_flush_or_finish(comp_ctx, out, 0);
}
static int rfc195x_finish(struct comp_ctx *comp_ctx, struct buffer *out)
{
return rfc195x_flush_or_finish(comp_ctx, out, 1);
}
/* we just need to free the comp_ctx here, nothing was allocated */
static int rfc195x_end(struct comp_ctx **comp_ctx)
{
deinit_comp_ctx(comp_ctx);
return 0;
}
#elif defined(USE_ZLIB) /* ! USE_SLZ */
/*
* This is a tricky allocation function using the zlib.
* This is based on the allocation order in deflateInit2.
@ -719,6 +875,10 @@ static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, {
__attribute__((constructor))
static void __comp_fetch_init(void)
{
#ifdef USE_SLZ
slz_make_crc_table();
slz_prepare_dist_table();
#endif
acl_register_keywords(&acl_kws);
sample_register_fetches(&sample_fetch_keywords);
}