set.c: reimplemented decode_base62_golomb using Knuth's coroutines

Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.

In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine.  The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over.  When the slave routine is invoked again, it has to recover the
state and take the right path (q or r).  These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs.  This change demonstrates that it is possible to
get rid of the state-related instructions altogether.

Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos).  However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on.  So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194].  Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.

For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step).  If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point.  The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q".  On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z").  We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).

This change also removes another branch in golomb R->Q transition:

        r &= (1 << Mshift) - 1;
        *v++ = (q << Mshift) | r;
        q = 0;
        state = ST_VLEN;
-       if (left == 0)
-           return;
        bits >>= n - left;
        n = left;
    vlen:
        if (bits == 0) {
            q += n;
            return;
        }
        int vbits = __builtin_ffs(bits);
        ...

This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected.  Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.

So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step).  Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.

callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564  PROGRAM TOTALS
1,059,874,219  lib/set.c:decode_base62_golomb
509,531,239  lib/set.c:rpmsetcmp

callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837  PROGRAM TOTALS
812,534,481  lib/set.c:decode_base62_golomb
509,531,239  lib/set.c:rpmsetcmp
This commit is contained in:
Alexey Tourbin 2012-03-01 21:24:43 +04:00
parent 63da57c20c
commit 568fe52e61

302
lib/set.c
View File

@ -335,7 +335,7 @@ int decode_golomb(int bitc, const char *bitv, int Mshift, unsigned *v)
}
// otherwise, incomplete value is not okay
if (bitc < Mshift)
return -10;
return -11;
// second part
unsigned r = 0;
int i;
@ -515,143 +515,193 @@ int decode_base62_golomb(const char *base62, int Mshift, unsigned *v)
unsigned q = 0;
unsigned r = 0;
int rfill = 0;
enum { ST_VLEN, ST_MBITS } state = ST_VLEN;
inline
void putNbits(unsigned c, int n)
{
if (state == ST_VLEN)
goto vlen;
r |= (c << rfill);
rfill += n;
rcheck: ;
int left = rfill - Mshift;
if (left < 0)
return;
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
if (left == 0)
return;
c >>= n - left;
n = left;
vlen:
if (c == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(c);
n -= vbits;
c >>= vbits;
q += vbits - 1;
r = c;
rfill = n;
state = ST_MBITS;
goto rcheck;
}
inline void put6bits(unsigned c) { putNbits(c, 6); }
inline void put10bits(unsigned c) { putNbits(c, 10); }
inline void put12bits(unsigned c) { putNbits(c, 12); }
inline void put24bits(unsigned c) { putNbits(c, 24); }
long c, w;
int n, vbits, left;
unsigned bits, morebits;
// need align
if (1 & (long) base62) {
long c = (unsigned char) *base62++;
int num6b = char_to_num[c];
if (num6b < 61) {
put6bits(num6b);
goto reg;
}
c = (unsigned char) *base62++;
bits = char_to_num[c];
if (bits < 61)
goto put6q_align;
else {
if (num6b == 0xff)
goto eol;
if (num6b == 0xee)
if (bits == 0xff)
goto eolq;
if (bits == 0xee)
return -1;
assert(num6b == 61);
goto esc;
assert(bits == 61);
goto esc1q;
}
}
// regular mode, process two-byte words
reg:
{
int num12b;
while (1) {
long w = *(unsigned short *) base62;
base62 += 2;
num12b = word_to_num[w];
if (num12b >= 0x1000)
break;
w = *(unsigned short *) base62;
base62 += 2;
int num12x = word_to_num[w];
if (num12x >= 0x1000) {
put12bits(num12b);
num12b = num12x;
break;
}
put24bits(num12b | (num12x << 12));
}
switch (num12b & 0xf000) {
case W_AZ:
put6bits(num12b & 0x0fff);
goto esc;
case W_ZA:
put10bits(num12b & 0x0fff);
goto reg;
case W_A0:
put6bits(num12b & 0x0fff);
goto eol;
case W_0X:
goto eol;
default:
return -1;
}
#define Get24(X) \
w = *(unsigned short *) base62; \
base62 += 2; \
bits = word_to_num[w]; \
if (bits >= 0x1000) \
goto gotNN ## X; \
w = *(unsigned short *) base62; \
base62 += 2; \
morebits = word_to_num[w]; \
if (morebits >= 0x1000) \
goto put12 ## X; \
bits |= (morebits << 12); \
goto put24 ## X
#define Get12(X) \
bits = morebits
#define GotNN(X) \
switch (bits & 0xf000) { \
case W_AZ: \
bits &= 0x0fff; \
goto put6 ## X ## _AZ; \
case W_ZA: \
bits &= 0x0fff; \
goto put10 ## X ## _ZA; \
case W_A0: \
bits &= 0x0fff; \
goto put6 ## X ## _A0; \
case W_0X: \
goto eol ## X; \
default: \
return -2; \
}
// make coroutines
get24q: Get24(q);
get24r: Get24(r);
get12q: Get12(q);
gotNNq: GotNN(q);
get12r: Get12(r);
gotNNr: GotNN(r);
// escape mode, handle 2 bytes one by one
esc:
{
// 1
int num6b = 61;
long c = (unsigned char) *base62++;
int num4b = char_to_num[c];
if (num4b == 0xff)
return -2;
if (num4b == 0xee)
return -3;
switch (num4b & (16 + 32)) {
case 0:
break;
case 16:
num6b = 62;
num4b &= ~16;
break;
case 32:
num6b = 63;
num4b &= ~32;
break;
default:
return -4;
}
put10bits(num6b | (num4b << 6));
// 2
c = (unsigned char) *base62++;
num6b = char_to_num[c];
if (num6b < 61) {
put6bits(num6b);
goto reg;
}
else {
if (num6b == 0xff)
goto eol;
if (num6b == 0xee)
return -1;
assert(num6b == 61);
goto esc;
}
#define Esc1(X) \
bits = 61; \
c = (unsigned char) *base62++; \
morebits = char_to_num[c]; \
if (morebits == 0xff) \
return -3; \
if (morebits == 0xee) \
return -4; \
switch (morebits & (16 + 32)) { \
case 0: \
break; \
case 16: \
bits = 62; \
morebits &= ~16; \
break; \
case 32: \
bits = 63; \
morebits &= ~32; \
break; \
default: \
return -5; \
} \
bits |= (morebits << 6); \
goto put10 ## X ## _esc1
#define Esc2(X) \
c = (unsigned char) *base62++; \
bits = char_to_num[c]; \
if (bits < 61) \
goto put6 ## X ## _esc2; \
else { \
if (bits == 0xff) \
goto eol ## X; \
if (bits == 0xee) \
return -6; \
goto esc1 ## X; \
}
eol:
if (state != ST_VLEN || q > 5)
// make coroutines
esc1q: Esc1(q);
esc2q: Esc2(q);
esc1r: Esc1(r);
esc2r: Esc2(r);
// golomb pieces
#define QInit(N) \
n = N
#define RInit(N) \
n = N; \
r |= (bits << rfill); \
rfill += n
#define RMake(Get) \
left = rfill - Mshift; \
if (left < 0) \
goto Get ## r; \
r &= (1 << Mshift) - 1; \
*v++ = (q << Mshift) | r; \
q = 0; \
bits >>= n - left; \
n = left
#define QMake(Get) \
if (bits == 0) { \
q += n; \
goto Get ## q; \
} \
vbits = __builtin_ffs(bits); \
n -= vbits; \
bits >>= vbits; \
q += vbits - 1; \
r = bits; \
rfill = n
// this assumes that minumum Mshift value is 7
#define Put24Q(Get) \
QInit(24); \
QMake(Get); RMake(Get); \
QMake(Get); RMake(Get); \
QMake(Get); RMake(Get); \
goto Get ## q
#define Put24R(Get) \
RInit(24); \
RMake(Get); \
QMake(Get); RMake(Get); \
QMake(Get); RMake(Get); \
QMake(Get); goto Get ## r
#define Put12Q(Get) \
QInit(12); \
QMake(Get); RMake(Get); \
QMake(Get); goto Get ## r
#define Put12R(Get) \
RInit(12); \
RMake(Get); \
QMake(Get); RMake(Get); \
QMake(Get); goto Get ## r
#define Put10Q(Get) \
QInit(10); \
QMake(Get); RMake(Get); \
QMake(Get); goto Get ## r
#define Put10R(Get) \
RInit(10); \
RMake(Get); \
QMake(Get); RMake(Get); \
QMake(Get); goto Get ## r
#define Put6Q(Get) \
QInit(6); \
QMake(Get); goto Get ## r
#define Put6R(Get) \
RInit(6); \
RMake(Get); \
QMake(Get); goto Get ## r
// make coroutines
put24q: Put24Q(get24);
put24r: Put24R(get24);
put12q: Put12Q(get12);
put12r: Put12R(get12);
put6q_align:
put6q_esc2: Put6Q(get24);
put6r_esc2: Put6R(get24);
put6q_AZ: Put6Q(esc1);
put6r_AZ: Put6R(esc1);
put10q_esc1: Put10Q(esc2);
put10r_esc1: Put10R(esc2);
put10q_ZA: Put10Q(get24);
put10r_ZA: Put10R(get24);
put6q_A0: Put6Q(eol);
put6r_A0: Put6R(eol);
// handle end of line and return
eolq:
if (q > 5)
return -10;
return v - v_start;
eolr:
return -11;
}
#ifdef SELF_TEST