2010-09-11 01:34:05 +04:00
/*
* set . c - base62 , golomb and set - string routines
*
set.c: reimplmeneted downsampling unsing merges
Most of the time, downsampling is needed for Provides versions,
which are expensive, and values are reduced by only 1 bit, which
can be implemented without sorting the values again. Indeed,
only a merge is required. The array v[] can be split into two
parts: the first part v1[] and the second part v2[], the latter
having values with high bit set. After the high bit is stripped,
v2[] values are still sorted. It suffices to merge v1[] and v2[].
Note that, however, a merge cannot be done inplace, and also we have
to support 2 or more downsampling steps. We also want to avoid copying.
This requires careful buffer management - each version needs two
alternate buffers.
callgrind annotations for "apt-cache <<<unmet", previous commit:
2,743,058,808 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
509,186,920 lib/set.c:rpmsetcmp
131,678,282 stdlib/msort.c:msort_with_tmp'2
93,496,965 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,266 sysdeps/x86_64/memcmp.S:bcmp
83,062,668 sysdeps/x86_64/strlen.S:__GI_strlen
64,584,024 sysdeps/x86_64/memcpy.S:memcpy
callgrind annotations for "apt-cache <<<unmet", this commit:
2,683,295,262 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
510,261,969 lib/set.c:rpmsetcmp
93,692,793 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,275 sysdeps/x86_64/memcmp.S:bcmp
90,080,205 stdlib/msort.c:msort_with_tmp'2
83,062,524 sysdeps/x86_64/strlen.S:__GI_strlen
58,165,691 sysdeps/x86_64/memcpy.S:memcpy
2012-02-16 17:18:21 +04:00
* Copyright ( C ) 2010 , 2011 , 2012 Alexey Tourbin < at @ altlinux . org >
2010-09-11 01:34:05 +04:00
*
* License : GPLv2 + or LGPL , see RPM COPYING
*/
# ifdef SELF_TEST
# undef NDEBUG
# include <stdio.h>
# endif
# include <string.h>
# include <stdlib.h>
# include <assert.h>
/*
* Base62 routines - encode bits with alnum characters .
*
* This is a base64 - based base62 implementation . Values 0. .61 are encoded
* with ' 0 ' . . ' 9 ' , ' a ' . . ' z ' , and ' A ' . . ' Z ' . However , ' Z ' is special : it will
* also encode 62 and 63. To achieve this , ' Z ' will occupy two high bits in
* the next character . Thus ' Z ' can be interpreted as an escape character
* ( which indicates that the next character must be handled specially ) .
* Note that setting high bits to " 00 " , " 01 " or " 10 " cannot contribute
* to another ' Z ' ( which would require high bits set to " 11 " ) . This is
2011-06-11 00:37:58 +04:00
* how multiple escapes are avoided .
2010-09-11 01:34:05 +04:00
*/
2011-01-02 06:39:32 +03:00
// Estimate base62 buffer size required to encode a given number of bits.
2010-09-11 01:34:05 +04:00
static inline
int encode_base62_size ( int bitc )
{
2012-02-19 08:43:36 +04:00
// In the worst case, which is ZxZxZx..., five bits can make a character;
// the remaining bits can make a character, too. And the string must be
// null-terminated.
return bitc / 5 + 2 ;
2010-09-11 01:34:05 +04:00
}
2015-05-21 17:59:09 +03:00
static
char *
put_digit ( int c , char * base62 )
{
assert ( c > = 0 & & c < = 61 ) ;
if ( c < 10 )
* base62 + + = c + ' 0 ' ;
else if ( c < 36 )
* base62 + + = c - 10 + ' a ' ;
else if ( c < 62 )
* base62 + + = c - 36 + ' A ' ;
return base62 ;
}
2011-01-02 06:39:32 +03:00
// Main base62 encoding routine: pack bitv into base62 string.
2010-09-11 01:34:05 +04:00
static
2011-01-02 06:39:32 +03:00
int encode_base62 ( int bitc , const char * bitv , char * base62 )
2010-09-11 01:34:05 +04:00
{
char * base62_start = base62 ;
2011-01-02 06:39:32 +03:00
int bits2 = 0 ; // number of high bits set
int bits6 = 0 ; // number of regular bits set
int num6b = 0 ; // pending 6-bit number
while ( bitc - - > 0 ) {
num6b | = ( * bitv + + < < bits6 + + ) ;
2011-01-03 00:00:58 +03:00
if ( bits6 + bits2 < 6 )
continue ;
switch ( num6b ) {
case 61 :
// escape
2015-05-21 17:59:09 +03:00
base62 = put_digit ( 61 , base62 ) ;
2011-01-03 00:00:58 +03:00
// extra "00...." high bits (in the next character)
bits2 = 2 ;
bits6 = 0 ;
num6b = 0 ;
break ;
case 62 :
2015-05-21 17:59:09 +03:00
base62 = put_digit ( 61 , base62 ) ;
2011-01-03 00:00:58 +03:00
// extra "01...." high bits
bits2 = 2 ;
bits6 = 0 ;
num6b = 16 ;
break ;
case 63 :
2015-05-21 17:59:09 +03:00
base62 = put_digit ( 61 , base62 ) ;
2011-01-03 00:00:58 +03:00
// extra "10...." high bits
bits2 = 2 ;
bits6 = 0 ;
num6b = 32 ;
break ;
default :
assert ( num6b < 61 ) ;
2015-05-21 17:59:09 +03:00
base62 = put_digit ( num6b , base62 ) ;
2011-01-03 00:00:58 +03:00
bits2 = 0 ;
bits6 = 0 ;
num6b = 0 ;
break ;
2010-09-11 01:34:05 +04:00
}
}
if ( bits6 + bits2 ) {
assert ( num6b < 61 ) ;
2015-05-21 17:59:09 +03:00
base62 = put_digit ( num6b , base62 ) ;
2010-09-11 01:34:05 +04:00
}
* base62 = ' \0 ' ;
return base62 - base62_start ;
}
2011-01-02 06:39:32 +03:00
// Estimate how many bits will result from decoding a base62 string.
2010-09-11 01:34:05 +04:00
static inline
2011-06-14 00:37:26 +04:00
int decode_base62_size ( int len )
2010-09-11 01:34:05 +04:00
{
2011-01-02 06:39:32 +03:00
// Each character will fill at most 6 bits.
2012-02-19 08:43:36 +04:00
return len * 6 ;
2010-09-11 01:34:05 +04:00
}
2011-01-03 02:06:07 +03:00
// This table maps alnum characters to their numeric values.
static
2011-05-24 19:55:31 +04:00
const int char_to_num [ 256 ] = {
[ 0 . . . 255 ] = 0xee ,
[ 0 ] = 0xff ,
# define C1(c, b) [c] = c - b
# define C2(c, b) C1(c, b), C1(c + 1, b)
# define C5(c, b) C1(c, b), C2(c + 1, b), C2(c + 3, b)
# define C10(c, b) C5(c, b), C5(c + 5, b)
C10 ( ' 0 ' , ' 0 ' ) ,
# define C26(c, b) C1(c, b), C5(c + 1, b), C10(c + 6, b), C10(c + 16, b)
C26 ( ' a ' , ' a ' + 10 ) ,
C26 ( ' A ' , ' A ' + 36 ) ,
2011-01-03 02:06:07 +03:00
} ;
2015-05-21 17:59:09 +03:00
static
char *
put6bits ( int c , char * bitv )
{
* bitv + + = ( c > > 0 ) & 1 ;
* bitv + + = ( c > > 1 ) & 1 ;
* bitv + + = ( c > > 2 ) & 1 ;
* bitv + + = ( c > > 3 ) & 1 ;
* bitv + + = ( c > > 4 ) & 1 ;
* bitv + + = ( c > > 5 ) & 1 ;
return bitv ;
}
static
char *
put4bits ( int c , char * bitv )
{
* bitv + + = ( c > > 0 ) & 1 ;
* bitv + + = ( c > > 1 ) & 1 ;
* bitv + + = ( c > > 2 ) & 1 ;
* bitv + + = ( c > > 3 ) & 1 ;
return bitv ;
}
2011-06-11 00:37:58 +04:00
// Main base62 decoding routine: unpack base62 string into bitv[].
2010-09-11 01:34:05 +04:00
static
2011-01-02 06:39:32 +03:00
int decode_base62 ( const char * base62 , char * bitv )
{
char * bitv_start = bitv ;
2011-05-24 19:55:31 +04:00
while ( 1 ) {
2011-05-25 04:18:16 +04:00
long c = ( unsigned char ) * base62 + + ;
2011-01-03 02:06:07 +03:00
int num6b = char_to_num [ c ] ;
2011-05-24 19:55:31 +04:00
while ( num6b < 61 ) {
2015-05-21 17:59:09 +03:00
bitv = put6bits ( num6b , bitv ) ;
2011-05-24 19:55:31 +04:00
c = ( unsigned char ) * base62 + + ;
num6b = char_to_num [ c ] ;
2011-01-03 00:00:58 +03:00
}
2011-05-24 19:55:31 +04:00
if ( num6b = = 0xff )
break ;
if ( num6b = = 0xee )
return - 1 ;
2011-01-03 00:00:58 +03:00
assert ( num6b = = 61 ) ;
2011-01-03 02:06:07 +03:00
c = ( unsigned char ) * base62 + + ;
int num4b = char_to_num [ c ] ;
2011-05-24 19:55:31 +04:00
if ( num4b = = 0xff )
return - 2 ;
if ( num4b = = 0xee )
2011-01-03 00:00:58 +03:00
return - 3 ;
switch ( num4b & ( 16 + 32 ) ) {
case 0 :
break ;
case 16 :
num6b = 62 ;
num4b & = ~ 16 ;
break ;
case 32 :
num6b = 63 ;
num4b & = ~ 32 ;
break ;
default :
return - 4 ;
2010-11-16 16:51:07 +03:00
}
2015-05-21 17:59:09 +03:00
bitv = put6bits ( num6b , bitv ) ;
bitv = put4bits ( num4b , bitv ) ;
2010-09-11 01:34:05 +04:00
}
2011-01-02 06:39:32 +03:00
return bitv - bitv_start ;
2010-09-11 01:34:05 +04:00
}
# ifdef SELF_TEST
2011-01-03 09:24:15 +03:00
static
2011-01-02 06:39:32 +03:00
void test_base62 ( )
2010-09-11 01:34:05 +04:00
{
const char rnd_bitv [ ] = {
1 , 0 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 0 , 1 ,
1 , 1 , 0 , 1 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 1 ,
0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 ,
0 , 1 , 1 , 0 , 1 , 0 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 ,
2011-01-02 06:39:32 +03:00
// trigger some 'Z'
2010-09-11 01:34:05 +04:00
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
} ;
2011-01-02 06:39:32 +03:00
const int rnd_bitc = sizeof rnd_bitv ;
// encode
char base62 [ encode_base62_size ( rnd_bitc ) ] ;
int len = encode_base62 ( rnd_bitc , rnd_bitv , base62 ) ;
2010-09-11 01:34:05 +04:00
assert ( len > 0 ) ;
2011-01-02 06:39:32 +03:00
assert ( len = = ( int ) strlen ( base62 ) ) ;
2010-09-11 01:34:05 +04:00
fprintf ( stderr , " len=%d base62=%s \n " , len , base62 ) ;
2011-01-02 06:39:32 +03:00
// The length cannot be shorter than 6 bits per symbol.
2010-09-11 01:34:05 +04:00
assert ( len > = rnd_bitc / 6 ) ;
2011-01-02 06:39:32 +03:00
// Neither too long: each second character must fill at least 4 bits.
2010-09-11 01:34:05 +04:00
assert ( len < = rnd_bitc / 2 / 4 + rnd_bitc / 2 / 6 + 1 ) ;
2011-01-02 06:39:32 +03:00
// decode
2011-06-14 00:37:26 +04:00
char bitv [ decode_base62_size ( len ) ] ;
2011-01-02 06:39:32 +03:00
int bitc = decode_base62 ( base62 , bitv ) ;
2010-09-11 01:34:05 +04:00
fprintf ( stderr , " rnd_bitc=%d bitc=%d \n " , rnd_bitc , bitc ) ;
assert ( bitc > = rnd_bitc ) ;
2011-01-02 06:39:32 +03:00
// Decoded bits must match.
int i ;
for ( i = 0 ; i < rnd_bitc ; i + + )
assert ( rnd_bitv [ i ] = = bitv [ i ] ) ;
// The remaining bits must be zero bits.
for ( i = rnd_bitc ; i < bitc ; i + + )
assert ( bitv [ i ] = = 0 ) ;
2010-09-11 01:34:05 +04:00
fprintf ( stderr , " %s: base62 test OK \n " , __FILE__ ) ;
}
# endif
/*
* Golomb - Rice routines - compress integer values into bits .
*
* The idea is as follows . Input values are assumed to be small integers .
* Each value is split into two parts : an integer resulting from its higher
* bits and an integer resulting from its lower bits ( with the number of lower
2010-09-17 11:41:04 +04:00
* bits specified by the Mshift parameter ) . The frist integer is then stored
2010-09-11 01:34:05 +04:00
* in unary coding ( which is a variable - length sequence of ' 0 ' followed by a
* terminating ' 1 ' ) ; the second part is stored in normal binary coding ( using
* Mshift bits ) .
*
* The method is justified by the fact that , since most of the values are
2010-09-17 11:41:04 +04:00
* small , their first parts will be short ( typically 1. .3 bits ) . In particular ,
* the method is known to be optimal for uniformly distributed hash values ,
* after the values are sorted and delta - encoded . See e . g .
2010-09-11 01:34:05 +04:00
* Putze , F . ; Sanders , P . ; Singler , J . ( 2007 ) ,
* " Cache-, Hash- and Space-Efficient Bloom Filters " ,
* http : //algo2.iti.uni-karlsruhe.de/singler/publications/cacheefficientbloomfilters-wea2007.pdf
*/
2015-05-21 17:59:09 +03:00
static
int log2i ( int n )
{
int m = 0 ;
while ( n > > = 1 )
m + + ;
return m ;
}
2011-01-02 06:39:32 +03:00
// Calculate Mshift paramter for encoding.
2010-09-11 01:34:05 +04:00
static
int encode_golomb_Mshift ( int c , int bpp )
{
2011-01-02 06:39:32 +03:00
// XXX Slightly better Mshift estimations are probably possible.
// Recheck "Compression and coding algorithms" by Moffat & Turpin.
int Mshift = bpp - log2i ( c ) - 1 ;
// Adjust out-of-range values.
2010-09-11 01:34:05 +04:00
if ( Mshift < 7 )
Mshift = 7 ;
if ( Mshift > 31 )
Mshift = 31 ;
assert ( Mshift < bpp ) ;
return Mshift ;
}
2011-01-02 06:39:32 +03:00
// Estimate how many bits can be filled up.
2010-09-11 01:34:05 +04:00
static inline
int encode_golomb_size ( int c , int Mshift )
{
2011-01-02 06:39:32 +03:00
// XXX No precise estimation. However, we do not expect unary-encoded bits
// to take more than binary-encoded Mshift bits.
2012-02-19 08:43:36 +04:00
return Mshift * 2 * c + 16 ;
2010-09-11 01:34:05 +04:00
}
2011-01-02 06:39:32 +03:00
// Main golomb encoding routine: package integers into bits.
2010-09-11 01:34:05 +04:00
static
2011-01-02 06:39:32 +03:00
int encode_golomb ( int c , const unsigned * v , int Mshift , char * bitv )
2010-09-11 01:34:05 +04:00
{
2011-01-02 06:39:32 +03:00
char * bitv_start = bitv ;
2010-09-11 01:34:05 +04:00
const unsigned mask = ( 1 < < Mshift ) - 1 ;
2011-01-02 06:39:32 +03:00
while ( c > 0 ) {
c - - ;
unsigned v0 = * v + + ;
2010-09-11 01:34:05 +04:00
int i ;
2011-01-02 06:39:32 +03:00
// first part: variable-length sequence
unsigned q = v0 > > Mshift ;
for ( i = 0 ; i < ( int ) q ; i + + )
* bitv + + = 0 ;
* bitv + + = 1 ;
// second part: lower Mshift bits
unsigned r = v0 & mask ;
for ( i = 0 ; i < Mshift ; i + + )
* bitv + + = ( r > > i ) & 1 ;
2010-09-11 01:34:05 +04:00
}
2011-01-02 06:39:32 +03:00
return bitv - bitv_start ;
2010-09-11 01:34:05 +04:00
}
2011-01-02 06:39:32 +03:00
// Estimate how many values will emerge.
2010-09-11 01:34:05 +04:00
static inline
int decode_golomb_size ( int bitc , int Mshift )
{
2011-01-02 06:39:32 +03:00
// Each (Mshift + 1) bits can make a value.
// The remaining bits cannot make a value, though.
2010-09-11 01:34:05 +04:00
return bitc / ( Mshift + 1 ) ;
}
2011-01-02 06:39:32 +03:00
// Main golomb decoding routine: unpackage bits into values.
2010-09-11 01:34:05 +04:00
static
2011-01-02 06:39:32 +03:00
int decode_golomb ( int bitc , const char * bitv , int Mshift , unsigned * v )
2010-09-11 01:34:05 +04:00
{
unsigned * v_start = v ;
2011-01-02 06:39:32 +03:00
// next value
while ( bitc > 0 ) {
// first part
unsigned q = 0 ;
char bit = 0 ;
while ( bitc > 0 ) {
bitc - - ;
bit = * bitv + + ;
2010-09-11 01:34:05 +04:00
if ( bit = = 0 )
q + + ;
else
break ;
}
2011-01-02 06:39:32 +03:00
// trailing zero bits in the input are okay
2011-10-03 05:09:12 +04:00
if ( bitc = = 0 & & bit = = 0 ) {
// up to 5 bits can be used to complete last character
if ( q > 5 )
return - 10 ;
2010-09-11 01:34:05 +04:00
break ;
2011-10-03 05:09:12 +04:00
}
2011-01-02 06:39:32 +03:00
// otherwise, incomplete value is not okay
if ( bitc < Mshift )
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
return - 11 ;
2011-01-02 06:39:32 +03:00
// second part
unsigned r = 0 ;
int i ;
for ( i = 0 ; i < Mshift ; i + + ) {
bitc - - ;
if ( * bitv + + )
r | = ( 1 < < i ) ;
}
// the value
2010-09-11 01:34:05 +04:00
* v + + = ( q < < Mshift ) | r ;
}
return v - v_start ;
}
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
# ifdef SELF_TEST
static
void test_golomb ( )
{
const unsigned rnd_v [ ] = {
// do re mi fa sol la si
1 , 2 , 3 , 4 , 5 , 6 , 7 ,
// koshka sela na taksi
7 , 6 , 5 , 4 , 3 , 2 , 1 ,
} ;
const int rnd_c = sizeof rnd_v / sizeof * rnd_v ;
int bpp = 10 ;
int Mshift = encode_golomb_Mshift ( rnd_c , bpp ) ;
fprintf ( stderr , " rnd_c=%d bpp=%d Mshift=%d \n " , rnd_c , bpp , Mshift ) ;
assert ( Mshift > 0 ) ;
assert ( Mshift < bpp ) ;
// encode
int alloc_bitc = encode_golomb_size ( rnd_c , Mshift ) ;
assert ( alloc_bitc > rnd_c ) ;
char bitv [ alloc_bitc ] ;
int bitc = encode_golomb ( rnd_c , rnd_v , Mshift , bitv ) ;
fprintf ( stderr , " alloc_bitc=%d bitc=%d \n " , alloc_bitc , bitc ) ;
assert ( bitc > rnd_c ) ;
assert ( bitc < = alloc_bitc ) ;
// decode
int alloc_c = decode_golomb_size ( bitc , Mshift ) ;
assert ( alloc_c > = rnd_c ) ;
unsigned v [ alloc_c ] ;
int c = decode_golomb ( bitc , bitv , Mshift , v ) ;
fprintf ( stderr , " rnd_c=%d alloc_c=%d c=%d \n " , rnd_c , alloc_c , c ) ;
assert ( alloc_c > = c ) ;
// Decoded values must match.
assert ( rnd_c = = c ) ;
int i ;
for ( i = 0 ; i < c ; i + + )
assert ( rnd_v [ i ] = = v [ i ] ) ;
// At the end of the day, did it save your money?
int golomb_bpp = bitc / c ;
fprintf ( stderr , " bpp=%d golomb_bpp=%d \n " , bpp , golomb_bpp ) ;
assert ( golomb_bpp < bpp ) ;
fprintf ( stderr , " %s: golomb test OK \n " , __FILE__ ) ;
}
# endif
/*
* Combined base62 + gololb decoding routine - implemented for efficiency .
*
* As Dmitry V . Levin once noticed , when it comes to speed , very few objections
* can be made against complicating the code . Which reminds me of Karl Marx ,
* who said that there is not a crime at which a capitalist will scruple for
* the sake of 300 per cent profit , even at the chance of being hanged . Anyway ,
* here Alexey Tourbin demonstrates that by using sophisticated - or should he
* say " ridiculously complicated " - techniques it is indeed possible to gain
* some profit , albeit of another kind .
*/
// Word types (when two bytes from base62 string cast to unsigned short).
enum {
W_AA = 0x0000 ,
W_AZ = 0x1000 ,
W_ZA = 0x2000 ,
W_A0 = 0x3000 ,
W_0X = 0x4000 ,
W_EE = 0xeeee ,
} ;
// Combine two characters into array index (with respect to endianness).
# include <sys/types.h>
# if BYTE_ORDER && BYTE_ORDER == LITTLE_ENDIAN
# define CCI(c1, c2) ((c1) | ((c2) << 8))
# elif BYTE_ORDER && BYTE_ORDER == BIG_ENDIAN
# define CCI(c1, c2) ((c2) | ((c1) << 8))
# else
# error "unknown byte order"
# endif
// Maps base62 word into numeric value (decoded bits) ORed with word type.
static
const unsigned short word_to_num [ 65536 ] = {
[ 0 . . . 65535 ] = W_EE ,
# define AA1(c1, c2, b1, b2) [CCI(c1, c2)] = (c1 - b1) | ((c2 - b2) << 6)
# define AA1x2(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA1(c1, c2 + 1, b1, b2)
# define AA1x3(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA1x2(c1, c2 + 1, b1, b2)
# define AA1x5(c1, c2, b1, b2) AA1x2(c1, c2, b1, b2), AA1x3(c1, c2 + 2, b1, b2)
# define AA1x10(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA1x5(c1, c2 + 5, b1, b2)
# define AA1x20(c1, c2, b1, b2) AA1x10(c1, c2, b1, b2), AA1x10(c1, c2 + 10, b1, b2)
# define AA1x25(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA1x20(c1, c2 + 5, b1, b2)
# define AA2x1(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA1(c1 + 1, c2, b1, b2)
# define AA3x1(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA2x1(c1 + 1, c2, b1, b2)
# define AA5x1(c1, c2, b1, b2) AA2x1(c1, c2, b1, b2), AA3x1(c1 + 2, c2, b1, b2)
# define AA10x1(c1, c2, b1, b2) AA5x1(c1, c2, b1, b2), AA5x1(c1 + 5, c2, b1, b2)
# define AA20x1(c1, c2, b1, b2) AA10x1(c1, c2, b1, b2), AA10x1(c1 + 10, c2, b1, b2)
# define AA25x1(c1, c2, b1, b2) AA5x1(c1, c2, b1, b2), AA20x1(c1 + 5, c2, b1, b2)
# define AA26x1(c1, c2, b1, b2) AA1(c1, c2, b1, b2), AA25x1(c1 + 1, c2, b1, b2)
# define AA2x5(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA1x5(c1 + 1, c2, b1, b2)
# define AA3x5(c1, c2, b1, b2) AA1x5(c1, c2, b1, b2), AA2x5(c1 + 1, c2, b1, b2)
# define AA5x5(c1, c2, b1, b2) AA2x5(c1, c2, b1, b2), AA3x5(c1 + 2, c2, b1, b2)
# define AA5x10(c1, c2, b1, b2) AA5x5(c1, c2, b1, b2), AA5x5(c1, c2 + 5, b1, b2)
# define AA10x5(c1, c2, b1, b2) AA5x5(c1, c2, b1, b2), AA5x5(c1 + 5, c2, b1, b2)
# define AA20x5(c1, c2, b1, b2) AA10x5(c1, c2, b1, b2), AA10x5(c1 + 10, c2, b1, b2)
# define AA25x5(c1, c2, b1, b2) AA5x5(c1, c2, b1, b2), AA20x5(c1 + 5, c2, b1, b2)
# define AA10x10(c1, c2, b1, b2) AA5x10(c1, c2, b1, b2), AA5x10(c1 + 5, c2, b1, b2)
# define AA10x20(c1, c2, b1, b2) AA10x10(c1, c2, b1, b2), AA10x10(c1, c2 + 10, b1, b2)
# define AA10x25(c1, c2, b1, b2) AA10x5(c1, c2, b1, b2), AA10x20(c1, c2 + 5, b1, b2)
# define AA10x26(c1, c2, b1, b2) AA10x1(c1, c2, b1, b2), AA10x25(c1, c2 + 1, b1, b2)
# define AA20x10(c1, c2, b1, b2) AA10x10(c1, c2, b1, b2), AA10x10(c1 + 10, c2, b1, b2)
# define AA25x10(c1, c2, b1, b2) AA5x10(c1, c2, b1, b2), AA20x10(c1 + 5, c2, b1, b2)
# define AA26x10(c1, c2, b1, b2) AA1x10(c1, c2, b1, b2), AA25x10(c1 + 1, c2, b1, b2)
# define AA25x20(c1, c2, b1, b2) AA25x10(c1, c2, b1, b2), AA25x10(c1, c2 + 10, b1, b2)
# define AA25x25(c1, c2, b1, b2) AA25x5(c1, c2, b1, b2), AA25x20(c1, c2 + 5, b1, b2)
# define AA25x26(c1, c2, b1, b2) AA25x1(c1, c2, b1, b2), AA25x25(c1, c2 + 1, b1, b2)
# define AA26x25(c1, c2, b1, b2) AA1x25(c1, c2, b1, b2), AA25x25(c1 + 1, c2, b1, b2)
# define AA26x26(c1, c2, b1, b2) AA26x1(c1, c2, b1, b2), AA26x25(c1, c2 + 1, b1, b2)
AA10x10 ( ' 0 ' , ' 0 ' , ' 0 ' , ' 0 ' ) ,
AA10x26 ( ' 0 ' , ' a ' , ' 0 ' , ' a ' + 10 ) ,
AA10x25 ( ' 0 ' , ' A ' , ' 0 ' , ' A ' + 36 ) ,
AA26x10 ( ' a ' , ' 0 ' , ' a ' + 10 , ' 0 ' ) ,
AA25x10 ( ' A ' , ' 0 ' , ' A ' + 36 , ' 0 ' ) ,
AA26x26 ( ' a ' , ' a ' , ' a ' + 10 , ' a ' + 10 ) ,
AA26x25 ( ' a ' , ' A ' , ' a ' + 10 , ' A ' + 36 ) ,
AA25x26 ( ' A ' , ' a ' , ' A ' + 36 , ' a ' + 10 ) ,
AA25x25 ( ' A ' , ' A ' , ' A ' + 36 , ' A ' + 36 ) ,
# define AZ1(c, b) [CCI(c, 'Z')] = (c - b) | W_AZ
# define AZ2(c, b) AZ1(c, b), AZ1(c + 1, b)
# define AZ5(c, b) AZ1(c, b), AZ2(c + 1, b), AZ2(c + 3, b)
# define AZ10(c, b) AZ5(c, b), AZ5(c + 5, b)
# define AZ25(c, b) AZ5(c, b), AZ10(c + 5, b), AZ10(c + 15, b)
# define AZ26(c, b) AZ1(c, b), AZ25(c + 1, b)
AZ10 ( ' 0 ' , ' 0 ' ) ,
AZ26 ( ' a ' , ' a ' + 10 ) ,
AZ25 ( ' A ' , ' A ' + 36 ) ,
# define ZA1(c, b) [CCI('Z', c)] = (61 + ((c - b) >> 4)) | (((c - b) & 0xf) << 6) | W_ZA
# define ZA2(c, b) ZA1(c, b), ZA1(c + 1, b)
# define ZA5(c, b) ZA1(c, b), ZA2(c + 1, b), ZA2(c + 3, b)
# define ZA10(c, b) ZA5(c, b), ZA5(c + 5, b)
# define ZA25(c, b) ZA5(c, b), ZA10(c + 5, b), ZA10(c + 15, b)
# define ZA26(c, b) ZA1(c, b), ZA25(c + 1, b)
ZA10 ( ' 0 ' , ' 0 ' ) ,
ZA26 ( ' a ' , ' a ' + 10 ) ,
ZA25 ( ' A ' , ' A ' + 36 ) ,
# define A01(c, b) [CCI(c, 0)] = (c - b) | W_A0
# define A02(c, b) A01(c, b), A01(c + 1, b)
# define A05(c, b) A01(c, b), A02(c + 1, b), A02(c + 3, b)
# define A010(c, b) A05(c, b), A05(c + 5, b)
# define A025(c, b) A05(c, b), A010(c + 5, b), A010(c + 15, b)
# define A026(c, b) A01(c, b), A025(c + 1, b)
A010 ( ' 0 ' , ' 0 ' ) ,
A026 ( ' a ' , ' a ' + 10 ) ,
A025 ( ' A ' , ' A ' + 36 ) ,
# define OX(c) [CCI(0, c)] = W_0X
# define OX4(c) OX(c), OX(c + 1), OX(c + 2), OX(c + 3)
# define OX16(c) OX4(c), OX4(c + 4), OX4(c + 8), OX4(c + 12)
# define OX64(c) OX16(c), OX16(c + 16), OX16(c + 32), OX16(c + 48)
# define OX256(c) OX64(c), OX64(c + 64), OX64(c + 128), OX64(c + 192)
OX256 ( ' \0 ' ) ,
} ;
// Combined base62+golomb decoding routine.
2011-01-03 02:57:02 +03:00
static
int decode_base62_golomb ( const char * base62 , int Mshift , unsigned * v )
{
unsigned * v_start = v ;
set.c: precompute r mask for putbits coroutines
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,424,712,279 PROGRAM TOTALS
813,389,804 lib/set.c:decode_base62_golomb
496,701,778 lib/set.c:rpmsetcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,406,630,571 PROGRAM TOTALS
795,320,289 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
2012-03-08 22:27:33 +04:00
unsigned mask = ( 1 < < Mshift ) - 1 ;
2011-01-03 02:57:02 +03:00
unsigned q = 0 ;
unsigned r = 0 ;
int rfill = 0 ;
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
long c , w ;
int n , vbits , left ;
unsigned bits , morebits ;
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
// need align
if ( 1 & ( long ) base62 ) {
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
c = ( unsigned char ) * base62 + + ;
bits = char_to_num [ c ] ;
if ( bits < 61 )
goto put6q_align ;
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
else {
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
if ( bits = = 0xff )
goto eolq ;
if ( bits = = 0xee )
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
return - 1 ;
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
assert ( bits = = 61 ) ;
goto esc1q ;
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
}
}
// regular mode, process two-byte words
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
# define Get24(X) \
w = * ( unsigned short * ) base62 ; \
base62 + = 2 ; \
bits = word_to_num [ w ] ; \
if ( bits > = 0x1000 ) \
goto gotNN # # X ; \
w = * ( unsigned short * ) base62 ; \
base62 + = 2 ; \
morebits = word_to_num [ w ] ; \
if ( morebits > = 0x1000 ) \
goto put12 # # X ; \
bits | = ( morebits < < 12 ) ; \
goto put24 # # X
# define Get12(X) \
bits = morebits
# define GotNN(X) \
switch ( bits & 0xf000 ) { \
case W_AZ : \
bits & = 0x0fff ; \
goto put6 # # X # # _AZ ; \
case W_ZA : \
bits & = 0x0fff ; \
goto put10 # # X # # _ZA ; \
case W_A0 : \
bits & = 0x0fff ; \
goto put6 # # X # # _A0 ; \
case W_0X : \
goto eol # # X ; \
default : \
return - 2 ; \
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
}
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
// make coroutines
get24q : Get24 ( q ) ;
get24r : Get24 ( r ) ;
get12q : Get12 ( q ) ;
gotNNq : GotNN ( q ) ;
get12r : Get12 ( r ) ;
gotNNr : GotNN ( r ) ;
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
// escape mode, handle 2 bytes one by one
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
# define Esc1(X) \
bits = 61 ; \
c = ( unsigned char ) * base62 + + ; \
morebits = char_to_num [ c ] ; \
if ( morebits = = 0xff ) \
return - 3 ; \
if ( morebits = = 0xee ) \
return - 4 ; \
switch ( morebits & ( 16 + 32 ) ) { \
case 0 : \
break ; \
case 16 : \
bits = 62 ; \
morebits & = ~ 16 ; \
break ; \
case 32 : \
bits = 63 ; \
morebits & = ~ 32 ; \
break ; \
default : \
return - 5 ; \
} \
bits | = ( morebits < < 6 ) ; \
goto put10 # # X # # _esc1
# define Esc2(X) \
c = ( unsigned char ) * base62 + + ; \
bits = char_to_num [ c ] ; \
if ( bits < 61 ) \
goto put6 # # X # # _esc2 ; \
else { \
if ( bits = = 0xff ) \
goto eol # # X ; \
if ( bits = = 0xee ) \
return - 6 ; \
goto esc1 # # X ; \
2011-01-03 02:57:02 +03:00
}
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
// make coroutines
esc1q : Esc1 ( q ) ;
esc2q : Esc2 ( q ) ;
esc1r : Esc1 ( r ) ;
esc2r : Esc2 ( r ) ;
// golomb pieces
# define QInit(N) \
n = N
# define RInit(N) \
n = N ; \
r | = ( bits < < rfill ) ; \
rfill + = n
# define RMake(Get) \
left = rfill - Mshift ; \
if ( left < 0 ) \
goto Get # # r ; \
set.c: precompute r mask for putbits coroutines
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,424,712,279 PROGRAM TOTALS
813,389,804 lib/set.c:decode_base62_golomb
496,701,778 lib/set.c:rpmsetcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,406,630,571 PROGRAM TOTALS
795,320,289 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
2012-03-08 22:27:33 +04:00
r & = mask ; \
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
* v + + = ( q < < Mshift ) | r ; \
q = 0 ; \
bits > > = n - left ; \
n = left
# define QMake(Get) \
if ( bits = = 0 ) { \
q + = n ; \
goto Get # # q ; \
} \
vbits = __builtin_ffs ( bits ) ; \
n - = vbits ; \
bits > > = vbits ; \
q + = vbits - 1 ; \
r = bits ; \
rfill = n
// this assumes that minumum Mshift value is 7
# define Put24Q(Get) \
QInit ( 24 ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
goto Get # # q
# define Put24R(Get) \
RInit ( 24 ) ; \
RMake ( Get ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; goto Get # # r
# define Put12Q(Get) \
QInit ( 12 ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; goto Get # # r
# define Put12R(Get) \
RInit ( 12 ) ; \
RMake ( Get ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; goto Get # # r
# define Put10Q(Get) \
QInit ( 10 ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; goto Get # # r
# define Put10R(Get) \
RInit ( 10 ) ; \
RMake ( Get ) ; \
QMake ( Get ) ; RMake ( Get ) ; \
QMake ( Get ) ; goto Get # # r
# define Put6Q(Get) \
QInit ( 6 ) ; \
QMake ( Get ) ; goto Get # # r
# define Put6R(Get) \
RInit ( 6 ) ; \
RMake ( Get ) ; \
QMake ( Get ) ; goto Get # # r
// make coroutines
put24q : Put24Q ( get24 ) ;
put24r : Put24R ( get24 ) ;
put12q : Put12Q ( get12 ) ;
put12r : Put12R ( get12 ) ;
put6q_align :
put6q_esc2 : Put6Q ( get24 ) ;
put6r_esc2 : Put6R ( get24 ) ;
put6q_AZ : Put6Q ( esc1 ) ;
put6r_AZ : Put6R ( esc1 ) ;
put10q_esc1 : Put10Q ( esc2 ) ;
put10r_esc1 : Put10R ( esc2 ) ;
put10q_ZA : Put10Q ( get24 ) ;
put10r_ZA : Put10R ( get24 ) ;
put6q_A0 : Put6Q ( eol ) ;
put6r_A0 : Put6R ( eol ) ;
// handle end of line and return
eolq :
if ( q > 5 )
2011-01-03 02:57:02 +03:00
return - 10 ;
return v - v_start ;
set.c: reimplemented decode_base62_golomb using Knuth's coroutines
Since the combined base62+golomb decoder is still the most expensive
routine, I have to consider very clever tricks to give it a boost.
In the routine, its "master logic" is executed on behalf of the base62
decoder: it makes bits from the string and passes them on to the "slave"
golomb routine. The slave routine has to maintain its own state (doing
q or doing r); after the bits are processed, it returns and base62 takes
over. When the slave routine is invoked again, it has to recover the
state and take the right path (q or r). These seemingly cheap state
transitions can actually become relatively expensive, since the "if"
clause involves branch prediction which is not particularly accurate on
variable-length inputs. This change demonstrates that it is possible to
get rid of the state-related instructions altogether.
Roughly, the idea is that, instead of calling putNbits(), we can invoke
"goto *putNbits", and the pointer will dispatch either to putNbitsQ or
putNbitsR label (we can do this with gcc's computed gotos). However,
the goto will not return, and so the "putbits" guys will have to invoke
"goto getbits", and so on. So it gets very similar to coroutines as
described in [Knuth 1997, vol. 1, p. 194]. Furthermore, one must
realize that computed gotos are not actually required: since the total
number of states is relatively small - roughly (q^r)x(reg^esc,align) -
it is possible to instantiate a few similar coroutines which pass
control directly to the right labels.
For example, the decoding is started with "get24q" coroutine - that is,
we're in the "Q" mode and we try to grab 24 bits (for the sake of the
example, I do not consider the initial align step). If 24 bits are
obtained successfully, they are passed down to the "put24q" coroutine
which, as its name suggests, takes over in the "Q" mode immediately;
furthermore, in the "put24q" coroutine, the next call to get bits has to
be either "get24q" or "get24r" (depending on whether Q or R is processed
when no bits are left) - that is, the coroutine itself must "know" that
there is no base62 complications at this point. The "get24r" is similar
to "get24q" except that it will invoke "put24r" instead of "put24q". On
the other hand, consider that, in the beginning, only 12 bits have been
directly decoded (and the next 12 bits probably involve "Z"). We then
pass control to "put12q", which will in turn call either "get12q" or
"get12r" to handle irregular cases for the pending 12 bits (um, the
names "get12q" and "get12r" are a bit of a misnomer).
This change also removes another branch in golomb R->Q transition:
r &= (1 << Mshift) - 1;
*v++ = (q << Mshift) | r;
q = 0;
state = ST_VLEN;
- if (left == 0)
- return;
bits >>= n - left;
n = left;
vlen:
if (bits == 0) {
q += n;
return;
}
int vbits = __builtin_ffs(bits);
...
This first "left no bits" check is now removed and performed implicitly
by the latter "no need for bsf" check, with the result being far better
than I expected. Perhaps it helps to understand that the condition
"left exactly 0" rarely holds, but CPU is stuck by the check.
So, Q and R processing step each now have exactly one branch (that is,
exactly one condition which completes the step). Also, in the "put"
coroutines, I simply make a sequence of Q and R steps; this produces
a clean sequence of instructions which branches only when absolutely
necessary.
callginrd annotations for "apt-cache <<<unmet", previous commit:
2,671,717,564 PROGRAM TOTALS
1,059,874,219 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
callginrd annotations for "apt-cache <<<unmet", this commit:
2,426,092,837 PROGRAM TOTALS
812,534,481 lib/set.c:decode_base62_golomb
509,531,239 lib/set.c:rpmsetcmp
2012-03-01 21:24:43 +04:00
eolr :
return - 11 ;
2011-01-03 02:57:02 +03:00
}
2010-09-11 01:34:05 +04:00
# ifdef SELF_TEST
2011-01-03 02:57:02 +03:00
static
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
void test_word_table ( )
2010-09-11 01:34:05 +04:00
{
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
int i , j ;
for ( i = 0 ; i < 256 ; i + + )
for ( j = 0 ; j < 256 ; j + + ) {
unsigned char u [ 2 ] __attribute__ ( ( aligned ( 2 ) ) ) = { i , j } ;
unsigned short ix = * ( unsigned short * ) u ;
int w = word_to_num [ ix ] ;
if ( w < 0x1000 )
assert ( w = = ( char_to_num [ i ] | ( char_to_num [ j ] < < 6 ) ) ) ;
else
assert ( char_to_num [ i ] > = 61 | | char_to_num [ j ] > = 61 ) ;
}
fprintf ( stderr , " %s: word table test OK \n " , __FILE__ ) ;
2010-09-11 01:34:05 +04:00
}
2011-01-03 02:57:02 +03:00
static
void test_base62_golomb ( )
{
const char str [ ] = " set:hdf7q2P5VZwtLGr9TKxhrEM1 " ;
const char * base62 = str + 4 + 2 ;
int Mshift = 10 ;
char bitv [ 256 ] ;
int bitc = decode_base62 ( base62 , bitv ) ;
assert ( bitc > 0 ) ;
unsigned v1 [ 32 ] , v2 [ 32 ] ;
int c1 = decode_golomb ( bitc , bitv , Mshift , v1 ) ;
assert ( c1 > 0 ) ;
int c2 = decode_base62_golomb ( base62 , Mshift , v2 ) ;
assert ( c2 > 0 ) ;
assert ( c1 = = c2 ) ;
int i ;
for ( i = 0 ; i < c1 ; i + + )
assert ( v1 [ i ] = = v2 [ i ] ) ;
fprintf ( stderr , " %s: base62_golomb test OK \n " , __FILE__ ) ;
}
2010-09-11 01:34:05 +04:00
# endif
/*
* Delta encoding routines - replace an increasing sequence of integer values
* by the sequence of their differences .
*/
static
void encode_delta ( int c , unsigned * v )
{
2010-11-16 16:51:07 +03:00
assert ( c > 0 ) ;
2011-01-03 07:27:29 +03:00
unsigned * v_end = v + c ;
unsigned v0 = * v + + ;
while ( v < v_end ) {
2010-09-11 01:34:05 +04:00
* v - = v0 ;
v0 + = * v + + ;
}
}
static
void decode_delta ( int c , unsigned * v )
{
2010-11-16 16:51:07 +03:00
assert ( c > 0 ) ;
2011-01-03 07:27:29 +03:00
unsigned * v_end = v + c ;
unsigned v0 = * v + + ;
while ( v < v_end ) {
2010-09-11 01:34:05 +04:00
* v + = v0 ;
v0 = * v + + ;
}
}
# ifdef SELF_TEST
2011-01-03 09:24:15 +03:00
static
2011-01-02 06:39:32 +03:00
void test_delta ( )
2010-09-11 01:34:05 +04:00
{
unsigned v [ ] = {
1 , 3 , 7 , 0
} ;
int c = 3 ;
encode_delta ( c , v ) ;
assert ( v [ 0 ] = = 1 ) ;
assert ( v [ 1 ] = = 2 ) ;
assert ( v [ 2 ] = = 4 ) ;
assert ( v [ 3 ] = = 0 ) ;
decode_delta ( c , v ) ;
assert ( v [ 0 ] = = 1 ) ;
assert ( v [ 1 ] = = 3 ) ;
assert ( v [ 2 ] = = 7 ) ;
assert ( v [ 3 ] = = 0 ) ;
fprintf ( stderr , " %s: delta test OK \n " , __FILE__ ) ;
}
# endif
/*
2011-06-11 00:37:58 +04:00
* Higher - level set - string routines - serialize integers into set - string .
2010-09-11 01:34:05 +04:00
*
* A set - string looks like this : " set:bMxyz... "
*
* The " set: " prefix marks set - versions in rpm ( to distinguish them between
* regular rpm versions ) . It is assumed to be stripped here .
*
* The next two characters ( denoted ' b ' and ' M ' ) encode two small integers
* in the range 7. .32 using ' a ' . . ' z ' . The first character encodes bpp .
* Valid bpp range is 10. .32 . The second character encodes Mshift . Valid
* Mshift range is 7. .31 . Also , valid Mshift must be less than bpp .
*
* The rest ( " xyz... " ) is a variable - length sequence of alnum characters .
* It encodes a ( sorted ) set of ( non - negative ) integer values , as follows :
* integers are delta - encoded , golomb - compressed and base62 - serialized .
*/
static
int encode_set_size ( int c , int bpp )
{
int Mshift = encode_golomb_Mshift ( c , bpp ) ;
int bitc = encode_golomb_size ( c , Mshift ) ;
2011-01-02 06:39:32 +03:00
// two leading characters are special
2010-09-11 01:34:05 +04:00
return 2 + encode_base62_size ( bitc ) ;
}
static
int encode_set ( int c , unsigned * v , int bpp , char * base62 )
{
2011-01-02 06:39:32 +03:00
// XXX v is non-const due to encode_delta
2010-09-11 01:34:05 +04:00
int Mshift = encode_golomb_Mshift ( c , bpp ) ;
int bitc = encode_golomb_size ( c , Mshift ) ;
2011-01-02 06:39:32 +03:00
char bitv [ bitc ] ;
// bpp
2010-09-11 01:34:05 +04:00
if ( bpp < 10 | | bpp > 32 )
return - 1 ;
* base62 + + = bpp - 7 + ' a ' ;
2011-01-02 06:39:32 +03:00
// golomb parameter
2010-09-11 01:34:05 +04:00
if ( Mshift < 7 | | Mshift > 31 )
return - 2 ;
* base62 + + = Mshift - 7 + ' a ' ;
2011-01-02 06:39:32 +03:00
// delta
2010-09-11 01:34:05 +04:00
encode_delta ( c , v ) ;
2011-01-02 06:39:32 +03:00
// golomb
bitc = encode_golomb ( c , v , Mshift , bitv ) ;
2010-09-11 01:34:05 +04:00
# ifdef SELF_TEST
decode_delta ( c , v ) ;
# endif
if ( bitc < 0 )
return - 3 ;
2011-01-02 06:39:32 +03:00
// base62
int len = encode_base62 ( bitc , bitv , base62 ) ;
2010-09-11 01:34:05 +04:00
if ( len < 0 )
return - 4 ;
return 2 + len ;
}
static
int decode_set_init ( const char * str , int * pbpp , int * pMshift )
{
2011-01-02 06:39:32 +03:00
// 7..32 values encoded with 'a'..'z'
int bpp = * str + + + 7 - ' a ' ;
2010-09-11 01:34:05 +04:00
if ( bpp < 10 | | bpp > 32 )
return - 1 ;
2011-01-02 06:39:32 +03:00
// golomb parameter
int Mshift = * str + + + 7 - ' a ' ;
2010-09-11 01:34:05 +04:00
if ( Mshift < 7 | | Mshift > 31 )
return - 2 ;
if ( Mshift > = bpp )
return - 3 ;
2011-01-02 06:39:32 +03:00
// no empty sets for now
2010-09-11 01:34:05 +04:00
if ( * str = = ' \0 ' )
return - 4 ;
* pbpp = bpp ;
* pMshift = Mshift ;
return 0 ;
}
static inline
2011-06-14 00:37:26 +04:00
int decode_set_size ( int len , int Mshift )
2010-09-11 01:34:05 +04:00
{
2011-06-14 00:37:26 +04:00
int bitc = decode_base62_size ( len - 2 ) ;
2010-09-11 01:34:05 +04:00
return decode_golomb_size ( bitc , Mshift ) ;
}
static
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
int decode_set ( const char * str , int Mshift , unsigned * v )
2010-09-11 01:34:05 +04:00
{
2011-01-03 02:57:02 +03:00
const char * base62 = str + 2 ;
// separate base62+golomb stages, for reference
if ( 0 ) {
// base62
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
int len = strlen ( base62 ) ;
2011-06-14 00:37:26 +04:00
char bitv [ decode_base62_size ( len ) ] ;
2011-01-03 02:57:02 +03:00
int bitc = decode_base62 ( base62 , bitv ) ;
if ( bitc < 0 )
return bitc ;
// golomb
int c = decode_golomb ( bitc , bitv , Mshift , v ) ;
if ( c < 0 )
return c ;
// delta
decode_delta ( c , v ) ;
return c ;
}
// combined base62+golomb stage
int c = decode_base62_golomb ( base62 , Mshift , v ) ;
2010-09-11 01:34:05 +04:00
if ( c < 0 )
2011-01-03 02:57:02 +03:00
return c ;
2011-01-02 06:39:32 +03:00
// delta
2010-09-11 01:34:05 +04:00
decode_delta ( c , v ) ;
return c ;
}
2011-01-02 06:39:32 +03:00
// Special decode_set version with LRU caching.
2010-12-04 14:44:07 +03:00
static
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
int cache_decode_set ( const char * str , int Mshift , const unsigned * * pv )
2010-12-04 14:44:07 +03:00
{
struct cache_ent {
2011-01-03 08:19:07 +03:00
char * str ;
2011-06-14 00:37:26 +04:00
int len ;
2010-12-04 14:44:07 +03:00
int c ;
set.c: removed support for caching short deltas, shrinked cache
Now that decode_base62_golomb is much cheaper, the question is:
is it still worth to store short deltas, as opposed to storing
full values at the expense of shrinking the cache?
callgrind annotations for previous commit:
1,526,256,208 PROGRAM TOTALS
470,195,400 decode_base62_golomb
434,006,244 rpmsetcmp
106,137,949 __GI_strcmp
102,459,314 __GI_strlen
...
callgrind annotations for this commit:
1,427,199,731 PROGRAM TOTALS
533,131,492 decode_base62_golomb
231,592,751 rpmsetcmp
103,476,056 __GI_strlen
102,008,203 __GI_strcmp
...
So, decode_base62_golomb now takes more cycles, but the overall price
goes down. This is because, when caching short deltas, two additional
stages should be performed: 1) short deltas must be copied into unsigned
v[] array; 2) decode_delta must be invoked to recover hash values. Both
stages iterate on per-value basis and both are seemingly fast. However,
they are not that fast when both of them are replaced with bare memcpy,
which uses xmm registers or something like this.
2011-06-10 21:43:29 +04:00
unsigned v [ ] ;
2010-12-04 14:44:07 +03:00
} ;
set.c: increased cache size from 160 to 256 slots, 75% hit ratio
Hit ratio for "apt-shell <<<unmet" command:
160 slots: hit=46813 miss=22862 67.2%
256 slots: hit=52238 miss=17437 75.0%
So, we've increased the cache size by a factor of 256/160=1.6 or by 60%,
and the number of misses has decreased by a factor of 22862/17437=1.31
or by 1-17437/22862=23.7%. This is not so bad, but it looks like we're
paying more for less. The following analysis shows that this is not
quite true, since the real memory usage has increased by a somewhat
smaller factor.
160 slots, callgrind annotations:
2,406,630,571 PROGRAM TOTALS
795,320,289 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
93,466,677 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,323,900 sysdeps/x86_64/memcmp.S:bcmp
90,314,290 stdlib/msort.c:msort_with_tmp'2
83,003,684 sysdeps/x86_64/strlen.S:__GI_strlen
58,300,129 sysdeps/x86_64/memcpy.S:memcpy
...
inclusive:
1,458,467,003 lib/set.c:rpmsetcmp
256 slots, callgrind annotations:
2,246,961,708 PROGRAM TOTALS
634,410,352 lib/set.c:decode_base62_golomb
492,003,532 lib/set.c:rpmsetcmp
95,643,612 sysdeps/x86_64/memcmp.S:bcmp
93,467,414 sysdeps/x86_64/strcmp.S:__GI_strcmp
90,314,290 stdlib/msort.c:msort_with_tmp'2
79,217,962 sysdeps/x86_64/strlen.S:__GI_strlen
56,509,877 sysdeps/x86_64/memcpy.S:memcpy
...
inclusive:
1,298,977,925 lib/set.c:rpmsetcmp
So the decoding routine now takes about 20% fewer instructions, and
inclusive rpmsetcmp cost is reduced by about 11%. Note, however, that
bcmp is now the third most expensive routine (due to higher hit ratio).
Since recent glibc versions provide optimized memcmp implementations, I
imply that total/inclusive improvement can be somewhat better than 11%.
As per memory usage, the question "how much the cache takes" cannot be
generally answered with a single number. However, if we simply sum the
size of all malloc'd chunks on each rpmsetcmp invocation, using the
piece of code with a few obvious modifications elsewhere, we can obtain
the following statistics.
if (hc == CACHE_SIZE) {
int total = 0;
for (i = 0; i < hc; i++)
total += ev[i]->msize;
printf("total %d\n", total);
}
160 slots, memory usage:
min=1178583
max=2048701
avg=1330104
dev=94747
q25=1266647
q50=1310287
q75=1369005
256 slots, memory usage:
min=1670029
max=2674909
avg=1895076
dev=122062
q25=1828928
q50=1868214
q75=1916025
This indicates that average cache size is increased by about 42% from
1.27M to 1.81M; however, the third quartile is increased by about 40%,
and the maximum size is increased only by about 31% from 1.95M to 2.55M.
By which I conclude that extra 600K must be available even on low-memory
machines like Raspberry Pi (256M RAM).
* * *
What's a good hit ratio?
$ DepNames() { pkglist-query '[%{RequireName}\t%{RequireVersion}\n]' \
/var/lib/apt/lists/_ALT_Sisyphus_x86%5f64_base_pkglist.classic |
fgrep set: |cut -f1; }
$ DepNames |wc -l
34763
$ DepNames |sort -u |wc -l
2429
$ DepNames |sort |uniq -c |sort -n |awk '$1>1{print$1}' |Sum
33924
$ DepNames |sort |uniq -c |sort -n |awk '$1>1{print$1}' |wc -l
1590
$ DepNames |sort |uniq -c |sort -n |tail -256 |Sum
27079
$
We have 34763 set-versioned dependencies, which refer to 2429 sonames;
however, only 33924 dependencies refer to 1590 sonames more than once,
and the first reference is always a miss. Thus the best possible hit
ratio (if we use at least 1590 slots) is (33924-1590)/34763=93.0%.
What happens if we use only 256 slots? Assuming that dependencies are
processed in random order, the best strategy must spend its cache slots
on sonames with the most references. This way we can serve (27079-256)
dependencies via cache hit, and so the best possible hit ratio for 256
slots is is 77.2%, assuming that dependencies are processed in random
order.
2012-03-08 22:45:16 +04:00
# define CACHE_SIZE 256
# define PIVOT_SIZE 243
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
static int hc ;
static unsigned hv [ CACHE_SIZE ] ;
static struct cache_ent * ev [ CACHE_SIZE ] ;
2012-02-13 11:04:41 +04:00
// look up in the cache
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
int i ;
unsigned * hp ;
set.c: use contiguous memory to facilitate linear search
Recently I tried to implement another data structure similar to SVR2
buffer cache [Bach 1986], but the code got too complicated. So I still
maintain that, for small cache sizes, linear search is okay. Dennis
Ritchie famously argued that a linear search of a directory is efficient
because it is bounded by the size of the directory [Ibid., p. 76].
Great minds think alike (and share similar views on a linear search).
What can make the search slow, however, is not the loop per se, but
rather memory loads: on average, about 67% entries have to be loaded
(assuming 67% hit ratio), checked for entry->hash, and most probably
followed by entry->next.
With malloc'd cache entries, memory loads can be slow. To facilitate
the search, this change introduces new structure "cache_hdr", which
has only 3 members necessary for the search. The structures are
pre-allocated in contiguous memory block. This must play nice with
CPU caches, resulting in fewer memory loads and faster searches.
Indeed, based on some measurements of "apt-shell <<<unmet", this change
can demonstrate about 2% overall improvement in user time. Using more
sophisticated SVR2-like data structure further improves the result only
by about %0.5.
2012-02-11 04:13:58 +04:00
struct cache_ent * ent ;
2011-01-03 04:22:06 +03:00
unsigned hash = str [ 0 ] | ( str [ 2 ] < < 8 ) | ( str [ 3 ] < < 16 ) ;
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
for ( hp = hv ; hp < hv + hc ; hp + + ) {
if ( hash = = * hp ) {
i = hp - hv ;
ent = ev [ i ] ;
set.c: use contiguous memory to facilitate linear search
Recently I tried to implement another data structure similar to SVR2
buffer cache [Bach 1986], but the code got too complicated. So I still
maintain that, for small cache sizes, linear search is okay. Dennis
Ritchie famously argued that a linear search of a directory is efficient
because it is bounded by the size of the directory [Ibid., p. 76].
Great minds think alike (and share similar views on a linear search).
What can make the search slow, however, is not the loop per se, but
rather memory loads: on average, about 67% entries have to be loaded
(assuming 67% hit ratio), checked for entry->hash, and most probably
followed by entry->next.
With malloc'd cache entries, memory loads can be slow. To facilitate
the search, this change introduces new structure "cache_hdr", which
has only 3 members necessary for the search. The structures are
pre-allocated in contiguous memory block. This must play nice with
CPU caches, resulting in fewer memory loads and faster searches.
Indeed, based on some measurements of "apt-shell <<<unmet", this change
can demonstrate about 2% overall improvement in user time. Using more
sophisticated SVR2-like data structure further improves the result only
by about %0.5.
2012-02-11 04:13:58 +04:00
if ( memcmp ( str , ent - > str , ent - > len + 1 ) = = 0 ) {
// hit, move to front
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
if ( i ) {
memmove ( hv + 1 , hv , i * sizeof ( hv [ 0 ] ) ) ;
memmove ( ev + 1 , ev , i * sizeof ( ev [ 0 ] ) ) ;
hv [ 0 ] = hash ;
ev [ 0 ] = ent ;
set.c: use contiguous memory to facilitate linear search
Recently I tried to implement another data structure similar to SVR2
buffer cache [Bach 1986], but the code got too complicated. So I still
maintain that, for small cache sizes, linear search is okay. Dennis
Ritchie famously argued that a linear search of a directory is efficient
because it is bounded by the size of the directory [Ibid., p. 76].
Great minds think alike (and share similar views on a linear search).
What can make the search slow, however, is not the loop per se, but
rather memory loads: on average, about 67% entries have to be loaded
(assuming 67% hit ratio), checked for entry->hash, and most probably
followed by entry->next.
With malloc'd cache entries, memory loads can be slow. To facilitate
the search, this change introduces new structure "cache_hdr", which
has only 3 members necessary for the search. The structures are
pre-allocated in contiguous memory block. This must play nice with
CPU caches, resulting in fewer memory loads and faster searches.
Indeed, based on some measurements of "apt-shell <<<unmet", this change
can demonstrate about 2% overall improvement in user time. Using more
sophisticated SVR2-like data structure further improves the result only
by about %0.5.
2012-02-11 04:13:58 +04:00
}
* pv = ent - > v ;
return ent - > c ;
2010-12-04 14:44:07 +03:00
}
}
}
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
// decode
int len = strlen ( str ) ;
int c = decode_set_size ( len , Mshift ) ;
set.c: implemented 4-byte and 8-byte steppers for rpmsetcmp main loop
Provides versions, on average, are about 34 times longer that Requires
versions. More precisely, if we consider all rpmsetcmp calls for
"apt-shell <<<unmet" command, then sum(c1)/sum(c2)=33.88. This means
that we can save some time and instructions by skipping intermediate
bytes - in other words, by stepping a few bytes at a time. Of course,
after all the bytes are skipped, we must recheck a few final bytes and
possibly step back. Also, this requires more than one sentinel for
proper boundary checking.
This change implements two such "steppers" - 4-byte stepper for c1/c2
ratio below 16 and 8-byte stepper which is used otherwise. When
stepping back, both steppers use bisecting. Note that replacing last
two bisecting steps with a simple loop might be actually more efficient
with respect to branch prediction and CPU's BTB. It is very hard to
measure any user time improvement, though, even in a series of 100 runs.
The improvement is next to none, at least on older AMD CPUs. And so I
choose to keep bisecting.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,279,520,414 PROGRAM TOTALS
646,107,201 lib/set.c:decode_base62_golomb
502,438,804 lib/set.c:rpmsetcmp
98,243,148 sysdeps/x86_64/memcmp.S:bcmp
93,038,752 sysdeps/x86_64/strcmp.S:__GI_strcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,000,254,692 PROGRAM TOTALS
642,039,009 lib/set.c:decode_base62_golomb
227,036,590 lib/set.c:rpmsetcmp
98,247,798 sysdeps/x86_64/memcmp.S:bcmp
93,047,422 sysdeps/x86_64/strcmp.S:__GI_strcmp
2012-03-09 03:41:03 +04:00
# define SENTINELS 8
ent = malloc ( sizeof ( * ent ) + len + 1 + ( c + SENTINELS ) * sizeof ( unsigned ) ) ;
set.c: use contiguous memory to facilitate linear search
Recently I tried to implement another data structure similar to SVR2
buffer cache [Bach 1986], but the code got too complicated. So I still
maintain that, for small cache sizes, linear search is okay. Dennis
Ritchie famously argued that a linear search of a directory is efficient
because it is bounded by the size of the directory [Ibid., p. 76].
Great minds think alike (and share similar views on a linear search).
What can make the search slow, however, is not the loop per se, but
rather memory loads: on average, about 67% entries have to be loaded
(assuming 67% hit ratio), checked for entry->hash, and most probably
followed by entry->next.
With malloc'd cache entries, memory loads can be slow. To facilitate
the search, this change introduces new structure "cache_hdr", which
has only 3 members necessary for the search. The structures are
pre-allocated in contiguous memory block. This must play nice with
CPU caches, resulting in fewer memory loads and faster searches.
Indeed, based on some measurements of "apt-shell <<<unmet", this change
can demonstrate about 2% overall improvement in user time. Using more
sophisticated SVR2-like data structure further improves the result only
by about %0.5.
2012-02-11 04:13:58 +04:00
assert ( ent ) ;
c = ent - > c = decode_set ( str , Mshift , ent - > v ) ;
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
if ( c < = 0 ) {
set.c: use contiguous memory to facilitate linear search
Recently I tried to implement another data structure similar to SVR2
buffer cache [Bach 1986], but the code got too complicated. So I still
maintain that, for small cache sizes, linear search is okay. Dennis
Ritchie famously argued that a linear search of a directory is efficient
because it is bounded by the size of the directory [Ibid., p. 76].
Great minds think alike (and share similar views on a linear search).
What can make the search slow, however, is not the loop per se, but
rather memory loads: on average, about 67% entries have to be loaded
(assuming 67% hit ratio), checked for entry->hash, and most probably
followed by entry->next.
With malloc'd cache entries, memory loads can be slow. To facilitate
the search, this change introduces new structure "cache_hdr", which
has only 3 members necessary for the search. The structures are
pre-allocated in contiguous memory block. This must play nice with
CPU caches, resulting in fewer memory loads and faster searches.
Indeed, based on some measurements of "apt-shell <<<unmet", this change
can demonstrate about 2% overall improvement in user time. Using more
sophisticated SVR2-like data structure further improves the result only
by about %0.5.
2012-02-11 04:13:58 +04:00
free ( ent ) ;
2010-12-04 14:44:07 +03:00
return c ;
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
}
set.c: implemented 4-byte and 8-byte steppers for rpmsetcmp main loop
Provides versions, on average, are about 34 times longer that Requires
versions. More precisely, if we consider all rpmsetcmp calls for
"apt-shell <<<unmet" command, then sum(c1)/sum(c2)=33.88. This means
that we can save some time and instructions by skipping intermediate
bytes - in other words, by stepping a few bytes at a time. Of course,
after all the bytes are skipped, we must recheck a few final bytes and
possibly step back. Also, this requires more than one sentinel for
proper boundary checking.
This change implements two such "steppers" - 4-byte stepper for c1/c2
ratio below 16 and 8-byte stepper which is used otherwise. When
stepping back, both steppers use bisecting. Note that replacing last
two bisecting steps with a simple loop might be actually more efficient
with respect to branch prediction and CPU's BTB. It is very hard to
measure any user time improvement, though, even in a series of 100 runs.
The improvement is next to none, at least on older AMD CPUs. And so I
choose to keep bisecting.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,279,520,414 PROGRAM TOTALS
646,107,201 lib/set.c:decode_base62_golomb
502,438,804 lib/set.c:rpmsetcmp
98,243,148 sysdeps/x86_64/memcmp.S:bcmp
93,038,752 sysdeps/x86_64/strcmp.S:__GI_strcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,000,254,692 PROGRAM TOTALS
642,039,009 lib/set.c:decode_base62_golomb
227,036,590 lib/set.c:rpmsetcmp
98,247,798 sysdeps/x86_64/memcmp.S:bcmp
93,047,422 sysdeps/x86_64/strcmp.S:__GI_strcmp
2012-03-09 03:41:03 +04:00
for ( i = 0 ; i < SENTINELS ; i + + )
ent - > v [ c + i ] = ~ 0u ;
ent - > str = ( char * ) ( ent - > v + c + SENTINELS ) ;
set.c: use contiguous memory to facilitate linear search
Recently I tried to implement another data structure similar to SVR2
buffer cache [Bach 1986], but the code got too complicated. So I still
maintain that, for small cache sizes, linear search is okay. Dennis
Ritchie famously argued that a linear search of a directory is efficient
because it is bounded by the size of the directory [Ibid., p. 76].
Great minds think alike (and share similar views on a linear search).
What can make the search slow, however, is not the loop per se, but
rather memory loads: on average, about 67% entries have to be loaded
(assuming 67% hit ratio), checked for entry->hash, and most probably
followed by entry->next.
With malloc'd cache entries, memory loads can be slow. To facilitate
the search, this change introduces new structure "cache_hdr", which
has only 3 members necessary for the search. The structures are
pre-allocated in contiguous memory block. This must play nice with
CPU caches, resulting in fewer memory loads and faster searches.
Indeed, based on some measurements of "apt-shell <<<unmet", this change
can demonstrate about 2% overall improvement in user time. Using more
sophisticated SVR2-like data structure further improves the result only
by about %0.5.
2012-02-11 04:13:58 +04:00
memcpy ( ent - > str , str , len + 1 ) ;
ent - > len = len ;
2012-02-13 11:04:41 +04:00
// insert
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
if ( hc < CACHE_SIZE )
i = hc + + ;
else {
2012-02-13 11:04:41 +04:00
// free last entry
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
free ( ev [ CACHE_SIZE - 1 ] ) ;
2012-02-13 11:04:41 +04:00
// position at midpoint
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
i = PIVOT_SIZE ;
memmove ( hv + i + 1 , hv + i , ( CACHE_SIZE - i - 1 ) * sizeof ( hv [ 0 ] ) ) ;
memmove ( ev + i + 1 , ev + i , ( CACHE_SIZE - i - 1 ) * sizeof ( ev [ 0 ] ) ) ;
2011-01-07 06:06:00 +03:00
}
set.c: improved cache_decode_set loop
I am going to consdier whether it is worthwhile to increase the cache
size. Thus I have to ensure that the linear search won't be an obstacle
for doing so. Particularly, its loop must be efficient in terms of both
cpu instructions and memory access patterns.
1) On behalf of memory access patterns, this change introduces two
separate arrays: hv[] with hash values and ev[] with actual cache
entries. On x86-64, this saves 4 bytes per entry which have previously
been wasted to align cache_hdr structures. This has some benefits on
i686 as well: for example, ev[] is not accessed on a cache miss.
2) As per instructions, the loop has two branches: the first is for
boundary checking, and the second is for matching hash condition. Since
the boundary checking condition (cur->ent != NULL) relies on a sentinel,
the loop cannot be unrolled; it takes 6 instructions per iteration. If
we replace the condition with explicit boundary check (hp < hv + hc),
the number of iterations becomes known upon entry to the loop, and gcc
will unroll the loop; it takes now 3 instructions per iteration, plus
some (smaller) overhead for boundary checking.
This change also removes __thread specifiers, since gcc is apparently
not very good at optimizing superfluous __tls_get_addr calls. Also, if
we are to consider larger cache sizes, it becomes questionable whether
each thread should posess its own cache only as a means of achieving
thread safety. Anyway, currently I'm not aware of threaded applications
which make concurrent librpm calls.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,437,446,116 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
510,957,897 lib/set.c:rpmsetcmp
...
23,671,760 for (cur = cache; cur->ent; cur++) {
1,114,800 => /usr/src/debug/glibc-2.11.3-alt7/elf/dl-tls.c:__tls_get_addr (69675x)
11,685,644 if (hash == cur->hash) {
. ent = cur->ent;
callgrind annotations for "apt-shell <<<unmet", this commit:
2,431,849,572 PROGRAM TOTALS
820,835,411 lib/set.c:decode_base62_golomb
496,682,547 lib/set.c:rpmsetcmp
...
10,204,175 for (hp = hv; hp < hv + hc; hp++) {
11,685,644 if (hash == *hp) {
189,344 i = hp - hv;
189,344 ent = ev[i];
Total improvement is not very impressive (6M instead of expected 14M),
mostly due to memmove complications - hv[] cannot be shifted efficiently
using 8-byte words. However, the code now scales better. Also, recent
glibc versions supposedly provide much improved memmove implementation.
2012-03-07 19:19:50 +04:00
hv [ i ] = hash ;
ev [ i ] = ent ;
set.c: use contiguous memory to facilitate linear search
Recently I tried to implement another data structure similar to SVR2
buffer cache [Bach 1986], but the code got too complicated. So I still
maintain that, for small cache sizes, linear search is okay. Dennis
Ritchie famously argued that a linear search of a directory is efficient
because it is bounded by the size of the directory [Ibid., p. 76].
Great minds think alike (and share similar views on a linear search).
What can make the search slow, however, is not the loop per se, but
rather memory loads: on average, about 67% entries have to be loaded
(assuming 67% hit ratio), checked for entry->hash, and most probably
followed by entry->next.
With malloc'd cache entries, memory loads can be slow. To facilitate
the search, this change introduces new structure "cache_hdr", which
has only 3 members necessary for the search. The structures are
pre-allocated in contiguous memory block. This must play nice with
CPU caches, resulting in fewer memory loads and faster searches.
Indeed, based on some measurements of "apt-shell <<<unmet", this change
can demonstrate about 2% overall improvement in user time. Using more
sophisticated SVR2-like data structure further improves the result only
by about %0.5.
2012-02-11 04:13:58 +04:00
* pv = ent - > v ;
2010-12-04 14:44:07 +03:00
return c ;
}
set.c: reimplmeneted downsampling unsing merges
Most of the time, downsampling is needed for Provides versions,
which are expensive, and values are reduced by only 1 bit, which
can be implemented without sorting the values again. Indeed,
only a merge is required. The array v[] can be split into two
parts: the first part v1[] and the second part v2[], the latter
having values with high bit set. After the high bit is stripped,
v2[] values are still sorted. It suffices to merge v1[] and v2[].
Note that, however, a merge cannot be done inplace, and also we have
to support 2 or more downsampling steps. We also want to avoid copying.
This requires careful buffer management - each version needs two
alternate buffers.
callgrind annotations for "apt-cache <<<unmet", previous commit:
2,743,058,808 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
509,186,920 lib/set.c:rpmsetcmp
131,678,282 stdlib/msort.c:msort_with_tmp'2
93,496,965 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,266 sysdeps/x86_64/memcmp.S:bcmp
83,062,668 sysdeps/x86_64/strlen.S:__GI_strlen
64,584,024 sysdeps/x86_64/memcpy.S:memcpy
callgrind annotations for "apt-cache <<<unmet", this commit:
2,683,295,262 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
510,261,969 lib/set.c:rpmsetcmp
93,692,793 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,275 sysdeps/x86_64/memcmp.S:bcmp
90,080,205 stdlib/msort.c:msort_with_tmp'2
83,062,524 sysdeps/x86_64/strlen.S:__GI_strlen
58,165,691 sysdeps/x86_64/memcpy.S:memcpy
2012-02-16 17:18:21 +04:00
// Reduce a set of (bpp + 1) values to a set of bpp values.
2010-09-11 01:34:05 +04:00
static
set.c: reimplmeneted downsampling unsing merges
Most of the time, downsampling is needed for Provides versions,
which are expensive, and values are reduced by only 1 bit, which
can be implemented without sorting the values again. Indeed,
only a merge is required. The array v[] can be split into two
parts: the first part v1[] and the second part v2[], the latter
having values with high bit set. After the high bit is stripped,
v2[] values are still sorted. It suffices to merge v1[] and v2[].
Note that, however, a merge cannot be done inplace, and also we have
to support 2 or more downsampling steps. We also want to avoid copying.
This requires careful buffer management - each version needs two
alternate buffers.
callgrind annotations for "apt-cache <<<unmet", previous commit:
2,743,058,808 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
509,186,920 lib/set.c:rpmsetcmp
131,678,282 stdlib/msort.c:msort_with_tmp'2
93,496,965 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,266 sysdeps/x86_64/memcmp.S:bcmp
83,062,668 sysdeps/x86_64/strlen.S:__GI_strlen
64,584,024 sysdeps/x86_64/memcpy.S:memcpy
callgrind annotations for "apt-cache <<<unmet", this commit:
2,683,295,262 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
510,261,969 lib/set.c:rpmsetcmp
93,692,793 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,275 sysdeps/x86_64/memcmp.S:bcmp
90,080,205 stdlib/msort.c:msort_with_tmp'2
83,062,524 sysdeps/x86_64/strlen.S:__GI_strlen
58,165,691 sysdeps/x86_64/memcpy.S:memcpy
2012-02-16 17:18:21 +04:00
int downsample_set ( int c , const unsigned * v , unsigned * w , int bpp )
2010-09-11 01:34:05 +04:00
{
unsigned mask = ( 1 < < bpp ) - 1 ;
set.c: reimplmeneted downsampling unsing merges
Most of the time, downsampling is needed for Provides versions,
which are expensive, and values are reduced by only 1 bit, which
can be implemented without sorting the values again. Indeed,
only a merge is required. The array v[] can be split into two
parts: the first part v1[] and the second part v2[], the latter
having values with high bit set. After the high bit is stripped,
v2[] values are still sorted. It suffices to merge v1[] and v2[].
Note that, however, a merge cannot be done inplace, and also we have
to support 2 or more downsampling steps. We also want to avoid copying.
This requires careful buffer management - each version needs two
alternate buffers.
callgrind annotations for "apt-cache <<<unmet", previous commit:
2,743,058,808 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
509,186,920 lib/set.c:rpmsetcmp
131,678,282 stdlib/msort.c:msort_with_tmp'2
93,496,965 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,266 sysdeps/x86_64/memcmp.S:bcmp
83,062,668 sysdeps/x86_64/strlen.S:__GI_strlen
64,584,024 sysdeps/x86_64/memcpy.S:memcpy
callgrind annotations for "apt-cache <<<unmet", this commit:
2,683,295,262 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
510,261,969 lib/set.c:rpmsetcmp
93,692,793 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,275 sysdeps/x86_64/memcmp.S:bcmp
90,080,205 stdlib/msort.c:msort_with_tmp'2
83,062,524 sysdeps/x86_64/strlen.S:__GI_strlen
58,165,691 sysdeps/x86_64/memcpy.S:memcpy
2012-02-16 17:18:21 +04:00
// find the first element with high bit set
int l = 0 ;
int u = c ;
while ( l < u ) {
int i = ( l + u ) / 2 ;
if ( v [ i ] < = mask )
l = i + 1 ;
else
u = i ;
}
// initialize parts
const unsigned * w_start = w ;
const unsigned * v1 = v + 0 , * v1end = v + u ;
const unsigned * v2 = v + u , * v2end = v + c ;
// merge v1 and v2 into w
if ( v1 < v1end & & v2 < v2end ) {
unsigned v1val = * v1 ;
unsigned v2val = * v2 & mask ;
while ( 1 ) {
if ( v1val < v2val ) {
* w + + = v1val ;
v1 + + ;
if ( v1 = = v1end )
break ;
v1val = * v1 ;
}
else if ( v2val < v1val ) {
* w + + = v2val ;
v2 + + ;
if ( v2 = = v2end )
break ;
v2val = * v2 & mask ;
}
else {
* w + + = v1val ;
v1 + + ;
v2 + + ;
if ( v1 = = v1end )
break ;
if ( v2 = = v2end )
break ;
v1val = * v1 ;
v2val = * v2 & mask ;
}
}
}
// append what's left
while ( v1 < v1end )
* w + + = * v1 + + ;
while ( v2 < v2end )
* w + + = * v2 + + & mask ;
return w - w_start ;
2010-09-11 01:34:05 +04:00
}
# ifdef SELF_TEST
static
2011-01-02 06:39:32 +03:00
void test_set ( )
2010-09-11 01:34:05 +04:00
{
unsigned rnd_v [ ] = {
0x020a , 0x07e5 , 0x3305 , 0x35f5 ,
0x4980 , 0x4c4f , 0x74ef , 0x7739 ,
0x82ae , 0x8415 , 0xa3e7 , 0xb07e ,
0xb584 , 0xb89f , 0xbb40 , 0xf39e ,
} ;
2011-01-02 06:39:32 +03:00
int rnd_c = sizeof rnd_v / sizeof * rnd_v ;
// encode
int bpp = 16 ;
char base62 [ encode_set_size ( rnd_c , bpp ) ] ;
int len = encode_set ( rnd_c , rnd_v , bpp , base62 ) ;
2010-09-11 01:34:05 +04:00
assert ( len > 0 ) ;
fprintf ( stderr , " len=%d set=%s \n " , len , base62 ) ;
2011-01-02 06:39:32 +03:00
// decode
int Mshift = bpp ;
int rc = decode_set_init ( base62 , & bpp , & Mshift ) ;
2010-09-11 01:34:05 +04:00
assert ( rc = = 0 ) ;
assert ( bpp = = 16 ) ;
assert ( Mshift < bpp ) ;
2011-06-14 00:37:26 +04:00
int c = decode_set_size ( len , Mshift ) ;
2010-09-11 01:34:05 +04:00
assert ( c > = rnd_c ) ;
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
unsigned vbuf [ c ] ;
const unsigned * v = vbuf ;
c = decode_set ( base62 , Mshift , vbuf ) ;
2011-01-02 06:39:32 +03:00
// Decoded values must match.
2010-09-11 01:34:05 +04:00
assert ( c = = rnd_c ) ;
2011-01-02 06:39:32 +03:00
int i ;
2011-01-03 08:19:07 +03:00
for ( i = 0 ; i < c ; i + + )
assert ( v [ i ] = = rnd_v [ i ] ) ;
// Cached version.
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
c = cache_decode_set ( base62 , Mshift , & v ) ;
2011-01-03 08:19:07 +03:00
assert ( c = = rnd_c ) ;
2010-09-11 01:34:05 +04:00
for ( i = 0 ; i < c ; i + + )
assert ( v [ i ] = = rnd_v [ i ] ) ;
fprintf ( stderr , " %s: set test OK \n " , __FILE__ ) ;
}
# endif
/*
* API routines start here .
*/
# include "set.h"
2011-01-02 06:39:32 +03:00
// main API routine
2010-09-11 01:34:05 +04:00
int rpmsetcmp ( const char * str1 , const char * str2 )
{
2011-06-10 10:50:05 +04:00
if ( strncmp ( str1 , " set: " , 4 ) = = 0 )
str1 + = 4 ;
if ( strncmp ( str2 , " set: " , 4 ) = = 0 )
str2 + = 4 ;
// initialize decoding
int bpp1 , Mshift1 ;
int bpp2 , Mshift2 ;
if ( decode_set_init ( str1 , & bpp1 , & Mshift1 ) < 0 )
return - 3 ;
if ( decode_set_init ( str2 , & bpp2 , & Mshift2 ) < 0 )
return - 4 ;
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
// decode set1 (comes on behalf of provides)
const unsigned * v1 = NULL ;
int c1 = cache_decode_set ( str1 , Mshift1 , & v1 ) ;
2011-06-10 10:50:05 +04:00
if ( c1 < 0 )
return - 3 ;
2012-12-24 16:24:15 +04:00
unsigned v1bufA [ c1 + SENTINELS ] ;
unsigned v1bufB [ c1 + SENTINELS ] ;
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
// decode set2 (on the stack)
int len2 = strlen ( str2 ) ;
set.c: reimplmeneted downsampling unsing merges
Most of the time, downsampling is needed for Provides versions,
which are expensive, and values are reduced by only 1 bit, which
can be implemented without sorting the values again. Indeed,
only a merge is required. The array v[] can be split into two
parts: the first part v1[] and the second part v2[], the latter
having values with high bit set. After the high bit is stripped,
v2[] values are still sorted. It suffices to merge v1[] and v2[].
Note that, however, a merge cannot be done inplace, and also we have
to support 2 or more downsampling steps. We also want to avoid copying.
This requires careful buffer management - each version needs two
alternate buffers.
callgrind annotations for "apt-cache <<<unmet", previous commit:
2,743,058,808 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
509,186,920 lib/set.c:rpmsetcmp
131,678,282 stdlib/msort.c:msort_with_tmp'2
93,496,965 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,266 sysdeps/x86_64/memcmp.S:bcmp
83,062,668 sysdeps/x86_64/strlen.S:__GI_strlen
64,584,024 sysdeps/x86_64/memcpy.S:memcpy
callgrind annotations for "apt-cache <<<unmet", this commit:
2,683,295,262 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
510,261,969 lib/set.c:rpmsetcmp
93,692,793 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,275 sysdeps/x86_64/memcmp.S:bcmp
90,080,205 stdlib/msort.c:msort_with_tmp'2
83,062,524 sysdeps/x86_64/strlen.S:__GI_strlen
58,165,691 sysdeps/x86_64/memcpy.S:memcpy
2012-02-16 17:18:21 +04:00
int c2 = decode_set_size ( len2 , Mshift2 ) ;
unsigned v2bufA [ c2 ] ;
unsigned v2bufB [ c2 ] ;
const unsigned * v2 = v2bufA ;
c2 = decode_set ( str2 , Mshift2 , v2bufA ) ;
2011-06-10 10:50:05 +04:00
if ( c2 < 0 )
return - 4 ;
// adjust for comparison
set.c: implemented 4-byte and 8-byte steppers for rpmsetcmp main loop
Provides versions, on average, are about 34 times longer that Requires
versions. More precisely, if we consider all rpmsetcmp calls for
"apt-shell <<<unmet" command, then sum(c1)/sum(c2)=33.88. This means
that we can save some time and instructions by skipping intermediate
bytes - in other words, by stepping a few bytes at a time. Of course,
after all the bytes are skipped, we must recheck a few final bytes and
possibly step back. Also, this requires more than one sentinel for
proper boundary checking.
This change implements two such "steppers" - 4-byte stepper for c1/c2
ratio below 16 and 8-byte stepper which is used otherwise. When
stepping back, both steppers use bisecting. Note that replacing last
two bisecting steps with a simple loop might be actually more efficient
with respect to branch prediction and CPU's BTB. It is very hard to
measure any user time improvement, though, even in a series of 100 runs.
The improvement is next to none, at least on older AMD CPUs. And so I
choose to keep bisecting.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,279,520,414 PROGRAM TOTALS
646,107,201 lib/set.c:decode_base62_golomb
502,438,804 lib/set.c:rpmsetcmp
98,243,148 sysdeps/x86_64/memcmp.S:bcmp
93,038,752 sysdeps/x86_64/strcmp.S:__GI_strcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,000,254,692 PROGRAM TOTALS
642,039,009 lib/set.c:decode_base62_golomb
227,036,590 lib/set.c:rpmsetcmp
98,247,798 sysdeps/x86_64/memcmp.S:bcmp
93,047,422 sysdeps/x86_64/strcmp.S:__GI_strcmp
2012-03-09 03:41:03 +04:00
int i ;
set.c: reimplmeneted downsampling unsing merges
Most of the time, downsampling is needed for Provides versions,
which are expensive, and values are reduced by only 1 bit, which
can be implemented without sorting the values again. Indeed,
only a merge is required. The array v[] can be split into two
parts: the first part v1[] and the second part v2[], the latter
having values with high bit set. After the high bit is stripped,
v2[] values are still sorted. It suffices to merge v1[] and v2[].
Note that, however, a merge cannot be done inplace, and also we have
to support 2 or more downsampling steps. We also want to avoid copying.
This requires careful buffer management - each version needs two
alternate buffers.
callgrind annotations for "apt-cache <<<unmet", previous commit:
2,743,058,808 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
509,186,920 lib/set.c:rpmsetcmp
131,678,282 stdlib/msort.c:msort_with_tmp'2
93,496,965 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,266 sysdeps/x86_64/memcmp.S:bcmp
83,062,668 sysdeps/x86_64/strlen.S:__GI_strlen
64,584,024 sysdeps/x86_64/memcpy.S:memcpy
callgrind annotations for "apt-cache <<<unmet", this commit:
2,683,295,262 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
510,261,969 lib/set.c:rpmsetcmp
93,692,793 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,275 sysdeps/x86_64/memcmp.S:bcmp
90,080,205 stdlib/msort.c:msort_with_tmp'2
83,062,524 sysdeps/x86_64/strlen.S:__GI_strlen
58,165,691 sysdeps/x86_64/memcpy.S:memcpy
2012-02-16 17:18:21 +04:00
while ( bpp1 > bpp2 ) {
unsigned * v1buf = v1bufA ;
if ( v1 = = v1buf )
v1buf = v1bufB ;
bpp1 - - ;
c1 = downsample_set ( c1 , v1 , v1buf , bpp1 ) ;
set.c: implemented 4-byte and 8-byte steppers for rpmsetcmp main loop
Provides versions, on average, are about 34 times longer that Requires
versions. More precisely, if we consider all rpmsetcmp calls for
"apt-shell <<<unmet" command, then sum(c1)/sum(c2)=33.88. This means
that we can save some time and instructions by skipping intermediate
bytes - in other words, by stepping a few bytes at a time. Of course,
after all the bytes are skipped, we must recheck a few final bytes and
possibly step back. Also, this requires more than one sentinel for
proper boundary checking.
This change implements two such "steppers" - 4-byte stepper for c1/c2
ratio below 16 and 8-byte stepper which is used otherwise. When
stepping back, both steppers use bisecting. Note that replacing last
two bisecting steps with a simple loop might be actually more efficient
with respect to branch prediction and CPU's BTB. It is very hard to
measure any user time improvement, though, even in a series of 100 runs.
The improvement is next to none, at least on older AMD CPUs. And so I
choose to keep bisecting.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,279,520,414 PROGRAM TOTALS
646,107,201 lib/set.c:decode_base62_golomb
502,438,804 lib/set.c:rpmsetcmp
98,243,148 sysdeps/x86_64/memcmp.S:bcmp
93,038,752 sysdeps/x86_64/strcmp.S:__GI_strcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,000,254,692 PROGRAM TOTALS
642,039,009 lib/set.c:decode_base62_golomb
227,036,590 lib/set.c:rpmsetcmp
98,247,798 sysdeps/x86_64/memcmp.S:bcmp
93,047,422 sysdeps/x86_64/strcmp.S:__GI_strcmp
2012-03-09 03:41:03 +04:00
for ( i = 0 ; i < SENTINELS ; i + + )
v1buf [ c1 + i ] = ~ 0u ;
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
v1 = v1buf ;
2011-06-10 10:50:05 +04:00
}
set.c: reimplmeneted downsampling unsing merges
Most of the time, downsampling is needed for Provides versions,
which are expensive, and values are reduced by only 1 bit, which
can be implemented without sorting the values again. Indeed,
only a merge is required. The array v[] can be split into two
parts: the first part v1[] and the second part v2[], the latter
having values with high bit set. After the high bit is stripped,
v2[] values are still sorted. It suffices to merge v1[] and v2[].
Note that, however, a merge cannot be done inplace, and also we have
to support 2 or more downsampling steps. We also want to avoid copying.
This requires careful buffer management - each version needs two
alternate buffers.
callgrind annotations for "apt-cache <<<unmet", previous commit:
2,743,058,808 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
509,186,920 lib/set.c:rpmsetcmp
131,678,282 stdlib/msort.c:msort_with_tmp'2
93,496,965 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,266 sysdeps/x86_64/memcmp.S:bcmp
83,062,668 sysdeps/x86_64/strlen.S:__GI_strlen
64,584,024 sysdeps/x86_64/memcpy.S:memcpy
callgrind annotations for "apt-cache <<<unmet", this commit:
2,683,295,262 PROGRAM TOTALS
1,068,102,605 lib/set.c:decode_base62_golomb
510,261,969 lib/set.c:rpmsetcmp
93,692,793 sysdeps/x86_64/strcmp.S:__GI_strcmp
91,066,275 sysdeps/x86_64/memcmp.S:bcmp
90,080,205 stdlib/msort.c:msort_with_tmp'2
83,062,524 sysdeps/x86_64/strlen.S:__GI_strlen
58,165,691 sysdeps/x86_64/memcpy.S:memcpy
2012-02-16 17:18:21 +04:00
while ( bpp2 > bpp1 ) {
unsigned * v2buf = v2bufA ;
if ( v2 = = v2buf )
v2buf = v2bufB ;
bpp2 - - ;
c2 = downsample_set ( c2 , v2 , v2buf , bpp2 ) ;
v2 = v2buf ;
2011-06-10 10:50:05 +04:00
}
// compare
int ge = 1 ;
int le = 1 ;
set.c: more redesign to avoid extra copying and strlen
This partially reverts what's been introduced with previous commit.
Realize that strlen() must be *only* called when allocating space
for v[]. There is no reason to call strlen() for every Provides
string, since most of them are decoded via the cache hit.
Note, however, that now I have to use the following trick:
memcmp(str, cur->str, cur->len + 1) == 0
I rely on the fact this works as expected even when str is shorter than
cur->len. Namely, memcmp must start from lower addresses and stop at
the first difference (i.e. memcmp must not read past the end of str,
possibly except for a few trailing bytes on the same memory page); this
is not specified by the standard, but this is how it must work.
Also, since the cache now stores full decoded values, it is possible to
avoid copying and instead to set the pointer to internal cache memory.
Copying must be performed, however, when the set is to be downsampled.
Note that average Provides set size is around 1024, which corresponds
to base62 string length of about 2K and v[] of 4K. Saving strlen(2K)
and memcpy(4K) on every rpmsetcmp call is indeed an improvement.
callgrind annotations for "apt-cache unmet", 4.0.4-alt100.27
1,900,016,996 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,581,178 __GI_strlen
80,781,386 msort_with_tmp'2
38,648,490 memcpy
26,936,309 __GI_strcpy
26,918,522 regionSwab.clone.2
21,000,896 _int_malloc
...
callgrind annotations for "apt-cache unmet", this commit (rebuilt in hasher):
1,264,977,497 PROGRAM TOTALS
533,131,492 decode_base62_golomb
230,706,690 rpmsetcmp
80,781,386 msort_with_tmp'2
60,541,804 __GI_strlen
42,518,368 memcpy
39,865,182 bcmp
26,918,522 regionSwab.clone.2
21,841,085 _int_malloc
...
2011-06-15 23:34:31 +04:00
const unsigned * v1end = v1 + c1 ;
const unsigned * v2end = v2 + c2 ;
set.c: implemented 4-byte and 8-byte steppers for rpmsetcmp main loop
Provides versions, on average, are about 34 times longer that Requires
versions. More precisely, if we consider all rpmsetcmp calls for
"apt-shell <<<unmet" command, then sum(c1)/sum(c2)=33.88. This means
that we can save some time and instructions by skipping intermediate
bytes - in other words, by stepping a few bytes at a time. Of course,
after all the bytes are skipped, we must recheck a few final bytes and
possibly step back. Also, this requires more than one sentinel for
proper boundary checking.
This change implements two such "steppers" - 4-byte stepper for c1/c2
ratio below 16 and 8-byte stepper which is used otherwise. When
stepping back, both steppers use bisecting. Note that replacing last
two bisecting steps with a simple loop might be actually more efficient
with respect to branch prediction and CPU's BTB. It is very hard to
measure any user time improvement, though, even in a series of 100 runs.
The improvement is next to none, at least on older AMD CPUs. And so I
choose to keep bisecting.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,279,520,414 PROGRAM TOTALS
646,107,201 lib/set.c:decode_base62_golomb
502,438,804 lib/set.c:rpmsetcmp
98,243,148 sysdeps/x86_64/memcmp.S:bcmp
93,038,752 sysdeps/x86_64/strcmp.S:__GI_strcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,000,254,692 PROGRAM TOTALS
642,039,009 lib/set.c:decode_base62_golomb
227,036,590 lib/set.c:rpmsetcmp
98,247,798 sysdeps/x86_64/memcmp.S:bcmp
93,047,422 sysdeps/x86_64/strcmp.S:__GI_strcmp
2012-03-09 03:41:03 +04:00
for ( i = 0 ; i < SENTINELS ; i + + )
assert ( v1end [ i ] = = ~ 0u ) ;
set.c: improved rpmsetcmp main loop performance
The loop is logically impeccable, but its main condition
(v1 < v1end && v2 < v2end) is somewhat redundant: in two
of the three cases, only one pointer gets advanced. To
save instructions, the conditions are now handled within
the cases. The loop is now a while (1) loop, a disguised
form of goto.
Also not that, when comparing Requires against Provides,
the Requires is usually sparse:
P: a b c d e f g h i j k l ...
R: a c h j ...
This means that a nested loop which skips intermediate Provides
elements towards the next Requires element may improve performance.
while (v1 < v1end && *v1 < *v2)
v1++;
However, note that the first condition (v1 < v1end) is also somewhat
redundant. This kind of boundary checking can be partially omitted if
the loop gets unrolled. There is a better technique, however, called
the barrier: *v1end must contain the biggest element possible, so that
the trailing *v1 is never smaller than any of *v2. The nested loop is
then becomes as simple as
while (*v1 < *v2)
v1++;
callgrind annotations, 4.0.4-alt100.27:
1,899,657,916 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,225,572 __GI_strcmp
102,459,314 __GI_strlen
...
callgrind annotations, this commit (rebuilt in hasher):
1,526,256,208 PROGRAM TOTALS
470,195,400 decode_base62_golomb
434,006,244 rpmsetcmp
106,137,949 __GI_strcmp
102,459,314 __GI_strlen
...
Note that rpmsetcmp also absorbs cache_decode_set and decode_delta;
the loop is now about twice as faster.
2011-06-10 11:03:45 +04:00
unsigned v2val = * v2 ;
set.c: implemented 4-byte and 8-byte steppers for rpmsetcmp main loop
Provides versions, on average, are about 34 times longer that Requires
versions. More precisely, if we consider all rpmsetcmp calls for
"apt-shell <<<unmet" command, then sum(c1)/sum(c2)=33.88. This means
that we can save some time and instructions by skipping intermediate
bytes - in other words, by stepping a few bytes at a time. Of course,
after all the bytes are skipped, we must recheck a few final bytes and
possibly step back. Also, this requires more than one sentinel for
proper boundary checking.
This change implements two such "steppers" - 4-byte stepper for c1/c2
ratio below 16 and 8-byte stepper which is used otherwise. When
stepping back, both steppers use bisecting. Note that replacing last
two bisecting steps with a simple loop might be actually more efficient
with respect to branch prediction and CPU's BTB. It is very hard to
measure any user time improvement, though, even in a series of 100 runs.
The improvement is next to none, at least on older AMD CPUs. And so I
choose to keep bisecting.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,279,520,414 PROGRAM TOTALS
646,107,201 lib/set.c:decode_base62_golomb
502,438,804 lib/set.c:rpmsetcmp
98,243,148 sysdeps/x86_64/memcmp.S:bcmp
93,038,752 sysdeps/x86_64/strcmp.S:__GI_strcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,000,254,692 PROGRAM TOTALS
642,039,009 lib/set.c:decode_base62_golomb
227,036,590 lib/set.c:rpmsetcmp
98,247,798 sysdeps/x86_64/memcmp.S:bcmp
93,047,422 sysdeps/x86_64/strcmp.S:__GI_strcmp
2012-03-09 03:41:03 +04:00
// loop pieces
# define IFLT4 \
if ( * v1 < v2val ) { \
le = 0 ; \
v1 + = 4 ; \
while ( * v1 < v2val ) \
v1 + = 4 ; \
v1 - = 2 ; \
if ( * v1 < v2val ) \
v1 + + ; \
else \
v1 - - ; \
if ( * v1 < v2val ) \
v1 + + ; \
if ( v1 = = v1end ) \
break ; \
}
# define IFLT8 \
if ( * v1 < v2val ) { \
le = 0 ; \
v1 + = 8 ; \
while ( * v1 < v2val ) \
v1 + = 8 ; \
v1 - = 4 ; \
if ( * v1 < v2val ) \
v1 + = 2 ; \
else \
v1 - = 2 ; \
if ( * v1 < v2val ) \
v1 + + ; \
else \
v1 - - ; \
if ( * v1 < v2val ) \
v1 + + ; \
if ( v1 = = v1end ) \
break ; \
}
# define IFGE \
if ( * v1 = = v2val ) { \
v1 + + ; \
v2 + + ; \
if ( v1 = = v1end ) \
break ; \
if ( v2 = = v2end ) \
break ; \
v2val = * v2 ; \
} \
else { \
ge = 0 ; \
v2 + + ; \
if ( v2 = = v2end ) \
break ; \
v2val = * v2 ; \
}
// choose the right stepper
if ( c1 > = 16 * c2 ) {
while ( 1 ) {
IFLT8 ;
IFGE ;
2010-09-11 01:34:05 +04:00
}
set.c: implemented 4-byte and 8-byte steppers for rpmsetcmp main loop
Provides versions, on average, are about 34 times longer that Requires
versions. More precisely, if we consider all rpmsetcmp calls for
"apt-shell <<<unmet" command, then sum(c1)/sum(c2)=33.88. This means
that we can save some time and instructions by skipping intermediate
bytes - in other words, by stepping a few bytes at a time. Of course,
after all the bytes are skipped, we must recheck a few final bytes and
possibly step back. Also, this requires more than one sentinel for
proper boundary checking.
This change implements two such "steppers" - 4-byte stepper for c1/c2
ratio below 16 and 8-byte stepper which is used otherwise. When
stepping back, both steppers use bisecting. Note that replacing last
two bisecting steps with a simple loop might be actually more efficient
with respect to branch prediction and CPU's BTB. It is very hard to
measure any user time improvement, though, even in a series of 100 runs.
The improvement is next to none, at least on older AMD CPUs. And so I
choose to keep bisecting.
callgrind annotations for "apt-shell <<<unmet", previous commit:
2,279,520,414 PROGRAM TOTALS
646,107,201 lib/set.c:decode_base62_golomb
502,438,804 lib/set.c:rpmsetcmp
98,243,148 sysdeps/x86_64/memcmp.S:bcmp
93,038,752 sysdeps/x86_64/strcmp.S:__GI_strcmp
callgrind annotations for "apt-shell <<<unmet", this commit:
2,000,254,692 PROGRAM TOTALS
642,039,009 lib/set.c:decode_base62_golomb
227,036,590 lib/set.c:rpmsetcmp
98,247,798 sysdeps/x86_64/memcmp.S:bcmp
93,047,422 sysdeps/x86_64/strcmp.S:__GI_strcmp
2012-03-09 03:41:03 +04:00
}
else {
while ( 1 ) {
IFLT4 ;
IFGE ;
set.c: optimize array access in rpmsetcmp
callgrind results for "apt-cache unmet", 4.0.4-alt100.6:
2,198,298,537 PROGRAM TOTALS
1,115,738,267 lib/set.c:decode_set
484,035,006 lib/set.c:rpmsetcmp
143,078,002 ???:strcmp
79,477,321 ???:strlen
61,780,572 ???:0x0000000000033080'2
54,466,947 ???:memcpy
31,161,399 ???:strcpy
24,438,336 ???:pkgCache::DepIterator::AllTargets()
callgrind results for "apt-cache unmet", this commit:
1,755,431,664 PROGRAM TOTALS
764,189,271 lib/set.c:decode_base62_golomb
404,493,494 lib/set.c:rpmsetcmp
143,076,968 ???:strcmp
70,833,953 ???:strlen
61,780,572 ???:0x0000000000033080'2
54,466,947 ???:memcpy
31,161,399 ???:strcpy
24,438,336 ???:pkgCache::DepIterator::AllTargets()
2011-01-03 03:46:15 +03:00
}
2011-06-10 10:50:05 +04:00
}
// return
if ( v1 < v1end )
le = 0 ;
if ( v2 < v2end )
ge = 0 ;
if ( le & & ge )
return 0 ;
if ( ge )
return 1 ;
if ( le )
return - 1 ;
return - 2 ;
2010-09-11 01:34:05 +04:00
}
/*
* Simple API for creating set - versions .
*/
# include "system.h"
# include "rpmlib.h"
2011-01-02 06:39:32 +03:00
// Internally, "struct set" is just a bag of strings and their hash values.
2010-09-11 01:34:05 +04:00
struct set {
int c ;
struct sv {
const char * s ;
unsigned v ;
} * sv ;
} ;
struct set * set_new ( )
{
2011-01-02 06:39:32 +03:00
struct set * set = xmalloc ( sizeof * set ) ;
2010-09-11 01:34:05 +04:00
set - > c = 0 ;
set - > sv = NULL ;
return set ;
}
void set_add ( struct set * set , const char * sym )
{
const int delta = 1024 ;
if ( ( set - > c & ( delta - 1 ) ) = = 0 )
set - > sv = xrealloc ( set - > sv , sizeof ( * set - > sv ) * ( set - > c + delta ) ) ;
set - > sv [ set - > c ] . s = xstrdup ( sym ) ;
set - > sv [ set - > c ] . v = 0 ;
set - > c + + ;
}
2010-10-05 13:00:31 +04:00
struct set * set_free ( struct set * set )
2010-09-11 01:34:05 +04:00
{
2011-01-02 06:39:32 +03:00
if ( set ) {
int i ;
for ( i = 0 ; i < set - > c ; i + + )
set - > sv [ i ] . s = _free ( set - > sv [ i ] . s ) ;
set - > sv = _free ( set - > sv ) ;
2010-11-16 16:19:26 +03:00
}
2011-01-02 06:39:32 +03:00
return NULL ;
2010-11-16 16:19:26 +03:00
}
2015-05-21 17:59:09 +03:00
static
int cmp ( const void * arg1 , const void * arg2 )
{
const struct sv * sv1 = arg1 ;
const struct sv * sv2 = arg2 ;
if ( sv1 - > v > sv2 - > v )
return 1 ;
if ( sv2 - > v > sv1 - > v )
return - 1 ;
return 0 ;
}
// Jenkins' one-at-a-time hash
static
unsigned int hash ( const char * str )
{
unsigned int hash = 0x9e3779b9 ;
const unsigned char * p = ( const unsigned char * ) str ;
while ( * p ) {
hash + = * p + + ;
hash + = ( hash < < 10 ) ;
hash ^ = ( hash > > 6 ) ;
}
hash + = ( hash < < 3 ) ;
hash ^ = ( hash > > 11 ) ;
hash + = ( hash < < 15 ) ;
return hash ;
}
static
int uniqv ( int c , unsigned * v )
{
int i , j ;
for ( i = 0 , j = 0 ; i < c ; i + + ) {
while ( i + 1 < c & & v [ i ] = = v [ i + 1 ] )
i + + ;
v [ j + + ] = v [ i ] ;
}
return j ;
}
2011-01-02 06:39:32 +03:00
// This routine does the whole job.
2010-09-11 01:34:05 +04:00
const char * set_fini ( struct set * set , int bpp )
{
if ( set - > c < 1 )
return NULL ;
if ( bpp < 10 )
return NULL ;
if ( bpp > 32 )
return NULL ;
2011-01-02 06:39:32 +03:00
unsigned mask = ( bpp < 32 ) ? ( 1u < < bpp ) - 1 : ~ 0u ;
// hash sv strings
int i ;
2010-09-11 01:34:05 +04:00
for ( i = 0 ; i < set - > c ; i + + )
2011-01-02 06:39:32 +03:00
set - > sv [ i ] . v = hash ( set - > sv [ i ] . s ) & mask ;
// sort by hash value
qsort ( set - > sv , set - > c , sizeof * set - > sv , cmp ) ;
// warn on hash collisions
2010-09-11 01:34:05 +04:00
for ( i = 0 ; i < set - > c - 1 ; i + + ) {
if ( set - > sv [ i ] . v ! = set - > sv [ i + 1 ] . v )
continue ;
2010-09-17 11:41:04 +04:00
if ( strcmp ( set - > sv [ i ] . s , set - > sv [ i + 1 ] . s ) = = 0 )
2010-09-11 01:34:05 +04:00
continue ;
fprintf ( stderr , " warning: hash collision: %s %s \n " ,
set - > sv [ i ] . s , set - > sv [ i + 1 ] . s ) ;
}
2011-01-02 06:39:32 +03:00
// encode
unsigned v [ set - > c ] ;
2010-09-11 01:34:05 +04:00
for ( i = 0 ; i < set - > c ; i + + )
v [ i ] = set - > sv [ i ] . v ;
2011-01-02 06:39:32 +03:00
int c = uniqv ( set - > c , v ) ;
char base62 [ encode_set_size ( c , bpp ) ] ;
int len = encode_set ( c , v , bpp , base62 ) ;
2010-09-11 01:34:05 +04:00
if ( len < 0 )
return NULL ;
return xstrdup ( base62 ) ;
}
# ifdef SELF_TEST
static
2011-01-02 06:39:32 +03:00
void test_api ( )
2010-09-11 01:34:05 +04:00
{
2011-01-02 06:39:32 +03:00
struct set * set1 = set_new ( ) ;
2010-09-11 01:34:05 +04:00
set_add ( set1 , " mama " ) ;
set_add ( set1 , " myla " ) ;
set_add ( set1 , " ramu " ) ;
2011-01-02 06:39:32 +03:00
const char * str10 = set_fini ( set1 , 16 ) ;
2010-09-11 01:34:05 +04:00
fprintf ( stderr , " set10=%s \n " , str10 ) ;
2011-01-02 06:39:32 +03:00
int cmp ;
struct set * set2 = set_new ( ) ;
2010-09-11 01:34:05 +04:00
set_add ( set2 , " myla " ) ;
set_add ( set2 , " mama " ) ;
2011-01-02 06:39:32 +03:00
const char * str20 = set_fini ( set2 , 16 ) ;
2010-09-11 01:34:05 +04:00
fprintf ( stderr , " set20=%s \n " , str20 ) ;
cmp = rpmsetcmp ( str10 , str20 ) ;
assert ( cmp = = 1 ) ;
set_add ( set2 , " ramu " ) ;
2011-01-02 06:39:32 +03:00
const char * str21 = set_fini ( set2 , 16 ) ;
2010-09-11 01:34:05 +04:00
fprintf ( stderr , " set21=%s \n " , str21 ) ;
cmp = rpmsetcmp ( str10 , str21 ) ;
assert ( cmp = = 0 ) ;
set_add ( set2 , " baba " ) ;
2011-01-02 06:39:32 +03:00
const char * str22 = set_fini ( set2 , 16 ) ;
2010-09-11 01:34:05 +04:00
cmp = rpmsetcmp ( str10 , str22 ) ;
assert ( cmp = = - 1 ) ;
set_add ( set1 , " deda " ) ;
2011-01-02 06:39:32 +03:00
const char * str11 = set_fini ( set1 , 16 ) ;
2010-09-11 01:34:05 +04:00
cmp = rpmsetcmp ( str11 , str22 ) ;
assert ( cmp = = - 2 ) ;
set1 = set_free ( set1 ) ;
set2 = set_free ( set2 ) ;
str10 = _free ( str10 ) ;
str11 = _free ( str11 ) ;
str20 = _free ( str20 ) ;
str21 = _free ( str21 ) ;
str22 = _free ( str22 ) ;
fprintf ( stderr , " %s: api test OK \n " , __FILE__ ) ;
}
# endif
# ifdef SELF_TEST
2011-01-02 06:39:32 +03:00
int main ( )
2010-09-11 01:34:05 +04:00
{
test_base62 ( ) ;
test_golomb ( ) ;
set.c: implemented two-bytes-at-a-time base62 decoding
callgrind annotations, 4.0.4-alt100.27:
1,899,576,194 PROGRAM TOTALS
694,132,522 decode_base62_golomb
583,376,772 rpmsetcmp
106,136,459 __GI_strcmp
102,459,362 __GI_strlen
...
callgrind annotations, this commit (built in hasher):
1,691,904,239 PROGRAM TOTALS
583,395,352 rpmsetcmp
486,433,168 decode_base62_golomb
106,122,657 __GI_strcmp
102,458,654 __GI_strlen
2011-05-27 06:36:14 +04:00
test_word_table ( ) ;
2011-01-03 02:57:02 +03:00
test_base62_golomb ( ) ;
2010-09-11 01:34:05 +04:00
test_delta ( ) ;
test_set ( ) ;
test_api ( ) ;
return 0 ;
}
# endif
2011-01-02 06:39:32 +03:00
// ex: set ts=8 sts=4 sw=4 noet: