2024-03-22 19:26:33 -04:00
// SPDX-License-Identifier: GPL-2.0
# include "eytzinger.h"
/**
* is_aligned - is this pointer & size okay for word - wide copying ?
* @ base : pointer to data
* @ size : size of each element
* @ align : required alignment ( typically 4 or 8 )
*
* Returns true if elements can be copied using word loads and stores .
* The size must be a multiple of the alignment , and the base address must
* be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS .
*
* For some reason , gcc doesn ' t know to optimize " if (a & mask || b & mask) "
* to " if ((a | b) & mask) " , so we do that by hand .
*/
__attribute_const__ __always_inline
static bool is_aligned ( const void * base , size_t size , unsigned char align )
{
unsigned char lsbits = ( unsigned char ) size ;
( void ) base ;
# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
lsbits | = ( unsigned char ) ( uintptr_t ) base ;
# endif
return ( lsbits & ( align - 1 ) ) = = 0 ;
}
/**
* swap_words_32 - swap two elements in 32 - bit chunks
* @ a : pointer to the first element to swap
* @ b : pointer to the second element to swap
* @ n : element size ( must be a multiple of 4 )
*
* Exchange the two objects in memory . This exploits base + index addressing ,
* which basically all CPUs have , to minimize loop overhead computations .
*
* For some reason , on x86 gcc 7.3 .0 adds a redundant test of n at the
* bottom of the loop , even though the zero flag is still valid from the
* subtract ( since the intervening mov instructions don ' t alter the flags ) .
* Gcc 8.1 .0 doesn ' t have that problem .
*/
static void swap_words_32 ( void * a , void * b , size_t n )
{
do {
u32 t = * ( u32 * ) ( a + ( n - = 4 ) ) ;
* ( u32 * ) ( a + n ) = * ( u32 * ) ( b + n ) ;
* ( u32 * ) ( b + n ) = t ;
} while ( n ) ;
}
/**
* swap_words_64 - swap two elements in 64 - bit chunks
* @ a : pointer to the first element to swap
* @ b : pointer to the second element to swap
* @ n : element size ( must be a multiple of 8 )
*
* Exchange the two objects in memory . This exploits base + index
* addressing , which basically all CPUs have , to minimize loop overhead
* computations .
*
* We ' d like to use 64 - bit loads if possible . If they ' re not , emulating
* one requires base + index + 4 addressing which x86 has but most other
* processors do not . If CONFIG_64BIT , we definitely have 64 - bit loads ,
* but it ' s possible to have 64 - bit loads without 64 - bit pointers ( e . g .
* x32 ABI ) . Are there any cases the kernel needs to worry about ?
*/
static void swap_words_64 ( void * a , void * b , size_t n )
{
do {
# ifdef CONFIG_64BIT
u64 t = * ( u64 * ) ( a + ( n - = 8 ) ) ;
* ( u64 * ) ( a + n ) = * ( u64 * ) ( b + n ) ;
* ( u64 * ) ( b + n ) = t ;
# else
/* Use two 32-bit transfers to avoid base+index+4 addressing */
u32 t = * ( u32 * ) ( a + ( n - = 4 ) ) ;
* ( u32 * ) ( a + n ) = * ( u32 * ) ( b + n ) ;
* ( u32 * ) ( b + n ) = t ;
t = * ( u32 * ) ( a + ( n - = 4 ) ) ;
* ( u32 * ) ( a + n ) = * ( u32 * ) ( b + n ) ;
* ( u32 * ) ( b + n ) = t ;
# endif
} while ( n ) ;
}
/**
* swap_bytes - swap two elements a byte at a time
* @ a : pointer to the first element to swap
* @ b : pointer to the second element to swap
* @ n : element size
*
* This is the fallback if alignment doesn ' t allow using larger chunks .
*/
static void swap_bytes ( void * a , void * b , size_t n )
{
do {
char t = ( ( char * ) a ) [ - - n ] ;
( ( char * ) a ) [ n ] = ( ( char * ) b ) [ n ] ;
( ( char * ) b ) [ n ] = t ;
} while ( n ) ;
}
/*
* The values are arbitrary as long as they can ' t be confused with
* a pointer , but small integers make for the smallest compare
* instructions .
*/
# define SWAP_WORDS_64 (swap_r_func_t)0
# define SWAP_WORDS_32 (swap_r_func_t)1
# define SWAP_BYTES (swap_r_func_t)2
# define SWAP_WRAPPER (swap_r_func_t)3
struct wrapper {
cmp_func_t cmp ;
2024-04-06 16:19:20 +02:00
swap_func_t swap_func ;
2024-03-22 19:26:33 -04:00
} ;
/*
* The function pointer is last to make tail calls most efficient if the
* compiler decides not to inline this function .
*/
static void do_swap ( void * a , void * b , size_t size , swap_r_func_t swap_func , const void * priv )
{
if ( swap_func = = SWAP_WRAPPER ) {
2024-04-06 16:19:20 +02:00
( ( const struct wrapper * ) priv ) - > swap_func ( a , b , ( int ) size ) ;
2024-03-22 19:26:33 -04:00
return ;
}
if ( swap_func = = SWAP_WORDS_64 )
swap_words_64 ( a , b , size ) ;
else if ( swap_func = = SWAP_WORDS_32 )
swap_words_32 ( a , b , size ) ;
else if ( swap_func = = SWAP_BYTES )
swap_bytes ( a , b , size ) ;
else
swap_func ( a , b , ( int ) size , priv ) ;
}
# define _CMP_WRAPPER ((cmp_r_func_t)0L)
static int do_cmp ( const void * a , const void * b , cmp_r_func_t cmp , const void * priv )
{
if ( cmp = = _CMP_WRAPPER )
return ( ( const struct wrapper * ) priv ) - > cmp ( a , b ) ;
return cmp ( a , b , priv ) ;
}
static inline int eytzinger0_do_cmp ( void * base , size_t n , size_t size ,
cmp_r_func_t cmp_func , const void * priv ,
size_t l , size_t r )
{
return do_cmp ( base + inorder_to_eytzinger0 ( l , n ) * size ,
base + inorder_to_eytzinger0 ( r , n ) * size ,
cmp_func , priv ) ;
}
static inline void eytzinger0_do_swap ( void * base , size_t n , size_t size ,
swap_r_func_t swap_func , const void * priv ,
size_t l , size_t r )
{
do_swap ( base + inorder_to_eytzinger0 ( l , n ) * size ,
base + inorder_to_eytzinger0 ( r , n ) * size ,
size , swap_func , priv ) ;
}
void eytzinger0_sort_r ( void * base , size_t n , size_t size ,
cmp_r_func_t cmp_func ,
swap_r_func_t swap_func ,
const void * priv )
{
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
int i , j , k ;
2024-03-22 19:26:33 -04:00
/* called from 'sort' without swap function, let's pick the default */
2024-04-06 16:19:20 +02:00
if ( swap_func = = SWAP_WRAPPER & & ! ( ( struct wrapper * ) priv ) - > swap_func )
2024-03-22 19:26:33 -04:00
swap_func = NULL ;
if ( ! swap_func ) {
if ( is_aligned ( base , size , 8 ) )
swap_func = SWAP_WORDS_64 ;
else if ( is_aligned ( base , size , 4 ) )
swap_func = SWAP_WORDS_32 ;
else
swap_func = SWAP_BYTES ;
}
/* heapify */
for ( i = n / 2 - 1 ; i > = 0 ; - - i ) {
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Find the sift-down path all the way to the leaves. */
for ( j = i ; k = j * 2 + 1 , k + 1 < n ; )
j = eytzinger0_do_cmp ( base , n , size , cmp_func , priv , k , k + 1 ) > 0 ? k : k + 1 ;
2024-03-22 19:26:33 -04:00
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Special case for the last leaf with no sibling. */
if ( j * 2 + 2 = = n )
j = j * 2 + 1 ;
2024-03-22 19:26:33 -04:00
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Backtrack to the correct location. */
while ( j ! = i & & eytzinger0_do_cmp ( base , n , size , cmp_func , priv , i , j ) > = 0 )
j = ( j - 1 ) / 2 ;
2024-03-22 19:26:33 -04:00
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Shift the element into its correct place. */
for ( k = j ; j ! = i ; ) {
j = ( j - 1 ) / 2 ;
eytzinger0_do_swap ( base , n , size , swap_func , priv , j , k ) ;
2024-03-22 19:26:33 -04:00
}
}
/* sort */
for ( i = n - 1 ; i > 0 ; - - i ) {
eytzinger0_do_swap ( base , n , size , swap_func , priv , 0 , i ) ;
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Find the sift-down path all the way to the leaves. */
for ( j = 0 ; k = j * 2 + 1 , k + 1 < i ; )
j = eytzinger0_do_cmp ( base , n , size , cmp_func , priv , k , k + 1 ) > 0 ? k : k + 1 ;
2024-03-22 19:26:33 -04:00
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Special case for the last leaf with no sibling. */
if ( j * 2 + 2 = = i )
j = j * 2 + 1 ;
2024-03-22 19:26:33 -04:00
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Backtrack to the correct location. */
while ( j & & eytzinger0_do_cmp ( base , n , size , cmp_func , priv , 0 , j ) > = 0 )
j = ( j - 1 ) / 2 ;
2024-03-22 19:26:33 -04:00
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
/* Shift the element into its correct place. */
for ( k = j ; j ; ) {
j = ( j - 1 ) / 2 ;
eytzinger0_do_swap ( base , n , size , swap_func , priv , j , k ) ;
2024-03-22 19:26:33 -04:00
}
}
}
void eytzinger0_sort ( void * base , size_t n , size_t size ,
cmp_func_t cmp_func ,
swap_func_t swap_func )
{
struct wrapper w = {
. cmp = cmp_func ,
2024-04-06 16:19:20 +02:00
. swap_func = swap_func ,
2024-03-22 19:26:33 -04:00
} ;
return eytzinger0_sort_r ( base , n , size , _CMP_WRAPPER , SWAP_WRAPPER , & w ) ;
}
bcachefs: Optimize eytzinger0_sort() with bottom-up heapsort
This optimization reduces the average number of comparisons required
from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is
sufficiently large, it results in approximately 50% fewer comparisons.
Currently, eytzinger0_sort employs the textbook version of heapsort,
where during the heapify process, each level requires two comparisons
to determine the maximum among three elements. In contrast, the
bottom-up heapsort, during heapify, only compares two children at each
level until reaching a leaf node. Then, it backtracks from the leaf
node to find the correct position. Since heapify typically continues
until very close to the leaf node, the standard heapify requires about
2*log2(n) comparisons, while the bottom-up variant only needs log2(n)
comparisons.
The experimental data presented below is based on an array generated
by get_random_u32().
| N | comparisons(old) | comparisons(new) | time(old) | time(new) |
|-------|------------------|------------------|-----------|-----------|
| 10000 | 235381 | 136615 | 25545 us | 20366 us |
| 20000 | 510694 | 293425 | 31336 us | 18312 us |
| 30000 | 800384 | 457412 | 35042 us | 27386 us |
| 40000 | 1101617 | 626831 | 48779 us | 38253 us |
| 50000 | 1409762 | 799637 | 62238 us | 46950 us |
| 60000 | 1721191 | 974521 | 75588 us | 58367 us |
| 70000 | 2038536 | 1152171 | 90823 us | 68778 us |
| 80000 | 2362958 | 1333472 | 104165 us | 78625 us |
| 90000 | 2690900 | 1516065 | 116111 us | 89573 us |
| 100000| 3019413 | 1699879 | 133638 us | 100998 us |
Refs:
BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average,
QUICKSORT (if n is not very small)
Ingo Wegener
Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993
https://doi.org/10.1016/0304-3975(93)90364-Y
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-04-07 11:39:04 +08:00
#if 0
# include <linux/slab.h>
# include <linux/random.h>
# include <linux/ktime.h>
static u64 cmp_count ;
static int mycmp ( const void * a , const void * b )
{
u32 _a = * ( u32 * ) a ;
u32 _b = * ( u32 * ) b ;
cmp_count + + ;
if ( _a < _b )
return - 1 ;
else if ( _a > _b )
return 1 ;
else
return 0 ;
}
static int test ( void )
{
size_t N , i ;
ktime_t start , end ;
s64 delta ;
u32 * arr ;
for ( N = 10000 ; N < = 100000 ; N + = 10000 ) {
arr = kmalloc_array ( N , sizeof ( u32 ) , GFP_KERNEL ) ;
cmp_count = 0 ;
for ( i = 0 ; i < N ; i + + )
arr [ i ] = get_random_u32 ( ) ;
start = ktime_get ( ) ;
eytzinger0_sort ( arr , N , sizeof ( u32 ) , mycmp , NULL ) ;
end = ktime_get ( ) ;
delta = ktime_us_delta ( end , start ) ;
printk ( KERN_INFO " time: %lld \n " , delta ) ;
printk ( KERN_INFO " comparisons: %lld \n " , cmp_count ) ;
u32 prev = 0 ;
eytzinger0_for_each ( i , N ) {
if ( prev > arr [ i ] )
goto err ;
prev = arr [ i ] ;
}
kfree ( arr ) ;
}
return 0 ;
err :
kfree ( arr ) ;
return - 1 ;
}
# endif