powerpc: optimise csum_partial() call when len is constant

csum_partial is often called for small fixed length packets
for which it is suboptimal to use the generic csum_partial()
function.

For instance, in my configuration, I got:
* One place calling it with constant len 4
* Seven places calling it with constant len 8
* Three places calling it with constant len 14
* One place calling it with constant len 20
* One place calling it with constant len 24
* One place calling it with constant len 32

This patch renames csum_partial() to __csum_partial() and
implements csum_partial() as a wrapper inline function which
* uses csum_add() for small 16bits multiple constant length
* uses ip_fast_csum() for other 32bits multiple constant
* uses __csum_partial() in all other cases

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <oss@buserror.net>
This commit is contained in:
Christophe Leroy 2016-03-07 18:44:37 +01:00 committed by Scott Wood
parent ac6082dd32
commit 7e393220b6
4 changed files with 61 additions and 28 deletions

View File

@ -12,20 +12,6 @@
#ifdef CONFIG_GENERIC_CSUM #ifdef CONFIG_GENERIC_CSUM
#include <asm-generic/checksum.h> #include <asm-generic/checksum.h>
#else #else
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
/* /*
* Computes the checksum of a memory block at src, length len, * Computes the checksum of a memory block at src, length len,
* and adds in "sum" (32-bit), while copying the block to dst. * and adds in "sum" (32-bit), while copying the block to dst.
@ -67,15 +53,6 @@ static inline __sum16 csum_fold(__wsum sum)
return (__force __sum16)(~((__force u32)sum + tmp) >> 16); return (__force __sum16)(~((__force u32)sum + tmp) >> 16);
} }
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
static inline __sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff, len, 0));
}
static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
unsigned short len, unsigned short len,
unsigned short proto, unsigned short proto,
@ -174,6 +151,62 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
return csum_fold(ip_fast_csum_nofold(iph, ihl)); return csum_fold(ip_fast_csum_nofold(iph, ihl));
} }
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 32-bit boundary
*/
__wsum __csum_partial(const void *buff, int len, __wsum sum);
static inline __wsum csum_partial(const void *buff, int len, __wsum sum)
{
if (__builtin_constant_p(len) && len <= 16 && (len & 1) == 0) {
if (len == 2)
sum = csum_add(sum, (__force __wsum)*(const u16 *)buff);
if (len >= 4)
sum = csum_add(sum, (__force __wsum)*(const u32 *)buff);
if (len == 6)
sum = csum_add(sum, (__force __wsum)
*(const u16 *)(buff + 4));
if (len >= 8)
sum = csum_add(sum, (__force __wsum)
*(const u32 *)(buff + 4));
if (len == 10)
sum = csum_add(sum, (__force __wsum)
*(const u16 *)(buff + 8));
if (len >= 12)
sum = csum_add(sum, (__force __wsum)
*(const u32 *)(buff + 8));
if (len == 14)
sum = csum_add(sum, (__force __wsum)
*(const u16 *)(buff + 12));
if (len >= 16)
sum = csum_add(sum, (__force __wsum)
*(const u32 *)(buff + 12));
} else if (__builtin_constant_p(len) && (len & 3) == 0) {
sum = csum_add(sum, ip_fast_csum_nofold(buff, len >> 2));
} else {
sum = __csum_partial(buff, len, sum);
}
return sum;
}
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
static inline __sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff, len, 0));
}
#endif #endif
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif #endif

View File

@ -24,9 +24,9 @@
* computes the checksum of a memory block at buff, length len, * computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit) * and adds in "sum" (32-bit)
* *
* csum_partial(buff, len, sum) * __csum_partial(buff, len, sum)
*/ */
_GLOBAL(csum_partial) _GLOBAL(__csum_partial)
subi r3,r3,4 subi r3,r3,4
srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ srawi. r6,r4,2 /* Divide len by 4 and also clear carry */
beq 3f /* if we're doing < 4 bytes */ beq 3f /* if we're doing < 4 bytes */

View File

@ -21,9 +21,9 @@
* Computes the checksum of a memory block at buff, length len, * Computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit). * and adds in "sum" (32-bit).
* *
* csum_partial(r3=buff, r4=len, r5=sum) * __csum_partial(r3=buff, r4=len, r5=sum)
*/ */
_GLOBAL(csum_partial) _GLOBAL(__csum_partial)
addic r0,r5,0 /* clear carry */ addic r0,r5,0 /* clear carry */
srdi. r6,r4,3 /* less than 8 bytes? */ srdi. r6,r4,3 /* less than 8 bytes? */

View File

@ -17,7 +17,7 @@ EXPORT_SYMBOL(strcmp);
EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strncmp);
#ifndef CONFIG_GENERIC_CSUM #ifndef CONFIG_GENERIC_CSUM
EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(__csum_partial);
EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(csum_partial_copy_generic);
#endif #endif