2005-04-16 15:20:36 -07:00
/*
* Optimized memory copy routines .
*
* Copyright ( C ) 2004 Randolph Chung < tausq @ debian . org >
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 , or ( at your option )
* any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 675 Mass Ave , Cambridge , MA 0213 9 , USA .
*
* Portions derived from the GNU C Library
* Copyright ( C ) 1991 , 1997 , 2003 Free Software Foundation , Inc .
*
* Several strategies are tried to try to get the best performance for various
* conditions . In the optimal case , we copy 64 - bytes in an unrolled loop using
* fp regs . This is followed by loops that copy 32 - or 16 - bytes at a time using
* general registers . Unaligned copies are handled either by aligning the
* destination and then using shift - and - write method , or in a few cases by
* falling back to a byte - at - a - time copy .
*
* I chose to implement this in C because it is easier to maintain and debug ,
* and in my experiments it appears that the C code generated by gcc ( 3.3 / 3.4
* at the time of writing ) is fairly optimal . Unfortunately some of the
* semantics of the copy routine ( exception handling ) is difficult to express
* in C , so we have to play some tricks to get it to work .
*
* All the loads and stores are done via explicit asm ( ) code in order to use
* the right space registers .
*
* Testing with various alignments and buffer sizes shows that this code is
* often > 10 x faster than a simple byte - at - a - time copy , even for strangely
* aligned operands . It is interesting to note that the glibc version
* of memcpy ( written in C ) is actually quite fast already . This routine is
* able to beat it by 30 - 40 % for aligned copies because of the loop unrolling ,
* but in some cases the glibc version is still slightly faster . This lends
* more credibility that gcc can generate very good code as long as we are
* careful .
*
* TODO :
* - cache prefetching needs more experimentation to get optimal settings
* - try not to use the post - increment address modifiers ; they create additional
* interlocks
* - replace byte - copy loops with stybs sequences
*/
# ifdef __KERNEL__
# include <linux/module.h>
# include <linux/compiler.h>
# include <asm/uaccess.h>
# define s_space "%%sr1"
# define d_space "%%sr2"
# else
# include "memcpy.h"
# define s_space "%%sr0"
# define d_space "%%sr0"
# define pa_memcpy new2_copy
# endif
DECLARE_PER_CPU ( struct exception_data , exception_data ) ;
# define preserve_branch(label) do { \
volatile int dummy ; \
/* The following branch is never taken, it's just here to */ \
/* prevent gcc from optimizing away our exception code. */ \
if ( unlikely ( dummy ! = dummy ) ) \
goto label ; \
} while ( 0 )
# define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
# define get_kernel_space() (0)
# define MERGE(w0, sh_1, w1, sh_2) ({ \
unsigned int _r ; \
asm volatile ( \
" mtsar %3 \n " \
" shrpw %1, %2, %%sar, %0 \n " \
: " =r " ( _r ) \
: " r " ( w0 ) , " r " ( w1 ) , " r " ( sh_2 ) \
) ; \
_r ; \
} )
# define THRESHOLD 16
# ifdef DEBUG_MEMCPY
# define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
# else
# define DPRINTF(fmt, args...)
# endif
# ifndef __LP64__
# define EXC_WORD ".word"
# else
# define EXC_WORD ".dword"
# endif
# define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
__asm__ __volatile__ ( \
" 1: \t " # _insn " ,ma " # _sz " ( " _s " ,%1), %0 \n " \
" \t .section __ex_table, \" aw \" \n " \
" \t " EXC_WORD " \t 1b \n " \
" \t " EXC_WORD " \t " # _e " \n " \
" \t .previous \n " \
: _tt ( _t ) , " +r " ( _a ) \
: \
: " r8 " )
# define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
__asm__ __volatile__ ( \
" 1: \t " # _insn " ,ma %1, " # _sz " ( " _s " ,%0) \n " \
" \t .section __ex_table, \" aw \" \n " \
" \t " EXC_WORD " \t 1b \n " \
" \t " EXC_WORD " \t " # _e " \n " \
" \t .previous \n " \
: " +r " ( _a ) \
: _tt ( _t ) \
: " r8 " )
# define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
# define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
# define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
# define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
# define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
# define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
# define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
__asm__ __volatile__ ( \
" 1: \t " # _insn " " # _o " ( " _s " ,%1), %0 \n " \
" \t .section __ex_table, \" aw \" \n " \
" \t " EXC_WORD " \t 1b \n " \
" \t " EXC_WORD " \t " # _e " \n " \
" \t .previous \n " \
: _tt ( _t ) \
: " r " ( _a ) \
: " r8 " )
# define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
__asm__ __volatile__ ( \
" 1: \t " # _insn " %0, " # _o " ( " _s " ,%1) \n " \
" \t .section __ex_table, \" aw \" \n " \
" \t " EXC_WORD " \t 1b \n " \
" \t " EXC_WORD " \t " # _e " \n " \
" \t .previous \n " \
: \
: _tt ( _t ) , " r " ( _a ) \
: " r8 " )
# define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
# define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
# ifdef CONFIG_PREFETCH
extern inline void prefetch_src ( const void * addr )
{
__asm__ ( " ldw 0( " s_space " ,%0), %%r0 " : : " r " ( addr ) ) ;
}
extern inline void prefetch_dst ( const void * addr )
{
__asm__ ( " ldd 0( " d_space " ,%0), %%r0 " : : " r " ( addr ) ) ;
}
# else
# define prefetch_src(addr)
# define prefetch_dst(addr)
# endif
/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
* per loop . This code is derived from glibc .
*/
static inline unsigned long copy_dstaligned ( unsigned long dst , unsigned long src , unsigned long len , unsigned long o_dst , unsigned long o_src , unsigned long o_len )
{
/* gcc complains that a2 and a3 may be uninitialized, but actually
* they cannot be . Initialize a2 / a3 to shut gcc up .
*/
register unsigned int a0 , a1 , a2 = 0 , a3 = 0 ;
int sh_1 , sh_2 ;
struct exception_data * d ;
/* prefetch_src((const void *)src); */
/* Calculate how to shift a word read at the memory operation
aligned srcp to make it aligned for copy . */
sh_1 = 8 * ( src % sizeof ( unsigned int ) ) ;
sh_2 = 8 * sizeof ( unsigned int ) - sh_1 ;
/* Make src aligned by rounding it down. */
src & = - sizeof ( unsigned int ) ;
switch ( len % 4 )
{
case 2 :
/* a1 = ((unsigned int *) src)[0];
a2 = ( ( unsigned int * ) src ) [ 1 ] ; */
ldw ( s_space , 0 , src , a1 , cda_ldw_exc ) ;
ldw ( s_space , 4 , src , a2 , cda_ldw_exc ) ;
src - = 1 * sizeof ( unsigned int ) ;
dst - = 3 * sizeof ( unsigned int ) ;
len + = 2 ;
goto do1 ;
case 3 :
/* a0 = ((unsigned int *) src)[0];
a1 = ( ( unsigned int * ) src ) [ 1 ] ; */
ldw ( s_space , 0 , src , a0 , cda_ldw_exc ) ;
ldw ( s_space , 4 , src , a1 , cda_ldw_exc ) ;
src - = 0 * sizeof ( unsigned int ) ;
dst - = 2 * sizeof ( unsigned int ) ;
len + = 1 ;
goto do2 ;
case 0 :
if ( len = = 0 )
return 0 ;
/* a3 = ((unsigned int *) src)[0];
a0 = ( ( unsigned int * ) src ) [ 1 ] ; */
ldw ( s_space , 0 , src , a3 , cda_ldw_exc ) ;
ldw ( s_space , 4 , src , a0 , cda_ldw_exc ) ;
src - = - 1 * sizeof ( unsigned int ) ;
dst - = 1 * sizeof ( unsigned int ) ;
len + = 0 ;
goto do3 ;
case 1 :
/* a2 = ((unsigned int *) src)[0];
a3 = ( ( unsigned int * ) src ) [ 1 ] ; */
ldw ( s_space , 0 , src , a2 , cda_ldw_exc ) ;
ldw ( s_space , 4 , src , a3 , cda_ldw_exc ) ;
src - = - 2 * sizeof ( unsigned int ) ;
dst - = 0 * sizeof ( unsigned int ) ;
len - = 1 ;
if ( len = = 0 )
goto do0 ;
goto do4 ; /* No-op. */
}
do
{
/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
do4 :
/* a0 = ((unsigned int *) src)[0]; */
ldw ( s_space , 0 , src , a0 , cda_ldw_exc ) ;
/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
stw ( d_space , MERGE ( a2 , sh_1 , a3 , sh_2 ) , 0 , dst , cda_stw_exc ) ;
do3 :
/* a1 = ((unsigned int *) src)[1]; */
ldw ( s_space , 4 , src , a1 , cda_ldw_exc ) ;
/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
stw ( d_space , MERGE ( a3 , sh_1 , a0 , sh_2 ) , 4 , dst , cda_stw_exc ) ;
do2 :
/* a2 = ((unsigned int *) src)[2]; */
ldw ( s_space , 8 , src , a2 , cda_ldw_exc ) ;
/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
stw ( d_space , MERGE ( a0 , sh_1 , a1 , sh_2 ) , 8 , dst , cda_stw_exc ) ;
do1 :
/* a3 = ((unsigned int *) src)[3]; */
ldw ( s_space , 12 , src , a3 , cda_ldw_exc ) ;
/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
stw ( d_space , MERGE ( a1 , sh_1 , a2 , sh_2 ) , 12 , dst , cda_stw_exc ) ;
src + = 4 * sizeof ( unsigned int ) ;
dst + = 4 * sizeof ( unsigned int ) ;
len - = 4 ;
}
while ( len ! = 0 ) ;
do0 :
/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
stw ( d_space , MERGE ( a2 , sh_1 , a3 , sh_2 ) , 0 , dst , cda_stw_exc ) ;
preserve_branch ( handle_load_error ) ;
preserve_branch ( handle_store_error ) ;
return 0 ;
handle_load_error :
__asm__ __volatile__ ( " cda_ldw_exc: \n " ) ;
d = & __get_cpu_var ( exception_data ) ;
DPRINTF ( " cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu \n " ,
o_len , d - > fault_addr , o_src , o_len - d - > fault_addr + o_src ) ;
return o_len * 4 - d - > fault_addr + o_src ;
handle_store_error :
__asm__ __volatile__ ( " cda_stw_exc: \n " ) ;
d = & __get_cpu_var ( exception_data ) ;
DPRINTF ( " cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu \n " ,
o_len , d - > fault_addr , o_dst , o_len - d - > fault_addr + o_dst ) ;
return o_len * 4 - d - > fault_addr + o_dst ;
}
/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
unsigned long pa_memcpy ( void * dstp , const void * srcp , unsigned long len )
{
register unsigned long src , dst , t1 , t2 , t3 ;
register unsigned char * pcs , * pcd ;
register unsigned int * pws , * pwd ;
register double * pds , * pdd ;
unsigned long ret = 0 ;
unsigned long o_dst , o_src , o_len ;
struct exception_data * d ;
src = ( unsigned long ) srcp ;
dst = ( unsigned long ) dstp ;
pcs = ( unsigned char * ) srcp ;
pcd = ( unsigned char * ) dstp ;
o_dst = dst ; o_src = src ; o_len = len ;
/* prefetch_src((const void *)srcp); */
if ( len < THRESHOLD )
goto byte_copy ;
/* Check alignment */
t1 = ( src ^ dst ) ;
if ( unlikely ( t1 & ( sizeof ( double ) - 1 ) ) )
goto unaligned_copy ;
/* src and dst have same alignment. */
/* Copy bytes till we are double-aligned. */
t2 = src & ( sizeof ( double ) - 1 ) ;
if ( unlikely ( t2 ! = 0 ) ) {
t2 = sizeof ( double ) - t2 ;
while ( t2 & & len ) {
/* *pcd++ = *pcs++; */
ldbma ( s_space , pcs , t3 , pmc_load_exc ) ;
len - - ;
stbma ( d_space , t3 , pcd , pmc_store_exc ) ;
t2 - - ;
}
}
pds = ( double * ) pcs ;
pdd = ( double * ) pcd ;
2005-10-21 22:48:34 -04:00
#if 0
2005-04-16 15:20:36 -07:00
/* Copy 8 doubles at a time */
while ( len > = 8 * sizeof ( double ) ) {
register double r1 , r2 , r3 , r4 , r5 , r6 , r7 , r8 ;
/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
flddma ( s_space , pds , r1 , pmc_load_exc ) ;
flddma ( s_space , pds , r2 , pmc_load_exc ) ;
flddma ( s_space , pds , r3 , pmc_load_exc ) ;
flddma ( s_space , pds , r4 , pmc_load_exc ) ;
fstdma ( d_space , r1 , pdd , pmc_store_exc ) ;
fstdma ( d_space , r2 , pdd , pmc_store_exc ) ;
fstdma ( d_space , r3 , pdd , pmc_store_exc ) ;
fstdma ( d_space , r4 , pdd , pmc_store_exc ) ;
#if 0
if ( L1_CACHE_BYTES < = 32 )
prefetch_src ( ( char * ) pds + L1_CACHE_BYTES ) ;
# endif
flddma ( s_space , pds , r5 , pmc_load_exc ) ;
flddma ( s_space , pds , r6 , pmc_load_exc ) ;
flddma ( s_space , pds , r7 , pmc_load_exc ) ;
flddma ( s_space , pds , r8 , pmc_load_exc ) ;
fstdma ( d_space , r5 , pdd , pmc_store_exc ) ;
fstdma ( d_space , r6 , pdd , pmc_store_exc ) ;
fstdma ( d_space , r7 , pdd , pmc_store_exc ) ;
fstdma ( d_space , r8 , pdd , pmc_store_exc ) ;
len - = 8 * sizeof ( double ) ;
}
2005-10-21 22:48:34 -04:00
# endif
2005-04-16 15:20:36 -07:00
pws = ( unsigned int * ) pds ;
pwd = ( unsigned int * ) pdd ;
word_copy :
while ( len > = 8 * sizeof ( unsigned int ) ) {
register unsigned int r1 , r2 , r3 , r4 , r5 , r6 , r7 , r8 ;
/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
ldwma ( s_space , pws , r1 , pmc_load_exc ) ;
ldwma ( s_space , pws , r2 , pmc_load_exc ) ;
ldwma ( s_space , pws , r3 , pmc_load_exc ) ;
ldwma ( s_space , pws , r4 , pmc_load_exc ) ;
stwma ( d_space , r1 , pwd , pmc_store_exc ) ;
stwma ( d_space , r2 , pwd , pmc_store_exc ) ;
stwma ( d_space , r3 , pwd , pmc_store_exc ) ;
stwma ( d_space , r4 , pwd , pmc_store_exc ) ;
ldwma ( s_space , pws , r5 , pmc_load_exc ) ;
ldwma ( s_space , pws , r6 , pmc_load_exc ) ;
ldwma ( s_space , pws , r7 , pmc_load_exc ) ;
ldwma ( s_space , pws , r8 , pmc_load_exc ) ;
stwma ( d_space , r5 , pwd , pmc_store_exc ) ;
stwma ( d_space , r6 , pwd , pmc_store_exc ) ;
stwma ( d_space , r7 , pwd , pmc_store_exc ) ;
stwma ( d_space , r8 , pwd , pmc_store_exc ) ;
len - = 8 * sizeof ( unsigned int ) ;
}
while ( len > = 4 * sizeof ( unsigned int ) ) {
register unsigned int r1 , r2 , r3 , r4 ;
ldwma ( s_space , pws , r1 , pmc_load_exc ) ;
ldwma ( s_space , pws , r2 , pmc_load_exc ) ;
ldwma ( s_space , pws , r3 , pmc_load_exc ) ;
ldwma ( s_space , pws , r4 , pmc_load_exc ) ;
stwma ( d_space , r1 , pwd , pmc_store_exc ) ;
stwma ( d_space , r2 , pwd , pmc_store_exc ) ;
stwma ( d_space , r3 , pwd , pmc_store_exc ) ;
stwma ( d_space , r4 , pwd , pmc_store_exc ) ;
len - = 4 * sizeof ( unsigned int ) ;
}
pcs = ( unsigned char * ) pws ;
pcd = ( unsigned char * ) pwd ;
byte_copy :
while ( len ) {
/* *pcd++ = *pcs++; */
ldbma ( s_space , pcs , t3 , pmc_load_exc ) ;
stbma ( d_space , t3 , pcd , pmc_store_exc ) ;
len - - ;
}
return 0 ;
unaligned_copy :
/* possibly we are aligned on a word, but not on a double... */
if ( likely ( t1 & ( sizeof ( unsigned int ) - 1 ) ) = = 0 ) {
t2 = src & ( sizeof ( unsigned int ) - 1 ) ;
if ( unlikely ( t2 ! = 0 ) ) {
t2 = sizeof ( unsigned int ) - t2 ;
while ( t2 ) {
/* *pcd++ = *pcs++; */
ldbma ( s_space , pcs , t3 , pmc_load_exc ) ;
stbma ( d_space , t3 , pcd , pmc_store_exc ) ;
len - - ;
t2 - - ;
}
}
pws = ( unsigned int * ) pcs ;
pwd = ( unsigned int * ) pcd ;
goto word_copy ;
}
/* Align the destination. */
if ( unlikely ( ( dst & ( sizeof ( unsigned int ) - 1 ) ) ! = 0 ) ) {
t2 = sizeof ( unsigned int ) - ( dst & ( sizeof ( unsigned int ) - 1 ) ) ;
while ( t2 ) {
/* *pcd++ = *pcs++; */
ldbma ( s_space , pcs , t3 , pmc_load_exc ) ;
stbma ( d_space , t3 , pcd , pmc_store_exc ) ;
len - - ;
t2 - - ;
}
dst = ( unsigned long ) pcd ;
src = ( unsigned long ) pcs ;
}
ret = copy_dstaligned ( dst , src , len / sizeof ( unsigned int ) ,
o_dst , o_src , o_len ) ;
if ( ret )
return ret ;
pcs + = ( len & - sizeof ( unsigned int ) ) ;
pcd + = ( len & - sizeof ( unsigned int ) ) ;
len % = sizeof ( unsigned int ) ;
preserve_branch ( handle_load_error ) ;
preserve_branch ( handle_store_error ) ;
goto byte_copy ;
handle_load_error :
__asm__ __volatile__ ( " pmc_load_exc: \n " ) ;
d = & __get_cpu_var ( exception_data ) ;
DPRINTF ( " pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu \n " ,
o_len , d - > fault_addr , o_src , o_len - d - > fault_addr + o_src ) ;
return o_len - d - > fault_addr + o_src ;
handle_store_error :
__asm__ __volatile__ ( " pmc_store_exc: \n " ) ;
d = & __get_cpu_var ( exception_data ) ;
DPRINTF ( " pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu \n " ,
o_len , d - > fault_addr , o_dst , o_len - d - > fault_addr + o_dst ) ;
return o_len - d - > fault_addr + o_dst ;
}
# ifdef __KERNEL__
unsigned long copy_to_user ( void __user * dst , const void * src , unsigned long len )
{
mtsp ( get_kernel_space ( ) , 1 ) ;
mtsp ( get_user_space ( ) , 2 ) ;
return pa_memcpy ( ( void __force * ) dst , src , len ) ;
}
unsigned long copy_from_user ( void * dst , const void __user * src , unsigned long len )
{
mtsp ( get_user_space ( ) , 1 ) ;
mtsp ( get_kernel_space ( ) , 2 ) ;
return pa_memcpy ( dst , ( void __force * ) src , len ) ;
}
unsigned long copy_in_user ( void __user * dst , const void __user * src , unsigned long len )
{
mtsp ( get_user_space ( ) , 1 ) ;
mtsp ( get_user_space ( ) , 2 ) ;
return pa_memcpy ( ( void __force * ) dst , ( void __force * ) src , len ) ;
}
void * memcpy ( void * dst , const void * src , size_t count )
{
mtsp ( get_kernel_space ( ) , 1 ) ;
mtsp ( get_kernel_space ( ) , 2 ) ;
pa_memcpy ( dst , src , count ) ;
return dst ;
}
EXPORT_SYMBOL ( copy_to_user ) ;
EXPORT_SYMBOL ( copy_from_user ) ;
EXPORT_SYMBOL ( copy_in_user ) ;
EXPORT_SYMBOL ( memcpy ) ;
# endif