2005-04-16 15:20:36 -07:00
/*
* MMX 3 DNow ! library helper functions
*
* To do :
2008-02-18 08:53:56 +01:00
* We can use MMX just for prefetch in IRQ ' s . This may be a win .
2005-04-16 15:20:36 -07:00
* ( reported so on K6 - III )
* We should use a better code neutral filler for the short jump
* leal ebx . [ ebx ] is apparently best for K6 - 2 , but Cyrix ? ?
* We also want to clobber the filler register so we don ' t get any
2008-02-18 08:53:56 +01:00
* register forwarding stalls on the filler .
2005-04-16 15:20:36 -07:00
*
* Add * user handling . Checksums are not a win with MMX on any CPU
* tested so far for any MMX solution figured .
*
2008-02-18 08:53:56 +01:00
* 22 / 09 / 2000 - Arjan van de Ven
* Improved for non - egineering - sample Athlons
2005-04-16 15:20:36 -07:00
*
*/
2008-02-18 08:53:56 +01:00
# include <linux/hardirq.h>
# include <linux/string.h>
# include <linux/module.h>
# include <linux/sched.h>
# include <linux/types.h>
# include <asm/i387.h>
# include <asm/asm.h>
2005-04-16 15:20:36 -07:00
void * _mmx_memcpy ( void * to , const void * from , size_t len )
{
void * p ;
int i ;
if ( unlikely ( in_interrupt ( ) ) )
return __memcpy ( to , from , len ) ;
p = to ;
i = len > > 6 ; /* len/64 */
kernel_fpu_begin ( ) ;
__asm__ __volatile__ (
" 1: prefetch (%0) \n " /* This set is 28 bytes */
" prefetch 64(%0) \n "
" prefetch 128(%0) \n "
" prefetch 192(%0) \n "
" prefetch 256(%0) \n "
" 2: \n "
" .section .fixup, \" ax \" \n "
" 3: movw $0x1AEB, 1b \n " /* jmp on 26 bytes */
" jmp 2b \n "
" .previous \n "
2008-02-18 08:53:56 +01:00
_ASM_EXTABLE ( 1 b , 3 b )
: : " r " ( from ) ) ;
for ( ; i > 5 ; i - - ) {
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" 1: prefetch 320(%0) \n "
" 2: movq (%0), %%mm0 \n "
" movq 8(%0), %%mm1 \n "
" movq 16(%0), %%mm2 \n "
" movq 24(%0), %%mm3 \n "
" movq %%mm0, (%1) \n "
" movq %%mm1, 8(%1) \n "
" movq %%mm2, 16(%1) \n "
" movq %%mm3, 24(%1) \n "
" movq 32(%0), %%mm0 \n "
" movq 40(%0), %%mm1 \n "
" movq 48(%0), %%mm2 \n "
" movq 56(%0), %%mm3 \n "
" movq %%mm0, 32(%1) \n "
" movq %%mm1, 40(%1) \n "
" movq %%mm2, 48(%1) \n "
" movq %%mm3, 56(%1) \n "
" .section .fixup, \" ax \" \n "
" 3: movw $0x05EB, 1b \n " /* jmp on 5 bytes */
" jmp 2b \n "
" .previous \n "
2008-02-18 08:53:56 +01:00
_ASM_EXTABLE ( 1 b , 3 b )
: : " r " ( from ) , " r " ( to ) : " memory " ) ;
from + = 64 ;
to + = 64 ;
2005-04-16 15:20:36 -07:00
}
2008-02-18 08:53:56 +01:00
for ( ; i > 0 ; i - - ) {
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" movq (%0), %%mm0 \n "
" movq 8(%0), %%mm1 \n "
" movq 16(%0), %%mm2 \n "
" movq 24(%0), %%mm3 \n "
" movq %%mm0, (%1) \n "
" movq %%mm1, 8(%1) \n "
" movq %%mm2, 16(%1) \n "
" movq %%mm3, 24(%1) \n "
" movq 32(%0), %%mm0 \n "
" movq 40(%0), %%mm1 \n "
" movq 48(%0), %%mm2 \n "
" movq 56(%0), %%mm3 \n "
" movq %%mm0, 32(%1) \n "
" movq %%mm1, 40(%1) \n "
" movq %%mm2, 48(%1) \n "
" movq %%mm3, 56(%1) \n "
2008-02-18 08:53:56 +01:00
: : " r " ( from ) , " r " ( to ) : " memory " ) ;
from + = 64 ;
to + = 64 ;
2005-04-16 15:20:36 -07:00
}
/*
2008-02-18 08:53:56 +01:00
* Now do the tail of the block :
2005-04-16 15:20:36 -07:00
*/
2008-02-18 08:53:56 +01:00
__memcpy ( to , from , len & 63 ) ;
2005-04-16 15:20:36 -07:00
kernel_fpu_end ( ) ;
2008-02-18 08:53:56 +01:00
2005-04-16 15:20:36 -07:00
return p ;
}
2008-02-18 08:53:56 +01:00
EXPORT_SYMBOL ( _mmx_memcpy ) ;
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_MK7
/*
* The K7 has streaming cache bypass load / store . The Cyrix III , K6 and
* other MMX using processors do not .
*/
static void fast_clear_page ( void * page )
{
int i ;
kernel_fpu_begin ( ) ;
2008-02-18 08:53:56 +01:00
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" pxor %%mm0, %%mm0 \n " : :
) ;
2008-02-18 08:53:56 +01:00
for ( i = 0 ; i < 4096 / 64 ; i + + ) {
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" movntq %%mm0, (%0) \n "
" movntq %%mm0, 8(%0) \n "
" movntq %%mm0, 16(%0) \n "
" movntq %%mm0, 24(%0) \n "
" movntq %%mm0, 32(%0) \n "
" movntq %%mm0, 40(%0) \n "
" movntq %%mm0, 48(%0) \n "
" movntq %%mm0, 56(%0) \n "
: : " r " ( page ) : " memory " ) ;
2008-02-18 08:53:56 +01:00
page + = 64 ;
2005-04-16 15:20:36 -07:00
}
2008-02-18 08:53:56 +01:00
/*
* Since movntq is weakly - ordered , a " sfence " is needed to become
* ordered again :
2005-04-16 15:20:36 -07:00
*/
2008-02-18 08:53:56 +01:00
__asm__ __volatile__ ( " sfence \n " : : ) ;
2005-04-16 15:20:36 -07:00
kernel_fpu_end ( ) ;
}
static void fast_copy_page ( void * to , void * from )
{
int i ;
kernel_fpu_begin ( ) ;
2008-02-18 08:53:56 +01:00
/*
* maybe the prefetch stuff can go before the expensive fnsave . . .
2005-04-16 15:20:36 -07:00
* but that is for later . - AV
*/
2008-02-18 08:53:56 +01:00
__asm__ __volatile__ (
2005-04-16 15:20:36 -07:00
" 1: prefetch (%0) \n "
" prefetch 64(%0) \n "
" prefetch 128(%0) \n "
" prefetch 192(%0) \n "
" prefetch 256(%0) \n "
" 2: \n "
" .section .fixup, \" ax \" \n "
" 3: movw $0x1AEB, 1b \n " /* jmp on 26 bytes */
" jmp 2b \n "
" .previous \n "
2008-02-18 08:53:56 +01:00
_ASM_EXTABLE ( 1 b , 3 b ) : : " r " ( from ) ) ;
2005-04-16 15:20:36 -07:00
2008-02-18 08:53:56 +01:00
for ( i = 0 ; i < ( 4096 - 320 ) / 64 ; i + + ) {
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" 1: prefetch 320(%0) \n "
" 2: movq (%0), %%mm0 \n "
" movntq %%mm0, (%1) \n "
" movq 8(%0), %%mm1 \n "
" movntq %%mm1, 8(%1) \n "
" movq 16(%0), %%mm2 \n "
" movntq %%mm2, 16(%1) \n "
" movq 24(%0), %%mm3 \n "
" movntq %%mm3, 24(%1) \n "
" movq 32(%0), %%mm4 \n "
" movntq %%mm4, 32(%1) \n "
" movq 40(%0), %%mm5 \n "
" movntq %%mm5, 40(%1) \n "
" movq 48(%0), %%mm6 \n "
" movntq %%mm6, 48(%1) \n "
" movq 56(%0), %%mm7 \n "
" movntq %%mm7, 56(%1) \n "
" .section .fixup, \" ax \" \n "
" 3: movw $0x05EB, 1b \n " /* jmp on 5 bytes */
" jmp 2b \n "
" .previous \n "
2008-02-18 08:53:56 +01:00
_ASM_EXTABLE ( 1 b , 3 b ) : : " r " ( from ) , " r " ( to ) : " memory " ) ;
from + = 64 ;
to + = 64 ;
2005-04-16 15:20:36 -07:00
}
2008-02-18 08:53:56 +01:00
for ( i = ( 4096 - 320 ) / 64 ; i < 4096 / 64 ; i + + ) {
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" 2: movq (%0), %%mm0 \n "
" movntq %%mm0, (%1) \n "
" movq 8(%0), %%mm1 \n "
" movntq %%mm1, 8(%1) \n "
" movq 16(%0), %%mm2 \n "
" movntq %%mm2, 16(%1) \n "
" movq 24(%0), %%mm3 \n "
" movntq %%mm3, 24(%1) \n "
" movq 32(%0), %%mm4 \n "
" movntq %%mm4, 32(%1) \n "
" movq 40(%0), %%mm5 \n "
" movntq %%mm5, 40(%1) \n "
" movq 48(%0), %%mm6 \n "
" movntq %%mm6, 48(%1) \n "
" movq 56(%0), %%mm7 \n "
" movntq %%mm7, 56(%1) \n "
2008-02-18 08:53:56 +01:00
: : " r " ( from ) , " r " ( to ) : " memory " ) ;
from + = 64 ;
to + = 64 ;
2005-04-16 15:20:36 -07:00
}
2008-02-18 08:53:56 +01:00
/*
* Since movntq is weakly - ordered , a " sfence " is needed to become
* ordered again :
2005-04-16 15:20:36 -07:00
*/
2008-02-18 08:53:56 +01:00
__asm__ __volatile__ ( " sfence \n " : : ) ;
2005-04-16 15:20:36 -07:00
kernel_fpu_end ( ) ;
}
2008-02-18 08:53:56 +01:00
# else /* CONFIG_MK7 */
2005-04-16 15:20:36 -07:00
/*
* Generic MMX implementation without K7 specific streaming
*/
static void fast_clear_page ( void * page )
{
int i ;
2008-02-18 08:53:56 +01:00
2005-04-16 15:20:36 -07:00
kernel_fpu_begin ( ) ;
2008-02-18 08:53:56 +01:00
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" pxor %%mm0, %%mm0 \n " : :
) ;
2008-02-18 08:53:56 +01:00
for ( i = 0 ; i < 4096 / 128 ; i + + ) {
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" movq %%mm0, (%0) \n "
" movq %%mm0, 8(%0) \n "
" movq %%mm0, 16(%0) \n "
" movq %%mm0, 24(%0) \n "
" movq %%mm0, 32(%0) \n "
" movq %%mm0, 40(%0) \n "
" movq %%mm0, 48(%0) \n "
" movq %%mm0, 56(%0) \n "
" movq %%mm0, 64(%0) \n "
" movq %%mm0, 72(%0) \n "
" movq %%mm0, 80(%0) \n "
" movq %%mm0, 88(%0) \n "
" movq %%mm0, 96(%0) \n "
" movq %%mm0, 104(%0) \n "
" movq %%mm0, 112(%0) \n "
" movq %%mm0, 120(%0) \n "
2008-02-18 08:53:56 +01:00
: : " r " ( page ) : " memory " ) ;
page + = 128 ;
2005-04-16 15:20:36 -07:00
}
kernel_fpu_end ( ) ;
}
static void fast_copy_page ( void * to , void * from )
{
int i ;
2008-02-18 08:53:56 +01:00
2005-04-16 15:20:36 -07:00
kernel_fpu_begin ( ) ;
__asm__ __volatile__ (
" 1: prefetch (%0) \n "
" prefetch 64(%0) \n "
" prefetch 128(%0) \n "
" prefetch 192(%0) \n "
" prefetch 256(%0) \n "
" 2: \n "
" .section .fixup, \" ax \" \n "
" 3: movw $0x1AEB, 1b \n " /* jmp on 26 bytes */
" jmp 2b \n "
" .previous \n "
2008-02-18 08:53:56 +01:00
_ASM_EXTABLE ( 1 b , 3 b ) : : " r " ( from ) ) ;
2005-04-16 15:20:36 -07:00
2008-02-18 08:53:56 +01:00
for ( i = 0 ; i < 4096 / 64 ; i + + ) {
2005-04-16 15:20:36 -07:00
__asm__ __volatile__ (
" 1: prefetch 320(%0) \n "
" 2: movq (%0), %%mm0 \n "
" movq 8(%0), %%mm1 \n "
" movq 16(%0), %%mm2 \n "
" movq 24(%0), %%mm3 \n "
" movq %%mm0, (%1) \n "
" movq %%mm1, 8(%1) \n "
" movq %%mm2, 16(%1) \n "
" movq %%mm3, 24(%1) \n "
" movq 32(%0), %%mm0 \n "
" movq 40(%0), %%mm1 \n "
" movq 48(%0), %%mm2 \n "
" movq 56(%0), %%mm3 \n "
" movq %%mm0, 32(%1) \n "
" movq %%mm1, 40(%1) \n "
" movq %%mm2, 48(%1) \n "
" movq %%mm3, 56(%1) \n "
" .section .fixup, \" ax \" \n "
" 3: movw $0x05EB, 1b \n " /* jmp on 5 bytes */
" jmp 2b \n "
" .previous \n "
2008-02-18 08:53:56 +01:00
_ASM_EXTABLE ( 1 b , 3 b )
: : " r " ( from ) , " r " ( to ) : " memory " ) ;
from + = 64 ;
to + = 64 ;
2005-04-16 15:20:36 -07:00
}
kernel_fpu_end ( ) ;
}
2008-02-18 08:53:56 +01:00
# endif /* !CONFIG_MK7 */
2005-04-16 15:20:36 -07:00
/*
2008-02-18 08:53:56 +01:00
* Favour MMX for page clear and copy :
2005-04-16 15:20:36 -07:00
*/
2008-02-18 08:53:56 +01:00
static void slow_zero_page ( void * page )
2005-04-16 15:20:36 -07:00
{
int d0 , d1 ;
2008-02-18 08:53:56 +01:00
__asm__ __volatile__ (
" cld \n \t "
" rep ; stosl "
: " =&c " ( d0 ) , " =&D " ( d1 )
: " a " ( 0 ) , " 1 " ( page ) , " 0 " ( 1024 )
: " memory " ) ;
2005-04-16 15:20:36 -07:00
}
2008-02-18 08:53:56 +01:00
void mmx_clear_page ( void * page )
2005-04-16 15:20:36 -07:00
{
2008-02-18 08:53:56 +01:00
if ( unlikely ( in_interrupt ( ) ) )
2005-04-16 15:20:36 -07:00
slow_zero_page ( page ) ;
else
fast_clear_page ( page ) ;
}
2008-02-18 08:53:56 +01:00
EXPORT_SYMBOL ( mmx_clear_page ) ;
2005-04-16 15:20:36 -07:00
static void slow_copy_page ( void * to , void * from )
{
int d0 , d1 , d2 ;
2008-02-18 08:53:56 +01:00
__asm__ __volatile__ (
" cld \n \t "
" rep ; movsl "
: " =&c " ( d0 ) , " =&D " ( d1 ) , " =&S " ( d2 )
: " 0 " ( 1024 ) , " 1 " ( ( long ) to ) , " 2 " ( ( long ) from )
2005-04-16 15:20:36 -07:00
: " memory " ) ;
}
void mmx_copy_page ( void * to , void * from )
{
2008-02-18 08:53:56 +01:00
if ( unlikely ( in_interrupt ( ) ) )
2005-04-16 15:20:36 -07:00
slow_copy_page ( to , from ) ;
else
fast_copy_page ( to , from ) ;
}
2005-06-23 00:08:33 -07:00
EXPORT_SYMBOL ( mmx_copy_page ) ;