2005-04-16 15:20:36 -07:00
/ *
* linux/ a r c h / x86 _ 6 4 / e n t r y . S
*
* Copyright ( C ) 1 9 9 1 , 1 9 9 2 L i n u s T o r v a l d s
* Copyright ( C ) 2 0 0 0 , 2 0 0 1 , 2 0 0 2 A n d i K l e e n S u S E L a b s
* Copyright ( C ) 2 0 0 0 P a v e l M a c h e k < p a v e l @suse.cz>
* /
/ *
* entry. S c o n t a i n s t h e s y s t e m - c a l l a n d f a u l t l o w - l e v e l h a n d l i n g r o u t i n e s .
*
* NOTE : This c o d e h a n d l e s s i g n a l - r e c o g n i t i o n , w h i c h h a p p e n s e v e r y t i m e
* after a n i n t e r r u p t a n d a f t e r e a c h s y s t e m c a l l .
2008-11-16 15:29:00 +01:00
*
* Normal s y s c a l l s a n d i n t e r r u p t s d o n ' t s a v e a f u l l s t a c k f r a m e , t h i s i s
2005-04-16 15:20:36 -07:00
* only d o n e f o r s y s c a l l t r a c i n g , s i g n a l s o r f o r k / e x e c e t . a l .
2008-11-16 15:29:00 +01:00
*
* A n o t e o n t e r m i n o l o g y :
* - top o f s t a c k : A r c h i t e c t u r e d e f i n e d i n t e r r u p t f r a m e f r o m S S t o R I P
* at t h e t o p o f t h e k e r n e l p r o c e s s s t a c k .
2005-04-16 15:20:36 -07:00
* - partial s t a c k f r a m e : p a r t i a l l y s a v e d r e g i s t e r s u p t o R 1 1 .
2008-11-16 15:29:00 +01:00
* - full s t a c k f r a m e : L i k e p a r t i a l s t a c k f r a m e , b u t a l l r e g i s t e r s a v e d .
2006-09-26 10:52:29 +02:00
*
* Some m a c r o u s a g e :
* - CFI m a c r o s a r e u s e d t o g e n e r a t e d w a r f2 u n w i n d i n f o r m a t i o n f o r b e t t e r
* backtraces. T h e y d o n ' t c h a n g e a n y c o d e .
* - SAVE_ A L L / R E S T O R E _ A L L - S a v e / r e s t o r e a l l r e g i s t e r s
* - SAVE_ A R G S / R E S T O R E _ A R G S - S a v e / r e s t o r e r e g i s t e r s t h a t C f u n c t i o n s m o d i f y .
* There a r e u n f o r t u n a t e l y l o t s o f s p e c i a l c a s e s w h e r e s o m e r e g i s t e r s
* not t o u c h e d . T h e m a c r o i s a b i g m e s s t h a t s h o u l d b e c l e a n e d u p .
* - SAVE_ R E S T / R E S T O R E _ R E S T - H a n d l e t h e r e g i s t e r s n o t s a v e d b y S A V E _ A R G S .
* Gives a f u l l s t a c k f r a m e .
* - ENTRY/ E N D D e f i n e f u n c t i o n s i n t h e s y m b o l t a b l e .
* - FIXUP_ T O P _ O F _ S T A C K / R E S T O R E _ T O P _ O F _ S T A C K - F i x u p t h e h a r d w a r e s t a c k
* frame t h a t i s o t h e r w i s e u n d e f i n e d a f t e r a S Y S C A L L
* - TRACE_ I R Q _ * - T r a c e h a r d i n t e r r u p t s t a t e f o r l o c k d e b u g g i n g .
* - errorentry/ p a r a n o i d e n t r y / z e r o e n t r y - D e f i n e e x c e p t i o n e n t r y p o i n t s .
2005-04-16 15:20:36 -07:00
* /
# include < l i n u x / l i n k a g e . h >
# include < a s m / s e g m e n t . h >
# include < a s m / c a c h e . h >
# include < a s m / e r r n o . h >
# include < a s m / d w a r f2 . h >
# include < a s m / c a l l i n g . h >
2005-09-09 21:28:48 +02:00
# include < a s m / a s m - o f f s e t s . h >
2005-04-16 15:20:36 -07:00
# include < a s m / m s r . h >
# include < a s m / u n i s t d . h >
# include < a s m / t h r e a d _ i n f o . h >
# include < a s m / h w _ i r q . h >
2006-01-16 01:56:39 +01:00
# include < a s m / p a g e . h >
2006-07-03 00:24:45 -07:00
# include < a s m / i r q f l a g s . h >
2008-01-30 13:32:08 +01:00
# include < a s m / p a r a v i r t . h >
2008-06-21 23:47:27 +05:30
# include < a s m / f t r a c e . h >
2005-04-16 15:20:36 -07:00
2008-06-23 15:37:04 -07:00
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
# include < l i n u x / e l f - e m . h >
# define A U D I T _ A R C H _ X 8 6 _ 6 4 ( E M _ X 8 6 _ 6 4 | _ _ A U D I T _ A R C H _ 6 4 B I T | _ _ A U D I T _ A R C H _ L E )
# define _ _ A U D I T _ A R C H _ 6 4 B I T 0 x80 0 0 0 0 0 0
# define _ _ A U D I T _ A R C H _ L E 0 x40 0 0 0 0 0 0
2005-04-16 15:20:36 -07:00
.code64
2008-10-06 19:06:12 -04:00
# ifdef C O N F I G _ F U N C T I O N _ T R A C E R
2008-05-12 21:20:43 +02:00
# ifdef C O N F I G _ D Y N A M I C _ F T R A C E
ENTRY( m c o u n t )
retq
END( m c o u n t )
ENTRY( f t r a c e _ c a l l e r )
2008-11-05 16:05:44 -05:00
cmpl $ 0 , f u n c t i o n _ t r a c e _ s t o p
jne f t r a c e _ s t u b
2008-05-12 21:20:43 +02:00
2008-12-13 00:09:08 +03:00
MCOUNT_ S A V E _ F R A M E
2008-05-12 21:20:43 +02:00
movq 0 x38 ( % r s p ) , % r d i
movq 8 ( % r b p ) , % r s i
2008-06-21 23:47:27 +05:30
subq $ M C O U N T _ I N S N _ S I Z E , % r d i
2008-05-12 21:20:43 +02:00
.globl ftrace_call
ftrace_call :
call f t r a c e _ s t u b
2008-12-13 00:09:08 +03:00
MCOUNT_ R E S T O R E _ F R A M E
2008-05-12 21:20:43 +02:00
2008-12-02 00:20:39 +01:00
# ifdef C O N F I G _ F U N C T I O N _ G R A P H _ T R A C E R
.globl ftrace_graph_call
ftrace_graph_call :
jmp f t r a c e _ s t u b
# endif
2008-05-12 21:20:43 +02:00
.globl ftrace_stub
ftrace_stub :
retq
END( f t r a c e _ c a l l e r )
# else / * ! C O N F I G _ D Y N A M I C _ F T R A C E * /
2008-05-12 21:20:42 +02:00
ENTRY( m c o u n t )
2008-11-05 16:05:44 -05:00
cmpl $ 0 , f u n c t i o n _ t r a c e _ s t o p
jne f t r a c e _ s t u b
2008-05-12 21:20:42 +02:00
cmpq $ f t r a c e _ s t u b , f t r a c e _ t r a c e _ f u n c t i o n
jnz t r a c e
2008-12-02 00:20:39 +01:00
# ifdef C O N F I G _ F U N C T I O N _ G R A P H _ T R A C E R
cmpq $ f t r a c e _ s t u b , f t r a c e _ g r a p h _ r e t u r n
jnz f t r a c e _ g r a p h _ c a l l e r
2008-12-02 23:50:05 -05:00
cmpq $ f t r a c e _ g r a p h _ e n t r y _ s t u b , f t r a c e _ g r a p h _ e n t r y
jnz f t r a c e _ g r a p h _ c a l l e r
2008-12-02 00:20:39 +01:00
# endif
2008-05-12 21:20:42 +02:00
.globl ftrace_stub
ftrace_stub :
retq
trace :
2008-12-13 00:09:08 +03:00
MCOUNT_ S A V E _ F R A M E
2008-05-12 21:20:42 +02:00
movq 0 x38 ( % r s p ) , % r d i
movq 8 ( % r b p ) , % r s i
2008-06-21 23:47:27 +05:30
subq $ M C O U N T _ I N S N _ S I Z E , % r d i
2008-05-12 21:20:42 +02:00
call * f t r a c e _ t r a c e _ f u n c t i o n
2008-12-13 00:09:08 +03:00
MCOUNT_ R E S T O R E _ F R A M E
2008-05-12 21:20:42 +02:00
jmp f t r a c e _ s t u b
END( m c o u n t )
2008-05-12 21:20:43 +02:00
# endif / * C O N F I G _ D Y N A M I C _ F T R A C E * /
2008-10-06 19:06:12 -04:00
# endif / * C O N F I G _ F U N C T I O N _ T R A C E R * /
2008-05-12 21:20:42 +02:00
2008-12-02 00:20:39 +01:00
# ifdef C O N F I G _ F U N C T I O N _ G R A P H _ T R A C E R
ENTRY( f t r a c e _ g r a p h _ c a l l e r )
cmpl $ 0 , f u n c t i o n _ t r a c e _ s t o p
jne f t r a c e _ s t u b
2008-12-13 00:09:08 +03:00
MCOUNT_ S A V E _ F R A M E
2008-12-02 00:20:39 +01:00
leaq 8 ( % r b p ) , % r d i
movq 0 x38 ( % r s p ) , % r s i
2008-12-02 15:34:09 -05:00
subq $ M C O U N T _ I N S N _ S I Z E , % r s i
2008-12-02 00:20:39 +01:00
call p r e p a r e _ f t r a c e _ r e t u r n
2008-12-13 00:09:08 +03:00
MCOUNT_ R E S T O R E _ F R A M E
2008-12-02 00:20:39 +01:00
retq
END( f t r a c e _ g r a p h _ c a l l e r )
.globl return_to_handler
return_to_handler :
subq $ 8 0 , % r s p
2008-05-12 21:20:42 +02:00
movq % r a x , ( % r s p )
movq % r c x , 8 ( % r s p )
movq % r d x , 1 6 ( % r s p )
movq % r s i , 2 4 ( % r s p )
movq % r d i , 3 2 ( % r s p )
movq % r8 , 4 0 ( % r s p )
movq % r9 , 4 8 ( % r s p )
2008-12-02 00:20:39 +01:00
movq % r10 , 5 6 ( % r s p )
movq % r11 , 6 4 ( % r s p )
2008-05-12 21:20:42 +02:00
2008-12-02 00:20:39 +01:00
call f t r a c e _ r e t u r n _ t o _ h a n d l e r
2008-05-12 21:20:42 +02:00
2008-12-02 00:20:39 +01:00
movq % r a x , 7 2 ( % r s p )
movq 6 4 ( % r s p ) , % r11
movq 5 6 ( % r s p ) , % r10
2008-05-12 21:20:42 +02:00
movq 4 8 ( % r s p ) , % r9
movq 4 0 ( % r s p ) , % r8
movq 3 2 ( % r s p ) , % r d i
movq 2 4 ( % r s p ) , % r s i
movq 1 6 ( % r s p ) , % r d x
movq 8 ( % r s p ) , % r c x
movq ( % r s p ) , % r a x
2008-12-02 00:20:39 +01:00
addq $ 7 2 , % r s p
retq
# endif
2008-05-12 21:20:42 +02:00
2005-04-16 15:25:05 -07:00
# ifndef C O N F I G _ P R E E M P T
2005-04-16 15:20:36 -07:00
# define r e t i n t _ k e r n e l r e t i n t _ r e s t o r e _ a r g s
2008-11-16 15:29:00 +01:00
# endif
2006-07-03 00:24:45 -07:00
2008-01-30 13:32:08 +01:00
# ifdef C O N F I G _ P A R A V I R T
2008-06-25 00:19:28 -04:00
ENTRY( n a t i v e _ u s e r g s _ s y s r e t 6 4 )
2008-01-30 13:32:08 +01:00
swapgs
sysretq
# endif / * C O N F I G _ P A R A V I R T * /
2006-07-03 00:24:45 -07:00
.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
# ifdef C O N F I G _ T R A C E _ I R Q F L A G S
bt $ 9 ,E F L A G S - \ o f f s e t ( % r s p ) / * i n t e r r u p t s o f f ? * /
jnc 1 f
TRACE_ I R Q S _ O N
1 :
# endif
.endm
2005-04-16 15:20:36 -07:00
/ *
2008-11-16 15:29:00 +01:00
* C c o d e i s n o t s u p p o s e d t o k n o w a b o u t u n d e f i n e d t o p o f s t a c k . E v e r y t i m e
* a C f u n c t i o n w i t h a n p t _ r e g s a r g u m e n t i s c a l l e d f r o m t h e S Y S C A L L b a s e d
2005-04-16 15:20:36 -07:00
* fast p a t h F I X U P _ T O P _ O F _ S T A C K i s n e e d e d .
* RESTORE_ T O P _ O F _ S T A C K s y n c s t h e s y s c a l l s t a t e a f t e r a n y p o s s i b l e p t r e g s
* manipulation.
2008-11-16 15:29:00 +01:00
* /
/* %rsp:at FRAMEEND */
2008-11-21 16:41:55 +01:00
.macro FIXUP_TOP_OF_STACK tmp o f f s e t =0
movq % g s : p d a _ o l d r s p ,\ t m p
movq \ t m p ,R S P + \ o f f s e t ( % r s p )
movq $ _ _ U S E R _ D S ,S S + \ o f f s e t ( % r s p )
movq $ _ _ U S E R _ C S ,C S + \ o f f s e t ( % r s p )
movq $ - 1 ,R C X + \ o f f s e t ( % r s p )
movq R 1 1 + \ o f f s e t ( % r s p ) ,\ t m p / * g e t e f l a g s * /
movq \ t m p ,E F L A G S + \ o f f s e t ( % r s p )
2005-04-16 15:20:36 -07:00
.endm
2008-11-21 16:41:55 +01:00
.macro RESTORE_TOP_OF_STACK tmp o f f s e t =0
movq R S P + \ o f f s e t ( % r s p ) ,\ t m p
movq \ t m p ,% g s : p d a _ o l d r s p
movq E F L A G S + \ o f f s e t ( % r s p ) ,\ t m p
movq \ t m p ,R 1 1 + \ o f f s e t ( % r s p )
2005-04-16 15:20:36 -07:00
.endm
.macro FAKE_STACK_FRAME child_ r i p
/* push in order ss, rsp, eflags, cs, rip */
2005-07-28 21:15:48 -07:00
xorl % e a x , % e a x
2008-06-25 00:19:25 -04:00
pushq $ _ _ K E R N E L _ D S / * s s * /
2005-04-16 15:20:36 -07:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-09-12 18:49:24 +02:00
/*CFI_REL_OFFSET ss,0*/
2005-04-16 15:20:36 -07:00
pushq % r a x / * r s p * /
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-09-12 18:49:24 +02:00
CFI_ R E L _ O F F S E T r s p ,0
2008-11-26 22:17:02 +03:00
pushq $ X 8 6 _ E F L A G S _ I F / * e f l a g s - i n t e r r u p t s o n * /
2005-04-16 15:20:36 -07:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-09-12 18:49:24 +02:00
/*CFI_REL_OFFSET rflags,0*/
2005-04-16 15:20:36 -07:00
pushq $ _ _ K E R N E L _ C S / * c s * /
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-09-12 18:49:24 +02:00
/*CFI_REL_OFFSET cs,0*/
2005-04-16 15:20:36 -07:00
pushq \ c h i l d _ r i p / * r i p * /
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-09-12 18:49:24 +02:00
CFI_ R E L _ O F F S E T r i p ,0
2005-04-16 15:20:36 -07:00
pushq % r a x / * o r i g r a x * /
CFI_ A D J U S T _ C F A _ O F F S E T 8
.endm
.macro UNFAKE_STACK_FRAME
addq $ 8 * 6 , % r s p
CFI_ A D J U S T _ C F A _ O F F S E T - ( 6 * 8 )
.endm
2008-11-20 14:40:11 +01:00
/ *
* initial f r a m e s t a t e f o r i n t e r r u p t s ( a n d e x c e p t i o n s w i t h o u t e r r o r c o d e )
* /
.macro EMPTY_FRAME start=1 o f f s e t =0
2005-09-12 18:49:24 +02:00
.if \ start
2008-11-20 14:40:11 +01:00
CFI_ S T A R T P R O C s i m p l e
2006-09-26 10:52:41 +02:00
CFI_ S I G N A L _ F R A M E
2008-11-20 14:40:11 +01:00
CFI_ D E F _ C F A r s p ,8 + \ o f f s e t
2005-09-12 18:49:24 +02:00
.else
2008-11-20 14:40:11 +01:00
CFI_ D E F _ C F A _ O F F S E T 8 + \ o f f s e t
2005-09-12 18:49:24 +02:00
.endif
2005-04-16 15:20:36 -07:00
.endm
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
/ *
2008-11-20 14:40:11 +01:00
* initial f r a m e s t a t e f o r i n t e r r u p t s ( a n d e x c e p t i o n s w i t h o u t e r r o r c o d e )
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
* /
2008-11-20 14:40:11 +01:00
.macro INTR_FRAME start=1 o f f s e t =0
2008-11-21 15:11:32 +01:00
EMPTY_ F R A M E \ s t a r t , S S + 8 + \ o f f s e t - R I P
/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
CFI_ R E L _ O F F S E T r s p , R S P + \ o f f s e t - R I P
/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
CFI_ R E L _ O F F S E T r i p , R I P + \ o f f s e t - R I P
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
.endm
/ *
* initial f r a m e s t a t e f o r e x c e p t i o n s w i t h e r r o r c o d e ( a n d i n t e r r u p t s
* with v e c t o r a l r e a d y p u s h e d )
* /
2008-11-20 14:40:11 +01:00
.macro XCPT_FRAME start=1 o f f s e t =0
2008-11-21 15:11:32 +01:00
INTR_ F R A M E \ s t a r t , R I P + \ o f f s e t - O R I G _ R A X
2008-11-20 14:40:11 +01:00
/*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
.endm
/ *
* frame t h a t e n a b l e s c a l l i n g i n t o C .
* /
.macro PARTIAL_FRAME start=1 o f f s e t =0
2008-11-21 15:11:32 +01:00
XCPT_ F R A M E \ s t a r t , O R I G _ R A X + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r d i , R D I + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r s i , R S I + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r d x , R D X + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r c x , R C X + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r a x , R A X + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r8 , R 8 + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r9 , R 9 + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r10 , R 1 0 + \ o f f s e t - A R G O F F S E T
CFI_ R E L _ O F F S E T r11 , R 1 1 + \ o f f s e t - A R G O F F S E T
2008-11-20 14:40:11 +01:00
.endm
/ *
* frame t h a t e n a b l e s p a s s i n g a c o m p l e t e p t _ r e g s t o a C f u n c t i o n .
* /
.macro DEFAULT_FRAME start=1 o f f s e t =0
2008-11-21 15:11:32 +01:00
PARTIAL_ F R A M E \ s t a r t , R 1 1 + \ o f f s e t - R 1 5
2008-11-20 14:40:11 +01:00
CFI_ R E L _ O F F S E T r b x , R B X + \ o f f s e t
CFI_ R E L _ O F F S E T r b p , R B P + \ o f f s e t
CFI_ R E L _ O F F S E T r12 , R 1 2 + \ o f f s e t
CFI_ R E L _ O F F S E T r13 , R 1 3 + \ o f f s e t
CFI_ R E L _ O F F S E T r14 , R 1 4 + \ o f f s e t
CFI_ R E L _ O F F S E T r15 , R 1 5 + \ o f f s e t
.endm
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
/* save partial stack frame */
ENTRY( s a v e _ a r g s )
XCPT_ F R A M E
cld
2008-11-21 15:20:47 +01:00
movq_ c f i r d i , R D I + 1 6 - A R G O F F S E T
movq_ c f i r s i , R S I + 1 6 - A R G O F F S E T
movq_ c f i r d x , R D X + 1 6 - A R G O F F S E T
movq_ c f i r c x , R C X + 1 6 - A R G O F F S E T
movq_ c f i r a x , R A X + 1 6 - A R G O F F S E T
movq_ c f i r8 , R 8 + 1 6 - A R G O F F S E T
movq_ c f i r9 , R 9 + 1 6 - A R G O F F S E T
movq_ c f i r10 , R 1 0 + 1 6 - A R G O F F S E T
movq_ c f i r11 , R 1 1 + 1 6 - A R G O F F S E T
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
leaq - A R G O F F S E T + 1 6 ( % r s p ) ,% r d i / * a r g 1 f o r h a n d l e r * /
2008-11-21 15:20:47 +01:00
movq_ c f i r b p , 8 / * p u s h % r b p * /
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
leaq 8 ( % r s p ) , % r b p / * m o v % r s p , % e b p * /
testl $ 3 , C S ( % r d i )
je 1 f
SWAPGS
/ *
* irqcount i s u s e d t o c h e c k i f a C P U i s a l r e a d y o n a n i n t e r r u p t s t a c k
* or n o t . W h i l e t h i s i s e s s e n t i a l l y r e d u n d a n t w i t h p r e e m p t _ c o u n t i t i s
* a l i t t l e c h e a p e r t o u s e a s e p a r a t e c o u n t e r i n t h e P D A ( s h o r t o f
* moving i r q _ e n t e r i n t o a s s e m b l y , w h i c h w o u l d b e t o o m u c h w o r k )
* /
1 : incl % g s : p d a _ i r q c o u n t
jne 2 f
2008-11-21 15:20:47 +01:00
popq_ c f i % r a x / * m o v e r e t u r n a d d r e s s . . . * /
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
mov % g s : p d a _ i r q s t a c k p t r ,% r s p
2008-11-20 14:40:11 +01:00
EMPTY_ F R A M E 0
2008-11-21 15:20:47 +01:00
pushq_ c f i % r a x / * . . . t o t h e n e w s t a c k * /
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
/ *
* We e n t e r e d a n i n t e r r u p t c o n t e x t - i r q s a r e o f f :
* /
2 : TRACE_ I R Q S _ O F F
ret
CFI_ E N D P R O C
END( s a v e _ a r g s )
2008-11-21 16:41:55 +01:00
ENTRY( s a v e _ r e s t )
PARTIAL_ F R A M E 1 R E S T _ S K I P + 8
movq 5 * 8 + 1 6 ( % r s p ) , % r11 / * s a v e r e t u r n a d d r e s s * /
movq_ c f i r b x , R B X + 1 6
movq_ c f i r b p , R B P + 1 6
movq_ c f i r12 , R 1 2 + 1 6
movq_ c f i r13 , R 1 3 + 1 6
movq_ c f i r14 , R 1 4 + 1 6
movq_ c f i r15 , R 1 5 + 1 6
movq % r11 , 8 ( % r s p ) / * r e t u r n a d d r e s s * /
FIXUP_ T O P _ O F _ S T A C K % r11 , 1 6
ret
CFI_ E N D P R O C
END( s a v e _ r e s t )
2008-11-21 16:43:18 +01:00
/* save complete stack frame */
ENTRY( s a v e _ p a r a n o i d )
XCPT_ F R A M E 1 R D I + 8
cld
movq_ c f i r d i , R D I + 8
movq_ c f i r s i , R S I + 8
movq_ c f i r d x , R D X + 8
movq_ c f i r c x , R C X + 8
movq_ c f i r a x , R A X + 8
movq_ c f i r8 , R 8 + 8
movq_ c f i r9 , R 9 + 8
movq_ c f i r10 , R 1 0 + 8
movq_ c f i r11 , R 1 1 + 8
movq_ c f i r b x , R B X + 8
movq_ c f i r b p , R B P + 8
movq_ c f i r12 , R 1 2 + 8
movq_ c f i r13 , R 1 3 + 8
movq_ c f i r14 , R 1 4 + 8
movq_ c f i r15 , R 1 5 + 8
movl $ 1 ,% e b x
movl $ M S R _ G S _ B A S E ,% e c x
rdmsr
testl % e d x ,% e d x
js 1 f / * n e g a t i v e - > i n k e r n e l * /
SWAPGS
xorl % e b x ,% e b x
1 : ret
CFI_ E N D P R O C
END( s a v e _ p a r a n o i d )
2005-04-16 15:20:36 -07:00
/ *
2008-11-27 14:41:21 +01:00
* A n e w l y f o r k e d p r o c e s s d i r e c t l y c o n t e x t s w i t c h e s i n t o t h i s a d d r e s s .
*
* rdi : prev t a s k w e s w i t c h e d f r o m
2008-11-16 15:29:00 +01:00
* /
2005-04-16 15:20:36 -07:00
ENTRY( r e t _ f r o m _ f o r k )
2008-11-20 14:40:11 +01:00
DEFAULT_ F R A M E
2008-11-27 14:41:21 +01:00
2006-09-26 10:52:41 +02:00
push k e r n e l _ e f l a g s ( % r i p )
2008-07-22 18:14:16 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2008-11-27 14:41:21 +01:00
popf # r e s e t k e r n e l e f l a g s
2008-07-22 18:14:16 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2008-11-27 14:41:21 +01:00
call s c h e d u l e _ t a i l # r d i : ' p r e v ' t a s k p a r a m e t e r
2005-04-16 15:20:36 -07:00
GET_ T H R E A D _ I N F O ( % r c x )
2008-11-27 14:41:21 +01:00
2008-10-30 10:45:36 +00:00
CFI_ R E M E M B E R _ S T A T E
2005-04-16 15:20:36 -07:00
RESTORE_ R E S T
2008-11-27 14:41:21 +01:00
testl $ 3 , C S - A R G O F F S E T ( % r s p ) # f r o m k e r n e l _ t h r e a d ?
2005-04-16 15:20:36 -07:00
je i n t _ r e t _ f r o m _ s y s _ c a l l
2008-11-27 14:41:21 +01:00
testl $ _ T I F _ I A 3 2 , T I _ f l a g s ( % r c x ) # 32 - b i t c o m p a t t a s k n e e d s I R E T
2005-04-16 15:20:36 -07:00
jnz i n t _ r e t _ f r o m _ s y s _ c a l l
2008-11-27 14:41:21 +01:00
2008-11-21 16:41:55 +01:00
RESTORE_ T O P _ O F _ S T A C K % r d i , - A R G O F F S E T
2008-11-27 14:41:21 +01:00
jmp r e t _ f r o m _ s y s _ c a l l # g o t o t h e S Y S R E T f a s t p a t h
2008-10-30 10:45:36 +00:00
CFI_ R E S T O R E _ S T A T E
2005-04-16 15:20:36 -07:00
CFI_ E N D P R O C
2006-06-26 13:56:55 +02:00
END( r e t _ f r o m _ f o r k )
2005-04-16 15:20:36 -07:00
/ *
* System c a l l e n t r y . U p t o 6 a r g u m e n t s i n r e g i s t e r s a r e s u p p o r t e d .
*
* SYSCALL d o e s n o t s a v e a n y t h i n g o n t h e s t a c k a n d d o e s n o t c h a n g e t h e
* stack p o i n t e r .
* /
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
/ *
2008-11-16 15:29:00 +01:00
* Register s e t u p :
2005-04-16 15:20:36 -07:00
* rax s y s t e m c a l l n u m b e r
* rdi a r g 0
2008-11-16 15:29:00 +01:00
* rcx r e t u r n a d d r e s s f o r s y s c a l l / s y s r e t , C a r g 3
2005-04-16 15:20:36 -07:00
* rsi a r g 1
2008-11-16 15:29:00 +01:00
* rdx a r g 2
2005-04-16 15:20:36 -07:00
* r1 0 a r g 3 ( - - > m o v e d t o r c x f o r C )
* r8 a r g 4
* r9 a r g 5
* r1 1 e f l a g s f o r s y s c a l l / s y s r e t , t e m p o r a r y f o r C
2008-11-16 15:29:00 +01:00
* r1 2 - r15 ,r b p ,r b x s a v e d b y C c o d e , n o t t o u c h e d .
*
2005-04-16 15:20:36 -07:00
* Interrupts a r e o f f o n e n t r y .
* Only c a l l e d f r o m u s e r s p a c e .
*
* XXX i f w e h a d a f r e e s c r a t c h r e g i s t e r w e c o u l d s a v e t h e R S P i n t o t h e s t a c k f r a m e
* and r e p o r t i t p r o p e r l y i n p s . U n f o r t u n a t e l y w e h a v e n ' t .
2006-04-07 19:50:00 +02:00
*
* When u s e r c a n c h a n g e t h e f r a m e s a l w a y s f o r c e I R E T . T h a t i s b e c a u s e
* it d e a l s w i t h u n c a n o n i c a l a d d r e s s e s b e t t e r . S Y S R E T h a s t r o u b l e
* with t h e m d u e t o b u g s i n b o t h A M D a n d I n t e l C P U s .
2008-11-16 15:29:00 +01:00
* /
2005-04-16 15:20:36 -07:00
ENTRY( s y s t e m _ c a l l )
2005-09-12 18:49:24 +02:00
CFI_ S T A R T P R O C s i m p l e
2006-09-26 10:52:41 +02:00
CFI_ S I G N A L _ F R A M E
2006-06-26 13:57:38 +02:00
CFI_ D E F _ C F A r s p ,P D A _ S T A C K O F F S E T
2005-09-12 18:49:24 +02:00
CFI_ R E G I S T E R r i p ,r c x
/*CFI_REGISTER rflags,r11*/
2008-01-30 13:32:08 +01:00
SWAPGS_ U N S A F E _ S T A C K
/ *
* A h y p e r v i s o r i m p l e m e n t a t i o n m i g h t w a n t t o u s e a l a b e l
* after t h e s w a p g s , s o t h a t i t c a n d o t h e s w a p g s
* for t h e g u e s t a n d j u m p h e r e o n s y s c a l l .
* /
ENTRY( s y s t e m _ c a l l _ a f t e r _ s w a p g s )
2008-11-16 15:29:00 +01:00
movq % r s p ,% g s : p d a _ o l d r s p
2005-04-16 15:20:36 -07:00
movq % g s : p d a _ k e r n e l s t a c k ,% r s p
2006-07-03 00:24:45 -07:00
/ *
* No n e e d t o f o l l o w t h i s i r q s o f f / o n s e c t i o n - i t ' s s t r a i g h t
* and s h o r t :
* /
2008-01-30 13:32:08 +01:00
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2005-04-16 15:20:36 -07:00
SAVE_ A R G S 8 ,1
2008-11-16 15:29:00 +01:00
movq % r a x ,O R I G _ R A X - A R G O F F S E T ( % r s p )
2005-09-12 18:49:24 +02:00
movq % r c x ,R I P - A R G O F F S E T ( % r s p )
CFI_ R E L _ O F F S E T r i p ,R I P - A R G O F F S E T
2005-04-16 15:20:36 -07:00
GET_ T H R E A D _ I N F O ( % r c x )
2008-07-09 02:38:07 -07:00
testl $ _ T I F _ W O R K _ S Y S C A L L _ E N T R Y ,T I _ f l a g s ( % r c x )
2005-04-16 15:20:36 -07:00
jnz t r a c e s y s
2008-06-23 15:37:04 -07:00
system_call_fastpath :
2005-04-16 15:20:36 -07:00
cmpq $ _ _ N R _ s y s c a l l _ m a x ,% r a x
ja b a d s y s
movq % r10 ,% r c x
call * s y s _ c a l l _ t a b l e ( ,% r a x ,8 ) # X X X : r i p r e l a t i v e
movq % r a x ,R A X - A R G O F F S E T ( % r s p )
/ *
* Syscall r e t u r n p a t h e n d i n g w i t h S Y S R E T ( f a s t p a t h )
2008-11-16 15:29:00 +01:00
* Has i n c o m p l e t e s t a c k f r a m e a n d u n d e f i n e d t o p o f s t a c k .
* /
2005-04-16 15:20:36 -07:00
ret_from_sys_call :
2005-04-16 15:25:02 -07:00
movl $ _ T I F _ A L L W O R K _ M A S K ,% e d i
2005-04-16 15:20:36 -07:00
/* edi: flagmask */
2008-11-16 15:29:00 +01:00
sysret_check :
2007-10-11 22:11:12 +02:00
LOCKDEP_ S Y S _ E X I T
2005-04-16 15:20:36 -07:00
GET_ T H R E A D _ I N F O ( % r c x )
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2008-06-24 11:19:35 -03:00
movl T I _ f l a g s ( % r c x ) ,% e d x
2005-04-16 15:20:36 -07:00
andl % e d i ,% e d x
2008-11-16 15:29:00 +01:00
jnz s y s r e t _ c a r e f u l
2006-12-07 02:14:02 +01:00
CFI_ R E M E M B E R _ S T A T E
2006-07-03 00:24:45 -07:00
/ *
* sysretq w i l l r e - e n a b l e i n t e r r u p t s :
* /
TRACE_ I R Q S _ O N
2005-04-16 15:20:36 -07:00
movq R I P - A R G O F F S E T ( % r s p ) ,% r c x
2005-09-12 18:49:24 +02:00
CFI_ R E G I S T E R r i p ,r c x
2005-04-16 15:20:36 -07:00
RESTORE_ A R G S 0 ,- A R G _ S K I P ,1
2005-09-12 18:49:24 +02:00
/*CFI_REGISTER rflags,r11*/
2008-06-25 00:19:27 -04:00
movq % g s : p d a _ o l d r s p , % r s p
2008-06-25 00:19:28 -04:00
USERGS_ S Y S R E T 6 4
2005-04-16 15:20:36 -07:00
2006-12-07 02:14:02 +01:00
CFI_ R E S T O R E _ S T A T E
2005-04-16 15:20:36 -07:00
/* Handle reschedules */
2008-11-16 15:29:00 +01:00
/* edx: work, edi: workmask */
2005-04-16 15:20:36 -07:00
sysret_careful :
bt $ T I F _ N E E D _ R E S C H E D ,% e d x
jnc s y s r e t _ s i g n a l
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O N
2008-01-30 13:32:08 +01:00
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2005-04-16 15:20:36 -07:00
pushq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-04-16 15:20:36 -07:00
call s c h e d u l e
popq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2005-04-16 15:20:36 -07:00
jmp s y s r e t _ c h e c k
2008-11-16 15:29:00 +01:00
/* Handle a signal */
2005-04-16 15:20:36 -07:00
sysret_signal :
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O N
2008-01-30 13:32:08 +01:00
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2008-06-23 15:37:04 -07:00
# ifdef C O N F I G _ A U D I T S Y S C A L L
bt $ T I F _ S Y S C A L L _ A U D I T ,% e d x
jc s y s r e t _ a u d i t
# endif
2005-05-16 21:53:19 -07:00
/* edx: work flags (arg3) */
2005-04-16 15:20:36 -07:00
leaq - A R G O F F S E T ( % r s p ) ,% r d i # & p t _ r e g s - > a r g 1
xorl % e s i ,% e s i # o l d s e t - > a r g 2
2008-11-21 22:59:52 +01:00
SAVE_ R E S T
FIXUP_ T O P _ O F _ S T A C K % r11
call d o _ n o t i f y _ r e s u m e
RESTORE_ T O P _ O F _ S T A C K % r11
RESTORE_ R E S T
2008-06-23 20:41:12 -07:00
movl $ _ T I F _ W O R K _ M A S K ,% e d i
2006-04-07 19:50:00 +02:00
/ * Use I R E T b e c a u s e u s e r c o u l d h a v e c h a n g e d f r a m e . T h i s
works b e c a u s e p t r e g s c a l l _ c o m m o n h a s c a l l e d F I X U P _ T O P _ O F _ S T A C K . * /
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2006-04-07 19:50:00 +02:00
jmp i n t _ w i t h _ c h e c k
2008-11-16 15:29:00 +01:00
2005-09-12 18:49:24 +02:00
badsys :
movq $ - E N O S Y S ,R A X - A R G O F F S E T ( % r s p )
jmp r e t _ f r o m _ s y s _ c a l l
2008-06-23 15:37:04 -07:00
# ifdef C O N F I G _ A U D I T S Y S C A L L
/ *
* Fast p a t h f o r s y s c a l l a u d i t w i t h o u t f u l l s y s c a l l t r a c e .
* We j u s t c a l l a u d i t _ s y s c a l l _ e n t r y ( ) d i r e c t l y , a n d t h e n
* jump b a c k t o t h e n o r m a l f a s t p a t h .
* /
auditsys :
movq % r10 ,% r9 / * 6 t h a r g : 4 t h s y s c a l l a r g * /
movq % r d x ,% r8 / * 5 t h a r g : 3 r d s y s c a l l a r g * /
movq % r s i ,% r c x / * 4 t h a r g : 2 n d s y s c a l l a r g * /
movq % r d i ,% r d x / * 3 r d a r g : 1 s t s y s c a l l a r g * /
movq % r a x ,% r s i / * 2 n d a r g : s y s c a l l n u m b e r * /
movl $ A U D I T _ A R C H _ X 8 6 _ 6 4 ,% e d i / * 1 s t a r g : a u d i t a r c h * /
call a u d i t _ s y s c a l l _ e n t r y
LOAD_ A R G S 0 / * r e l o a d c a l l - c l o b b e r e d r e g i s t e r s * /
jmp s y s t e m _ c a l l _ f a s t p a t h
/ *
* Return f a s t p a t h f o r s y s c a l l a u d i t . C a l l a u d i t _ s y s c a l l _ e x i t ( )
* directly a n d t h e n j u m p b a c k t o t h e f a s t p a t h w i t h T I F _ S Y S C A L L _ A U D I T
* masked o f f .
* /
sysret_audit :
movq % r a x ,% r s i / * s e c o n d a r g , s y s c a l l r e t u r n v a l u e * /
cmpq $ 0 ,% r a x / * i s i t < 0 ? * /
setl % a l / * 1 i f s o , 0 i f n o t * /
movzbl % a l ,% e d i / * z e r o - e x t e n d t h a t i n t o % e d i * /
inc % e d i / * f i r s t a r g , 0 - > 1 ( A U D I T S C _ S U C C E S S ) , 1 - > 2 ( A U D I T S C _ F A I L U R E ) * /
call a u d i t _ s y s c a l l _ e x i t
movl $ ( _ T I F _ A L L W O R K _ M A S K & ~ _ T I F _ S Y S C A L L _ A U D I T ) ,% e d i
jmp s y s r e t _ c h e c k
# endif / * C O N F I G _ A U D I T S Y S C A L L * /
2005-04-16 15:20:36 -07:00
/* Do syscall tracing */
2008-11-16 15:29:00 +01:00
tracesys :
2008-06-23 15:37:04 -07:00
# ifdef C O N F I G _ A U D I T S Y S C A L L
testl $ ( _ T I F _ W O R K _ S Y S C A L L _ E N T R Y & ~ _ T I F _ S Y S C A L L _ A U D I T ) ,T I _ f l a g s ( % r c x )
jz a u d i t s y s
# endif
2005-04-16 15:20:36 -07:00
SAVE_ R E S T
2008-03-16 21:59:11 -07:00
movq $ - E N O S Y S ,R A X ( % r s p ) / * p t r a c e c a n c h a n g e t h i s f o r a b a d s y s c a l l * /
2005-04-16 15:20:36 -07:00
FIXUP_ T O P _ O F _ S T A C K % r d i
movq % r s p ,% r d i
call s y s c a l l _ t r a c e _ e n t e r
2008-07-09 02:38:07 -07:00
/ *
* Reload a r g r e g i s t e r s f r o m s t a c k i n c a s e p t r a c e c h a n g e d t h e m .
* We d o n ' t r e l o a d % r a x b e c a u s e s y s c a l l _ t r a c e _ e n t e r ( ) r e t u r n e d
* the v a l u e i t w a n t s u s t o u s e i n t h e t a b l e l o o k u p .
* /
LOAD_ A R G S A R G O F F S E T , 1
2005-04-16 15:20:36 -07:00
RESTORE_ R E S T
cmpq $ _ _ N R _ s y s c a l l _ m a x ,% r a x
2008-03-16 21:59:11 -07:00
ja i n t _ r e t _ f r o m _ s y s _ c a l l / * R A X ( % r s p ) s e t t o - E N O S Y S a b o v e * /
2005-04-16 15:20:36 -07:00
movq % r10 ,% r c x / * f i x u p f o r C * /
call * s y s _ c a l l _ t a b l e ( ,% r a x ,8 )
2008-03-16 21:59:11 -07:00
movq % r a x ,R A X - A R G O F F S E T ( % r s p )
2006-04-07 19:50:00 +02:00
/* Use IRET because user could have changed frame */
2008-11-16 15:29:00 +01:00
/ *
2005-04-16 15:20:36 -07:00
* Syscall r e t u r n p a t h e n d i n g w i t h I R E T .
* Has c o r r e c t t o p o f s t a c k , b u t p a r t i a l s t a c k f r a m e .
2006-12-07 02:14:02 +01:00
* /
.globl int_ret_from_sys_call
2008-06-24 01:13:31 -07:00
.globl int_with_check
2006-12-07 02:14:02 +01:00
int_ret_from_sys_call :
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2005-04-16 15:20:36 -07:00
testl $ 3 ,C S - A R G O F F S E T ( % r s p )
je r e t i n t _ r e s t o r e _ a r g s
movl $ _ T I F _ A L L W O R K _ M A S K ,% e d i
/* edi: mask to check */
int_with_check :
2007-10-11 22:11:12 +02:00
LOCKDEP_ S Y S _ E X I T _ I R Q
2005-04-16 15:20:36 -07:00
GET_ T H R E A D _ I N F O ( % r c x )
2008-06-24 11:19:35 -03:00
movl T I _ f l a g s ( % r c x ) ,% e d x
2005-04-16 15:20:36 -07:00
andl % e d i ,% e d x
jnz i n t _ c a r e f u l
2008-06-24 11:19:35 -03:00
andl $ ~ T S _ C O M P A T ,T I _ s t a t u s ( % r c x )
2005-04-16 15:20:36 -07:00
jmp r e t i n t _ s w a p g s
/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
/* edx: work, edi: workmask */
int_careful :
bt $ T I F _ N E E D _ R E S C H E D ,% e d x
jnc i n t _ v e r y _ c a r e f u l
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O N
2008-01-30 13:32:08 +01:00
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2005-04-16 15:20:36 -07:00
pushq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-04-16 15:20:36 -07:00
call s c h e d u l e
popq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2005-04-16 15:20:36 -07:00
jmp i n t _ w i t h _ c h e c k
/* handle signals and tracing -- both require a full stack frame */
int_very_careful :
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O N
2008-01-30 13:32:08 +01:00
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2005-04-16 15:20:36 -07:00
SAVE_ R E S T
2008-11-16 15:29:00 +01:00
/* Check for syscall exit trace */
2008-07-09 02:38:07 -07:00
testl $ _ T I F _ W O R K _ S Y S C A L L _ E X I T ,% e d x
2005-04-16 15:20:36 -07:00
jz i n t _ s i g n a l
pushq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2008-11-16 15:29:00 +01:00
leaq 8 ( % r s p ) ,% r d i # & p t r e g s - > a r g 1
2005-04-16 15:20:36 -07:00
call s y s c a l l _ t r a c e _ l e a v e
popq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2008-07-09 02:38:07 -07:00
andl $ ~ ( _ T I F _ W O R K _ S Y S C A L L _ E X I T | _ T I F _ S Y S C A L L _ E M U ) ,% e d i
2005-04-16 15:20:36 -07:00
jmp i n t _ r e s t o r e _ r e s t
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
int_signal :
2008-01-25 21:08:29 +01:00
testl $ _ T I F _ D O _ N O T I F Y _ M A S K ,% e d x
2005-04-16 15:20:36 -07:00
jz 1 f
movq % r s p ,% r d i # & p t r e g s - > a r g 1
xorl % e s i ,% e s i # o l d s e t - > a r g 2
call d o _ n o t i f y _ r e s u m e
x86_64: fix delayed signals
On three of the several paths in entry_64.S that call
do_notify_resume() on the way back to user mode, we fail to properly
check again for newly-arrived work that requires another call to
do_notify_resume() before going to user mode. These paths set the
mask to check only _TIF_NEED_RESCHED, but this is wrong. The other
paths that lead to do_notify_resume() do this correctly already, and
entry_32.S does it correctly in all cases.
All paths back to user mode have to check all the _TIF_WORK_MASK
flags at the last possible stage, with interrupts disabled.
Otherwise, we miss any flags (TIF_SIGPENDING for example) that were
set any time after we entered do_notify_resume(). More work flags
can be set (or left set) synchronously inside do_notify_resume(), as
TIF_SIGPENDING can be, or asynchronously by interrupts or other CPUs
(which then send an asynchronous interrupt).
There are many different scenarios that could hit this bug, most of
them races. The simplest one to demonstrate does not require any
race: when one signal has done handler setup at the check before
returning from a syscall, and there is another signal pending that
should be handled. The second signal's handler should interrupt the
first signal handler before it actually starts (so the interrupted PC
is still at the handler's entry point). Instead, it runs away until
the next kernel entry (next syscall, tick, etc).
This test behaves correctly on 32-bit kernels, and fails on 64-bit
(either 32-bit or 64-bit test binary). With this fix, it works.
#define _GNU_SOURCE
#include <stdio.h>
#include <signal.h>
#include <string.h>
#include <sys/ucontext.h>
#ifndef REG_RIP
#define REG_RIP REG_EIP
#endif
static sig_atomic_t hit1, hit2;
static void
handler (int sig, siginfo_t *info, void *ctx)
{
ucontext_t *uc = ctx;
if ((void *) uc->uc_mcontext.gregs[REG_RIP] == &handler)
{
if (sig == SIGUSR1)
hit1 = 1;
else
hit2 = 1;
}
printf ("%s at %#lx\n", strsignal (sig),
uc->uc_mcontext.gregs[REG_RIP]);
}
int
main (void)
{
struct sigaction sa;
sigset_t set;
sigemptyset (&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = &handler;
if (sigaction (SIGUSR1, &sa, NULL)
|| sigaction (SIGUSR2, &sa, NULL))
return 2;
sigemptyset (&set);
sigaddset (&set, SIGUSR1);
sigaddset (&set, SIGUSR2);
if (sigprocmask (SIG_BLOCK, &set, NULL))
return 3;
printf ("main at %p, handler at %p\n", &main, &handler);
raise (SIGUSR1);
raise (SIGUSR2);
if (sigprocmask (SIG_UNBLOCK, &set, NULL))
return 4;
if (hit1 + hit2 == 1)
{
puts ("PASS");
return 0;
}
puts ("FAIL");
return 1;
}
Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-10 14:50:39 -07:00
1 : movl $ _ T I F _ W O R K _ M A S K ,% e d i
2005-04-16 15:20:36 -07:00
int_restore_rest :
RESTORE_ R E S T
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2005-04-16 15:20:36 -07:00
jmp i n t _ w i t h _ c h e c k
CFI_ E N D P R O C
2006-12-07 02:14:02 +01:00
END( s y s t e m _ c a l l )
2008-11-16 15:29:00 +01:00
/ *
2005-04-16 15:20:36 -07:00
* Certain s p e c i a l s y s t e m c a l l s t h a t n e e d t o s a v e a c o m p l e t e f u l l s t a c k f r a m e .
2008-11-16 15:29:00 +01:00
* /
2005-04-16 15:20:36 -07:00
.macro PTREGSCALL label,f u n c ,a r g
2008-11-21 16:41:55 +01:00
ENTRY( \ l a b e l )
PARTIAL_ F R A M E 1 8 / * o f f s e t 8 : r e t u r n a d d r e s s * /
subq $ R E S T _ S K I P , % r s p
CFI_ A D J U S T _ C F A _ O F F S E T R E S T _ S K I P
call s a v e _ r e s t
DEFAULT_ F R A M E 0 8 / * o f f s e t 8 : r e t u r n a d d r e s s * /
leaq 8 ( % r s p ) , \ a r g / * p t _ r e g s p o i n t e r * /
call \ f u n c
jmp p t r e g s c a l l _ c o m m o n
CFI_ E N D P R O C
2006-06-26 13:56:55 +02:00
END( \ l a b e l )
2005-04-16 15:20:36 -07:00
.endm
PTREGSCALL s t u b _ c l o n e , s y s _ c l o n e , % r8
PTREGSCALL s t u b _ f o r k , s y s _ f o r k , % r d i
PTREGSCALL s t u b _ v f o r k , s y s _ v f o r k , % r d i
PTREGSCALL s t u b _ s i g a l t s t a c k , s y s _ s i g a l t s t a c k , % r d x
PTREGSCALL s t u b _ i o p l , s y s _ i o p l , % r s i
ENTRY( p t r e g s c a l l _ c o m m o n )
2008-11-21 16:41:55 +01:00
DEFAULT_ F R A M E 1 8 / * o f f s e t 8 : r e t u r n a d d r e s s * /
RESTORE_ T O P _ O F _ S T A C K % r11 , 8
movq_ c f i _ r e s t o r e R 1 5 + 8 , r15
movq_ c f i _ r e s t o r e R 1 4 + 8 , r14
movq_ c f i _ r e s t o r e R 1 3 + 8 , r13
movq_ c f i _ r e s t o r e R 1 2 + 8 , r12
movq_ c f i _ r e s t o r e R B P + 8 , r b p
movq_ c f i _ r e s t o r e R B X + 8 , r b x
ret $ R E S T _ S K I P / * p o p e x t e n d e d r e g i s t e r s * /
2005-04-16 15:20:36 -07:00
CFI_ E N D P R O C
2006-06-26 13:56:55 +02:00
END( p t r e g s c a l l _ c o m m o n )
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
ENTRY( s t u b _ e x e c v e )
CFI_ S T A R T P R O C
popq % r11
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
CFI_ R E G I S T E R r i p , r11
2005-04-16 15:20:36 -07:00
SAVE_ R E S T
FIXUP_ T O P _ O F _ S T A C K % r11
2008-02-26 12:55:57 +01:00
movq % r s p , % r c x
2005-04-16 15:20:36 -07:00
call s y s _ e x e c v e
RESTORE_ T O P _ O F _ S T A C K % r11
movq % r a x ,R A X ( % r s p )
RESTORE_ R E S T
jmp i n t _ r e t _ f r o m _ s y s _ c a l l
CFI_ E N D P R O C
2006-06-26 13:56:55 +02:00
END( s t u b _ e x e c v e )
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
/ *
* sigreturn i s s p e c i a l b e c a u s e i t n e e d s t o r e s t o r e a l l r e g i s t e r s o n r e t u r n .
* This c a n n o t b e d o n e w i t h S Y S R E T , s o u s e t h e I R E T r e t u r n p a t h i n s t e a d .
2008-11-16 15:29:00 +01:00
* /
2005-04-16 15:20:36 -07:00
ENTRY( s t u b _ r t _ s i g r e t u r n )
CFI_ S T A R T P R O C
2005-09-12 18:49:24 +02:00
addq $ 8 , % r s p
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2005-04-16 15:20:36 -07:00
SAVE_ R E S T
movq % r s p ,% r d i
FIXUP_ T O P _ O F _ S T A C K % r11
call s y s _ r t _ s i g r e t u r n
movq % r a x ,R A X ( % r s p ) # f i x m e , t h i s c o u l d b e d o n e a t t h e h i g h e r l a y e r
RESTORE_ R E S T
jmp i n t _ r e t _ f r o m _ s y s _ c a l l
CFI_ E N D P R O C
2006-06-26 13:56:55 +02:00
END( s t u b _ r t _ s i g r e t u r n )
2005-04-16 15:20:36 -07:00
2008-11-11 13:51:52 -08:00
/ *
* Build t h e e n t r y s t u b s a n d p o i n t e r t a b l e w i t h s o m e a s s e m b l e r m a g i c .
* We p a c k 7 s t u b s i n t o a s i n g l e 3 2 - b y t e c h u n k , w h i c h w i l l f i t i n a
* single c a c h e l i n e o n a l l m o d e r n x86 i m p l e m e n t a t i o n s .
* /
.section .init .rodata , " a"
ENTRY( i n t e r r u p t )
.text
.p2align 5
.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY( i r q _ e n t r i e s _ s t a r t )
INTR_ F R A M E
vector=FIRST_EXTERNAL_VECTOR
.rept ( NR_ V E C T O R S - F I R S T _ E X T E R N A L _ V E C T O R + 6 ) / 7
.balign 32
.rept 7
.if vector < NR_ V E C T O R S
2008-11-12 10:27:35 -08:00
.if vector < > FIRST_ E X T E R N A L _ V E C T O R
2008-11-11 13:51:52 -08:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
.endif
1 : pushq $ ( ~ v e c t o r + 0 x80 ) / * N o t e : a l w a y s i n s i g n e d b y t e r a n g e * /
CFI_ A D J U S T _ C F A _ O F F S E T 8
2008-11-12 10:27:35 -08:00
.if ( ( vector- F I R S T _ E X T E R N A L _ V E C T O R ) % 7 ) < > 6
2008-11-11 13:51:52 -08:00
jmp 2 f
.endif
.previous
.quad 1b
.text
vector=vector + 1
.endif
.endr
2 : jmp c o m m o n _ i n t e r r u p t
.endr
CFI_ E N D P R O C
END( i r q _ e n t r i e s _ s t a r t )
.previous
END( i n t e r r u p t )
.previous
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
/ *
2005-04-16 15:20:36 -07:00
* Interrupt e n t r y / e x i t .
*
* Interrupt e n t r y p o i n t s s a v e o n l y c a l l e e c l o b b e r e d r e g i s t e r s i n f a s t p a t h .
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
*
* Entry r u n s w i t h i n t e r r u p t s o f f .
* /
2005-04-16 15:20:36 -07:00
2008-11-13 13:50:20 +01:00
/* 0(%rsp): ~(interrupt number) */
2005-04-16 15:20:36 -07:00
.macro interrupt func
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
subq $ 1 0 * 8 , % r s p
CFI_ A D J U S T _ C F A _ O F F S E T 1 0 * 8
call s a v e _ a r g s
2008-11-20 14:40:11 +01:00
PARTIAL_ F R A M E 0
2005-04-16 15:20:36 -07:00
call \ f u n c
.endm
2008-11-13 13:50:20 +01:00
/ *
* The i n t e r r u p t s t u b s p u s h ( ~ v e c t o r + 0 x80 ) o n t o t h e s t a c k a n d
* then j u m p t o c o m m o n _ i n t e r r u p t .
* /
2008-11-11 13:51:52 -08:00
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt :
2005-09-12 18:49:24 +02:00
XCPT_ F R A M E
2008-11-13 13:50:20 +01:00
addq $ - 0 x80 ,( % r s p ) / * A d j u s t v e c t o r t o [ - 2 5 6 ,- 1 ] r a n g e * /
2005-04-16 15:20:36 -07:00
interrupt d o _ I R Q
/* 0(%rsp): oldrsp-ARGOFFSET */
2005-09-12 18:49:24 +02:00
ret_from_intr :
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2005-07-28 21:15:48 -07:00
decl % g s : p d a _ i r q c o u n t
2006-06-26 13:57:35 +02:00
leaveq
2005-09-12 18:49:24 +02:00
CFI_ D E F _ C F A _ R E G I S T E R r s p
2006-06-26 13:57:35 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2005-09-12 18:49:24 +02:00
exit_intr :
2005-04-16 15:20:36 -07:00
GET_ T H R E A D _ I N F O ( % r c x )
testl $ 3 ,C S - A R G O F F S E T ( % r s p )
je r e t i n t _ k e r n e l
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
/* Interrupt came from user space */
/ *
* Has a c o r r e c t t o p o f s t a c k , b u t a p a r t i a l s t a c k f r a m e
* % rcx : thread i n f o . I n t e r r u p t s o f f .
2008-11-16 15:29:00 +01:00
* /
2005-04-16 15:20:36 -07:00
retint_with_reschedule :
movl $ _ T I F _ W O R K _ M A S K ,% e d i
2005-09-12 18:49:24 +02:00
retint_check :
2007-10-11 22:11:12 +02:00
LOCKDEP_ S Y S _ E X I T _ I R Q
2008-06-24 11:19:35 -03:00
movl T I _ f l a g s ( % r c x ) ,% e d x
2005-04-16 15:20:36 -07:00
andl % e d i ,% e d x
2005-09-12 18:49:24 +02:00
CFI_ R E M E M B E R _ S T A T E
2005-04-16 15:20:36 -07:00
jnz r e t i n t _ c a r e f u l
2007-10-11 22:11:12 +02:00
retint_swapgs : /* return to user-space */
2006-07-03 00:24:45 -07:00
/ *
* The i r e t q c o u l d r e - e n a b l e i n t e r r u p t s :
* /
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ A N Y )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ I R E T Q
2008-01-30 13:32:08 +01:00
SWAPGS
2006-07-03 00:24:45 -07:00
jmp r e s t o r e _ a r g s
2007-10-11 22:11:12 +02:00
retint_restore_args : /* return to kernel space */
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ A N Y )
2006-07-03 00:24:45 -07:00
/ *
* The i r e t q c o u l d r e - e n a b l e i n t e r r u p t s :
* /
TRACE_ I R Q S _ I R E T Q
restore_args :
2008-02-09 23:24:08 +01:00
RESTORE_ A R G S 0 ,8 ,0
2008-02-13 23:29:53 +02:00
irq_return :
2008-01-30 13:32:08 +01:00
INTERRUPT_ R E T U R N
2008-02-09 23:24:08 +01:00
.section _ _ ex_ t a b l e , " a "
.quad irq_ r e t u r n , b a d _ i r e t
.previous
# ifdef C O N F I G _ P A R A V I R T
2008-01-30 13:32:08 +01:00
ENTRY( n a t i v e _ i r e t )
2005-04-16 15:20:36 -07:00
iretq
.section _ _ ex_ t a b l e ," a "
2008-01-30 13:32:08 +01:00
.quad native_ i r e t , b a d _ i r e t
2005-04-16 15:20:36 -07:00
.previous
2008-02-09 23:24:08 +01:00
# endif
2005-04-16 15:20:36 -07:00
.section .fixup , " ax"
bad_iret :
2008-02-06 22:39:43 +01:00
/ *
* The i r e t t r a p s w h e n t h e % c s o r % s s b e i n g r e s t o r e d i s b o g u s .
* We' v e l o s t t h e o r i g i n a l t r a p v e c t o r a n d e r r o r c o d e .
* # GPF i s t h e m o s t l i k e l y o n e t o g e t f o r a n i n v a l i d s e l e c t o r .
* So p r e t e n d w e c o m p l e t e d t h e i r e t a n d t o o k t h e #G P F i n u s e r m o d e .
*
* We a r e n o w r u n n i n g w i t h t h e k e r n e l G S a f t e r e x c e p t i o n r e c o v e r y .
* But e r r o r _ e n t r y e x p e c t s u s t o h a v e u s e r G S t o m a t c h t h e u s e r % c s ,
* so s w a p b a c k .
* /
pushq $ 0
SWAPGS
jmp g e n e r a l _ p r o t e c t i o n
2008-01-30 13:32:08 +01:00
.previous
2005-09-12 18:49:24 +02:00
/* edi: workmask, edx: work */
2005-04-16 15:20:36 -07:00
retint_careful :
2005-09-12 18:49:24 +02:00
CFI_ R E S T O R E _ S T A T E
2005-04-16 15:20:36 -07:00
bt $ T I F _ N E E D _ R E S C H E D ,% e d x
jnc r e t i n t _ s i g n a l
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O N
2008-01-30 13:32:08 +01:00
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2005-04-16 15:20:36 -07:00
pushq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2005-04-16 15:20:36 -07:00
call s c h e d u l e
2008-11-16 15:29:00 +01:00
popq % r d i
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2005-04-16 15:20:36 -07:00
GET_ T H R E A D _ I N F O ( % r c x )
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2005-04-16 15:20:36 -07:00
jmp r e t i n t _ c h e c k
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
retint_signal :
2008-01-25 21:08:29 +01:00
testl $ _ T I F _ D O _ N O T I F Y _ M A S K ,% e d x
2005-05-16 21:53:19 -07:00
jz r e t i n t _ s w a p g s
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O N
2008-01-30 13:32:08 +01:00
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2005-04-16 15:20:36 -07:00
SAVE_ R E S T
2008-11-16 15:29:00 +01:00
movq $ - 1 ,O R I G _ R A X ( % r s p )
2005-07-28 21:15:48 -07:00
xorl % e s i ,% e s i # o l d s e t
2005-04-16 15:20:36 -07:00
movq % r s p ,% r d i # & p t _ r e g s
call d o _ n o t i f y _ r e s u m e
RESTORE_ R E S T
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
2006-07-03 00:24:45 -07:00
TRACE_ I R Q S _ O F F
2005-05-01 08:58:51 -07:00
GET_ T H R E A D _ I N F O ( % r c x )
x86_64: fix delayed signals
On three of the several paths in entry_64.S that call
do_notify_resume() on the way back to user mode, we fail to properly
check again for newly-arrived work that requires another call to
do_notify_resume() before going to user mode. These paths set the
mask to check only _TIF_NEED_RESCHED, but this is wrong. The other
paths that lead to do_notify_resume() do this correctly already, and
entry_32.S does it correctly in all cases.
All paths back to user mode have to check all the _TIF_WORK_MASK
flags at the last possible stage, with interrupts disabled.
Otherwise, we miss any flags (TIF_SIGPENDING for example) that were
set any time after we entered do_notify_resume(). More work flags
can be set (or left set) synchronously inside do_notify_resume(), as
TIF_SIGPENDING can be, or asynchronously by interrupts or other CPUs
(which then send an asynchronous interrupt).
There are many different scenarios that could hit this bug, most of
them races. The simplest one to demonstrate does not require any
race: when one signal has done handler setup at the check before
returning from a syscall, and there is another signal pending that
should be handled. The second signal's handler should interrupt the
first signal handler before it actually starts (so the interrupted PC
is still at the handler's entry point). Instead, it runs away until
the next kernel entry (next syscall, tick, etc).
This test behaves correctly on 32-bit kernels, and fails on 64-bit
(either 32-bit or 64-bit test binary). With this fix, it works.
#define _GNU_SOURCE
#include <stdio.h>
#include <signal.h>
#include <string.h>
#include <sys/ucontext.h>
#ifndef REG_RIP
#define REG_RIP REG_EIP
#endif
static sig_atomic_t hit1, hit2;
static void
handler (int sig, siginfo_t *info, void *ctx)
{
ucontext_t *uc = ctx;
if ((void *) uc->uc_mcontext.gregs[REG_RIP] == &handler)
{
if (sig == SIGUSR1)
hit1 = 1;
else
hit2 = 1;
}
printf ("%s at %#lx\n", strsignal (sig),
uc->uc_mcontext.gregs[REG_RIP]);
}
int
main (void)
{
struct sigaction sa;
sigset_t set;
sigemptyset (&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = &handler;
if (sigaction (SIGUSR1, &sa, NULL)
|| sigaction (SIGUSR2, &sa, NULL))
return 2;
sigemptyset (&set);
sigaddset (&set, SIGUSR1);
sigaddset (&set, SIGUSR2);
if (sigprocmask (SIG_BLOCK, &set, NULL))
return 3;
printf ("main at %p, handler at %p\n", &main, &handler);
raise (SIGUSR1);
raise (SIGUSR2);
if (sigprocmask (SIG_UNBLOCK, &set, NULL))
return 4;
if (hit1 + hit2 == 1)
{
puts ("PASS");
return 0;
}
puts ("FAIL");
return 1;
}
Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-10 14:50:39 -07:00
jmp r e t i n t _ w i t h _ r e s c h e d u l e
2005-04-16 15:20:36 -07:00
# ifdef C O N F I G _ P R E E M P T
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
2006-09-26 10:52:29 +02:00
ENTRY( r e t i n t _ k e r n e l )
2008-06-24 11:19:35 -03:00
cmpl $ 0 ,T I _ p r e e m p t _ c o u n t ( % r c x )
2005-04-16 15:20:36 -07:00
jnz r e t i n t _ r e s t o r e _ a r g s
2008-06-24 11:19:35 -03:00
bt $ T I F _ N E E D _ R E S C H E D ,T I _ f l a g s ( % r c x )
2005-04-16 15:20:36 -07:00
jnc r e t i n t _ r e s t o r e _ a r g s
bt $ 9 ,E F L A G S - A R G O F F S E T ( % r s p ) / * i n t e r r u p t s o f f ? * /
jnc r e t i n t _ r e s t o r e _ a r g s
call p r e e m p t _ s c h e d u l e _ i r q
jmp e x i t _ i n t r
2008-11-16 15:29:00 +01:00
# endif
2006-06-26 13:56:55 +02:00
2005-04-16 15:20:36 -07:00
CFI_ E N D P R O C
2006-06-26 13:56:55 +02:00
END( c o m m o n _ i n t e r r u p t )
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
/ *
* APIC i n t e r r u p t s .
2008-11-16 15:29:00 +01:00
* /
2008-11-23 10:08:28 +01:00
.macro apicinterrupt num s y m d o _ s y m
ENTRY( \ s y m )
2005-09-12 18:49:24 +02:00
INTR_ F R A M E
2006-06-27 02:53:44 -07:00
pushq $ ~ ( \ n u m )
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2008-11-23 10:08:28 +01:00
interrupt \ d o _ s y m
2005-04-16 15:20:36 -07:00
jmp r e t _ f r o m _ i n t r
CFI_ E N D P R O C
2008-11-23 10:08:28 +01:00
END( \ s y m )
.endm
2005-04-16 15:20:36 -07:00
2008-11-23 10:08:28 +01:00
# ifdef C O N F I G _ S M P
apicinterrupt I R Q _ M O V E _ C L E A N U P _ V E C T O R \
irq_ m o v e _ c l e a n u p _ i n t e r r u p t s m p _ i r q _ m o v e _ c l e a n u p _ i n t e r r u p t
# endif
2005-04-16 15:20:36 -07:00
2008-11-27 00:02:10 +03:00
apicinterrupt U V _ B A U _ M E S S A G E \
2008-11-23 10:08:28 +01:00
uv_ b a u _ m e s s a g e _ i n t r1 u v _ b a u _ m e s s a g e _ i n t e r r u p t
apicinterrupt L O C A L _ T I M E R _ V E C T O R \
apic_ t i m e r _ i n t e r r u p t s m p _ a p i c _ t i m e r _ i n t e r r u p t
2005-11-05 17:25:53 +01:00
2008-11-16 15:29:00 +01:00
# ifdef C O N F I G _ S M P
2008-11-23 10:08:28 +01:00
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 0 \
invalidate_ i n t e r r u p t 0 s m p _ i n v a l i d a t e _ i n t e r r u p t
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 1 \
invalidate_ i n t e r r u p t 1 s m p _ i n v a l i d a t e _ i n t e r r u p t
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 2 \
invalidate_ i n t e r r u p t 2 s m p _ i n v a l i d a t e _ i n t e r r u p t
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 3 \
invalidate_ i n t e r r u p t 3 s m p _ i n v a l i d a t e _ i n t e r r u p t
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 4 \
invalidate_ i n t e r r u p t 4 s m p _ i n v a l i d a t e _ i n t e r r u p t
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 5 \
invalidate_ i n t e r r u p t 5 s m p _ i n v a l i d a t e _ i n t e r r u p t
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 6 \
invalidate_ i n t e r r u p t 6 s m p _ i n v a l i d a t e _ i n t e r r u p t
apicinterrupt I N V A L I D A T E _ T L B _ V E C T O R _ S T A R T + 7 \
invalidate_ i n t e r r u p t 7 s m p _ i n v a l i d a t e _ i n t e r r u p t
2005-04-16 15:20:36 -07:00
# endif
2008-11-23 10:08:28 +01:00
apicinterrupt T H R E S H O L D _ A P I C _ V E C T O R \
threshold_ i n t e r r u p t m c e _ t h r e s h o l d _ i n t e r r u p t
apicinterrupt T H E R M A L _ A P I C _ V E C T O R \
thermal_ i n t e r r u p t s m p _ t h e r m a l _ i n t e r r u p t
2008-06-02 08:56:14 -05:00
2008-11-23 10:08:28 +01:00
# ifdef C O N F I G _ S M P
apicinterrupt C A L L _ F U N C T I O N _ S I N G L E _ V E C T O R \
call_ f u n c t i o n _ s i n g l e _ i n t e r r u p t s m p _ c a l l _ f u n c t i o n _ s i n g l e _ i n t e r r u p t
apicinterrupt C A L L _ F U N C T I O N _ V E C T O R \
call_ f u n c t i o n _ i n t e r r u p t s m p _ c a l l _ f u n c t i o n _ i n t e r r u p t
apicinterrupt R E S C H E D U L E _ V E C T O R \
reschedule_ i n t e r r u p t s m p _ r e s c h e d u l e _ i n t e r r u p t
# endif
2005-04-16 15:20:36 -07:00
2008-11-23 10:08:28 +01:00
apicinterrupt E R R O R _ A P I C _ V E C T O R \
error_ i n t e r r u p t s m p _ e r r o r _ i n t e r r u p t
apicinterrupt S P U R I O U S _ A P I C _ V E C T O R \
spurious_ i n t e r r u p t s m p _ s p u r i o u s _ i n t e r r u p t
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
/ *
* Exception e n t r y p o i n t s .
2008-11-16 15:29:00 +01:00
* /
2008-11-23 10:08:28 +01:00
.macro zeroentry sym d o _ s y m
ENTRY( \ s y m )
2005-09-12 18:49:24 +02:00
INTR_ F R A M E
2008-06-25 00:19:31 -04:00
PARAVIRT_ A D J U S T _ E X C E P T I O N _ F R A M E
2008-11-21 15:20:47 +01:00
pushq_ c f i $ - 1 / * O R I G _ R A X : n o s y s c a l l t o r e s t a r t * /
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
subq $ 1 5 * 8 ,% r s p
CFI_ A D J U S T _ C F A _ O F F S E T 1 5 * 8
call e r r o r _ e n t r y
2008-11-20 14:40:11 +01:00
DEFAULT_ F R A M E 0
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
movq % r s p ,% r d i / * p t _ r e g s p o i n t e r * /
xorl % e s i ,% e s i / * n o e r r o r c o d e * /
2008-11-23 10:08:28 +01:00
call \ d o _ s y m
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
jmp e r r o r _ e x i t / * % e b x : n o s w a p g s f l a g * /
2005-09-12 18:49:24 +02:00
CFI_ E N D P R O C
2008-11-23 10:08:28 +01:00
END( \ s y m )
.endm
2005-04-16 15:20:36 -07:00
2008-11-23 10:08:28 +01:00
.macro paranoidzeroentry sym d o _ s y m
2008-11-24 13:24:28 +01:00
ENTRY( \ s y m )
2008-11-21 16:44:28 +01:00
INTR_ F R A M E
PARAVIRT_ A D J U S T _ E X C E P T I O N _ F R A M E
pushq $ - 1 / * O R I G _ R A X : n o s y s c a l l t o r e s t a r t * /
CFI_ A D J U S T _ C F A _ O F F S E T 8
subq $ 1 5 * 8 , % r s p
call s a v e _ p a r a n o i d
TRACE_ I R Q S _ O F F
movq % r s p ,% r d i / * p t _ r e g s p o i n t e r * /
xorl % e s i ,% e s i / * n o e r r o r c o d e * /
2008-11-23 10:08:28 +01:00
call \ d o _ s y m
2008-11-21 16:44:28 +01:00
jmp p a r a n o i d _ e x i t / * % e b x : n o s w a p g s f l a g * /
CFI_ E N D P R O C
2008-11-24 13:24:28 +01:00
END( \ s y m )
2008-11-23 10:08:28 +01:00
.endm
2008-11-21 16:44:28 +01:00
2008-11-23 10:08:28 +01:00
.macro paranoidzeroentry_ist sym d o _ s y m i s t
2008-11-24 13:24:28 +01:00
ENTRY( \ s y m )
2008-11-27 21:10:08 +03:00
INTR_ F R A M E
2008-11-21 16:44:28 +01:00
PARAVIRT_ A D J U S T _ E X C E P T I O N _ F R A M E
pushq $ - 1 / * O R I G _ R A X : n o s y s c a l l t o r e s t a r t * /
CFI_ A D J U S T _ C F A _ O F F S E T 8
subq $ 1 5 * 8 , % r s p
call s a v e _ p a r a n o i d
TRACE_ I R Q S _ O F F
movq % r s p ,% r d i / * p t _ r e g s p o i n t e r * /
xorl % e s i ,% e s i / * n o e r r o r c o d e * /
movq % g s : p d a _ d a t a _ o f f s e t , % r b p
subq $ E X C E P T I O N _ S T K S Z , p e r _ c p u _ _ i n i t _ t s s + T S S _ i s t + ( \ i s t - 1 ) * 8 ( % r b p )
2008-11-23 10:08:28 +01:00
call \ d o _ s y m
2008-11-21 16:44:28 +01:00
addq $ E X C E P T I O N _ S T K S Z , p e r _ c p u _ _ i n i t _ t s s + T S S _ i s t + ( \ i s t - 1 ) * 8 ( % r b p )
jmp p a r a n o i d _ e x i t / * % e b x : n o s w a p g s f l a g * /
CFI_ E N D P R O C
2008-11-24 13:24:28 +01:00
END( \ s y m )
2008-11-23 10:08:28 +01:00
.endm
2008-11-21 16:44:28 +01:00
2008-11-24 13:24:28 +01:00
.macro errorentry sym d o _ s y m
2008-11-23 10:08:28 +01:00
ENTRY( \ s y m )
2005-09-12 18:49:24 +02:00
XCPT_ F R A M E
2008-06-25 00:19:31 -04:00
PARAVIRT_ A D J U S T _ E X C E P T I O N _ F R A M E
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
subq $ 1 5 * 8 ,% r s p
CFI_ A D J U S T _ C F A _ O F F S E T 1 5 * 8
call e r r o r _ e n t r y
2008-11-20 14:40:11 +01:00
DEFAULT_ F R A M E 0
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
movq % r s p ,% r d i / * p t _ r e g s p o i n t e r * /
movq O R I G _ R A X ( % r s p ) ,% r s i / * g e t e r r o r c o d e * /
movq $ - 1 ,O R I G _ R A X ( % r s p ) / * n o s y s c a l l t o r e s t a r t * /
2008-11-23 10:08:28 +01:00
call \ d o _ s y m
x86: move entry_64.S register saving out of the macros
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-19 01:18:11 +01:00
jmp e r r o r _ e x i t / * % e b x : n o s w a p g s f l a g * /
2005-09-12 18:49:24 +02:00
CFI_ E N D P R O C
2008-11-23 10:08:28 +01:00
END( \ s y m )
.endm
2005-04-16 15:20:36 -07:00
/* error code is on the stack already */
2008-11-24 13:24:28 +01:00
.macro paranoiderrorentry sym d o _ s y m
2008-11-23 10:08:28 +01:00
ENTRY( \ s y m )
2008-11-21 16:44:28 +01:00
XCPT_ F R A M E
PARAVIRT_ A D J U S T _ E X C E P T I O N _ F R A M E
subq $ 1 5 * 8 ,% r s p
2008-11-21 16:43:18 +01:00
CFI_ A D J U S T _ C F A _ O F F S E T 1 5 * 8
call s a v e _ p a r a n o i d
DEFAULT_ F R A M E 0
2008-09-26 14:03:03 +02:00
TRACE_ I R Q S _ O F F
2008-11-21 16:44:28 +01:00
movq % r s p ,% r d i / * p t _ r e g s p o i n t e r * /
movq O R I G _ R A X ( % r s p ) ,% r s i / * g e t e r r o r c o d e * /
movq $ - 1 ,O R I G _ R A X ( % r s p ) / * n o s y s c a l l t o r e s t a r t * /
2008-11-23 10:08:28 +01:00
call \ d o _ s y m
2008-11-21 16:44:28 +01:00
jmp p a r a n o i d _ e x i t / * % e b x : n o s w a p g s f l a g * /
CFI_ E N D P R O C
2008-11-23 10:08:28 +01:00
END( \ s y m )
.endm
zeroentry d i v i d e _ e r r o r d o _ d i v i d e _ e r r o r
zeroentry o v e r f l o w d o _ o v e r f l o w
zeroentry b o u n d s d o _ b o u n d s
zeroentry i n v a l i d _ o p d o _ i n v a l i d _ o p
zeroentry d e v i c e _ n o t _ a v a i l a b l e d o _ d e v i c e _ n o t _ a v a i l a b l e
2008-11-24 13:24:28 +01:00
paranoiderrorentry d o u b l e _ f a u l t d o _ d o u b l e _ f a u l t
2008-11-23 10:08:28 +01:00
zeroentry c o p r o c e s s o r _ s e g m e n t _ o v e r r u n d o _ c o p r o c e s s o r _ s e g m e n t _ o v e r r u n
errorentry i n v a l i d _ T S S d o _ i n v a l i d _ T S S
errorentry s e g m e n t _ n o t _ p r e s e n t d o _ s e g m e n t _ n o t _ p r e s e n t
zeroentry s p u r i o u s _ i n t e r r u p t _ b u g d o _ s p u r i o u s _ i n t e r r u p t _ b u g
zeroentry c o p r o c e s s o r _ e r r o r d o _ c o p r o c e s s o r _ e r r o r
errorentry a l i g n m e n t _ c h e c k d o _ a l i g n m e n t _ c h e c k
zeroentry s i m d _ c o p r o c e s s o r _ e r r o r d o _ s i m d _ c o p r o c e s s o r _ e r r o r
2006-07-03 00:24:45 -07:00
2008-11-27 21:10:08 +03:00
/* Reload gs selector with exception handling */
/* edi: new selector */
2008-06-25 00:19:32 -04:00
ENTRY( n a t i v e _ l o a d _ g s _ i n d e x )
2005-09-12 18:49:24 +02:00
CFI_ S T A R T P R O C
2005-04-16 15:20:36 -07:00
pushf
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T 8
2008-01-30 13:32:08 +01:00
DISABLE_ I N T E R R U P T S ( C L B R _ A N Y | ~ ( C L B R _ R D I ) )
2008-11-27 21:10:08 +03:00
SWAPGS
2008-11-16 15:29:00 +01:00
gs_change :
2008-11-27 21:10:08 +03:00
movl % e d i ,% g s
2005-04-16 15:20:36 -07:00
2 : mfence / * w o r k a r o u n d * /
2008-01-30 13:32:08 +01:00
SWAPGS
2008-11-27 21:10:08 +03:00
popf
2005-09-12 18:49:24 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2008-11-27 21:10:08 +03:00
ret
2005-09-12 18:49:24 +02:00
CFI_ E N D P R O C
2008-11-23 10:15:32 +01:00
END( n a t i v e _ l o a d _ g s _ i n d e x )
2008-11-16 15:29:00 +01:00
2008-11-27 21:10:08 +03:00
.section _ _ ex_ t a b l e ," a "
.align 8
.quad gs_ c h a n g e ,b a d _ g s
.previous
.section .fixup , " ax"
2005-04-16 15:20:36 -07:00
/* running with kernelgs */
2008-11-16 15:29:00 +01:00
bad_gs :
2008-01-30 13:32:08 +01:00
SWAPGS / * s w i t c h b a c k t o u s e r g s * /
2005-04-16 15:20:36 -07:00
xorl % e a x ,% e a x
2008-11-27 21:10:08 +03:00
movl % e a x ,% g s
jmp 2 b
.previous
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
/ *
* Create a k e r n e l t h r e a d .
*
* C e x t e r n i n t e r f a c e :
* extern l o n g k e r n e l _ t h r e a d ( i n t ( * f n ) ( v o i d * ) , v o i d * a r g , u n s i g n e d l o n g f l a g s )
*
* asm i n p u t a r g u m e n t s :
* rdi : fn, r s i : a r g , r d x : f l a g s
* /
ENTRY( k e r n e l _ t h r e a d )
CFI_ S T A R T P R O C
FAKE_ S T A C K _ F R A M E $ c h i l d _ r i p
SAVE_ A L L
# rdi : flags, r s i : u s p , r d x : w i l l b e & p t _ r e g s
movq % r d x ,% r d i
orq k e r n e l _ t h r e a d _ f l a g s ( % r i p ) ,% r d i
movq $ - 1 , % r s i
movq % r s p , % r d x
xorl % r8 d ,% r8 d
xorl % r9 d ,% r9 d
2008-11-16 15:29:00 +01:00
2005-04-16 15:20:36 -07:00
# clone n o w
call d o _ f o r k
movq % r a x ,R A X ( % r s p )
xorl % e d i ,% e d i
/ *
* It i s n ' t w o r t h t o c h e c k f o r r e s c h e d u l e h e r e ,
* so i n t e r n a l l y t o t h e x86 _ 6 4 p o r t y o u c a n r e l y o n k e r n e l _ t h r e a d ( )
* not t o r e s c h e d u l e t h e c h i l d b e f o r e r e t u r n i n g , t h i s a v o i d s t h e n e e d
* of h a c k s f o r e x a m p l e t o f o r k o f f t h e p e r - C P U i d l e t a s k s .
2008-11-27 21:10:08 +03:00
* [ Hopefully n o g e n e r i c c o d e r e l i e s o n t h e r e s c h e d u l e - A K ]
2005-04-16 15:20:36 -07:00
* /
RESTORE_ A L L
UNFAKE_ S T A C K _ F R A M E
ret
CFI_ E N D P R O C
2008-11-23 10:15:32 +01:00
END( k e r n e l _ t h r e a d )
2008-11-16 15:29:00 +01:00
2008-11-26 22:17:00 +03:00
ENTRY( c h i l d _ r i p )
2006-08-30 19:37:08 +02:00
pushq $ 0 # f a k e r e t u r n a d d r e s s
CFI_ S T A R T P R O C
2005-04-16 15:20:36 -07:00
/ *
* Here w e a r e i n t h e c h i l d a n d t h e r e g i s t e r s a r e s e t a s t h e y w e r e
* at k e r n e l _ t h r e a d ( ) i n v o c a t i o n i n t h e p a r e n t .
* /
movq % r d i , % r a x
movq % r s i , % r d i
call * % r a x
# exit
2007-10-17 18:04:33 +02:00
mov % e a x , % e d i
2005-04-16 15:20:36 -07:00
call d o _ e x i t
2008-11-23 22:47:10 +08:00
ud2 # p a d d i n g f o r c a l l t r a c e
2006-08-30 19:37:08 +02:00
CFI_ E N D P R O C
2008-11-23 10:15:32 +01:00
END( c h i l d _ r i p )
2005-04-16 15:20:36 -07:00
/ *
* execve( ) . T h i s f u n c t i o n n e e d s t o u s e I R E T , n o t S Y S R E T , t o s e t u p a l l s t a t e p r o p e r l y .
*
* C e x t e r n i n t e r f a c e :
* extern l o n g e x e c v e ( c h a r * n a m e , c h a r * * a r g v , c h a r * * e n v p )
*
* asm i n p u t a r g u m e n t s :
* rdi : name, r s i : a r g v , r d x : e n v p
*
* We w a n t t o f a l l b a c k i n t o :
2008-02-26 12:55:57 +01:00
* extern l o n g s y s _ e x e c v e ( c h a r * n a m e , c h a r * * a r g v ,c h a r * * e n v p , s t r u c t p t _ r e g s * r e g s )
2005-04-16 15:20:36 -07:00
*
* do_ s y s _ e x e c v e a s m f a l l b a c k a r g u m e n t s :
2008-02-26 12:55:57 +01:00
* rdi : name, r s i : a r g v , r d x : e n v p , r c x : f a k e f r a m e o n t h e s t a c k
2005-04-16 15:20:36 -07:00
* /
2006-10-02 02:18:31 -07:00
ENTRY( k e r n e l _ e x e c v e )
2005-04-16 15:20:36 -07:00
CFI_ S T A R T P R O C
FAKE_ S T A C K _ F R A M E $ 0
2008-11-16 15:29:00 +01:00
SAVE_ A L L
2008-02-26 12:55:57 +01:00
movq % r s p ,% r c x
2005-04-16 15:20:36 -07:00
call s y s _ e x e c v e
2008-11-16 15:29:00 +01:00
movq % r a x , R A X ( % r s p )
2005-04-16 15:20:36 -07:00
RESTORE_ R E S T
testq % r a x ,% r a x
je i n t _ r e t _ f r o m _ s y s _ c a l l
RESTORE_ A R G S
UNFAKE_ S T A C K _ F R A M E
ret
CFI_ E N D P R O C
2008-11-23 10:15:32 +01:00
END( k e r n e l _ e x e c v e )
2005-04-16 15:20:36 -07:00
2006-08-02 22:37:28 +02:00
/* Call softirq on interrupt stack. Interrupts are off. */
2005-07-28 21:15:49 -07:00
ENTRY( c a l l _ s o f t i r q )
2005-09-12 18:49:24 +02:00
CFI_ S T A R T P R O C
2006-08-02 22:37:28 +02:00
push % r b p
CFI_ A D J U S T _ C F A _ O F F S E T 8
CFI_ R E L _ O F F S E T r b p ,0
mov % r s p ,% r b p
CFI_ D E F _ C F A _ R E G I S T E R r b p
2005-07-28 21:15:49 -07:00
incl % g s : p d a _ i r q c o u n t
2006-08-02 22:37:28 +02:00
cmove % g s : p d a _ i r q s t a c k p t r ,% r s p
push % r b p # b a c k l i n k f o r o l d u n w i n d e r
2005-07-28 21:15:49 -07:00
call _ _ d o _ s o f t i r q
2006-08-02 22:37:28 +02:00
leaveq
2005-09-12 18:49:24 +02:00
CFI_ D E F _ C F A _ R E G I S T E R r s p
2006-08-02 22:37:28 +02:00
CFI_ A D J U S T _ C F A _ O F F S E T - 8
2005-07-28 21:15:49 -07:00
decl % g s : p d a _ i r q c o u n t
ret
2005-09-12 18:49:24 +02:00
CFI_ E N D P R O C
2008-11-23 10:15:32 +01:00
END( c a l l _ s o f t i r q )
2007-06-23 02:29:25 +02:00
2008-07-08 15:06:49 -07:00
# ifdef C O N F I G _ X E N
2008-11-23 10:08:28 +01:00
zeroentry x e n _ h y p e r v i s o r _ c a l l b a c k x e n _ d o _ h y p e r v i s o r _ c a l l b a c k
2008-07-08 15:06:49 -07:00
/ *
2008-11-27 21:10:08 +03:00
* A n o t e o n t h e " c r i t i c a l r e g i o n " i n o u r c a l l b a c k h a n d l e r .
* We w a n t t o a v o i d s t a c k i n g c a l l b a c k h a n d l e r s d u e t o e v e n t s o c c u r r i n g
* during h a n d l i n g o f t h e l a s t e v e n t . T o d o t h i s , w e k e e p e v e n t s d i s a b l e d
* until w e ' v e d o n e a l l p r o c e s s i n g . H O W E V E R , w e m u s t e n a b l e e v e n t s b e f o r e
* popping t h e s t a c k f r a m e ( c a n ' t b e d o n e a t o m i c a l l y ) a n d s o i t w o u l d s t i l l
* be p o s s i b l e t o g e t e n o u g h h a n d l e r a c t i v a t i o n s t o o v e r f l o w t h e s t a c k .
* Although u n l i k e l y , b u g s o f t h a t k i n d a r e h a r d t o t r a c k d o w n , s o w e ' d
* like t o a v o i d t h e p o s s i b i l i t y .
* So, o n e n t r y t o t h e h a n d l e r w e d e t e c t w h e t h e r w e i n t e r r u p t e d a n
* existing a c t i v a t i o n i n i t s c r i t i c a l r e g i o n - - i f s o , w e p o p t h e c u r r e n t
* activation a n d r e s t a r t t h e h a n d l e r u s i n g t h e p r e v i o u s o n e .
* /
2008-07-08 15:06:49 -07:00
ENTRY( x e n _ d o _ h y p e r v i s o r _ c a l l b a c k ) # d o _ h y p e r v i s o r _ c a l l b a c k ( s t r u c t * p t _ r e g s )
CFI_ S T A R T P R O C
2008-11-27 21:10:08 +03:00
/ *
* Since w e d o n ' t m o d i f y % r d i , e v t c h n _ d o _ u p a l l ( s t r u c t * p t _ r e g s ) w i l l
* see t h e c o r r e c t p o i n t e r t o t h e p t _ r e g s
* /
2008-07-08 15:06:49 -07:00
movq % r d i , % r s p # w e d o n ' t r e t u r n , a d j u s t t h e s t a c k f r a m e
CFI_ E N D P R O C
2008-11-20 14:40:11 +01:00
DEFAULT_ F R A M E
2008-07-08 15:06:49 -07:00
11 : incl % g s : p d a _ i r q c o u n t
movq % r s p ,% r b p
CFI_ D E F _ C F A _ R E G I S T E R r b p
cmovzq % g s : p d a _ i r q s t a c k p t r ,% r s p
pushq % r b p # b a c k l i n k f o r o l d u n w i n d e r
call x e n _ e v t c h n _ d o _ u p c a l l
popq % r s p
CFI_ D E F _ C F A _ R E G I S T E R r s p
decl % g s : p d a _ i r q c o u n t
jmp e r r o r _ e x i t
CFI_ E N D P R O C
END( d o _ h y p e r v i s o r _ c a l l b a c k )
/ *
2008-11-27 21:10:08 +03:00
* Hypervisor u s e s t h i s f o r a p p l i c a t i o n f a u l t s w h i l e i t e x e c u t e s .
* We g e t h e r e f o r t w o r e a s o n s :
* 1 . Fault w h i l e r e l o a d i n g D S , E S , F S o r G S
* 2 . Fault w h i l e e x e c u t i n g I R E T
* Category 1 w e d o n o t n e e d t o f i x u p a s X e n h a s a l r e a d y r e l o a d e d a l l s e g m e n t
* registers t h a t c o u l d b e r e l o a d e d a n d z e r o e d t h e o t h e r s .
* Category 2 w e f i x u p b y k i l l i n g t h e c u r r e n t p r o c e s s . W e c a n n o t u s e t h e
* normal L i n u x r e t u r n p a t h i n t h i s c a s e b e c a u s e i f w e u s e t h e I R E T h y p e r c a l l
* to p o p t h e s t a c k f r a m e w e e n d u p i n a n i n f i n i t e l o o p o f f a i l s a f e c a l l b a c k s .
* We d i s t i n g u i s h b e t w e e n c a t e g o r i e s b y c o m p a r i n g e a c h s a v e d s e g m e n t r e g i s t e r
* with i t s c u r r e n t c o n t e n t s : a n y d i s c r e p a n c y m e a n s w e i n c a t e g o r y 1 .
* /
2008-07-08 15:06:49 -07:00
ENTRY( x e n _ f a i l s a f e _ c a l l b a c k )
2008-11-20 14:40:11 +01:00
INTR_ F R A M E 1 ( 6 * 8 )
/*CFI_REL_OFFSET gs,GS*/
/*CFI_REL_OFFSET fs,FS*/
/*CFI_REL_OFFSET es,ES*/
/*CFI_REL_OFFSET ds,DS*/
CFI_ R E L _ O F F S E T r11 ,8
CFI_ R E L _ O F F S E T r c x ,0
2008-07-08 15:06:49 -07:00
movw % d s ,% c x
cmpw % c x ,0 x10 ( % r s p )
CFI_ R E M E M B E R _ S T A T E
jne 1 f
movw % e s ,% c x
cmpw % c x ,0 x18 ( % r s p )
jne 1 f
movw % f s ,% c x
cmpw % c x ,0 x20 ( % r s p )
jne 1 f
movw % g s ,% c x
cmpw % c x ,0 x28 ( % r s p )
jne 1 f
/* All segments match their saved values => Category 2 (Bad IRET). */
movq ( % r s p ) ,% r c x
CFI_ R E S T O R E r c x
movq 8 ( % r s p ) ,% r11
CFI_ R E S T O R E r11
addq $ 0 x30 ,% r s p
CFI_ A D J U S T _ C F A _ O F F S E T - 0 x30
2008-11-21 15:20:47 +01:00
pushq_ c f i $ 0 / * R I P * /
pushq_ c f i % r11
pushq_ c f i % r c x
2008-07-08 15:07:09 -07:00
jmp g e n e r a l _ p r o t e c t i o n
2008-07-08 15:06:49 -07:00
CFI_ R E S T O R E _ S T A T E
1 : /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq ( % r s p ) ,% r c x
CFI_ R E S T O R E r c x
movq 8 ( % r s p ) ,% r11
CFI_ R E S T O R E r11
addq $ 0 x30 ,% r s p
CFI_ A D J U S T _ C F A _ O F F S E T - 0 x30
2008-11-21 15:20:47 +01:00
pushq_ c f i $ 0
2008-07-08 15:06:49 -07:00
SAVE_ A L L
jmp e r r o r _ e x i t
CFI_ E N D P R O C
END( x e n _ f a i l s a f e _ c a l l b a c k )
# endif / * C O N F I G _ X E N * /
2008-11-24 13:24:28 +01:00
/ *
* Some f u n c t i o n s s h o u l d b e p r o t e c t e d a g a i n s t k p r o b e s
* /
.pushsection .kprobes .text , " ax"
paranoidzeroentry_ i s t d e b u g d o _ d e b u g D E B U G _ S T A C K
paranoidzeroentry_ i s t i n t 3 d o _ i n t 3 D E B U G _ S T A C K
paranoiderrorentry s t a c k _ s e g m e n t d o _ s t a c k _ s e g m e n t
errorentry g e n e r a l _ p r o t e c t i o n d o _ g e n e r a l _ p r o t e c t i o n
errorentry p a g e _ f a u l t d o _ p a g e _ f a u l t
# ifdef C O N F I G _ X 8 6 _ M C E
paranoidzeroentry m a c h i n e _ c h e c k d o _ m a c h i n e _ c h e c k
# endif
/ *
2008-11-27 21:10:08 +03:00
* " Paranoid" e x i t p a t h f r o m e x c e p t i o n s t a c k .
* Paranoid b e c a u s e t h i s i s u s e d b y N M I s a n d c a n n o t t a k e
2008-11-24 13:24:28 +01:00
* any k e r n e l s t a t e f o r g r a n t e d .
* We d o n ' t d o k e r n e l p r e e m p t i o n c h e c k s h e r e , b e c a u s e o n l y
* NMI s h o u l d b e c o m m o n a n d i t d o e s n o t e n a b l e I R Q s a n d
* cannot g e t r e s c h e d u l e t i c k s .
*
* " trace" i s 0 f o r t h e N M I h a n d l e r o n l y , b e c a u s e i r q - t r a c i n g
* is f u n d a m e n t a l l y N M I - u n s a f e . ( w e c a n n o t c h a n g e t h e s o f t a n d
* hard f l a g s a t o n c e , a t o m i c a l l y )
* /
/* ebx: no swapgs flag */
ENTRY( p a r a n o i d _ e x i t )
INTR_ F R A M E
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
TRACE_ I R Q S _ O F F
testl % e b x ,% e b x / * s w a p g s n e e d e d ? * /
jnz p a r a n o i d _ r e s t o r e
testl $ 3 ,C S ( % r s p )
jnz p a r a n o i d _ u s e r s p a c e
paranoid_swapgs :
TRACE_ I R Q S _ I R E T Q 0
SWAPGS_ U N S A F E _ S T A C K
paranoid_restore :
RESTORE_ A L L 8
jmp i r q _ r e t u r n
paranoid_userspace :
GET_ T H R E A D _ I N F O ( % r c x )
movl T I _ f l a g s ( % r c x ) ,% e b x
andl $ _ T I F _ W O R K _ M A S K ,% e b x
jz p a r a n o i d _ s w a p g s
movq % r s p ,% r d i / * & p t _ r e g s * /
call s y n c _ r e g s
movq % r a x ,% r s p / * s w i t c h s t a c k f o r s c h e d u l i n g * /
testl $ _ T I F _ N E E D _ R E S C H E D ,% e b x
jnz p a r a n o i d _ s c h e d u l e
movl % e b x ,% e d x / * a r g 3 : t h r e a d f l a g s * /
TRACE_ I R Q S _ O N
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
xorl % e s i ,% e s i / * a r g 2 : o l d s e t * /
movq % r s p ,% r d i / * a r g 1 : & p t _ r e g s * /
call d o _ n o t i f y _ r e s u m e
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
TRACE_ I R Q S _ O F F
jmp p a r a n o i d _ u s e r s p a c e
paranoid_schedule :
TRACE_ I R Q S _ O N
ENABLE_ I N T E R R U P T S ( C L B R _ A N Y )
call s c h e d u l e
DISABLE_ I N T E R R U P T S ( C L B R _ A N Y )
TRACE_ I R Q S _ O F F
jmp p a r a n o i d _ u s e r s p a c e
CFI_ E N D P R O C
END( p a r a n o i d _ e x i t )
/ *
* Exception e n t r y p o i n t . T h i s e x p e c t s a n e r r o r c o d e / o r i g _ r a x o n t h e s t a c k .
* returns i n " n o s w a p g s f l a g " i n % e b x .
* /
ENTRY( e r r o r _ e n t r y )
XCPT_ F R A M E
CFI_ A D J U S T _ C F A _ O F F S E T 1 5 * 8
/* oldrax contains error code */
cld
movq_ c f i r d i , R D I + 8
movq_ c f i r s i , R S I + 8
movq_ c f i r d x , R D X + 8
movq_ c f i r c x , R C X + 8
movq_ c f i r a x , R A X + 8
movq_ c f i r8 , R 8 + 8
movq_ c f i r9 , R 9 + 8
movq_ c f i r10 , R 1 0 + 8
movq_ c f i r11 , R 1 1 + 8
movq_ c f i r b x , R B X + 8
movq_ c f i r b p , R B P + 8
movq_ c f i r12 , R 1 2 + 8
movq_ c f i r13 , R 1 3 + 8
movq_ c f i r14 , R 1 4 + 8
movq_ c f i r15 , R 1 5 + 8
xorl % e b x ,% e b x
testl $ 3 ,C S + 8 ( % r s p )
je e r r o r _ k e r n e l s p a c e
error_swapgs :
SWAPGS
error_sti :
TRACE_ I R Q S _ O F F
ret
CFI_ E N D P R O C
/ *
* There a r e t w o p l a c e s i n t h e k e r n e l t h a t c a n p o t e n t i a l l y f a u l t w i t h
* usergs. H a n d l e t h e m h e r e . T h e e x c e p t i o n h a n d l e r s a f t e r i r e t r u n w i t h
* kernel g s a g a i n , s o d o n ' t s e t t h e u s e r s p a c e f l a g . B s t e p p i n g K 8 s
* sometimes r e p o r t a n t r u n c a t e d R I P f o r I R E T e x c e p t i o n s r e t u r n i n g t o
* compat m o d e . C h e c k f o r t h e s e h e r e t o o .
* /
error_kernelspace :
incl % e b x
leaq i r q _ r e t u r n ( % r i p ) ,% r c x
cmpq % r c x ,R I P + 8 ( % r s p )
je e r r o r _ s w a p g s
movl % e c x ,% e c x / * z e r o e x t e n d * /
cmpq % r c x ,R I P + 8 ( % r s p )
je e r r o r _ s w a p g s
cmpq $ g s _ c h a n g e ,R I P + 8 ( % r s p )
2008-11-27 21:10:08 +03:00
je e r r o r _ s w a p g s
2008-11-24 13:24:28 +01:00
jmp e r r o r _ s t i
END( e r r o r _ e n t r y )
/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
ENTRY( e r r o r _ e x i t )
DEFAULT_ F R A M E
movl % e b x ,% e a x
RESTORE_ R E S T
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
TRACE_ I R Q S _ O F F
GET_ T H R E A D _ I N F O ( % r c x )
testl % e a x ,% e a x
jne r e t i n t _ k e r n e l
LOCKDEP_ S Y S _ E X I T _ I R Q
movl T I _ f l a g s ( % r c x ) ,% e d x
movl $ _ T I F _ W O R K _ M A S K ,% e d i
andl % e d i ,% e d x
jnz r e t i n t _ c a r e f u l
jmp r e t i n t _ s w a p g s
CFI_ E N D P R O C
END( e r r o r _ e x i t )
/* runs on exception stack */
ENTRY( n m i )
INTR_ F R A M E
PARAVIRT_ A D J U S T _ E X C E P T I O N _ F R A M E
pushq_ c f i $ - 1
subq $ 1 5 * 8 , % r s p
CFI_ A D J U S T _ C F A _ O F F S E T 1 5 * 8
call s a v e _ p a r a n o i d
DEFAULT_ F R A M E 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq % r s p ,% r d i
movq $ - 1 ,% r s i
call d o _ n m i
# ifdef C O N F I G _ T R A C E _ I R Q F L A G S
/* paranoidexit; without TRACE_IRQS_OFF */
/* ebx: no swapgs flag */
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
testl % e b x ,% e b x / * s w a p g s n e e d e d ? * /
jnz n m i _ r e s t o r e
testl $ 3 ,C S ( % r s p )
jnz n m i _ u s e r s p a c e
nmi_swapgs :
SWAPGS_ U N S A F E _ S T A C K
nmi_restore :
RESTORE_ A L L 8
jmp i r q _ r e t u r n
nmi_userspace :
GET_ T H R E A D _ I N F O ( % r c x )
movl T I _ f l a g s ( % r c x ) ,% e b x
andl $ _ T I F _ W O R K _ M A S K ,% e b x
jz n m i _ s w a p g s
movq % r s p ,% r d i / * & p t _ r e g s * /
call s y n c _ r e g s
movq % r a x ,% r s p / * s w i t c h s t a c k f o r s c h e d u l i n g * /
testl $ _ T I F _ N E E D _ R E S C H E D ,% e b x
jnz n m i _ s c h e d u l e
movl % e b x ,% e d x / * a r g 3 : t h r e a d f l a g s * /
ENABLE_ I N T E R R U P T S ( C L B R _ N O N E )
xorl % e s i ,% e s i / * a r g 2 : o l d s e t * /
movq % r s p ,% r d i / * a r g 1 : & p t _ r e g s * /
call d o _ n o t i f y _ r e s u m e
DISABLE_ I N T E R R U P T S ( C L B R _ N O N E )
jmp n m i _ u s e r s p a c e
nmi_schedule :
ENABLE_ I N T E R R U P T S ( C L B R _ A N Y )
call s c h e d u l e
DISABLE_ I N T E R R U P T S ( C L B R _ A N Y )
jmp n m i _ u s e r s p a c e
CFI_ E N D P R O C
# else
jmp p a r a n o i d _ e x i t
2008-11-27 21:10:08 +03:00
CFI_ E N D P R O C
2008-11-24 13:24:28 +01:00
# endif
END( n m i )
ENTRY( i g n o r e _ s y s r e t )
CFI_ S T A R T P R O C
mov $ - E N O S Y S ,% e a x
sysret
CFI_ E N D P R O C
END( i g n o r e _ s y s r e t )
/ *
* End o f k p r o b e s s e c t i o n
* /
.popsection