2005-04-16 15:20:36 -07:00
/*
* Copyright ( C ) 1995 Linus Torvalds
*
* Pentium III FXSR , SSE support
* Gareth Hughes < gareth @ valinux . com > , May 2000
*/
/*
* This file handles the architecture - dependent parts of process handling . .
*/
# include <stdarg.h>
2005-06-25 14:54:50 -07:00
# include <linux/cpu.h>
2005-04-16 15:20:36 -07:00
# include <linux/errno.h>
# include <linux/sched.h>
# include <linux/fs.h>
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/elfcore.h>
# include <linux/smp.h>
# include <linux/stddef.h>
# include <linux/slab.h>
# include <linux/vmalloc.h>
# include <linux/user.h>
# include <linux/interrupt.h>
# include <linux/utsname.h>
# include <linux/delay.h>
# include <linux/reboot.h>
# include <linux/init.h>
# include <linux/mc146818rtc.h>
# include <linux/module.h>
# include <linux/kallsyms.h>
# include <linux/ptrace.h>
# include <linux/random.h>
2006-09-26 10:52:28 +02:00
# include <linux/personality.h>
2007-02-16 01:28:07 -08:00
# include <linux/tick.h>
2007-05-02 19:27:16 +02:00
# include <linux/percpu.h>
2008-04-14 00:24:18 +02:00
# include <linux/prctl.h>
2005-04-16 15:20:36 -07:00
# include <asm/uaccess.h>
# include <asm/pgtable.h>
# include <asm/system.h>
# include <asm/io.h>
# include <asm/ldt.h>
# include <asm/processor.h>
# include <asm/i387.h>
# include <asm/desc.h>
# ifdef CONFIG_MATH_EMULATION
# include <asm/math_emu.h>
# endif
# include <linux/err.h>
2005-06-25 14:54:50 -07:00
# include <asm/tlbflush.h>
# include <asm/cpu.h>
2008-01-30 13:30:17 +01:00
# include <asm/kdebug.h>
2005-06-25 14:54:50 -07:00
2005-04-16 15:20:36 -07:00
asmlinkage void ret_from_fork ( void ) __asm__ ( " ret_from_fork " ) ;
static int hlt_counter ;
unsigned long boot_option_idle_override = 0 ;
EXPORT_SYMBOL ( boot_option_idle_override ) ;
2007-05-02 19:27:16 +02:00
DEFINE_PER_CPU ( struct task_struct * , current_task ) = & init_task ;
EXPORT_PER_CPU_SYMBOL ( current_task ) ;
DEFINE_PER_CPU ( int , cpu_number ) ;
EXPORT_PER_CPU_SYMBOL ( cpu_number ) ;
2005-04-16 15:20:36 -07:00
/*
* Return saved PC of a blocked thread .
*/
unsigned long thread_saved_pc ( struct task_struct * tsk )
{
2008-01-30 13:31:02 +01:00
return ( ( unsigned long * ) tsk - > thread . sp ) [ 3 ] ;
2005-04-16 15:20:36 -07:00
}
/*
* Powermanagement idle function , if any . .
*/
void ( * pm_idle ) ( void ) ;
2005-06-23 00:08:33 -07:00
EXPORT_SYMBOL ( pm_idle ) ;
2005-04-16 15:20:36 -07:00
void disable_hlt ( void )
{
hlt_counter + + ;
}
EXPORT_SYMBOL ( disable_hlt ) ;
void enable_hlt ( void )
{
hlt_counter - - ;
}
EXPORT_SYMBOL ( enable_hlt ) ;
/*
* We use this if we don ' t have any better
* idle routine . .
*/
void default_idle ( void )
{
if ( ! hlt_counter & & boot_cpu_data . hlt_works_ok ) {
2006-06-26 13:59:11 +02:00
current_thread_info ( ) - > status & = ~ TS_POLLING ;
[PATCH] sched: fix bad missed wakeups in the i386, x86_64, ia64, ACPI and APM idle code
Fernando Lopez-Lezcano reported frequent scheduling latencies and audio
xruns starting at the 2.6.18-rt kernel, and those problems persisted all
until current -rt kernels. The latencies were serious and unjustified by
system load, often in the milliseconds range.
After a patient and heroic multi-month effort of Fernando, where he
tested dozens of kernels, tried various configs, boot options,
test-patches of mine and provided latency traces of those incidents, the
following 'smoking gun' trace was captured by him:
_------=> CPU#
/ _-----=> irqs-off
| / _----=> need-resched
|| / _---=> hardirq/softirq
||| / _--=> preempt-depth
|||| /
||||| delay
cmd pid ||||| time | caller
\ / ||||| \ | /
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup (try_to_wake_up)
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup <<...>-5856> (37 0)
IRQ_19-1479 1D..1 0us : __trace_start_sched_wakeup (c01262ba 0 0)
IRQ_19-1479 1D..1 0us : resched_task (try_to_wake_up)
IRQ_19-1479 1D..1 0us : __spin_unlock_irqrestore (try_to_wake_up)
...
<idle>-0 1...1 11us!: default_idle (cpu_idle)
...
<idle>-0 0Dn.1 602us : smp_apic_timer_interrupt (c0103baf 1 0)
...
<...>-5856 0D..2 618us : __switch_to (__schedule)
<...>-5856 0D..2 618us : __schedule <<idle>-0> (20 162)
<...>-5856 0D..2 619us : __spin_unlock_irq (__schedule)
<...>-5856 0...1 619us : trace_stop_sched_switched (__schedule)
<...>-5856 0D..1 619us : trace_stop_sched_switched <<...>-5856> (37 0)
what is visible in this trace is that CPU#1 ran try_to_wake_up() for
PID:5856, it placed PID:5856 on CPU#0's runqueue and ran resched_task()
for CPU#0. But it decided to not send an IPI that no CPU - due to
TS_POLLING. But CPU#0 never woke up after its NEED_RESCHED bit was set,
and only rescheduled to PID:5856 upon the next lapic timer IRQ. The
result was a 600+ usecs latency and a missed wakeup!
the bug turned out to be an idle-wakeup bug introduced into the mainline
kernel this summer via an optimization in the x86_64 tree:
commit 495ab9c045e1b0e5c82951b762257fe1c9d81564
Author: Andi Kleen <ak@suse.de>
Date: Mon Jun 26 13:59:11 2006 +0200
[PATCH] i386/x86-64/ia64: Move polling flag into thread_info_status
During some profiling I noticed that default_idle causes a lot of
memory traffic. I think that is caused by the atomic operations
to clear/set the polling flag in thread_info. There is actually
no reason to make this atomic - only the idle thread does it
to itself, other CPUs only read it. So I moved it into ti->status.
the problem is this type of change:
if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
while (!need_resched()) {
local_irq_disable();
this changes clear_thread_flag() to an explicit clearing of TS_POLLING.
clear_thread_flag() is defined as:
clear_bit(flag, &ti->flags);
and clear_bit() is a LOCK-ed atomic instruction on all x86 platforms:
static inline void clear_bit(int nr, volatile unsigned long * addr)
{
__asm__ __volatile__( LOCK_PREFIX
"btrl %1,%0"
hence smp_mb__after_clear_bit() is defined as a simple compile barrier:
#define smp_mb__after_clear_bit() barrier()
but the explicit TS_POLLING clearing introduced by the patch:
+ current_thread_info()->status &= ~TS_POLLING;
is not an atomic op! So the clearing of the TS_POLLING bit is freely
reorderable with the reading of the NEED_RESCHED bit - and both now
reside in different memory addresses.
CPU idle wakeup very much depends on ordered memory ops, the clearing of
the TS_POLLING flag must always be done before we test need_resched()
and hit the idle instruction(s). [Symmetrically, the wakeup code needs
to set NEED_RESCHED before it tests the TS_POLLING flag, so memory
ordering is paramount.]
Fernando's dual-core Athlon64 system has a sufficiently advanced memory
ordering model so that it triggered this scenario very often.
( And it also turned out that the reason why these latencies never
triggered on my testsystems is that i routinely use idle=poll, which
was the only idle variant not affected by this bug. )
The fix is to change the smp_mb__after_clear_bit() to an smp_mb(), to
act as an absolute barrier between the TS_POLLING write and the
NEED_RESCHED read. This affects almost all idling methods (default,
ACPI, APM), on all 3 x86 architectures: i386, x86_64, ia64.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Fernando Lopez-Lezcano <nando@ccrma.Stanford.EDU>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-22 01:11:56 -08:00
/*
* TS_POLLING - cleared state must be visible before we
* test NEED_RESCHED :
*/
smp_mb ( ) ;
2008-04-25 17:39:01 +02:00
if ( ! need_resched ( ) )
2006-12-07 02:14:03 +01:00
safe_halt ( ) ; /* enables interrupts racelessly */
2008-04-25 17:39:01 +02:00
else
local_irq_enable ( ) ;
2006-06-26 13:59:11 +02:00
current_thread_info ( ) - > status | = TS_POLLING ;
2005-04-16 15:20:36 -07:00
} else {
2008-03-19 14:25:06 -03:00
local_irq_enable ( ) ;
2006-12-07 02:14:03 +01:00
/* loop is done by the caller */
cpu_relax ( ) ;
2005-04-16 15:20:36 -07:00
}
}
2005-06-23 00:08:33 -07:00
# ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL ( default_idle ) ;
# endif
2005-04-16 15:20:36 -07:00
2005-06-25 14:54:50 -07:00
# ifdef CONFIG_HOTPLUG_CPU
# include <asm/nmi.h>
/* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead ( void )
{
2005-06-25 14:54:56 -07:00
/* This must be done before dead CPU ack */
cpu_exit_clear ( ) ;
wbinvd ( ) ;
mb ( ) ;
2005-06-25 14:54:50 -07:00
/* Ack it */
__get_cpu_var ( cpu_state ) = CPU_DEAD ;
2005-06-25 14:54:56 -07:00
/*
* With physical CPU hotplug , we should halt the cpu
*/
2005-06-25 14:54:50 -07:00
local_irq_disable ( ) ;
2005-06-25 14:54:56 -07:00
while ( 1 )
2005-09-03 15:56:42 -07:00
halt ( ) ;
2005-06-25 14:54:50 -07:00
}
# else
static inline void play_dead ( void )
{
BUG ( ) ;
}
# endif /* CONFIG_HOTPLUG_CPU */
2005-04-16 15:20:36 -07:00
/*
* The idle thread . There ' s no useful work to be
* done , so just try to conserve power and have a
* low exit latency ( ie sit in a loop waiting for
* somebody to say that they ' d like to reschedule )
*/
2005-06-25 14:54:50 -07:00
void cpu_idle ( void )
2005-04-16 15:20:36 -07:00
{
2005-11-08 21:39:01 -08:00
int cpu = smp_processor_id ( ) ;
2005-06-25 14:54:50 -07:00
2006-06-26 13:59:11 +02:00
current_thread_info ( ) - > status | = TS_POLLING ;
[PATCH] sched: resched and cpu_idle rework
Make some changes to the NEED_RESCHED and POLLING_NRFLAG to reduce
confusion, and make their semantics rigid. Improves efficiency of
resched_task and some cpu_idle routines.
* In resched_task:
- TIF_NEED_RESCHED is only cleared with the task's runqueue lock held,
and as we hold it during resched_task, then there is no need for an
atomic test and set there. The only other time this should be set is
when the task's quantum expires, in the timer interrupt - this is
protected against because the rq lock is irq-safe.
- If TIF_NEED_RESCHED is set, then we don't need to do anything. It
won't get unset until the task get's schedule()d off.
- If we are running on the same CPU as the task we resched, then set
TIF_NEED_RESCHED and no further action is required.
- If we are running on another CPU, and TIF_POLLING_NRFLAG is *not* set
after TIF_NEED_RESCHED has been set, then we need to send an IPI.
Using these rules, we are able to remove the test and set operation in
resched_task, and make clear the previously vague semantics of
POLLING_NRFLAG.
* In idle routines:
- Enter cpu_idle with preempt disabled. When the need_resched() condition
becomes true, explicitly call schedule(). This makes things a bit clearer
(IMO), but haven't updated all architectures yet.
- Many do a test and clear of TIF_NEED_RESCHED for some reason. According
to the resched_task rules, this isn't needed (and actually breaks the
assumption that TIF_NEED_RESCHED is only cleared with the runqueue lock
held). So remove that. Generally one less locked memory op when switching
to the idle thread.
- Many idle routines clear TIF_POLLING_NRFLAG, and only set it in the inner
most polling idle loops. The above resched_task semantics allow it to be
set until before the last time need_resched() is checked before going into
a halt requiring interrupt wakeup.
Many idle routines simply never enter such a halt, and so POLLING_NRFLAG
can be always left set, completely eliminating resched IPIs when rescheduling
the idle task.
POLLING_NRFLAG width can be increased, to reduce the chance of resched IPIs.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-08 21:39:04 -08:00
2005-04-16 15:20:36 -07:00
/* endless idle loop with no priority at all */
while ( 1 ) {
2007-02-16 01:28:07 -08:00
tick_nohz_stop_sched_tick ( ) ;
2005-04-16 15:20:36 -07:00
while ( ! need_resched ( ) ) {
void ( * idle ) ( void ) ;
2007-05-12 11:15:24 -07:00
check_pgt_cache ( ) ;
2005-04-16 15:20:36 -07:00
rmb ( ) ;
idle = pm_idle ;
2008-01-30 13:33:13 +01:00
if ( rcu_pending ( cpu ) )
rcu_check_callbacks ( cpu , 0 ) ;
2005-04-16 15:20:36 -07:00
if ( ! idle )
idle = default_idle ;
2005-06-25 14:54:50 -07:00
if ( cpu_is_offline ( cpu ) )
play_dead ( ) ;
2008-04-25 17:39:01 +02:00
local_irq_disable ( ) ;
2005-04-16 15:20:36 -07:00
__get_cpu_var ( irq_stat ) . idle_timestamp = jiffies ;
idle ( ) ;
}
2007-02-16 01:28:07 -08:00
tick_nohz_restart_sched_tick ( ) ;
2005-11-08 21:39:01 -08:00
preempt_enable_no_resched ( ) ;
2005-04-16 15:20:36 -07:00
schedule ( ) ;
2005-11-08 21:39:01 -08:00
preempt_disable ( ) ;
2005-04-16 15:20:36 -07:00
}
}
2007-10-19 20:35:03 +02:00
void __show_registers ( struct pt_regs * regs , int all )
2005-04-16 15:20:36 -07:00
{
unsigned long cr0 = 0L , cr2 = 0L , cr3 = 0L , cr4 = 0L ;
2007-07-21 17:10:42 +02:00
unsigned long d0 , d1 , d2 , d3 , d6 , d7 ;
2008-01-30 13:30:56 +01:00
unsigned long sp ;
2007-10-19 20:35:03 +02:00
unsigned short ss , gs ;
if ( user_mode_vm ( regs ) ) {
2008-01-30 13:30:56 +01:00
sp = regs - > sp ;
ss = regs - > ss & 0xffff ;
2007-10-19 20:35:03 +02:00
savesegment ( gs , gs ) ;
} else {
2008-01-30 13:30:56 +01:00
sp = ( unsigned long ) ( & regs - > sp ) ;
2007-10-19 20:35:03 +02:00
savesegment ( ss , ss ) ;
savesegment ( gs , gs ) ;
}
2005-04-16 15:20:36 -07:00
printk ( " \n " ) ;
2007-10-19 15:06:00 -07:00
printk ( " Pid: %d, comm: %s %s (%s %.*s) \n " ,
task_pid_nr ( current ) , current - > comm ,
2007-10-19 20:35:03 +02:00
print_tainted ( ) , init_utsname ( ) - > release ,
( int ) strcspn ( init_utsname ( ) - > version , " " ) ,
init_utsname ( ) - > version ) ;
printk ( " EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d \n " ,
2008-02-08 12:09:56 -08:00
( u16 ) regs - > cs , regs - > ip , regs - > flags ,
2007-10-19 20:35:03 +02:00
smp_processor_id ( ) ) ;
2008-01-30 13:30:56 +01:00
print_symbol ( " EIP is at %s \n " , regs - > ip ) ;
2005-04-16 15:20:36 -07:00
printk ( " EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx \n " ,
2008-01-30 13:30:56 +01:00
regs - > ax , regs - > bx , regs - > cx , regs - > dx ) ;
2007-10-19 20:35:03 +02:00
printk ( " ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx \n " ,
2008-01-30 13:30:56 +01:00
regs - > si , regs - > di , regs - > bp , sp ) ;
2007-10-19 20:35:03 +02:00
printk ( " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x \n " ,
2008-02-08 12:09:56 -08:00
( u16 ) regs - > ds , ( u16 ) regs - > es , ( u16 ) regs - > fs , gs , ss ) ;
2007-10-19 20:35:03 +02:00
if ( ! all )
return ;
2005-04-16 15:20:36 -07:00
2005-09-03 15:56:36 -07:00
cr0 = read_cr0 ( ) ;
cr2 = read_cr2 ( ) ;
cr3 = read_cr3 ( ) ;
2006-01-06 00:11:50 -08:00
cr4 = read_cr4_safe ( ) ;
2007-10-19 20:35:03 +02:00
printk ( " CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx \n " ,
cr0 , cr2 , cr3 , cr4 ) ;
2007-07-21 17:10:42 +02:00
get_debugreg ( d0 , 0 ) ;
get_debugreg ( d1 , 1 ) ;
get_debugreg ( d2 , 2 ) ;
get_debugreg ( d3 , 3 ) ;
printk ( " DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx \n " ,
d0 , d1 , d2 , d3 ) ;
2007-10-19 20:35:03 +02:00
2007-07-21 17:10:42 +02:00
get_debugreg ( d6 , 6 ) ;
get_debugreg ( d7 , 7 ) ;
2007-10-19 20:35:03 +02:00
printk ( " DR6: %08lx DR7: %08lx \n " ,
d6 , d7 ) ;
}
2007-07-21 17:10:42 +02:00
2007-10-19 20:35:03 +02:00
void show_regs ( struct pt_regs * regs )
{
__show_registers ( regs , 1 ) ;
2008-01-30 13:33:07 +01:00
show_trace ( NULL , regs , & regs - > sp , regs - > bp ) ;
2005-04-16 15:20:36 -07:00
}
/*
2008-01-30 13:30:56 +01:00
* This gets run with % bx containing the
* function to call , and % dx containing
2005-04-16 15:20:36 -07:00
* the " args " .
*/
extern void kernel_thread_helper ( void ) ;
/*
* Create a kernel thread
*/
int kernel_thread ( int ( * fn ) ( void * ) , void * arg , unsigned long flags )
{
struct pt_regs regs ;
memset ( & regs , 0 , sizeof ( regs ) ) ;
2008-01-30 13:30:56 +01:00
regs . bx = ( unsigned long ) fn ;
regs . dx = ( unsigned long ) arg ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:30:56 +01:00
regs . ds = __USER_DS ;
regs . es = __USER_DS ;
regs . fs = __KERNEL_PERCPU ;
regs . orig_ax = - 1 ;
regs . ip = ( unsigned long ) kernel_thread_helper ;
regs . cs = __KERNEL_CS | get_kernel_rpl ( ) ;
regs . flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2 ;
2005-04-16 15:20:36 -07:00
/* Ok, create the new process.. */
2006-10-21 18:37:02 +02:00
return do_fork ( flags | CLONE_VM | CLONE_UNTRACED , 0 , & regs , 0 , NULL , NULL ) ;
2005-04-16 15:20:36 -07:00
}
2005-06-23 00:08:33 -07:00
EXPORT_SYMBOL ( kernel_thread ) ;
2005-04-16 15:20:36 -07:00
/*
* Free current thread data structures etc . .
*/
void exit_thread ( void )
{
/* The process may have allocated an io port bitmap... nuke it. */
2006-07-09 21:12:39 -04:00
if ( unlikely ( test_thread_flag ( TIF_IO_BITMAP ) ) ) {
struct task_struct * tsk = current ;
struct thread_struct * t = & tsk - > thread ;
2005-04-16 15:20:36 -07:00
int cpu = get_cpu ( ) ;
struct tss_struct * tss = & per_cpu ( init_tss , cpu ) ;
kfree ( t - > io_bitmap_ptr ) ;
t - > io_bitmap_ptr = NULL ;
2006-07-09 21:12:39 -04:00
clear_thread_flag ( TIF_IO_BITMAP ) ;
2005-04-16 15:20:36 -07:00
/*
* Careful , clear this in the TSS too :
*/
memset ( tss - > io_bitmap , 0xff , tss - > io_bitmap_max ) ;
t - > io_bitmap_max = 0 ;
tss - > io_bitmap_owner = NULL ;
tss - > io_bitmap_max = 0 ;
2007-05-02 19:27:13 +02:00
tss - > x86_tss . io_bitmap_base = INVALID_IO_BITMAP_OFFSET ;
2005-04-16 15:20:36 -07:00
put_cpu ( ) ;
}
}
void flush_thread ( void )
{
struct task_struct * tsk = current ;
2008-01-30 13:30:59 +01:00
tsk - > thread . debugreg0 = 0 ;
tsk - > thread . debugreg1 = 0 ;
tsk - > thread . debugreg2 = 0 ;
tsk - > thread . debugreg3 = 0 ;
tsk - > thread . debugreg6 = 0 ;
tsk - > thread . debugreg7 = 0 ;
2005-04-16 15:20:36 -07:00
memset ( tsk - > thread . tls_array , 0 , sizeof ( tsk - > thread . tls_array ) ) ;
2006-07-09 21:12:39 -04:00
clear_tsk_thread_flag ( tsk , TIF_DEBUG ) ;
2005-04-16 15:20:36 -07:00
/*
* Forget coprocessor state . .
*/
clear_fpu ( tsk ) ;
clear_used_math ( ) ;
}
void release_thread ( struct task_struct * dead_task )
{
2006-01-06 00:11:59 -08:00
BUG_ON ( dead_task - > mm ) ;
2005-04-16 15:20:36 -07:00
release_vm86_irqs ( dead_task ) ;
}
/*
* This gets called before we allocate a new thread and copy
* the current task into it .
*/
void prepare_to_copy ( struct task_struct * tsk )
{
unlazy_fpu ( tsk ) ;
}
2008-01-30 13:30:56 +01:00
int copy_thread ( int nr , unsigned long clone_flags , unsigned long sp ,
2005-04-16 15:20:36 -07:00
unsigned long unused ,
struct task_struct * p , struct pt_regs * regs )
{
struct pt_regs * childregs ;
struct task_struct * tsk ;
int err ;
2006-01-12 01:05:41 -08:00
childregs = task_pt_regs ( p ) ;
2005-05-05 16:15:03 -07:00
* childregs = * regs ;
2008-01-30 13:30:56 +01:00
childregs - > ax = 0 ;
childregs - > sp = sp ;
2005-05-05 16:15:03 -07:00
2008-01-30 13:31:02 +01:00
p - > thread . sp = ( unsigned long ) childregs ;
p - > thread . sp0 = ( unsigned long ) ( childregs + 1 ) ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:31:02 +01:00
p - > thread . ip = ( unsigned long ) ret_from_fork ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:31:03 +01:00
savesegment ( gs , p - > thread . gs ) ;
2005-04-16 15:20:36 -07:00
tsk = current ;
2006-07-09 21:12:39 -04:00
if ( unlikely ( test_tsk_thread_flag ( tsk , TIF_IO_BITMAP ) ) ) {
2006-09-30 23:27:21 -07:00
p - > thread . io_bitmap_ptr = kmemdup ( tsk - > thread . io_bitmap_ptr ,
IO_BITMAP_BYTES , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
if ( ! p - > thread . io_bitmap_ptr ) {
p - > thread . io_bitmap_max = 0 ;
return - ENOMEM ;
}
2006-07-09 21:12:39 -04:00
set_tsk_thread_flag ( p , TIF_IO_BITMAP ) ;
2005-04-16 15:20:36 -07:00
}
2008-01-30 13:30:46 +01:00
err = 0 ;
2005-04-16 15:20:36 -07:00
/*
* Set a new TLS for the child thread ?
*/
2008-01-30 13:30:46 +01:00
if ( clone_flags & CLONE_SETTLS )
err = do_set_thread_area ( p , - 1 ,
2008-01-30 13:30:56 +01:00
( struct user_desc __user * ) childregs - > si , 0 ) ;
2005-04-16 15:20:36 -07:00
if ( err & & p - > thread . io_bitmap_ptr ) {
kfree ( p - > thread . io_bitmap_ptr ) ;
p - > thread . io_bitmap_max = 0 ;
}
return err ;
}
2008-02-21 05:18:40 +01:00
void
start_thread ( struct pt_regs * regs , unsigned long new_ip , unsigned long new_sp )
{
__asm__ ( " movl %0, %%gs " : : " r " ( 0 ) ) ;
regs - > fs = 0 ;
set_fs ( USER_DS ) ;
regs - > ds = __USER_DS ;
regs - > es = __USER_DS ;
regs - > ss = __USER_DS ;
regs - > cs = __USER_CS ;
regs - > ip = new_ip ;
regs - > sp = new_sp ;
2008-03-10 15:28:05 -07:00
/*
* Free the old FP and other extended state
*/
free_thread_xstate ( current ) ;
2008-02-21 05:18:40 +01:00
}
EXPORT_SYMBOL_GPL ( start_thread ) ;
2008-01-30 13:31:21 +01:00
static void hard_disable_TSC ( void )
2007-07-15 23:41:33 -07:00
{
write_cr4 ( read_cr4 ( ) | X86_CR4_TSD ) ;
}
2008-04-14 00:24:18 +02:00
2007-07-15 23:41:33 -07:00
void disable_TSC ( void )
{
preempt_disable ( ) ;
if ( ! test_and_set_thread_flag ( TIF_NOTSC ) )
/*
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context .
*/
hard_disable_TSC ( ) ;
preempt_enable ( ) ;
}
2008-04-14 00:24:18 +02:00
2008-01-30 13:31:21 +01:00
static void hard_enable_TSC ( void )
2007-07-15 23:41:33 -07:00
{
write_cr4 ( read_cr4 ( ) & ~ X86_CR4_TSD ) ;
}
2008-04-14 00:24:18 +02:00
2008-04-23 13:20:56 +02:00
static void enable_TSC ( void )
2008-04-14 00:24:18 +02:00
{
preempt_disable ( ) ;
if ( test_and_clear_thread_flag ( TIF_NOTSC ) )
/*
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context .
*/
hard_enable_TSC ( ) ;
preempt_enable ( ) ;
}
int get_tsc_mode ( unsigned long adr )
{
unsigned int val ;
if ( test_thread_flag ( TIF_NOTSC ) )
val = PR_TSC_SIGSEGV ;
else
val = PR_TSC_ENABLE ;
return put_user ( val , ( unsigned int __user * ) adr ) ;
}
int set_tsc_mode ( unsigned int val )
{
if ( val = = PR_TSC_SIGSEGV )
disable_TSC ( ) ;
else if ( val = = PR_TSC_ENABLE )
enable_TSC ( ) ;
else
return - EINVAL ;
return 0 ;
}
2007-07-15 23:41:33 -07:00
static noinline void
__switch_to_xtra ( struct task_struct * prev_p , struct task_struct * next_p ,
struct tss_struct * tss )
2005-04-16 15:20:36 -07:00
{
2008-01-30 13:30:54 +01:00
struct thread_struct * prev , * next ;
2008-01-30 13:31:09 +01:00
unsigned long debugctl ;
2006-07-09 21:12:39 -04:00
2008-01-30 13:30:54 +01:00
prev = & prev_p - > thread ;
2006-07-09 21:12:39 -04:00
next = & next_p - > thread ;
2008-01-30 13:31:09 +01:00
debugctl = prev - > debugctlmsr ;
if ( next - > ds_area_msr ! = prev - > ds_area_msr ) {
/* we clear debugctl to make sure DS
* is not in use when we change it */
debugctl = 0 ;
2008-03-10 13:11:17 +00:00
update_debugctlmsr ( 0 ) ;
2008-01-30 13:31:09 +01:00
wrmsr ( MSR_IA32_DS_AREA , next - > ds_area_msr , 0 ) ;
}
if ( next - > debugctlmsr ! = debugctl )
2008-03-10 13:11:17 +00:00
update_debugctlmsr ( next - > debugctlmsr ) ;
2008-01-30 13:30:54 +01:00
2006-07-09 21:12:39 -04:00
if ( test_tsk_thread_flag ( next_p , TIF_DEBUG ) ) {
2008-01-30 13:30:59 +01:00
set_debugreg ( next - > debugreg0 , 0 ) ;
set_debugreg ( next - > debugreg1 , 1 ) ;
set_debugreg ( next - > debugreg2 , 2 ) ;
set_debugreg ( next - > debugreg3 , 3 ) ;
2006-07-09 21:12:39 -04:00
/* no 4 and 5 */
2008-01-30 13:30:59 +01:00
set_debugreg ( next - > debugreg6 , 6 ) ;
set_debugreg ( next - > debugreg7 , 7 ) ;
2006-07-09 21:12:39 -04:00
}
2007-07-15 23:41:33 -07:00
if ( test_tsk_thread_flag ( prev_p , TIF_NOTSC ) ^
test_tsk_thread_flag ( next_p , TIF_NOTSC ) ) {
/* prev and next are different */
if ( test_tsk_thread_flag ( next_p , TIF_NOTSC ) )
hard_disable_TSC ( ) ;
else
hard_enable_TSC ( ) ;
}
2008-02-26 09:40:27 +01:00
# ifdef X86_BTS
2008-01-30 13:31:09 +01:00
if ( test_tsk_thread_flag ( prev_p , TIF_BTS_TRACE_TS ) )
ptrace_bts_take_timestamp ( prev_p , BTS_TASK_DEPARTS ) ;
if ( test_tsk_thread_flag ( next_p , TIF_BTS_TRACE_TS ) )
ptrace_bts_take_timestamp ( next_p , BTS_TASK_ARRIVES ) ;
2008-02-26 09:40:27 +01:00
# endif
2008-01-30 13:31:09 +01:00
2006-07-09 21:12:39 -04:00
if ( ! test_tsk_thread_flag ( next_p , TIF_IO_BITMAP ) ) {
2005-04-16 15:20:36 -07:00
/*
* Disable the bitmap via an invalid offset . We still cache
* the previous bitmap owner and the IO bitmap contents :
*/
2007-05-02 19:27:13 +02:00
tss - > x86_tss . io_bitmap_base = INVALID_IO_BITMAP_OFFSET ;
2005-04-16 15:20:36 -07:00
return ;
}
2006-07-09 21:12:39 -04:00
2005-04-16 15:20:36 -07:00
if ( likely ( next = = tss - > io_bitmap_owner ) ) {
/*
* Previous owner of the bitmap ( hence the bitmap content )
* matches the next task , we dont have to do anything but
* to set a valid offset in the TSS :
*/
2007-05-02 19:27:13 +02:00
tss - > x86_tss . io_bitmap_base = IO_BITMAP_OFFSET ;
2005-04-16 15:20:36 -07:00
return ;
}
/*
* Lazy TSS ' s I / O bitmap copy . We set an invalid offset here
* and we let the task to get a GPF in case an I / O instruction
* is performed . The handler of the GPF will verify that the
* faulting task has a valid I / O bitmap and , it true , does the
* real copy and restart the instruction . This will save us
* redundant copies when the currently switched task does not
* perform any I / O during its timeslice .
*/
2007-05-02 19:27:13 +02:00
tss - > x86_tss . io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY ;
2005-04-16 15:20:36 -07:00
}
/*
* switch_to ( x , yn ) should switch tasks from x to y .
*
* We fsave / fwait so that an exception goes off at the right time
* ( as a call from the fsave or fwait in effect ) rather than to
* the wrong process . Lazy FP saving no longer makes any sense
* with modern CPU ' s , and this simplifies a lot of things ( SMP
* and UP become the same ) .
*
* NOTE ! We used to use the x86 hardware context switching . The
* reason for not using it any more becomes apparent when you
* try to recover gracefully from saved state that is no longer
* valid ( stale segment register values in particular ) . With the
* hardware task - switch , there is no way to fix up bad state in
* a reasonable manner .
*
* The fact that Intel documents the hardware task - switching to
* be slow is a fairly red herring - this code is not noticeably
* faster . However , there _is_ some room for improvement here ,
* so the performance issues may eventually be a valid point .
* More important , however , is the fact that this allows us much
* more flexibility .
*
2008-01-30 13:30:56 +01:00
* The return value ( in % ax ) will be the " prev " task after
2005-04-16 15:20:36 -07:00
* the task - switch , and shows up in ret_from_fork in entry . S ,
* for example .
*/
2008-01-30 13:31:17 +01:00
struct task_struct * __switch_to ( struct task_struct * prev_p , struct task_struct * next_p )
2005-04-16 15:20:36 -07:00
{
struct thread_struct * prev = & prev_p - > thread ,
* next = & next_p - > thread ;
int cpu = smp_processor_id ( ) ;
struct tss_struct * tss = & per_cpu ( init_tss , cpu ) ;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
__unlazy_fpu ( prev_p ) ;
2006-12-07 02:14:01 +01:00
/* we're going to use this soon, after a few expensive things */
if ( next_p - > fpu_counter > 5 )
2008-03-10 15:28:04 -07:00
prefetch ( next - > xstate ) ;
2006-12-07 02:14:01 +01:00
2005-04-16 15:20:36 -07:00
/*
2005-09-03 15:56:39 -07:00
* Reload esp0 .
2005-04-16 15:20:36 -07:00
*/
2008-01-30 13:31:02 +01:00
load_sp0 ( tss , next ) ;
2005-04-16 15:20:36 -07:00
/*
2007-02-13 13:26:20 +01:00
* Save away % gs . No need to save % fs , as it was saved on the
[PATCH] i386: Use %gs as the PDA base-segment in the kernel
This patch is the meat of the PDA change. This patch makes several related
changes:
1: Most significantly, %gs is now used in the kernel. This means that on
entry, the old value of %gs is saved away, and it is reloaded with
__KERNEL_PDA.
2: entry.S constructs the stack in the shape of struct pt_regs, and this
is passed around the kernel so that the process's saved register
state can be accessed.
Unfortunately struct pt_regs doesn't currently have space for %gs
(or %fs). This patch extends pt_regs to add space for gs (no space
is allocated for %fs, since it won't be used, and it would just
complicate the code in entry.S to work around the space).
3: Because %gs is now saved on the stack like %ds, %es and the integer
registers, there are a number of places where it no longer needs to
be handled specially; namely context switch, and saving/restoring the
register state in a signal context.
4: And since kernel threads run in kernel space and call normal kernel
code, they need to be created with their %gs == __KERNEL_PDA.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
2006-12-07 02:14:02 +01:00
* stack on entry . No need to save % es and % ds , as those are
* always kernel segments while inside the kernel . Doing this
* before setting the new TLS descriptors avoids the situation
* where we temporarily have non - reloadable segments in % fs
* and % gs . This could be an issue if the NMI handler ever
* used % fs or % gs ( it does not today ) , or if the kernel is
* running inside of a hypervisor layer .
2005-04-16 15:20:36 -07:00
*/
2007-02-13 13:26:20 +01:00
savesegment ( gs , prev - > gs ) ;
2005-04-16 15:20:36 -07:00
/*
2005-09-03 15:56:39 -07:00
* Load the per - thread Thread - Local Storage descriptor .
2005-04-16 15:20:36 -07:00
*/
2005-09-03 15:56:39 -07:00
load_TLS ( next , cpu ) ;
2005-04-16 15:20:36 -07:00
2007-02-13 13:26:21 +01:00
/*
* Restore IOPL if needed . In normal use , the flags restore
* in the switch assembly will handle this . But if the kernel
* is running virtualized at a non - zero CPL , the popf will
* not restore flags , so it must be done in a separate step .
*/
if ( get_kernel_rpl ( ) & & unlikely ( prev - > iopl ! = next - > iopl ) )
set_iopl_mask ( next - > iopl ) ;
2005-04-16 15:20:36 -07:00
/*
2006-07-09 21:12:39 -04:00
* Now maybe handle debug registers and / or IO bitmaps
2005-04-16 15:20:36 -07:00
*/
2007-07-15 23:41:33 -07:00
if ( unlikely ( task_thread_info ( prev_p ) - > flags & _TIF_WORK_CTXSW_PREV | |
task_thread_info ( next_p ) - > flags & _TIF_WORK_CTXSW_NEXT ) )
__switch_to_xtra ( prev_p , next_p , tss ) ;
2005-06-27 14:36:36 -07:00
2007-02-13 13:26:21 +01:00
/*
* Leave lazy mode , flushing any hypercalls made here .
* This must be done before restoring TLS segments so
* the GDT and LDT are properly updated , and must be
* done before math_state_restore , so the TS bit is up
* to date .
*/
arch_leave_lazy_cpu_mode ( ) ;
2006-12-07 02:14:01 +01:00
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap ; the
* chances of needing FPU soon are obviously high now
*/
if ( next_p - > fpu_counter > 5 )
math_state_restore ( ) ;
2007-02-13 13:26:21 +01:00
/*
* Restore % gs if needed ( which is common )
*/
if ( prev - > gs | next - > gs )
loadsegment ( gs , next - > gs ) ;
2007-05-02 19:27:16 +02:00
x86_write_percpu ( current_task , next_p ) ;
2007-02-13 13:26:21 +01:00
2005-04-16 15:20:36 -07:00
return prev_p ;
}
asmlinkage int sys_fork ( struct pt_regs regs )
{
2008-01-30 13:30:56 +01:00
return do_fork ( SIGCHLD , regs . sp , & regs , 0 , NULL , NULL ) ;
2005-04-16 15:20:36 -07:00
}
asmlinkage int sys_clone ( struct pt_regs regs )
{
unsigned long clone_flags ;
unsigned long newsp ;
int __user * parent_tidptr , * child_tidptr ;
2008-01-30 13:30:56 +01:00
clone_flags = regs . bx ;
newsp = regs . cx ;
parent_tidptr = ( int __user * ) regs . dx ;
child_tidptr = ( int __user * ) regs . di ;
2005-04-16 15:20:36 -07:00
if ( ! newsp )
2008-01-30 13:30:56 +01:00
newsp = regs . sp ;
2005-04-16 15:20:36 -07:00
return do_fork ( clone_flags , newsp , & regs , 0 , parent_tidptr , child_tidptr ) ;
}
/*
* This is trivial , and on the face of it looks like it
* could equally well be done in user mode .
*
* Not so , for quite unobvious reasons - register pressure .
* In user mode vfork ( ) cannot have a stack frame , and if
* done by calling the " clone() " system call directly , you
* do not have enough call - clobbered registers to hold all
* the information you need .
*/
asmlinkage int sys_vfork ( struct pt_regs regs )
{
2008-01-30 13:30:56 +01:00
return do_fork ( CLONE_VFORK | CLONE_VM | SIGCHLD , regs . sp , & regs , 0 , NULL , NULL ) ;
2005-04-16 15:20:36 -07:00
}
/*
* sys_execve ( ) executes a new program .
*/
asmlinkage int sys_execve ( struct pt_regs regs )
{
int error ;
char * filename ;
2008-01-30 13:30:56 +01:00
filename = getname ( ( char __user * ) regs . bx ) ;
2005-04-16 15:20:36 -07:00
error = PTR_ERR ( filename ) ;
if ( IS_ERR ( filename ) )
goto out ;
error = do_execve ( filename ,
2008-01-30 13:30:56 +01:00
( char __user * __user * ) regs . cx ,
( char __user * __user * ) regs . dx ,
2005-04-16 15:20:36 -07:00
& regs ) ;
if ( error = = 0 ) {
/* Make sure we don't return using sysenter.. */
set_thread_flag ( TIF_IRET ) ;
}
putname ( filename ) ;
out :
return error ;
}
# define top_esp (THREAD_SIZE - sizeof(unsigned long))
# define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
unsigned long get_wchan ( struct task_struct * p )
{
2008-01-30 13:30:56 +01:00
unsigned long bp , sp , ip ;
2005-04-16 15:20:36 -07:00
unsigned long stack_page ;
int count = 0 ;
if ( ! p | | p = = current | | p - > state = = TASK_RUNNING )
return 0 ;
2006-01-12 01:05:41 -08:00
stack_page = ( unsigned long ) task_stack_page ( p ) ;
2008-01-30 13:31:02 +01:00
sp = p - > thread . sp ;
2008-01-30 13:30:56 +01:00
if ( ! stack_page | | sp < stack_page | | sp > top_esp + stack_page )
2005-04-16 15:20:36 -07:00
return 0 ;
2008-01-30 13:30:56 +01:00
/* include/asm-i386/system.h:switch_to() pushes bp last. */
bp = * ( unsigned long * ) sp ;
2005-04-16 15:20:36 -07:00
do {
2008-01-30 13:30:56 +01:00
if ( bp < stack_page | | bp > top_ebp + stack_page )
2005-04-16 15:20:36 -07:00
return 0 ;
2008-01-30 13:30:56 +01:00
ip = * ( unsigned long * ) ( bp + 4 ) ;
if ( ! in_sched_functions ( ip ) )
return ip ;
bp = * ( unsigned long * ) bp ;
2005-04-16 15:20:36 -07:00
} while ( count + + < 16 ) ;
return 0 ;
}
unsigned long arch_align_stack ( unsigned long sp )
{
2006-09-26 10:52:28 +02:00
if ( ! ( current - > personality & ADDR_NO_RANDOMIZE ) & & randomize_va_space )
2005-04-16 15:20:36 -07:00
sp - = get_random_int ( ) % 8192 ;
return sp & ~ 0xf ;
}
2008-01-30 13:30:40 +01:00
unsigned long arch_randomize_brk ( struct mm_struct * mm )
{
unsigned long range_end = mm - > brk + 0x02000000 ;
return randomize_range ( mm - > brk , range_end , 0 ) ? : mm - > brk ;
}