2005-04-16 15:20:36 -07:00
/*
* x86 SMP booting functions
*
* ( c ) 1995 Alan Cox , Building # 3 < alan @ redhat . com >
* ( c ) 1998 , 1999 , 2000 Ingo Molnar < mingo @ redhat . com >
*
* Much of the core SMP work is based on previous work by Thomas Radke , to
* whom a great many thanks are extended .
*
* Thanks to Intel for making available several different Pentium ,
* Pentium Pro and Pentium - II / Xeon MP machines .
* Original development of Linux SMP code supported by Caldera .
*
* This code is released under the GNU General Public License version 2 or
* later .
*
* Fixes
* Felix Koop : NR_CPUS used properly
* Jose Renau : Handle single CPU case .
* Alan Cox : By repeated request 8 ) - Total BogoMIPS report .
* Greg Wright : Fix for kernel stacks panic .
* Erich Boleyn : MP v1 .4 and additional changes .
* Matthias Sattler : Changes for 2.1 kernel map .
* Michel Lespinasse : Changes for 2.1 kernel map .
* Michael Chastain : Change trampoline . S to gnu as .
* Alan Cox : Dumb bug : ' B ' step PPro ' s are fine
* Ingo Molnar : Added APIC timers , based on code
* from Jose Renau
* Ingo Molnar : various cleanups and rewrites
* Tigran Aivazian : fixed " 0.00 in /proc/uptime on SMP " bug .
* Maciej W . Rozycki : Bits for genuine 82489 DX APICs
* Martin J . Bligh : Added support for multi - quad systems
* Dave Jones : Report invalid combinations of Athlon CPUs .
* Rusty Russell : Hacked into shape for new " hotplug " boot process . */
# include <linux/module.h>
# include <linux/config.h>
# include <linux/init.h>
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/sched.h>
# include <linux/kernel_stat.h>
# include <linux/smp_lock.h>
# include <linux/bootmem.h>
2005-06-25 14:54:50 -07:00
# include <linux/notifier.h>
# include <linux/cpu.h>
# include <linux/percpu.h>
2005-04-16 15:20:36 -07:00
# include <linux/delay.h>
# include <linux/mc146818rtc.h>
# include <asm/tlbflush.h>
# include <asm/desc.h>
# include <asm/arch_hooks.h>
2006-06-26 13:57:01 +02:00
# include <asm/nmi.h>
2005-04-16 15:20:36 -07:00
# include <mach_apic.h>
# include <mach_wakecpu.h>
# include <smpboot_hooks.h>
/* Set if we find a B stepping CPU */
2005-06-25 14:54:55 -07:00
static int __devinitdata smp_b_stepping ;
2005-04-16 15:20:36 -07:00
/* Number of siblings per CPU package */
int smp_num_siblings = 1 ;
2005-06-23 00:08:33 -07:00
# ifdef CONFIG_X86_HT
EXPORT_SYMBOL ( smp_num_siblings ) ;
# endif
2005-06-25 14:54:54 -07:00
/* Package ID of each logical CPU */
2005-07-07 17:56:59 -07:00
int phys_proc_id [ NR_CPUS ] __read_mostly = { [ 0 . . . NR_CPUS - 1 ] = BAD_APICID } ;
2005-06-25 14:54:54 -07:00
/* Core ID of each logical CPU */
2005-07-07 17:56:59 -07:00
int cpu_core_id [ NR_CPUS ] __read_mostly = { [ 0 . . . NR_CPUS - 1 ] = BAD_APICID } ;
2005-04-16 15:20:36 -07:00
2006-03-27 01:15:22 -08:00
/* Last level cache ID of each logical CPU */
int cpu_llc_id [ NR_CPUS ] __cpuinitdata = { [ 0 . . . NR_CPUS - 1 ] = BAD_APICID } ;
2005-11-05 17:25:54 +01:00
/* representing HT siblings of each logical CPU */
2005-07-07 17:56:59 -07:00
cpumask_t cpu_sibling_map [ NR_CPUS ] __read_mostly ;
2005-06-25 14:54:54 -07:00
EXPORT_SYMBOL ( cpu_sibling_map ) ;
2005-11-05 17:25:54 +01:00
/* representing HT and core siblings of each logical CPU */
2005-07-07 17:56:59 -07:00
cpumask_t cpu_core_map [ NR_CPUS ] __read_mostly ;
2005-06-25 14:54:54 -07:00
EXPORT_SYMBOL ( cpu_core_map ) ;
2005-04-16 15:20:36 -07:00
/* bitmap of online cpus */
2005-07-07 17:56:59 -07:00
cpumask_t cpu_online_map __read_mostly ;
2005-06-23 00:08:33 -07:00
EXPORT_SYMBOL ( cpu_online_map ) ;
2005-04-16 15:20:36 -07:00
cpumask_t cpu_callin_map ;
cpumask_t cpu_callout_map ;
2005-06-23 00:08:33 -07:00
EXPORT_SYMBOL ( cpu_callout_map ) ;
2005-09-03 15:56:51 -07:00
cpumask_t cpu_possible_map ;
EXPORT_SYMBOL ( cpu_possible_map ) ;
2005-04-16 15:20:36 -07:00
static cpumask_t smp_commenced_mask ;
2005-06-25 14:54:56 -07:00
/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
* is no way to resync one AP against BP . TBD : for prescott and above , we
* should use IA64 ' s algorithm
*/
static int __devinitdata tsc_sync_disabled ;
2005-04-16 15:20:36 -07:00
/* Per CPU bogomips and other parameters */
struct cpuinfo_x86 cpu_data [ NR_CPUS ] __cacheline_aligned ;
2005-06-23 00:08:33 -07:00
EXPORT_SYMBOL ( cpu_data ) ;
2005-04-16 15:20:36 -07:00
2005-07-07 17:56:59 -07:00
u8 x86_cpu_to_apicid [ NR_CPUS ] __read_mostly =
2005-04-16 15:20:36 -07:00
{ [ 0 . . . NR_CPUS - 1 ] = 0xff } ;
EXPORT_SYMBOL ( x86_cpu_to_apicid ) ;
/*
* Trampoline 80 x86 program as an array .
*/
extern unsigned char trampoline_data [ ] ;
extern unsigned char trampoline_end [ ] ;
static unsigned char * trampoline_base ;
static int trampoline_exec ;
static void map_cpu_to_logical_apicid ( void ) ;
2005-06-25 14:54:50 -07:00
/* State of each CPU. */
DEFINE_PER_CPU ( int , cpu_state ) = { 0 } ;
2005-04-16 15:20:36 -07:00
/*
* Currently trivial . Write the real - > protected mode
* bootstrap into the page concerned . The caller
* has made sure it ' s suitably aligned .
*/
2005-06-25 14:54:55 -07:00
static unsigned long __devinit setup_trampoline ( void )
2005-04-16 15:20:36 -07:00
{
memcpy ( trampoline_base , trampoline_data , trampoline_end - trampoline_data ) ;
return virt_to_phys ( trampoline_base ) ;
}
/*
* We are called very early to get the low memory for the
* SMP bootup trampoline page .
*/
void __init smp_alloc_memory ( void )
{
trampoline_base = ( void * ) alloc_bootmem_low_pages ( PAGE_SIZE ) ;
/*
* Has to be in very low memory so we can execute
* real - mode AP code .
*/
if ( __pa ( trampoline_base ) > = 0x9F000 )
BUG ( ) ;
/*
* Make the SMP trampoline executable :
*/
trampoline_exec = set_kernel_exec ( ( unsigned long ) trampoline_base , 1 ) ;
}
/*
* The bootstrap kernel entry code has set these up . Save them for
* a given CPU
*/
2005-06-25 14:54:55 -07:00
static void __devinit smp_store_cpu_info ( int id )
2005-04-16 15:20:36 -07:00
{
struct cpuinfo_x86 * c = cpu_data + id ;
* c = boot_cpu_data ;
if ( id ! = 0 )
identify_cpu ( c ) ;
/*
* Mask B , Pentium , but not Pentium MMX
*/
if ( c - > x86_vendor = = X86_VENDOR_INTEL & &
c - > x86 = = 5 & &
c - > x86_mask > = 1 & & c - > x86_mask < = 4 & &
c - > x86_model < = 3 )
/*
* Remember we have B step Pentia with bugs
*/
smp_b_stepping = 1 ;
/*
* Certain Athlons might work ( for various values of ' work ' ) in SMP
* but they are not certified as MP capable .
*/
if ( ( c - > x86_vendor = = X86_VENDOR_AMD ) & & ( c - > x86 = = 6 ) ) {
/* Athlon 660/661 is valid. */
if ( ( c - > x86_model = = 6 ) & & ( ( c - > x86_mask = = 0 ) | | ( c - > x86_mask = = 1 ) ) )
goto valid_k7 ;
/* Duron 670 is valid */
if ( ( c - > x86_model = = 7 ) & & ( c - > x86_mask = = 0 ) )
goto valid_k7 ;
/*
* Athlon 662 , Duron 671 , and Athlon > model 7 have capability bit .
* It ' s worth noting that the A5 stepping ( 662 ) of some Athlon XP ' s
* have the MP bit set .
* See http : //www.heise.de/newsticker/data/jow-18.10.01-000 for more.
*/
if ( ( ( c - > x86_model = = 6 ) & & ( c - > x86_mask > = 2 ) ) | |
( ( c - > x86_model = = 7 ) & & ( c - > x86_mask > = 1 ) ) | |
( c - > x86_model > 7 ) )
if ( cpu_has_mp )
goto valid_k7 ;
/* If we get here, it's not a certified SMP capable AMD system. */
2005-09-13 01:25:16 -07:00
add_taint ( TAINT_UNSAFE_SMP ) ;
2005-04-16 15:20:36 -07:00
}
valid_k7 :
;
}
/*
* TSC synchronization .
*
* We first check whether all CPUs have their TSC ' s synchronized ,
* then we print a warning if not , and always resync .
*/
static atomic_t tsc_start_flag = ATOMIC_INIT ( 0 ) ;
static atomic_t tsc_count_start = ATOMIC_INIT ( 0 ) ;
static atomic_t tsc_count_stop = ATOMIC_INIT ( 0 ) ;
static unsigned long long tsc_values [ NR_CPUS ] ;
# define NR_LOOPS 5
static void __init synchronize_tsc_bp ( void )
{
int i ;
unsigned long long t0 ;
unsigned long long sum , avg ;
long long delta ;
2005-06-23 00:08:34 -07:00
unsigned int one_usec ;
2005-04-16 15:20:36 -07:00
int buggy = 0 ;
printk ( KERN_INFO " checking TSC synchronization across %u CPUs: " , num_booting_cpus ( ) ) ;
/* convert from kcyc/sec to cyc/usec */
one_usec = cpu_khz / 1000 ;
atomic_set ( & tsc_start_flag , 1 ) ;
wmb ( ) ;
/*
* We loop a few times to get a primed instruction cache ,
* then the last pass is more or less synchronized and
* the BP and APs set their cycle counters to zero all at
* once . This reduces the chance of having random offsets
* between the processors , and guarantees that the maximum
* delay between the cycle counters is never bigger than
* the latency of information - passing ( cachelines ) between
* two CPUs .
*/
for ( i = 0 ; i < NR_LOOPS ; i + + ) {
/*
* all APs synchronize but they loop on ' = = num_cpus '
*/
while ( atomic_read ( & tsc_count_start ) ! = num_booting_cpus ( ) - 1 )
2006-06-25 05:46:52 -07:00
cpu_relax ( ) ;
2005-04-16 15:20:36 -07:00
atomic_set ( & tsc_count_stop , 0 ) ;
wmb ( ) ;
/*
* this lets the APs save their current TSC :
*/
atomic_inc ( & tsc_count_start ) ;
rdtscll ( tsc_values [ smp_processor_id ( ) ] ) ;
/*
* We clear the TSC in the last loop :
*/
if ( i = = NR_LOOPS - 1 )
write_tsc ( 0 , 0 ) ;
/*
* Wait for all APs to leave the synchronization point :
*/
while ( atomic_read ( & tsc_count_stop ) ! = num_booting_cpus ( ) - 1 )
2006-06-25 05:46:52 -07:00
cpu_relax ( ) ;
2005-04-16 15:20:36 -07:00
atomic_set ( & tsc_count_start , 0 ) ;
wmb ( ) ;
atomic_inc ( & tsc_count_stop ) ;
}
sum = 0 ;
for ( i = 0 ; i < NR_CPUS ; i + + ) {
if ( cpu_isset ( i , cpu_callout_map ) ) {
t0 = tsc_values [ i ] ;
sum + = t0 ;
}
}
avg = sum ;
do_div ( avg , num_booting_cpus ( ) ) ;
sum = 0 ;
for ( i = 0 ; i < NR_CPUS ; i + + ) {
if ( ! cpu_isset ( i , cpu_callout_map ) )
continue ;
delta = tsc_values [ i ] - avg ;
if ( delta < 0 )
delta = - delta ;
/*
* We report bigger than 2 microseconds clock differences .
*/
if ( delta > 2 * one_usec ) {
long realdelta ;
if ( ! buggy ) {
buggy = 1 ;
printk ( " \n " ) ;
}
realdelta = delta ;
do_div ( realdelta , one_usec ) ;
if ( tsc_values [ i ] < avg )
realdelta = - realdelta ;
2006-04-27 18:39:24 -07:00
if ( realdelta > 0 )
printk ( KERN_INFO " CPU#%d had %ld usecs TSC "
" skew, fixed it up. \n " , i , realdelta ) ;
2005-04-16 15:20:36 -07:00
}
sum + = delta ;
}
if ( ! buggy )
printk ( " passed. \n " ) ;
}
static void __init synchronize_tsc_ap ( void )
{
int i ;
/*
* Not every cpu is online at the time
* this gets called , so we first wait for the BP to
* finish SMP initialization :
*/
2006-06-25 05:46:52 -07:00
while ( ! atomic_read ( & tsc_start_flag ) )
cpu_relax ( ) ;
2005-04-16 15:20:36 -07:00
for ( i = 0 ; i < NR_LOOPS ; i + + ) {
atomic_inc ( & tsc_count_start ) ;
while ( atomic_read ( & tsc_count_start ) ! = num_booting_cpus ( ) )
2006-06-25 05:46:52 -07:00
cpu_relax ( ) ;
2005-04-16 15:20:36 -07:00
rdtscll ( tsc_values [ smp_processor_id ( ) ] ) ;
if ( i = = NR_LOOPS - 1 )
write_tsc ( 0 , 0 ) ;
atomic_inc ( & tsc_count_stop ) ;
2006-06-25 05:46:52 -07:00
while ( atomic_read ( & tsc_count_stop ) ! = num_booting_cpus ( ) )
cpu_relax ( ) ;
2005-04-16 15:20:36 -07:00
}
}
# undef NR_LOOPS
extern void calibrate_delay ( void ) ;
static atomic_t init_deasserted ;
2005-06-25 14:54:55 -07:00
static void __devinit smp_callin ( void )
2005-04-16 15:20:36 -07:00
{
int cpuid , phys_id ;
unsigned long timeout ;
/*
* If waken up by an INIT in an 82489 DX configuration
* we may get here before an INIT - deassert IPI reaches
* our local APIC . We have to wait for the IPI or we ' ll
* lock up on an APIC access .
*/
wait_for_init_deassert ( & init_deasserted ) ;
/*
* ( This works even if the APIC is not enabled . )
*/
phys_id = GET_APIC_ID ( apic_read ( APIC_ID ) ) ;
cpuid = smp_processor_id ( ) ;
if ( cpu_isset ( cpuid , cpu_callin_map ) ) {
printk ( " huh, phys CPU#%d, CPU#%d already present?? \n " ,
phys_id , cpuid ) ;
BUG ( ) ;
}
Dprintk ( " CPU#%d (phys ID: %d) waiting for CALLOUT \n " , cpuid , phys_id ) ;
/*
* STARTUP IPIs are fragile beasts as they might sometimes
* trigger some glue motherboard logic . Complete APIC bus
* silence for 1 second , this overestimates the time the
* boot CPU is spending to send the up to 2 STARTUP IPIs
* by a factor of two . This should be enough .
*/
/*
* Waiting 2 s total for startup ( udelay is not yet working )
*/
timeout = jiffies + 2 * HZ ;
while ( time_before ( jiffies , timeout ) ) {
/*
* Has the boot CPU finished it ' s STARTUP sequence ?
*/
if ( cpu_isset ( cpuid , cpu_callout_map ) )
break ;
rep_nop ( ) ;
}
if ( ! time_before ( jiffies , timeout ) ) {
printk ( " BUG: CPU%d started up but did not get a callout! \n " ,
cpuid ) ;
BUG ( ) ;
}
/*
* the boot CPU has finished the init stage and is spinning
* on callin_map until we finish . We are free to set up this
* CPU , first the APIC . ( this is probably redundant on most
* boards )
*/
Dprintk ( " CALLIN, before setup_local_APIC(). \n " ) ;
smp_callin_clear_local_apic ( ) ;
setup_local_APIC ( ) ;
map_cpu_to_logical_apicid ( ) ;
/*
* Get our bogomips .
*/
calibrate_delay ( ) ;
Dprintk ( " Stack at about %p \n " , & cpuid ) ;
/*
* Save our processor parameters
*/
smp_store_cpu_info ( cpuid ) ;
disable_APIC_timer ( ) ;
/*
* Allow the master to continue .
*/
cpu_set ( cpuid , cpu_callin_map ) ;
/*
* Synchronize the TSC with the BP
*/
2005-06-25 14:54:56 -07:00
if ( cpu_has_tsc & & cpu_khz & & ! tsc_sync_disabled )
2005-04-16 15:20:36 -07:00
synchronize_tsc_ap ( ) ;
}
static int cpucount ;
2006-03-27 01:15:22 -08:00
/* maps the cpu to the sched domain representing multi-core */
cpumask_t cpu_coregroup_map ( int cpu )
{
struct cpuinfo_x86 * c = cpu_data + cpu ;
/*
* For perf , we return last level cache shared map .
* TBD : when power saving sched policy is added , we will return
* cpu_core_map when power saving policy is enabled
*/
return c - > llc_shared_map ;
}
2005-11-05 17:25:54 +01:00
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map ;
2005-06-25 14:54:54 -07:00
static inline void
set_cpu_sibling_map ( int cpu )
{
int i ;
2005-11-05 17:25:54 +01:00
struct cpuinfo_x86 * c = cpu_data ;
cpu_set ( cpu , cpu_sibling_setup_map ) ;
2005-06-25 14:54:54 -07:00
if ( smp_num_siblings > 1 ) {
2005-11-05 17:25:54 +01:00
for_each_cpu_mask ( i , cpu_sibling_setup_map ) {
if ( phys_proc_id [ cpu ] = = phys_proc_id [ i ] & &
cpu_core_id [ cpu ] = = cpu_core_id [ i ] ) {
2005-06-25 14:54:54 -07:00
cpu_set ( i , cpu_sibling_map [ cpu ] ) ;
cpu_set ( cpu , cpu_sibling_map [ i ] ) ;
2005-11-05 17:25:54 +01:00
cpu_set ( i , cpu_core_map [ cpu ] ) ;
cpu_set ( cpu , cpu_core_map [ i ] ) ;
2006-03-27 01:15:22 -08:00
cpu_set ( i , c [ cpu ] . llc_shared_map ) ;
cpu_set ( cpu , c [ i ] . llc_shared_map ) ;
2005-06-25 14:54:54 -07:00
}
}
} else {
cpu_set ( cpu , cpu_sibling_map [ cpu ] ) ;
}
2006-03-27 01:15:22 -08:00
cpu_set ( cpu , c [ cpu ] . llc_shared_map ) ;
2005-11-05 17:25:54 +01:00
if ( current_cpu_data . x86_max_cores = = 1 ) {
2005-06-25 14:54:54 -07:00
cpu_core_map [ cpu ] = cpu_sibling_map [ cpu ] ;
2005-11-05 17:25:54 +01:00
c [ cpu ] . booted_cores = 1 ;
return ;
}
for_each_cpu_mask ( i , cpu_sibling_setup_map ) {
2006-03-27 01:15:22 -08:00
if ( cpu_llc_id [ cpu ] ! = BAD_APICID & &
cpu_llc_id [ cpu ] = = cpu_llc_id [ i ] ) {
cpu_set ( i , c [ cpu ] . llc_shared_map ) ;
cpu_set ( cpu , c [ i ] . llc_shared_map ) ;
}
2005-11-05 17:25:54 +01:00
if ( phys_proc_id [ cpu ] = = phys_proc_id [ i ] ) {
cpu_set ( i , cpu_core_map [ cpu ] ) ;
cpu_set ( cpu , cpu_core_map [ i ] ) ;
/*
* Does this new cpu bringup a new core ?
*/
if ( cpus_weight ( cpu_sibling_map [ cpu ] ) = = 1 ) {
/*
* for each core in package , increment
* the booted_cores for this new cpu
*/
if ( first_cpu ( cpu_sibling_map [ i ] ) = = i )
c [ cpu ] . booted_cores + + ;
/*
* increment the core count for all
* the other cpus in this package
*/
if ( i ! = cpu )
c [ i ] . booted_cores + + ;
} else if ( i ! = cpu & & ! c [ cpu ] . booted_cores )
c [ cpu ] . booted_cores = c [ i ] . booted_cores ;
}
2005-06-25 14:54:54 -07:00
}
}
2005-04-16 15:20:36 -07:00
/*
* Activate a secondary processor .
*/
2005-06-25 14:54:55 -07:00
static void __devinit start_secondary ( void * unused )
2005-04-16 15:20:36 -07:00
{
/*
* Dont put anything before smp_callin ( ) , SMP
* booting is too fragile that we want to limit the
* things done here to the most necessary things .
*/
cpu_init ( ) ;
2005-11-08 21:39:01 -08:00
preempt_disable ( ) ;
2005-04-16 15:20:36 -07:00
smp_callin ( ) ;
while ( ! cpu_isset ( smp_processor_id ( ) , smp_commenced_mask ) )
rep_nop ( ) ;
setup_secondary_APIC_clock ( ) ;
if ( nmi_watchdog = = NMI_IO_APIC ) {
disable_8259A_irq ( 0 ) ;
enable_NMI_through_LVT0 ( NULL ) ;
enable_8259A_irq ( 0 ) ;
}
enable_APIC_timer ( ) ;
/*
* low - memory mappings have been cleared , flush them from
* the local TLBs too .
*/
local_flush_tlb ( ) ;
2005-06-25 14:54:53 -07:00
2005-06-25 14:54:54 -07:00
/* This must be done before setting cpu_online_map */
set_cpu_sibling_map ( raw_smp_processor_id ( ) ) ;
wmb ( ) ;
2005-06-25 14:54:53 -07:00
/*
* We need to hold call_lock , so there is no inconsistency
* between the time smp_call_function ( ) determines number of
* IPI receipients , and the time when the determination is made
* for which cpus receive the IPI . Holding this
* lock helps us to not include this cpu in a currently in progress
* smp_call_function ( ) .
*/
lock_ipi_call_lock ( ) ;
2005-04-16 15:20:36 -07:00
cpu_set ( smp_processor_id ( ) , cpu_online_map ) ;
2005-06-25 14:54:53 -07:00
unlock_ipi_call_lock ( ) ;
2005-06-25 14:54:56 -07:00
per_cpu ( cpu_state , smp_processor_id ( ) ) = CPU_ONLINE ;
2005-04-16 15:20:36 -07:00
/* We can take interrupts now: we're officially "up". */
local_irq_enable ( ) ;
wmb ( ) ;
cpu_idle ( ) ;
}
/*
* Everything has been set up for the secondary
* CPUs - they just need to reload everything
* from the task structure
* This function must not return .
*/
2005-06-25 14:54:55 -07:00
void __devinit initialize_secondary ( void )
2005-04-16 15:20:36 -07:00
{
/*
* We don ' t actually need to load the full TSS ,
* basically just the stack pointer and the eip .
*/
asm volatile (
" movl %0,%%esp \n \t "
" jmp *%1 "
:
: " r " ( current - > thread . esp ) , " r " ( current - > thread . eip ) ) ;
}
extern struct {
void * esp ;
unsigned short ss ;
} stack_start ;
# ifdef CONFIG_NUMA
/* which logical CPUs are on which nodes */
2005-07-07 17:56:59 -07:00
cpumask_t node_2_cpu_mask [ MAX_NUMNODES ] __read_mostly =
2005-04-16 15:20:36 -07:00
{ [ 0 . . . MAX_NUMNODES - 1 ] = CPU_MASK_NONE } ;
/* which node each logical CPU is on */
2005-07-07 17:56:59 -07:00
int cpu_2_node [ NR_CPUS ] __read_mostly = { [ 0 . . . NR_CPUS - 1 ] = 0 } ;
2005-04-16 15:20:36 -07:00
EXPORT_SYMBOL ( cpu_2_node ) ;
/* set up a mapping between cpu and node. */
static inline void map_cpu_to_node ( int cpu , int node )
{
printk ( " Mapping cpu %d to node %d \n " , cpu , node ) ;
cpu_set ( cpu , node_2_cpu_mask [ node ] ) ;
cpu_2_node [ cpu ] = node ;
}
/* undo a mapping between cpu and node. */
static inline void unmap_cpu_to_node ( int cpu )
{
int node ;
printk ( " Unmapping cpu %d from all nodes \n " , cpu ) ;
for ( node = 0 ; node < MAX_NUMNODES ; node + + )
cpu_clear ( cpu , node_2_cpu_mask [ node ] ) ;
cpu_2_node [ cpu ] = 0 ;
}
# else /* !CONFIG_NUMA */
# define map_cpu_to_node(cpu, node) ({})
# define unmap_cpu_to_node(cpu) ({})
# endif /* CONFIG_NUMA */
2005-07-07 17:56:59 -07:00
u8 cpu_2_logical_apicid [ NR_CPUS ] __read_mostly = { [ 0 . . . NR_CPUS - 1 ] = BAD_APICID } ;
2005-04-16 15:20:36 -07:00
static void map_cpu_to_logical_apicid ( void )
{
int cpu = smp_processor_id ( ) ;
int apicid = logical_smp_processor_id ( ) ;
cpu_2_logical_apicid [ cpu ] = apicid ;
map_cpu_to_node ( cpu , apicid_to_node ( apicid ) ) ;
}
static void unmap_cpu_to_logical_apicid ( int cpu )
{
cpu_2_logical_apicid [ cpu ] = BAD_APICID ;
unmap_cpu_to_node ( cpu ) ;
}
# if APIC_DEBUG
static inline void __inquire_remote_apic ( int apicid )
{
int i , regs [ ] = { APIC_ID > > 4 , APIC_LVR > > 4 , APIC_SPIV > > 4 } ;
char * names [ ] = { " ID " , " VERSION " , " SPIV " } ;
int timeout , status ;
printk ( " Inquiring remote APIC #%d... \n " , apicid ) ;
2005-11-07 00:58:31 -08:00
for ( i = 0 ; i < ARRAY_SIZE ( regs ) ; i + + ) {
2005-04-16 15:20:36 -07:00
printk ( " ... APIC #%d %s: " , apicid , names [ i ] ) ;
/*
* Wait for idle .
*/
apic_wait_icr_idle ( ) ;
apic_write_around ( APIC_ICR2 , SET_APIC_DEST_FIELD ( apicid ) ) ;
apic_write_around ( APIC_ICR , APIC_DM_REMRD | regs [ i ] ) ;
timeout = 0 ;
do {
udelay ( 100 ) ;
status = apic_read ( APIC_ICR ) & APIC_ICR_RR_MASK ;
} while ( status = = APIC_ICR_RR_INPROG & & timeout + + < 1000 ) ;
switch ( status ) {
case APIC_ICR_RR_VALID :
status = apic_read ( APIC_RRR ) ;
printk ( " %08x \n " , status ) ;
break ;
default :
printk ( " failed \n " ) ;
}
}
}
# endif
# ifdef WAKE_SECONDARY_VIA_NMI
/*
* Poke the other CPU in the eye via NMI to wake it up . Remember that the normal
* INIT , INIT , STARTUP sequence will reset the chip hard for us , and this
* won ' t . . . remember to clear down the APIC , etc later .
*/
2005-06-25 14:54:55 -07:00
static int __devinit
2005-04-16 15:20:36 -07:00
wakeup_secondary_cpu ( int logical_apicid , unsigned long start_eip )
{
unsigned long send_status = 0 , accept_status = 0 ;
int timeout , maxlvt ;
/* Target chip */
apic_write_around ( APIC_ICR2 , SET_APIC_DEST_FIELD ( logical_apicid ) ) ;
/* Boot on the stack */
/* Kick the second */
apic_write_around ( APIC_ICR , APIC_DM_NMI | APIC_DEST_LOGICAL ) ;
Dprintk ( " Waiting for send to finish... \n " ) ;
timeout = 0 ;
do {
Dprintk ( " + " ) ;
udelay ( 100 ) ;
send_status = apic_read ( APIC_ICR ) & APIC_ICR_BUSY ;
} while ( send_status & & ( timeout + + < 1000 ) ) ;
/*
* Give the other CPU some time to accept the IPI .
*/
udelay ( 200 ) ;
/*
* Due to the Pentium erratum 3 AP .
*/
maxlvt = get_maxlvt ( ) ;
if ( maxlvt > 3 ) {
apic_read_around ( APIC_SPIV ) ;
apic_write ( APIC_ESR , 0 ) ;
}
accept_status = ( apic_read ( APIC_ESR ) & 0xEF ) ;
Dprintk ( " NMI sent. \n " ) ;
if ( send_status )
printk ( " APIC never delivered??? \n " ) ;
if ( accept_status )
printk ( " APIC delivery error (%lx). \n " , accept_status ) ;
return ( send_status | accept_status ) ;
}
# endif /* WAKE_SECONDARY_VIA_NMI */
# ifdef WAKE_SECONDARY_VIA_INIT
2005-06-25 14:54:55 -07:00
static int __devinit
2005-04-16 15:20:36 -07:00
wakeup_secondary_cpu ( int phys_apicid , unsigned long start_eip )
{
unsigned long send_status = 0 , accept_status = 0 ;
int maxlvt , timeout , num_starts , j ;
/*
* Be paranoid about clearing APIC errors .
*/
if ( APIC_INTEGRATED ( apic_version [ phys_apicid ] ) ) {
apic_read_around ( APIC_SPIV ) ;
apic_write ( APIC_ESR , 0 ) ;
apic_read ( APIC_ESR ) ;
}
Dprintk ( " Asserting INIT. \n " ) ;
/*
* Turn INIT on target chip
*/
apic_write_around ( APIC_ICR2 , SET_APIC_DEST_FIELD ( phys_apicid ) ) ;
/*
* Send IPI
*/
apic_write_around ( APIC_ICR , APIC_INT_LEVELTRIG | APIC_INT_ASSERT
| APIC_DM_INIT ) ;
Dprintk ( " Waiting for send to finish... \n " ) ;
timeout = 0 ;
do {
Dprintk ( " + " ) ;
udelay ( 100 ) ;
send_status = apic_read ( APIC_ICR ) & APIC_ICR_BUSY ;
} while ( send_status & & ( timeout + + < 1000 ) ) ;
mdelay ( 10 ) ;
Dprintk ( " Deasserting INIT. \n " ) ;
/* Target chip */
apic_write_around ( APIC_ICR2 , SET_APIC_DEST_FIELD ( phys_apicid ) ) ;
/* Send IPI */
apic_write_around ( APIC_ICR , APIC_INT_LEVELTRIG | APIC_DM_INIT ) ;
Dprintk ( " Waiting for send to finish... \n " ) ;
timeout = 0 ;
do {
Dprintk ( " + " ) ;
udelay ( 100 ) ;
send_status = apic_read ( APIC_ICR ) & APIC_ICR_BUSY ;
} while ( send_status & & ( timeout + + < 1000 ) ) ;
atomic_set ( & init_deasserted , 1 ) ;
/*
* Should we send STARTUP IPIs ?
*
* Determine this based on the APIC version .
* If we don ' t have an integrated APIC , don ' t send the STARTUP IPIs .
*/
if ( APIC_INTEGRATED ( apic_version [ phys_apicid ] ) )
num_starts = 2 ;
else
num_starts = 0 ;
/*
* Run STARTUP IPI loop .
*/
Dprintk ( " #startup loops: %d. \n " , num_starts ) ;
maxlvt = get_maxlvt ( ) ;
for ( j = 1 ; j < = num_starts ; j + + ) {
Dprintk ( " Sending STARTUP #%d. \n " , j ) ;
apic_read_around ( APIC_SPIV ) ;
apic_write ( APIC_ESR , 0 ) ;
apic_read ( APIC_ESR ) ;
Dprintk ( " After apic_write. \n " ) ;
/*
* STARTUP IPI
*/
/* Target chip */
apic_write_around ( APIC_ICR2 , SET_APIC_DEST_FIELD ( phys_apicid ) ) ;
/* Boot on the stack */
/* Kick the second */
apic_write_around ( APIC_ICR , APIC_DM_STARTUP
| ( start_eip > > 12 ) ) ;
/*
* Give the other CPU some time to accept the IPI .
*/
udelay ( 300 ) ;
Dprintk ( " Startup point 1. \n " ) ;
Dprintk ( " Waiting for send to finish... \n " ) ;
timeout = 0 ;
do {
Dprintk ( " + " ) ;
udelay ( 100 ) ;
send_status = apic_read ( APIC_ICR ) & APIC_ICR_BUSY ;
} while ( send_status & & ( timeout + + < 1000 ) ) ;
/*
* Give the other CPU some time to accept the IPI .
*/
udelay ( 200 ) ;
/*
* Due to the Pentium erratum 3 AP .
*/
if ( maxlvt > 3 ) {
apic_read_around ( APIC_SPIV ) ;
apic_write ( APIC_ESR , 0 ) ;
}
accept_status = ( apic_read ( APIC_ESR ) & 0xEF ) ;
if ( send_status | | accept_status )
break ;
}
Dprintk ( " After Startup. \n " ) ;
if ( send_status )
printk ( " APIC never delivered??? \n " ) ;
if ( accept_status )
printk ( " APIC delivery error (%lx). \n " , accept_status ) ;
return ( send_status | accept_status ) ;
}
# endif /* WAKE_SECONDARY_VIA_INIT */
extern cpumask_t cpu_initialized ;
2005-06-25 14:54:56 -07:00
static inline int alloc_cpu_id ( void )
{
cpumask_t tmp_map ;
int cpu ;
cpus_complement ( tmp_map , cpu_present_map ) ;
cpu = first_cpu ( tmp_map ) ;
if ( cpu > = NR_CPUS )
return - ENODEV ;
return cpu ;
}
# ifdef CONFIG_HOTPLUG_CPU
static struct task_struct * __devinitdata cpu_idle_tasks [ NR_CPUS ] ;
static inline struct task_struct * alloc_idle_task ( int cpu )
{
struct task_struct * idle ;
if ( ( idle = cpu_idle_tasks [ cpu ] ) ! = NULL ) {
/* initialize thread_struct. we really want to avoid destroy
* idle tread
*/
2006-01-12 01:05:41 -08:00
idle - > thread . esp = ( unsigned long ) task_pt_regs ( idle ) ;
2005-06-25 14:54:56 -07:00
init_idle ( idle , cpu ) ;
return idle ;
}
idle = fork_idle ( cpu ) ;
if ( ! IS_ERR ( idle ) )
cpu_idle_tasks [ cpu ] = idle ;
return idle ;
}
# else
# define alloc_idle_task(cpu) fork_idle(cpu)
# endif
2005-04-16 15:20:36 -07:00
2005-06-25 14:54:56 -07:00
static int __devinit do_boot_cpu ( int apicid , int cpu )
2005-04-16 15:20:36 -07:00
/*
* NOTE - on most systems this is a PHYSICAL apic ID , but on multiquad
* ( ie clustered apic addressing mode ) , this is a LOGICAL apic ID .
* Returns zero if CPU booted OK , else error code from wakeup_secondary_cpu .
*/
{
struct task_struct * idle ;
unsigned long boot_error ;
2005-06-25 14:54:56 -07:00
int timeout ;
2005-04-16 15:20:36 -07:00
unsigned long start_eip ;
unsigned short nmi_high = 0 , nmi_low = 0 ;
2005-06-25 14:54:56 -07:00
+ + cpucount ;
2006-03-23 02:59:32 -08:00
alternatives_smp_switch ( 1 ) ;
2005-06-25 14:54:56 -07:00
2005-04-16 15:20:36 -07:00
/*
* We can ' t use kernel_thread since we must avoid to
* reschedule the child .
*/
2005-06-25 14:54:56 -07:00
idle = alloc_idle_task ( cpu ) ;
2005-04-16 15:20:36 -07:00
if ( IS_ERR ( idle ) )
panic ( " failed fork for CPU %d " , cpu ) ;
idle - > thread . eip = ( unsigned long ) start_secondary ;
/* start_eip had better be page-aligned! */
start_eip = setup_trampoline ( ) ;
/* So we see what's up */
printk ( " Booting processor %d/%d eip %lx \n " , cpu , apicid , start_eip ) ;
/* Stack for startup_32 can be just as for start_secondary onwards */
stack_start . esp = ( void * ) idle - > thread . esp ;
irq_ctx_init ( cpu ) ;
/*
* This grunge runs the startup process for
* the targeted processor .
*/
atomic_set ( & init_deasserted , 0 ) ;
Dprintk ( " Setting warm reset code and vector. \n " ) ;
store_NMI_vector ( & nmi_high , & nmi_low ) ;
smpboot_setup_warm_reset_vector ( start_eip ) ;
/*
* Starting actual IPI sequence . . .
*/
boot_error = wakeup_secondary_cpu ( apicid , start_eip ) ;
if ( ! boot_error ) {
/*
* allow APs to start initializing .
*/
Dprintk ( " Before Callout %d. \n " , cpu ) ;
cpu_set ( cpu , cpu_callout_map ) ;
Dprintk ( " After Callout %d. \n " , cpu ) ;
/*
* Wait 5 s total for a response
*/
for ( timeout = 0 ; timeout < 50000 ; timeout + + ) {
if ( cpu_isset ( cpu , cpu_callin_map ) )
break ; /* It has booted */
udelay ( 100 ) ;
}
if ( cpu_isset ( cpu , cpu_callin_map ) ) {
/* number CPUs logically, starting from 1 (BSP is 0) */
Dprintk ( " OK. \n " ) ;
printk ( " CPU%d: " , cpu ) ;
print_cpu_info ( & cpu_data [ cpu ] ) ;
Dprintk ( " CPU has booted. \n " ) ;
} else {
boot_error = 1 ;
if ( * ( ( volatile unsigned char * ) trampoline_base )
= = 0xA5 )
/* trampoline started but...? */
printk ( " Stuck ?? \n " ) ;
else
/* trampoline code not run */
printk ( " Not responding. \n " ) ;
inquire_remote_apic ( apicid ) ;
}
}
2005-06-25 14:54:56 -07:00
2005-04-16 15:20:36 -07:00
if ( boot_error ) {
/* Try to put things back the way they were before ... */
unmap_cpu_to_logical_apicid ( cpu ) ;
cpu_clear ( cpu , cpu_callout_map ) ; /* was set here (do_boot_cpu()) */
cpu_clear ( cpu , cpu_initialized ) ; /* was set by cpu_init() */
cpucount - - ;
2005-06-25 14:54:56 -07:00
} else {
x86_cpu_to_apicid [ cpu ] = apicid ;
cpu_set ( cpu , cpu_present_map ) ;
2005-04-16 15:20:36 -07:00
}
/* mark "stuck" area as not stuck */
* ( ( volatile unsigned long * ) trampoline_base ) = 0 ;
return boot_error ;
}
2005-06-25 14:54:56 -07:00
# ifdef CONFIG_HOTPLUG_CPU
void cpu_exit_clear ( void )
{
int cpu = raw_smp_processor_id ( ) ;
idle_task_exit ( ) ;
cpucount - - ;
cpu_uninit ( ) ;
irq_ctx_exit ( cpu ) ;
cpu_clear ( cpu , cpu_callout_map ) ;
cpu_clear ( cpu , cpu_callin_map ) ;
cpu_clear ( cpu , smp_commenced_mask ) ;
unmap_cpu_to_logical_apicid ( cpu ) ;
}
struct warm_boot_cpu_info {
struct completion * complete ;
int apicid ;
int cpu ;
} ;
2006-03-25 03:08:18 -08:00
static void __cpuinit do_warm_boot_cpu ( void * p )
2005-06-25 14:54:56 -07:00
{
struct warm_boot_cpu_info * info = p ;
do_boot_cpu ( info - > apicid , info - > cpu ) ;
complete ( info - > complete ) ;
}
2006-03-25 03:08:18 -08:00
static int __cpuinit __smp_prepare_cpu ( int cpu )
2005-06-25 14:54:56 -07:00
{
DECLARE_COMPLETION ( done ) ;
struct warm_boot_cpu_info info ;
struct work_struct task ;
int apicid , ret ;
apicid = x86_cpu_to_apicid [ cpu ] ;
if ( apicid = = BAD_APICID ) {
ret = - ENODEV ;
goto exit ;
}
info . complete = & done ;
info . apicid = apicid ;
info . cpu = cpu ;
INIT_WORK ( & task , do_warm_boot_cpu , & info ) ;
tsc_sync_disabled = 1 ;
/* init low mem mapping */
2005-09-03 15:56:50 -07:00
clone_pgd_range ( swapper_pg_dir , swapper_pg_dir + USER_PGD_PTRS ,
KERNEL_PGD_PTRS ) ;
2005-06-25 14:54:56 -07:00
flush_tlb_all ( ) ;
schedule_work ( & task ) ;
wait_for_completion ( & done ) ;
tsc_sync_disabled = 0 ;
zap_low_mappings ( ) ;
ret = 0 ;
exit :
return ret ;
}
# endif
2005-04-16 15:20:36 -07:00
static void smp_tune_scheduling ( void )
{
unsigned long cachesize ; /* kB */
unsigned long bandwidth = 350 ; /* MB/s */
/*
* Rough estimation for SMP scheduling , this is the number of
* cycles it takes for a fully memory - limited process to flush
* the SMP - local cache .
*
* ( For a P5 this pretty much means we will choose another idle
* CPU almost always at wakeup time ( this is due to the small
* L1 cache ) , on PIIs it ' s around 50 - 100 usecs , depending on
* the cache size )
*/
if ( ! cpu_khz ) {
/*
* this basically disables processor - affinity
* scheduling on SMP without a TSC .
*/
return ;
} else {
cachesize = boot_cpu_data . x86_cache_size ;
if ( cachesize = = - 1 ) {
cachesize = 16 ; /* Pentiums, 2x8kB cache */
bandwidth = 100 ;
}
[PATCH] scheduler cache-hot-autodetect
)
From: Ingo Molnar <mingo@elte.hu>
This is the latest version of the scheduler cache-hot-auto-tune patch.
The first problem was that detection time scaled with O(N^2), which is
unacceptable on larger SMP and NUMA systems. To solve this:
- I've added a 'domain distance' function, which is used to cache
measurement results. Each distance is only measured once. This means
that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT
distances 0 and 1, and on SMP distance 0 is measured. The code walks
the domain tree to determine the distance, so it automatically follows
whatever hierarchy an architecture sets up. This cuts down on the boot
time significantly and removes the O(N^2) limit. The only assumption
is that migration costs can be expressed as a function of domain
distance - this covers the overwhelming majority of existing systems,
and is a good guess even for more assymetric systems.
[ People hacking systems that have assymetries that break this
assumption (e.g. different CPU speeds) should experiment a bit with
the cpu_distance() function. Adding a ->migration_distance factor to
the domain structure would be one possible solution - but lets first
see the problem systems, if they exist at all. Lets not overdesign. ]
Another problem was that only a single cache-size was used for measuring
the cost of migration, and most architectures didnt set that variable
up. Furthermore, a single cache-size does not fit NUMA hierarchies with
L3 caches and does not fit HT setups, where different CPUs will often
have different 'effective cache sizes'. To solve this problem:
- Instead of relying on a single cache-size provided by the platform and
sticking to it, the code now auto-detects the 'effective migration
cost' between two measured CPUs, via iterating through a wide range of
cachesizes. The code searches for the maximum migration cost, which
occurs when the working set of the test-workload falls just below the
'effective cache size'. I.e. real-life optimized search is done for
the maximum migration cost, between two real CPUs.
This, amongst other things, has the positive effect hat if e.g. two
CPUs share a L2/L3 cache, a different (and accurate) migration cost
will be found than between two CPUs on the same system that dont share
any caches.
(The reliable measurement of migration costs is tricky - see the source
for details.)
Furthermore i've added various boot-time options to override/tune
migration behavior.
Firstly, there's a blanket override for autodetection:
migration_cost=1000,2000,3000
will override the depth 0/1/2 values with 1msec/2msec/3msec values.
Secondly, there's a global factor that can be used to increase (or
decrease) the autodetected values:
migration_factor=120
will increase the autodetected values by 20%. This option is useful to
tune things in a workload-dependent way - e.g. if a workload is
cache-insensitive then CPU utilization can be maximized by specifying
migration_factor=0.
I've tested the autodetection code quite extensively on x86, on 3
P3/Xeon/2MB, and the autodetected values look pretty good:
Dual Celeron (128K L2 cache):
---------------------
migration cost matrix (max_cache_size: 131072, cpu: 467 MHz):
---------------------
[00] [01]
[00]: - 1.7(1)
[01]: 1.7(1) -
---------------------
cacheflush times [2]: 0.0 (0) 1.7 (1784008)
---------------------
Here the slow memory subsystem dominates system performance, and even
though caches are small, the migration cost is 1.7 msecs.
Dual HT P4 (512K L2 cache):
---------------------
migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz):
---------------------
[00] [01] [02] [03]
[00]: - 0.4(1) 0.0(0) 0.4(1)
[01]: 0.4(1) - 0.4(1) 0.0(0)
[02]: 0.0(0) 0.4(1) - 0.4(1)
[03]: 0.4(1) 0.0(0) 0.4(1) -
---------------------
cacheflush times [2]: 0.0 (33900) 0.4 (448514)
---------------------
Here it can be seen that there is no migration cost between two HT
siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory
system makes inter-physical-CPU migration pretty cheap: 0.4 msecs.
8-way P3/Xeon [2MB L2 cache]:
---------------------
migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz):
---------------------
[00] [01] [02] [03] [04] [05] [06] [07]
[00]: - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[01]: 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[02]: 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[03]: 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[04]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1)
[05]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1)
[06]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1)
[07]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) -
---------------------
cacheflush times [2]: 0.0 (0) 19.2 (19281756)
---------------------
This one has huge caches and a relatively slow memory subsystem - so the
migration cost is 19 msecs.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: <wilder@us.ibm.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-12 01:05:30 -08:00
max_cache_size = cachesize * 1024 ;
2005-04-16 15:20:36 -07:00
}
}
/*
* Cycle through the processors sending APIC IPIs to boot each .
*/
static int boot_cpu_logical_apicid ;
/* Where the IO area was mapped on multiquad, always 0 otherwise */
void * xquad_portio ;
2005-06-23 00:08:33 -07:00
# ifdef CONFIG_X86_NUMAQ
EXPORT_SYMBOL ( xquad_portio ) ;
# endif
2005-04-16 15:20:36 -07:00
static void __init smp_boot_cpus ( unsigned int max_cpus )
{
int apicid , cpu , bit , kicked ;
unsigned long bogosum = 0 ;
/*
* Setup boot CPU information
*/
smp_store_cpu_info ( 0 ) ; /* Final full version of the data */
printk ( " CPU%d: " , 0 ) ;
print_cpu_info ( & cpu_data [ 0 ] ) ;
2005-10-31 19:16:17 -08:00
boot_cpu_physical_apicid = GET_APIC_ID ( apic_read ( APIC_ID ) ) ;
2005-04-16 15:20:36 -07:00
boot_cpu_logical_apicid = logical_smp_processor_id ( ) ;
x86_cpu_to_apicid [ 0 ] = boot_cpu_physical_apicid ;
current_thread_info ( ) - > cpu = 0 ;
smp_tune_scheduling ( ) ;
2005-11-05 17:25:54 +01:00
set_cpu_sibling_map ( 0 ) ;
2005-04-16 15:25:15 -07:00
2005-04-16 15:20:36 -07:00
/*
* If we couldn ' t find an SMP configuration at boot time ,
* get out of here now !
*/
if ( ! smp_found_config & & ! acpi_lapic ) {
printk ( KERN_NOTICE " SMP motherboard not detected. \n " ) ;
2005-10-31 19:16:17 -08:00
smpboot_clear_io_apic_irqs ( ) ;
phys_cpu_present_map = physid_mask_of_physid ( 0 ) ;
if ( APIC_init_uniprocessor ( ) )
printk ( KERN_NOTICE " Local APIC not detected. "
" Using dummy APIC emulation. \n " ) ;
map_cpu_to_logical_apicid ( ) ;
cpu_set ( 0 , cpu_sibling_map [ 0 ] ) ;
cpu_set ( 0 , cpu_core_map [ 0 ] ) ;
return ;
}
/*
* Should not be necessary because the MP table should list the boot
* CPU too , but we do it for the sake of robustness anyway .
* Makes no sense to do this check in clustered apic mode , so skip it
*/
if ( ! check_phys_apicid_present ( boot_cpu_physical_apicid ) ) {
printk ( " weird, boot CPU (#%d) not listed by the BIOS. \n " ,
boot_cpu_physical_apicid ) ;
physid_set ( hard_smp_processor_id ( ) , phys_cpu_present_map ) ;
}
/*
* If we couldn ' t find a local APIC , then get out of here now !
*/
if ( APIC_INTEGRATED ( apic_version [ boot_cpu_physical_apicid ] ) & & ! cpu_has_apic ) {
printk ( KERN_ERR " BIOS bug, local APIC #%d not detected!... \n " ,
boot_cpu_physical_apicid ) ;
printk ( KERN_ERR " ... forcing use of dummy APIC emulation. (tell your hw vendor) \n " ) ;
smpboot_clear_io_apic_irqs ( ) ;
phys_cpu_present_map = physid_mask_of_physid ( 0 ) ;
cpu_set ( 0 , cpu_sibling_map [ 0 ] ) ;
cpu_set ( 0 , cpu_core_map [ 0 ] ) ;
2005-04-16 15:20:36 -07:00
return ;
}
2005-10-31 19:16:17 -08:00
verify_local_APIC ( ) ;
2005-04-16 15:20:36 -07:00
/*
* If SMP should be disabled , then really disable it !
*/
2005-10-31 19:16:17 -08:00
if ( ! max_cpus ) {
smp_found_config = 0 ;
printk ( KERN_INFO " SMP mode deactivated, forcing use of dummy APIC emulation. \n " ) ;
smpboot_clear_io_apic_irqs ( ) ;
phys_cpu_present_map = physid_mask_of_physid ( 0 ) ;
cpu_set ( 0 , cpu_sibling_map [ 0 ] ) ;
cpu_set ( 0 , cpu_core_map [ 0 ] ) ;
2005-04-16 15:20:36 -07:00
return ;
}
2005-10-31 19:16:17 -08:00
connect_bsp_APIC ( ) ;
setup_local_APIC ( ) ;
map_cpu_to_logical_apicid ( ) ;
2005-04-16 15:20:36 -07:00
setup_portio_remap ( ) ;
/*
* Scan the CPU present map and fire up the other CPUs via do_boot_cpu
*
* In clustered apic mode , phys_cpu_present_map is a constructed thus :
* bits 0 - 3 are quad0 , 4 - 7 are quad1 , etc . A perverse twist on the
* clustered apic ID .
*/
Dprintk ( " CPU present map: %lx \n " , physids_coerce ( phys_cpu_present_map ) ) ;
kicked = 1 ;
for ( bit = 0 ; kicked < NR_CPUS & & bit < MAX_APICS ; bit + + ) {
apicid = cpu_present_to_apicid ( bit ) ;
/*
* Don ' t even attempt to start the boot CPU !
*/
if ( ( apicid = = boot_cpu_apicid ) | | ( apicid = = BAD_APICID ) )
continue ;
if ( ! check_apicid_present ( bit ) )
continue ;
if ( max_cpus < = cpucount + 1 )
continue ;
2005-06-25 14:54:56 -07:00
if ( ( ( cpu = alloc_cpu_id ( ) ) < = 0 ) | | do_boot_cpu ( apicid , cpu ) )
2005-04-16 15:20:36 -07:00
printk ( " CPU #%d not responding - cannot use it. \n " ,
apicid ) ;
else
+ + kicked ;
}
/*
* Cleanup possible dangling ends . . .
*/
smpboot_restore_warm_reset_vector ( ) ;
/*
* Allow the user to impress friends .
*/
Dprintk ( " Before bogomips. \n " ) ;
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + )
if ( cpu_isset ( cpu , cpu_callout_map ) )
bogosum + = cpu_data [ cpu ] . loops_per_jiffy ;
printk ( KERN_INFO
" Total of %d processors activated (%lu.%02lu BogoMIPS). \n " ,
cpucount + 1 ,
bogosum / ( 500000 / HZ ) ,
( bogosum / ( 5000 / HZ ) ) % 100 ) ;
Dprintk ( " Before bogocount - setting activated=1. \n " ) ;
if ( smp_b_stepping )
printk ( KERN_WARNING " WARNING: SMP operation may be unreliable with B stepping processors. \n " ) ;
/*
* Don ' t taint if we are running SMP kernel on a single non - MP
* approved Athlon
*/
if ( tainted & TAINT_UNSAFE_SMP ) {
if ( cpucount )
printk ( KERN_INFO " WARNING: This combination of AMD processors is not suitable for SMP. \n " ) ;
else
tainted & = ~ TAINT_UNSAFE_SMP ;
}
Dprintk ( " Boot done. \n " ) ;
/*
* construct cpu_sibling_map [ ] , so that we can tell sibling CPUs
* efficiently .
*/
2005-04-16 15:25:15 -07:00
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + ) {
2005-04-16 15:20:36 -07:00
cpus_clear ( cpu_sibling_map [ cpu ] ) ;
2005-04-16 15:25:15 -07:00
cpus_clear ( cpu_core_map [ cpu ] ) ;
}
2005-04-16 15:20:36 -07:00
2005-06-25 14:54:54 -07:00
cpu_set ( 0 , cpu_sibling_map [ 0 ] ) ;
cpu_set ( 0 , cpu_core_map [ 0 ] ) ;
2005-04-16 15:20:36 -07:00
2005-10-31 19:16:17 -08:00
smpboot_setup_io_apic ( ) ;
setup_boot_APIC_clock ( ) ;
2005-04-16 15:20:36 -07:00
/*
* Synchronize the TSC with the AP
*/
if ( cpu_has_tsc & & cpucount & & cpu_khz )
synchronize_tsc_bp ( ) ;
}
/* These are wrappers to interface to the new boot process. Someone
who understands all this stuff should rewrite it properly . - - RR 15 / Jul / 02 */
void __init smp_prepare_cpus ( unsigned int max_cpus )
{
2005-06-25 14:54:50 -07:00
smp_commenced_mask = cpumask_of_cpu ( 0 ) ;
cpu_callin_map = cpumask_of_cpu ( 0 ) ;
mb ( ) ;
2005-04-16 15:20:36 -07:00
smp_boot_cpus ( max_cpus ) ;
}
void __devinit smp_prepare_boot_cpu ( void )
{
cpu_set ( smp_processor_id ( ) , cpu_online_map ) ;
cpu_set ( smp_processor_id ( ) , cpu_callout_map ) ;
2005-06-25 14:54:56 -07:00
cpu_set ( smp_processor_id ( ) , cpu_present_map ) ;
2005-09-03 15:56:51 -07:00
cpu_set ( smp_processor_id ( ) , cpu_possible_map ) ;
2005-06-25 14:54:56 -07:00
per_cpu ( cpu_state , smp_processor_id ( ) ) = CPU_ONLINE ;
2005-04-16 15:20:36 -07:00
}
2005-06-25 14:54:50 -07:00
# ifdef CONFIG_HOTPLUG_CPU
2005-06-25 14:54:56 -07:00
static void
remove_siblinginfo ( int cpu )
2005-04-16 15:20:36 -07:00
{
2005-06-25 14:54:56 -07:00
int sibling ;
2005-11-05 17:25:54 +01:00
struct cpuinfo_x86 * c = cpu_data ;
2005-06-25 14:54:56 -07:00
2005-11-05 17:25:54 +01:00
for_each_cpu_mask ( sibling , cpu_core_map [ cpu ] ) {
cpu_clear ( cpu , cpu_core_map [ sibling ] ) ;
/*
* last thread sibling in this cpu core going down
*/
if ( cpus_weight ( cpu_sibling_map [ cpu ] ) = = 1 )
c [ sibling ] . booted_cores - - ;
}
2005-06-25 14:54:56 -07:00
for_each_cpu_mask ( sibling , cpu_sibling_map [ cpu ] )
cpu_clear ( cpu , cpu_sibling_map [ sibling ] ) ;
cpus_clear ( cpu_sibling_map [ cpu ] ) ;
cpus_clear ( cpu_core_map [ cpu ] ) ;
phys_proc_id [ cpu ] = BAD_APICID ;
cpu_core_id [ cpu ] = BAD_APICID ;
2005-11-05 17:25:54 +01:00
cpu_clear ( cpu , cpu_sibling_setup_map ) ;
2005-06-25 14:54:50 -07:00
}
int __cpu_disable ( void )
{
cpumask_t map = cpu_online_map ;
int cpu = smp_processor_id ( ) ;
/*
* Perhaps use cpufreq to drop frequency , but that could go
* into generic code .
*
* We won ' t take down the boot processor on i386 due to some
* interrupts only being able to be serviced by the BSP .
* Especially so if we ' re not using an IOAPIC - zwane
*/
if ( cpu = = 0 )
return - EBUSY ;
2005-12-12 22:17:08 -08:00
clear_local_APIC ( ) ;
2005-06-25 14:54:50 -07:00
/* Allow any queued timer interrupts to get serviced */
local_irq_enable ( ) ;
mdelay ( 1 ) ;
local_irq_disable ( ) ;
2005-06-25 14:54:56 -07:00
remove_siblinginfo ( cpu ) ;
2005-06-25 14:54:50 -07:00
cpu_clear ( cpu , map ) ;
fixup_irqs ( map ) ;
/* It's now safe to remove this processor from the online map */
cpu_clear ( cpu , cpu_online_map ) ;
return 0 ;
}
void __cpu_die ( unsigned int cpu )
{
/* We don't do anything here: idle task is faking death itself. */
unsigned int i ;
for ( i = 0 ; i < 10 ; i + + ) {
/* They ack this in play_dead by setting CPU_DEAD */
2005-06-25 14:54:56 -07:00
if ( per_cpu ( cpu_state , cpu ) = = CPU_DEAD ) {
printk ( " CPU %d is now offline \n " , cpu ) ;
2006-03-23 02:59:32 -08:00
if ( 1 = = num_online_cpus ( ) )
alternatives_smp_switch ( 0 ) ;
2005-06-25 14:54:50 -07:00
return ;
2005-06-25 14:54:56 -07:00
}
2005-09-10 00:26:50 -07:00
msleep ( 100 ) ;
2005-04-16 15:20:36 -07:00
}
2005-06-25 14:54:50 -07:00
printk ( KERN_ERR " CPU %u didn't die... \n " , cpu ) ;
}
# else /* ... !CONFIG_HOTPLUG_CPU */
int __cpu_disable ( void )
{
return - ENOSYS ;
}
2005-04-16 15:20:36 -07:00
2005-06-25 14:54:50 -07:00
void __cpu_die ( unsigned int cpu )
{
/* We said "no" in __cpu_disable */
BUG ( ) ;
}
# endif /* CONFIG_HOTPLUG_CPU */
int __devinit __cpu_up ( unsigned int cpu )
{
2006-03-25 03:08:18 -08:00
# ifdef CONFIG_HOTPLUG_CPU
int ret = 0 ;
/*
* We do warm boot only on cpus that had booted earlier
* Otherwise cold boot is all handled from smp_boot_cpus ( ) .
* cpu_callin_map is set during AP kickstart process . Its reset
* when a cpu is taken offline from cpu_exit_clear ( ) .
*/
if ( ! cpu_isset ( cpu , cpu_callin_map ) )
ret = __smp_prepare_cpu ( cpu ) ;
if ( ret )
return - EIO ;
# endif
2005-04-16 15:20:36 -07:00
/* In case one didn't come up */
if ( ! cpu_isset ( cpu , cpu_callin_map ) ) {
2005-06-25 14:54:50 -07:00
printk ( KERN_DEBUG " skipping cpu%d, didn't come online \n " , cpu ) ;
2005-04-16 15:20:36 -07:00
local_irq_enable ( ) ;
return - EIO ;
}
local_irq_enable ( ) ;
2005-06-25 14:54:56 -07:00
per_cpu ( cpu_state , cpu ) = CPU_UP_PREPARE ;
2005-04-16 15:20:36 -07:00
/* Unleash the CPU! */
cpu_set ( cpu , smp_commenced_mask ) ;
while ( ! cpu_isset ( cpu , cpu_online_map ) )
2006-06-25 05:46:52 -07:00
cpu_relax ( ) ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
void __init smp_cpus_done ( unsigned int max_cpus )
{
# ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest ( ) ;
# endif
zap_low_mappings ( ) ;
2005-06-25 14:54:56 -07:00
# ifndef CONFIG_HOTPLUG_CPU
2005-04-16 15:20:36 -07:00
/*
* Disable executability of the SMP trampoline :
*/
set_kernel_exec ( ( unsigned long ) trampoline_base , trampoline_exec ) ;
2005-06-25 14:54:56 -07:00
# endif
2005-04-16 15:20:36 -07:00
}
void __init smp_intr_init ( void )
{
/*
* IRQ0 must be given a fixed assignment and initialized ,
* because it ' s used before the IO - APIC is set up .
*/
set_intr_gate ( FIRST_DEVICE_VECTOR , interrupt [ 0 ] ) ;
/*
* The reschedule interrupt is a CPU - to - CPU reschedule - helper
* IPI , driven by wakeup .
*/
set_intr_gate ( RESCHEDULE_VECTOR , reschedule_interrupt ) ;
/* IPI for invalidation */
set_intr_gate ( INVALIDATE_TLB_VECTOR , invalidate_interrupt ) ;
/* IPI for generic function call */
set_intr_gate ( CALL_FUNCTION_VECTOR , call_function_interrupt ) ;
}