2009-12-09 21:45:34 +03:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2008-03-19 20:25:23 +03:00
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/init.h>
# include <linux/bootmem.h>
# include <linux/percpu.h>
2008-06-20 17:38:22 +04:00
# include <linux/kexec.h>
2008-06-22 08:02:20 +04:00
# include <linux/crash_dump.h>
2009-01-04 14:34:26 +03:00
# include <linux/smp.h>
# include <linux/topology.h>
2009-02-24 05:57:21 +03:00
# include <linux/pfn.h>
2008-03-19 20:25:23 +03:00
# include <asm/sections.h>
# include <asm/processor.h>
# include <asm/setup.h>
2008-04-04 23:40:48 +04:00
# include <asm/mpspec.h>
2008-04-04 23:40:41 +04:00
# include <asm/apicdef.h>
2008-06-20 17:38:22 +04:00
# include <asm/highmem.h>
2009-01-13 14:41:35 +03:00
# include <asm/proto.h>
2009-01-10 09:47:37 +03:00
# include <asm/cpumask.h>
2009-01-27 06:56:48 +03:00
# include <asm/cpu.h>
2009-02-09 16:17:40 +03:00
# include <asm/stackprotector.h>
2008-04-04 23:40:41 +04:00
x86: Add read_mostly declaration/definition to variables from smp.h
Add "read-mostly" qualifier to the following variables in
smp.h:
- cpu_sibling_map
- cpu_core_map
- cpu_llc_shared_map
- cpu_llc_id
- cpu_number
- x86_cpu_to_apicid
- x86_bios_cpu_apicid
- x86_cpu_to_logical_apicid
As long as all the variables above are only written during the
initialization, this change is meant to prevent the false
sharing. More specifically, on vSMP Foundation platform
x86_cpu_to_apicid shared the same internode_cache_line with
frequently written lapic_events.
From the analysis of the first 33 per_cpu variables out of 219
(memories they describe, to be more specific) the 8 have read_mostly
nature (tlb_vector_offset, cpu_loops_per_jiffy, xen_debug_irq, etc.)
and 25 are frequently written (irq_stack_union, gdt_page,
exception_stacks, idt_desc, etc.).
Assuming that the spread of the rest of the per_cpu variables is
similar, identifying the read mostly memories will make more sense
in terms of long-term code maintenance comparing to identifying
frequently written memories.
Signed-off-by: Vlad Zolotarov <vlad@scalemp.com>
Acked-by: Shai Fultheim <shai@scalemp.com>
Cc: Shai Fultheim (Shai@ScaleMP.com) <Shai@scalemp.com>
Cc: ido@wizery.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1719258.EYKzE4Zbq5@vlad
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-06-11 13:56:52 +04:00
DEFINE_PER_CPU_READ_MOSTLY ( int , cpu_number ) ;
2009-01-18 18:38:58 +03:00
EXPORT_PER_CPU_SYMBOL ( cpu_number ) ;
2009-01-27 06:56:48 +03:00
# ifdef CONFIG_X86_64
# define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
# else
# define BOOT_PERCPU_OFFSET 0
# endif
DEFINE_PER_CPU ( unsigned long , this_cpu_off ) = BOOT_PERCPU_OFFSET ;
EXPORT_PER_CPU_SYMBOL ( this_cpu_off ) ;
2009-01-13 14:41:35 +03:00
unsigned long __per_cpu_offset [ NR_CPUS ] __read_mostly = {
2009-01-27 06:56:48 +03:00
[ 0 . . . NR_CPUS - 1 ] = BOOT_PERCPU_OFFSET ,
2009-01-13 14:41:35 +03:00
} ;
EXPORT_SYMBOL ( __per_cpu_offset ) ;
2008-03-19 20:25:23 +03:00
2009-03-06 08:33:59 +03:00
/*
* On x86_64 symbols referenced from code should be reachable using
* 32 bit relocations . Reserve space for static percpu variables in
* modules so that they are always served from the first chunk which
* is located at the percpu segment base . On x86_32 , anything can
* address anywhere . No need to reserve space in the first chunk .
*/
# ifdef CONFIG_X86_64
# define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
# else
# define PERCPU_FIRST_CHUNK_RESERVE 0
# endif
2009-08-14 10:00:52 +04:00
# ifdef CONFIG_X86_32
2009-02-24 05:57:21 +03:00
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
* If NUMA is not configured or there is only one NUMA node available ,
* there is no reason to consider NUMA . This function determines
* whether percpu allocation should consider NUMA or not .
*
* RETURNS :
* true if NUMA should be considered ; otherwise , false .
*/
static bool __init pcpu_need_numa ( void )
{
# ifdef CONFIG_NEED_MULTIPLE_NODES
pg_data_t * last = NULL ;
unsigned int cpu ;
for_each_possible_cpu ( cpu ) {
int node = early_cpu_to_node ( cpu ) ;
if ( node_online ( node ) & & NODE_DATA ( node ) & &
last & & last ! = NODE_DATA ( node ) )
return true ;
last = NODE_DATA ( node ) ;
}
# endif
return false ;
}
2009-08-14 10:00:52 +04:00
# endif
2009-02-24 05:57:21 +03:00
2009-02-24 05:57:21 +03:00
/**
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
* @ cpu : cpu to allocate for
* @ size : size allocation in bytes
* @ align : alignment
*
* Allocate @ size bytes aligned at @ align for cpu @ cpu . This wrapper
* does the right thing for NUMA regardless of the current
* configuration .
*
* RETURNS :
* Pointer to the allocated area on success , NULL on failure .
*/
static void * __init pcpu_alloc_bootmem ( unsigned int cpu , unsigned long size ,
unsigned long align )
{
const unsigned long goal = __pa ( MAX_DMA_ADDRESS ) ;
# ifdef CONFIG_NEED_MULTIPLE_NODES
int node = early_cpu_to_node ( cpu ) ;
void * ptr ;
if ( ! node_online ( node ) | | ! NODE_DATA ( node ) ) {
ptr = __alloc_bootmem_nopanic ( size , align , goal ) ;
pr_info ( " cpu %d has no node %d or node-local memory \n " ,
cpu , node ) ;
pr_debug ( " per cpu data for cpu%d %lu bytes at %016lx \n " ,
cpu , size , __pa ( ptr ) ) ;
} else {
ptr = __alloc_bootmem_node_nopanic ( NODE_DATA ( node ) ,
size , align , goal ) ;
2009-12-09 21:45:34 +03:00
pr_debug ( " per cpu data for cpu%d %lu bytes on node%d at %016lx \n " ,
cpu , size , node , __pa ( ptr ) ) ;
2009-02-24 05:57:21 +03:00
}
return ptr ;
# else
return __alloc_bootmem_nopanic ( size , align , goal ) ;
# endif
}
2009-07-04 03:10:59 +04:00
/*
* Helpers for first chunk memory allocation
*/
2009-08-14 10:00:50 +04:00
static void * __init pcpu_fc_alloc ( unsigned int cpu , size_t size , size_t align )
2009-07-04 03:10:59 +04:00
{
2009-08-14 10:00:50 +04:00
return pcpu_alloc_bootmem ( cpu , size , align ) ;
2009-07-04 03:10:59 +04:00
}
static void __init pcpu_fc_free ( void * ptr , size_t size )
{
free_bootmem ( __pa ( ptr ) , size ) ;
}
2009-08-14 10:00:52 +04:00
static int __init pcpu_cpu_distance ( unsigned int from , unsigned int to )
2009-07-04 03:11:00 +04:00
{
2009-08-14 10:00:52 +04:00
# ifdef CONFIG_NEED_MULTIPLE_NODES
2009-07-04 03:11:00 +04:00
if ( early_cpu_to_node ( from ) = = early_cpu_to_node ( to ) )
return LOCAL_DISTANCE ;
else
return REMOTE_DISTANCE ;
2009-02-24 05:57:22 +03:00
# else
2009-08-14 10:00:52 +04:00
return LOCAL_DISTANCE ;
2009-02-24 05:57:22 +03:00
# endif
2009-02-24 05:57:21 +03:00
}
2009-08-14 10:00:49 +04:00
static void __init pcpup_populate_pte ( unsigned long addr )
2009-02-24 05:57:21 +03:00
{
populate_extra_pte ( addr ) ;
}
2009-01-27 06:56:48 +03:00
static inline void setup_percpu_segment ( int cpu )
{
# ifdef CONFIG_X86_32
struct desc_struct gdt ;
pack_descriptor ( & gdt , per_cpu_offset ( cpu ) , 0xFFFFF ,
0x2 | DESCTYPE_S , 0x8 ) ;
gdt . s = 1 ;
write_gdt_entry ( get_cpu_gdt_table ( cpu ) ,
GDT_ENTRY_PERCPU , & gdt , DESCTYPE_S ) ;
# endif
}
2008-03-19 20:25:23 +03:00
void __init setup_per_cpu_areas ( void )
{
2009-02-24 05:57:21 +03:00
unsigned int cpu ;
2009-02-20 10:29:09 +03:00
unsigned long delta ;
2009-08-14 10:00:51 +04:00
int rc ;
2008-12-17 04:33:53 +03:00
2009-01-02 21:51:32 +03:00
pr_info ( " NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d \n " ,
2008-12-17 04:33:53 +03:00
NR_CPUS , nr_cpumask_bits , nr_cpu_ids , nr_node_ids ) ;
2009-02-20 10:29:09 +03:00
2009-02-24 05:57:22 +03:00
/*
2009-08-14 10:00:52 +04:00
* Allocate percpu area . Embedding allocator is our favorite ;
* however , on NUMA configurations , it can result in very
* sparse unit mapping and vmalloc area isn ' t spacious enough
* on 32 bit . Use page in that case .
2009-02-24 05:57:22 +03:00
*/
2009-08-14 10:00:52 +04:00
# ifdef CONFIG_X86_32
if ( pcpu_chosen_fc = = PCPU_FC_AUTO & & pcpu_need_numa ( ) )
pcpu_chosen_fc = PCPU_FC_PAGE ;
# endif
2009-08-14 10:00:51 +04:00
rc = - EINVAL ;
2009-08-14 10:00:52 +04:00
if ( pcpu_chosen_fc ! = PCPU_FC_PAGE ) {
const size_t dyn_size = PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE ;
2012-04-27 21:54:35 +04:00
size_t atom_size ;
/*
* On 64 bit , use PMD_SIZE for atom_size so that embedded
* percpu areas are aligned to PMD . This , in the future ,
* can also allow using PMD mappings in vmalloc area . Use
* PAGE_SIZE on 32 bit as vmalloc space is highly contended
* and large vmalloc area allocs can easily fail .
*/
# ifdef CONFIG_X86_64
atom_size = PMD_SIZE ;
# else
atom_size = PAGE_SIZE ;
# endif
2009-08-14 10:00:52 +04:00
rc = pcpu_embed_first_chunk ( PERCPU_FIRST_CHUNK_RESERVE ,
dyn_size , atom_size ,
pcpu_cpu_distance ,
pcpu_fc_alloc , pcpu_fc_free ) ;
2009-08-14 10:00:51 +04:00
if ( rc < 0 )
2009-12-09 21:45:34 +03:00
pr_warning ( " %s allocator failed (%d), falling back to page size \n " ,
2009-08-14 10:00:52 +04:00
pcpu_fc_names [ pcpu_chosen_fc ] , rc ) ;
2009-06-22 06:56:24 +04:00
}
2009-08-14 10:00:51 +04:00
if ( rc < 0 )
2009-08-14 10:00:52 +04:00
rc = pcpu_page_first_chunk ( PERCPU_FIRST_CHUNK_RESERVE ,
pcpu_fc_alloc , pcpu_fc_free ,
pcpup_populate_pte ) ;
2009-08-14 10:00:51 +04:00
if ( rc < 0 )
panic ( " cannot initialize percpu area (err=%d) " , rc ) ;
2009-02-20 10:29:09 +03:00
2009-02-24 05:57:21 +03:00
/* alrighty, percpu areas up and running */
2009-02-20 10:29:09 +03:00
delta = ( unsigned long ) pcpu_base_addr - ( unsigned long ) __per_cpu_start ;
for_each_possible_cpu ( cpu ) {
2009-08-14 10:00:51 +04:00
per_cpu_offset ( cpu ) = delta + pcpu_unit_offsets [ cpu ] ;
2009-01-18 18:38:58 +03:00
per_cpu ( this_cpu_off , cpu ) = per_cpu_offset ( cpu ) ;
2009-01-18 18:38:58 +03:00
per_cpu ( cpu_number , cpu ) = cpu ;
2009-01-27 06:56:48 +03:00
setup_percpu_segment ( cpu ) ;
2009-02-09 16:17:40 +03:00
setup_stack_canary_segment ( cpu ) ;
2009-01-27 06:56:47 +03:00
/*
2009-01-27 08:25:05 +03:00
* Copy data used in early init routines from the
* initial arrays to the per cpu data areas . These
* arrays then become expendable and the * _early_ptr ' s
* are zeroed indicating that the static arrays are
* gone .
2009-01-27 06:56:47 +03:00
*/
2009-01-27 06:56:47 +03:00
# ifdef CONFIG_X86_LOCAL_APIC
2009-01-27 06:56:47 +03:00
per_cpu ( x86_cpu_to_apicid , cpu ) =
2009-01-27 08:25:05 +03:00
early_per_cpu_map ( x86_cpu_to_apicid , cpu ) ;
2009-01-27 06:56:47 +03:00
per_cpu ( x86_bios_cpu_apicid , cpu ) =
2009-01-27 08:25:05 +03:00
early_per_cpu_map ( x86_bios_cpu_apicid , cpu ) ;
2009-01-27 06:56:47 +03:00
# endif
2011-01-23 16:37:30 +03:00
# ifdef CONFIG_X86_32
per_cpu ( x86_cpu_to_logical_apicid , cpu ) =
early_per_cpu_map ( x86_cpu_to_logical_apicid , cpu ) ;
# endif
2009-01-13 14:41:35 +03:00
# ifdef CONFIG_X86_64
2009-01-18 18:38:58 +03:00
per_cpu ( irq_stack_ptr , cpu ) =
2009-01-27 08:25:05 +03:00
per_cpu ( irq_stack_union . irq_stack , cpu ) +
IRQ_STACK_SIZE - 64 ;
2011-01-23 16:37:40 +03:00
# endif
2009-01-27 06:56:47 +03:00
# ifdef CONFIG_NUMA
per_cpu ( x86_cpu_to_node_map , cpu ) =
2009-01-27 08:25:05 +03:00
early_per_cpu_map ( x86_cpu_to_node_map , cpu ) ;
2010-07-21 00:24:30 +04:00
/*
2010-07-21 20:25:42 +04:00
* Ensure that the boot cpu numa_node is correct when the boot
2010-07-21 00:24:30 +04:00
* cpu is on a node that doesn ' t have memory installed .
* Also cpu_up ( ) will call cpu_to_node ( ) for APs when
* MEMORY_HOTPLUG is defined , before per_cpu ( numa_node ) is set
* up later with c_init aka intel_init / amd_init .
* So set them all ( boot cpu and all APs ) .
*/
set_cpu_numa_node ( cpu , early_cpu_to_node ( cpu ) ) ;
2009-01-27 06:56:47 +03:00
# endif
2009-01-13 14:41:35 +03:00
/*
2010-02-20 03:03:53 +03:00
* Up to this point , the boot CPU has been using . init . data
2009-01-27 06:56:48 +03:00
* area . Reload any changed state for the boot CPU .
2009-01-13 14:41:35 +03:00
*/
2010-07-21 21:03:58 +04:00
if ( ! cpu )
2009-01-30 11:47:53 +03:00
switch_to_new_gdt ( cpu ) ;
2008-03-19 20:25:23 +03:00
}
2009-01-27 06:56:47 +03:00
/* indicate the early static arrays will soon be gone */
2009-01-27 08:21:37 +03:00
# ifdef CONFIG_X86_LOCAL_APIC
2009-01-27 06:56:47 +03:00
early_per_cpu_ptr ( x86_cpu_to_apicid ) = NULL ;
early_per_cpu_ptr ( x86_bios_cpu_apicid ) = NULL ;
2009-01-27 08:21:37 +03:00
# endif
2011-01-23 16:37:30 +03:00
# ifdef CONFIG_X86_32
early_per_cpu_ptr ( x86_cpu_to_logical_apicid ) = NULL ;
# endif
2011-01-23 16:37:40 +03:00
# ifdef CONFIG_NUMA
2009-01-27 06:56:47 +03:00
early_per_cpu_ptr ( x86_cpu_to_node_map ) = NULL ;
# endif
2008-04-05 05:11:01 +04:00
2008-05-12 23:21:12 +04:00
/* Setup node to cpumask map */
setup_node_to_cpumask_map ( ) ;
2009-01-04 16:18:03 +03:00
/* Setup cpu initialized, callin, callout masks */
setup_cpu_local_masks ( ) ;
2008-03-19 20:25:23 +03:00
}