2005-06-26 01:58:02 +04:00
/*
2007-10-13 05:10:53 +04:00
* handle transition of Linux booting another kernel
2005-06-26 01:58:02 +04:00
* Copyright ( C ) 2002 - 2005 Eric Biederman < ebiederm @ xmission . com >
*
* This source code is licensed under the GNU General Public License ,
* Version 2. See the file COPYING for more details .
*/
# include <linux/mm.h>
# include <linux/kexec.h>
# include <linux/string.h>
# include <linux/reboot.h>
2007-10-17 10:27:27 +04:00
# include <linux/numa.h>
2008-05-12 23:20:43 +04:00
# include <linux/ftrace.h>
2009-03-10 05:56:57 +03:00
# include <linux/io.h>
2009-03-10 05:57:16 +03:00
# include <linux/suspend.h>
2008-05-12 23:20:43 +04:00
2005-06-26 01:58:02 +04:00
# include <asm/pgtable.h>
# include <asm/tlbflush.h>
# include <asm/mmu_context.h>
2009-06-01 22:16:03 +04:00
# include <asm/debugreg.h>
2005-07-29 23:25:28 +04:00
2009-03-10 05:57:04 +03:00
static int init_one_level2_page ( struct kimage * image , pgd_t * pgd ,
unsigned long addr )
{
pud_t * pud ;
pmd_t * pmd ;
struct page * page ;
int result = - ENOMEM ;
addr & = PMD_MASK ;
pgd + = pgd_index ( addr ) ;
if ( ! pgd_present ( * pgd ) ) {
page = kimage_alloc_control_pages ( image , 0 ) ;
if ( ! page )
goto out ;
pud = ( pud_t * ) page_address ( page ) ;
memset ( pud , 0 , PAGE_SIZE ) ;
set_pgd ( pgd , __pgd ( __pa ( pud ) | _KERNPG_TABLE ) ) ;
}
pud = pud_offset ( pgd , addr ) ;
if ( ! pud_present ( * pud ) ) {
page = kimage_alloc_control_pages ( image , 0 ) ;
if ( ! page )
goto out ;
pmd = ( pmd_t * ) page_address ( page ) ;
memset ( pmd , 0 , PAGE_SIZE ) ;
set_pud ( pud , __pud ( __pa ( pmd ) | _KERNPG_TABLE ) ) ;
}
pmd = pmd_offset ( pud , addr ) ;
if ( ! pmd_present ( * pmd ) )
set_pmd ( pmd , __pmd ( addr | __PAGE_KERNEL_LARGE_EXEC ) ) ;
result = 0 ;
out :
return result ;
}
2005-07-29 23:25:28 +04:00
static void init_level2_page ( pmd_t * level2p , unsigned long addr )
2005-06-26 01:58:02 +04:00
{
unsigned long end_addr ;
2005-06-26 01:58:28 +04:00
2005-06-26 01:58:02 +04:00
addr & = PAGE_MASK ;
2005-07-29 23:25:28 +04:00
end_addr = addr + PUD_SIZE ;
2005-06-26 01:58:28 +04:00
while ( addr < end_addr ) {
2005-07-29 23:25:28 +04:00
set_pmd ( level2p + + , __pmd ( addr | __PAGE_KERNEL_LARGE_EXEC ) ) ;
addr + = PMD_SIZE ;
2005-06-26 01:58:02 +04:00
}
}
2005-07-29 23:25:28 +04:00
static int init_level3_page ( struct kimage * image , pud_t * level3p ,
2005-06-26 01:58:28 +04:00
unsigned long addr , unsigned long last_addr )
2005-06-26 01:58:02 +04:00
{
unsigned long end_addr ;
int result ;
2005-06-26 01:58:28 +04:00
2005-06-26 01:58:02 +04:00
result = 0 ;
addr & = PAGE_MASK ;
2005-07-29 23:25:28 +04:00
end_addr = addr + PGDIR_SIZE ;
2005-06-26 01:58:28 +04:00
while ( ( addr < last_addr ) & & ( addr < end_addr ) ) {
2005-06-26 01:58:02 +04:00
struct page * page ;
2005-07-29 23:25:28 +04:00
pmd_t * level2p ;
2005-06-26 01:58:28 +04:00
2005-06-26 01:58:02 +04:00
page = kimage_alloc_control_pages ( image , 0 ) ;
if ( ! page ) {
result = - ENOMEM ;
goto out ;
}
2005-07-29 23:25:28 +04:00
level2p = ( pmd_t * ) page_address ( page ) ;
2005-06-26 01:58:02 +04:00
init_level2_page ( level2p , addr ) ;
2005-07-29 23:25:28 +04:00
set_pud ( level3p + + , __pud ( __pa ( level2p ) | _KERNPG_TABLE ) ) ;
addr + = PUD_SIZE ;
2005-06-26 01:58:02 +04:00
}
/* clear the unused entries */
2005-06-26 01:58:28 +04:00
while ( addr < end_addr ) {
2005-07-29 23:25:28 +04:00
pud_clear ( level3p + + ) ;
addr + = PUD_SIZE ;
2005-06-26 01:58:02 +04:00
}
out :
return result ;
}
2005-07-29 23:25:28 +04:00
static int init_level4_page ( struct kimage * image , pgd_t * level4p ,
2005-06-26 01:58:28 +04:00
unsigned long addr , unsigned long last_addr )
2005-06-26 01:58:02 +04:00
{
unsigned long end_addr ;
int result ;
2005-06-26 01:58:28 +04:00
2005-06-26 01:58:02 +04:00
result = 0 ;
addr & = PAGE_MASK ;
2005-07-29 23:25:28 +04:00
end_addr = addr + ( PTRS_PER_PGD * PGDIR_SIZE ) ;
2005-06-26 01:58:28 +04:00
while ( ( addr < last_addr ) & & ( addr < end_addr ) ) {
2005-06-26 01:58:02 +04:00
struct page * page ;
2005-07-29 23:25:28 +04:00
pud_t * level3p ;
2005-06-26 01:58:28 +04:00
2005-06-26 01:58:02 +04:00
page = kimage_alloc_control_pages ( image , 0 ) ;
if ( ! page ) {
result = - ENOMEM ;
goto out ;
}
2005-07-29 23:25:28 +04:00
level3p = ( pud_t * ) page_address ( page ) ;
2005-06-26 01:58:02 +04:00
result = init_level3_page ( image , level3p , addr , last_addr ) ;
2009-03-10 05:56:57 +03:00
if ( result )
2005-06-26 01:58:02 +04:00
goto out ;
2005-07-29 23:25:28 +04:00
set_pgd ( level4p + + , __pgd ( __pa ( level3p ) | _KERNPG_TABLE ) ) ;
addr + = PGDIR_SIZE ;
2005-06-26 01:58:02 +04:00
}
/* clear the unused entries */
2005-06-26 01:58:28 +04:00
while ( addr < end_addr ) {
2005-07-29 23:25:28 +04:00
pgd_clear ( level4p + + ) ;
addr + = PGDIR_SIZE ;
2005-06-26 01:58:02 +04:00
}
2005-06-26 01:58:28 +04:00
out :
2005-06-26 01:58:02 +04:00
return result ;
}
2009-02-03 09:22:48 +03:00
static void free_transition_pgtable ( struct kimage * image )
{
free_page ( ( unsigned long ) image - > arch . pud ) ;
free_page ( ( unsigned long ) image - > arch . pmd ) ;
free_page ( ( unsigned long ) image - > arch . pte ) ;
}
static int init_transition_pgtable ( struct kimage * image , pgd_t * pgd )
{
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
unsigned long vaddr , paddr ;
int result = - ENOMEM ;
vaddr = ( unsigned long ) relocate_kernel ;
paddr = __pa ( page_address ( image - > control_code_page ) + PAGE_SIZE ) ;
pgd + = pgd_index ( vaddr ) ;
if ( ! pgd_present ( * pgd ) ) {
pud = ( pud_t * ) get_zeroed_page ( GFP_KERNEL ) ;
if ( ! pud )
goto err ;
image - > arch . pud = pud ;
set_pgd ( pgd , __pgd ( __pa ( pud ) | _KERNPG_TABLE ) ) ;
}
pud = pud_offset ( pgd , vaddr ) ;
if ( ! pud_present ( * pud ) ) {
pmd = ( pmd_t * ) get_zeroed_page ( GFP_KERNEL ) ;
if ( ! pmd )
goto err ;
image - > arch . pmd = pmd ;
set_pud ( pud , __pud ( __pa ( pmd ) | _KERNPG_TABLE ) ) ;
}
pmd = pmd_offset ( pud , vaddr ) ;
if ( ! pmd_present ( * pmd ) ) {
pte = ( pte_t * ) get_zeroed_page ( GFP_KERNEL ) ;
if ( ! pte )
goto err ;
image - > arch . pte = pte ;
set_pmd ( pmd , __pmd ( __pa ( pte ) | _KERNPG_TABLE ) ) ;
}
pte = pte_offset_kernel ( pmd , vaddr ) ;
set_pte ( pte , pfn_pte ( paddr > > PAGE_SHIFT , PAGE_KERNEL_EXEC ) ) ;
return 0 ;
err :
free_transition_pgtable ( image ) ;
return result ;
}
2005-06-26 01:58:02 +04:00
static int init_pgtable ( struct kimage * image , unsigned long start_pgtable )
{
2005-07-29 23:25:28 +04:00
pgd_t * level4p ;
2009-02-03 09:22:48 +03:00
int result ;
2005-07-29 23:25:28 +04:00
level4p = ( pgd_t * ) __va ( start_pgtable ) ;
2009-02-03 09:22:48 +03:00
result = init_level4_page ( image , level4p , 0 , max_pfn < < PAGE_SHIFT ) ;
2009-03-10 05:57:04 +03:00
if ( result )
return result ;
/*
* image - > start may be outside 0 ~ max_pfn , for example when
* jump back to original kernel from kexeced kernel
*/
result = init_one_level2_page ( image , level4p , image - > start ) ;
2009-02-03 09:22:48 +03:00
if ( result )
return result ;
return init_transition_pgtable ( image , level4p ) ;
2005-06-26 01:58:02 +04:00
}
static void set_idt ( void * newidt , u16 limit )
{
2005-07-29 23:02:09 +04:00
struct desc_ptr curidt ;
2005-06-26 01:58:02 +04:00
/* x86-64 supports unaliged loads & stores */
2005-07-29 23:02:09 +04:00
curidt . size = limit ;
curidt . address = ( unsigned long ) newidt ;
2005-06-26 01:58:02 +04:00
__asm__ __volatile__ (
2005-07-29 23:02:09 +04:00
" lidtq %0 \n "
: : " m " ( curidt )
2005-06-26 01:58:02 +04:00
) ;
} ;
static void set_gdt ( void * newgdt , u16 limit )
{
2005-07-29 23:02:09 +04:00
struct desc_ptr curgdt ;
2005-06-26 01:58:02 +04:00
/* x86-64 supports unaligned loads & stores */
2005-07-29 23:02:09 +04:00
curgdt . size = limit ;
curgdt . address = ( unsigned long ) newgdt ;
2005-06-26 01:58:02 +04:00
__asm__ __volatile__ (
2005-07-29 23:02:09 +04:00
" lgdtq %0 \n "
: : " m " ( curgdt )
2005-06-26 01:58:02 +04:00
) ;
} ;
static void load_segments ( void )
{
__asm__ __volatile__ (
2005-07-29 23:02:09 +04:00
" \t movl %0,%%ds \n "
" \t movl %0,%%es \n "
" \t movl %0,%%ss \n "
" \t movl %0,%%fs \n "
" \t movl %0,%%gs \n "
2006-03-08 08:55:48 +03:00
: : " a " ( __KERNEL_DS ) : " memory "
2005-06-26 01:58:02 +04:00
) ;
}
int machine_kexec_prepare ( struct kimage * image )
{
2006-09-26 12:52:38 +04:00
unsigned long start_pgtable ;
2005-06-26 01:58:02 +04:00
int result ;
/* Calculate the offsets */
2005-06-26 01:58:28 +04:00
start_pgtable = page_to_pfn ( image - > control_code_page ) < < PAGE_SHIFT ;
2005-06-26 01:58:02 +04:00
/* Setup the identity mapped 64bit page table */
result = init_pgtable ( image , start_pgtable ) ;
2005-06-26 01:58:28 +04:00
if ( result )
2005-06-26 01:58:02 +04:00
return result ;
return 0 ;
}
void machine_kexec_cleanup ( struct kimage * image )
{
2009-02-03 09:22:48 +03:00
free_transition_pgtable ( image ) ;
2005-06-26 01:58:02 +04:00
}
/*
* Do not allocate memory ( or fail in any way ) in machine_kexec ( ) .
* We are past the point of no return , committed to rebooting now .
*/
2008-07-26 06:45:07 +04:00
void machine_kexec ( struct kimage * image )
2005-06-26 01:58:02 +04:00
{
2006-09-26 12:52:38 +04:00
unsigned long page_list [ PAGES_NR ] ;
void * control_page ;
2009-03-10 05:57:16 +03:00
int save_ftrace_enabled ;
2005-06-26 01:58:02 +04:00
2009-03-10 05:57:16 +03:00
# ifdef CONFIG_KEXEC_JUMP
2009-05-08 06:51:41 +04:00
if ( image - > preserve_context )
2009-03-10 05:57:16 +03:00
save_processor_state ( ) ;
# endif
save_ftrace_enabled = __ftrace_enabled_save ( ) ;
2008-05-12 23:20:43 +04:00
2005-06-26 01:58:02 +04:00
/* Interrupts aren't acceptable while we reboot */
local_irq_disable ( ) ;
2009-06-01 22:16:03 +04:00
hw_breakpoint_disable ( ) ;
2005-06-26 01:58:02 +04:00
2009-03-10 05:57:16 +03:00
if ( image - > preserve_context ) {
# ifdef CONFIG_X86_IO_APIC
/*
* We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel . kexec / kdump
* paths already have calls to disable_IO_APIC ( ) in
* one form or other . kexec jump path also need
* one .
*/
disable_IO_APIC ( ) ;
# endif
}
2006-09-26 12:52:38 +04:00
control_page = page_address ( image - > control_code_page ) + PAGE_SIZE ;
2009-03-10 05:57:16 +03:00
memcpy ( control_page , relocate_kernel , KEXEC_CONTROL_CODE_MAX_SIZE ) ;
2006-09-26 12:52:38 +04:00
Revert "[PATCH] x86: __pa and __pa_symbol address space separation"
This was broken. It adds complexity, for no good reason. Rather than
separate __pa() and __pa_symbol(), we should deprecate __pa_symbol(),
and preferably __pa() too - and just use "virt_to_phys()" instead, which
is more readable and has nicer semantics.
However, right now, just undo the separation, and make __pa_symbol() be
the exact same as __pa(). That fixes the bugs this patch introduced,
and we can do the fairly obvious cleanups later.
Do the new __phys_addr() function (which is now the actual workhorse for
the unified __pa()/__pa_symbol()) as a real external function, that way
all the potential issues with compile/link-time optimizations of
constant symbol addresses go away, and we can also, if we choose to, add
more sanity-checking of the argument.
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 19:44:24 +04:00
page_list [ PA_CONTROL_PAGE ] = virt_to_phys ( control_page ) ;
2009-03-10 05:57:16 +03:00
page_list [ VA_CONTROL_PAGE ] = ( unsigned long ) control_page ;
2006-09-26 12:52:38 +04:00
page_list [ PA_TABLE_PAGE ] =
( unsigned long ) __pa ( page_address ( image - > control_code_page ) ) ;
2005-06-26 01:58:02 +04:00
2009-03-10 05:57:16 +03:00
if ( image - > type = = KEXEC_TYPE_DEFAULT )
page_list [ PA_SWAP_PAGE ] = ( page_to_pfn ( image - > swap_page )
< < PAGE_SHIFT ) ;
2009-03-10 05:56:57 +03:00
/*
* The segment registers are funny things , they have both a
2006-07-30 14:03:20 +04:00
* visible and an invisible part . Whenever the visible part is
* set to a specific selector , the invisible part is loaded
* with from a table in memory . At no other time is the
* descriptor table in memory accessed .
2005-06-26 01:58:02 +04:00
*
* I take advantage of this here by force loading the
* segments , before I zap the gdt with an invalid value .
*/
load_segments ( ) ;
2009-03-10 05:56:57 +03:00
/*
* The gdt & idt are now invalid .
2005-06-26 01:58:02 +04:00
* If you want to load them you must set up your own idt & gdt .
*/
2009-03-10 05:56:57 +03:00
set_gdt ( phys_to_virt ( 0 ) , 0 ) ;
set_idt ( phys_to_virt ( 0 ) , 0 ) ;
2006-09-26 12:52:38 +04:00
2005-06-26 01:58:02 +04:00
/* now call it */
2009-03-10 05:57:16 +03:00
image - > start = relocate_kernel ( ( unsigned long ) image - > head ,
( unsigned long ) page_list ,
image - > start ,
image - > preserve_context ) ;
# ifdef CONFIG_KEXEC_JUMP
2009-05-08 06:51:41 +04:00
if ( image - > preserve_context )
2009-03-10 05:57:16 +03:00
restore_processor_state ( ) ;
# endif
__ftrace_enabled_restore ( save_ftrace_enabled ) ;
2005-06-26 01:58:02 +04:00
}
2006-09-26 12:52:32 +04:00
2007-10-17 10:27:27 +04:00
void arch_crash_save_vmcoreinfo ( void )
{
2008-04-03 00:04:50 +04:00
VMCOREINFO_SYMBOL ( phys_base ) ;
2007-10-26 09:19:26 +04:00
VMCOREINFO_SYMBOL ( init_level4_pgt ) ;
2008-02-07 11:15:23 +03:00
# ifdef CONFIG_NUMA
VMCOREINFO_SYMBOL ( node_data ) ;
VMCOREINFO_LENGTH ( node_data , MAX_NUMNODES ) ;
# endif
2007-10-17 10:27:27 +04:00
}