2005-04-16 15:20:36 -07:00
/*
* Copyright ( C ) 1995 Linus Torvalds
* Copyright ( C ) 2001 , 2002 Andi Kleen , SuSE Labs .
*/
# include <linux/signal.h>
# include <linux/sched.h>
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/string.h>
# include <linux/types.h>
# include <linux/ptrace.h>
# include <linux/mman.h>
# include <linux/mm.h>
# include <linux/smp.h>
# include <linux/interrupt.h>
# include <linux/init.h>
# include <linux/tty.h>
# include <linux/vt_kern.h> /* For unblank_screen() */
# include <linux/compiler.h>
2008-01-30 13:34:11 +01:00
# include <linux/highmem.h>
# include <linux/bootmem.h> /* for max_low_pfn */
2007-05-08 00:27:03 -07:00
# include <linux/vmalloc.h>
2005-04-16 15:20:36 -07:00
# include <linux/module.h>
2005-09-06 15:19:28 -07:00
# include <linux/kprobes.h>
2006-12-07 02:14:06 +01:00
# include <linux/uaccess.h>
2007-05-08 00:27:03 -07:00
# include <linux/kdebug.h>
2005-04-16 15:20:36 -07:00
# include <asm/system.h>
2008-01-30 13:34:11 +01:00
# include <asm/desc.h>
# include <asm/segment.h>
2005-04-16 15:20:36 -07:00
# include <asm/pgalloc.h>
# include <asm/smp.h>
# include <asm/tlbflush.h>
# include <asm/proto.h>
# include <asm-generic/sections.h>
2008-01-30 13:32:19 +01:00
/*
* Page fault error code bits
* bit 0 = = 0 means no page found , 1 means protection fault
* bit 1 = = 0 means read , 1 means write
* bit 2 = = 0 means kernel , 1 means user - mode
* bit 3 = = 1 means use of reserved bit detected
* bit 4 = = 1 means fault was an instruction fetch
*/
2008-01-30 13:32:53 +01:00
# define PF_PROT (1<<0)
2006-01-11 22:44:09 +01:00
# define PF_WRITE (1<<1)
2008-01-30 13:32:53 +01:00
# define PF_USER (1<<2)
# define PF_RSVD (1<<3)
2006-01-11 22:44:09 +01:00
# define PF_INSTR (1<<4)
2007-10-16 01:24:07 -07:00
static inline int notify_page_fault ( struct pt_regs * regs )
2006-06-26 00:25:25 -07:00
{
2008-01-30 13:32:19 +01:00
# ifdef CONFIG_KPROBES
2007-10-16 01:24:07 -07:00
int ret = 0 ;
/* kprobe_running() needs smp_processor_id() */
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
if ( ! user_mode_vm ( regs ) ) {
# else
2007-10-16 01:24:07 -07:00
if ( ! user_mode ( regs ) ) {
2008-01-30 13:34:10 +01:00
# endif
2007-10-16 01:24:07 -07:00
preempt_disable ( ) ;
if ( kprobe_running ( ) & & kprobe_fault_handler ( regs , 14 ) )
ret = 1 ;
preempt_enable ( ) ;
}
2006-06-26 00:25:25 -07:00
2007-10-16 01:24:07 -07:00
return ret ;
# else
return 0 ;
# endif
2008-01-30 13:32:19 +01:00
}
2006-06-26 00:25:25 -07:00
2008-01-30 13:32:35 +01:00
/*
* X86_32
* Sometimes AMD Athlon / Opteron CPUs report invalid exceptions on prefetch .
* Check that here and ignore it .
*
* X86_64
* Sometimes the CPU reports invalid exceptions on prefetch .
* Check that here and ignore it .
*
* Opcode checker based on code by Richard Brunner
*/
static int is_prefetch ( struct pt_regs * regs , unsigned long addr ,
unsigned long error_code )
2008-01-30 13:32:19 +01:00
{
2006-12-07 02:14:06 +01:00
unsigned char * instr ;
2005-04-16 15:20:36 -07:00
int scan_more = 1 ;
2008-01-30 13:32:19 +01:00
int prefetch = 0 ;
2005-04-16 15:24:59 -07:00
unsigned char * max_instr ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:32:35 +01:00
# ifdef CONFIG_X86_32
2008-01-30 13:34:11 +01:00
if ( ! ( __supported_pte_mask & _PAGE_NX ) )
2008-01-30 13:32:35 +01:00
return 0 ;
2008-01-30 13:34:11 +01:00
# endif
2008-01-30 13:34:11 +01:00
/* If it was a exec fault on NX page, ignore */
2006-01-11 22:44:09 +01:00
if ( error_code & PF_INSTR )
2005-04-16 15:20:36 -07:00
return 0 ;
2008-01-30 13:32:35 +01:00
2008-01-30 13:33:12 +01:00
instr = ( unsigned char * ) convert_ip_to_linear ( current , regs ) ;
2005-04-16 15:24:59 -07:00
max_instr = instr + 15 ;
2005-04-16 15:20:36 -07:00
2005-06-23 00:08:46 -07:00
if ( user_mode ( regs ) & & instr > = ( unsigned char * ) TASK_SIZE )
2005-04-16 15:20:36 -07:00
return 0 ;
2008-01-30 13:32:19 +01:00
while ( scan_more & & instr < max_instr ) {
2005-04-16 15:20:36 -07:00
unsigned char opcode ;
unsigned char instr_hi ;
unsigned char instr_lo ;
2006-12-07 02:14:06 +01:00
if ( probe_kernel_address ( instr , opcode ) )
2008-01-30 13:32:19 +01:00
break ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:32:19 +01:00
instr_hi = opcode & 0xf0 ;
instr_lo = opcode & 0x0f ;
2005-04-16 15:20:36 -07:00
instr + + ;
2008-01-30 13:32:19 +01:00
switch ( instr_hi ) {
2005-04-16 15:20:36 -07:00
case 0x20 :
case 0x30 :
2008-01-30 13:32:19 +01:00
/*
* Values 0x26 , 0x2E , 0x36 , 0x3E are valid x86 prefixes .
* In X86_64 long mode , the CPU will signal invalid
* opcode if some of these prefixes are present so
* X86_64 will never get here anyway
*/
2005-04-16 15:20:36 -07:00
scan_more = ( ( instr_lo & 7 ) = = 0x6 ) ;
break ;
2008-01-30 13:32:19 +01:00
# ifdef CONFIG_X86_64
2005-04-16 15:20:36 -07:00
case 0x40 :
2008-01-30 13:32:19 +01:00
/*
* In AMD64 long mode 0x40 . .0 x4F are valid REX prefixes
* Need to figure out under what instruction mode the
* instruction was issued . Could check the LDT for lm ,
* but for now it ' s good enough to assume that long
* mode only uses well known segments or kernel .
*/
2005-06-23 00:08:46 -07:00
scan_more = ( ! user_mode ( regs ) ) | | ( regs - > cs = = __USER_CS ) ;
2005-04-16 15:20:36 -07:00
break ;
2008-01-30 13:32:19 +01:00
# endif
2005-04-16 15:20:36 -07:00
case 0x60 :
/* 0x64 thru 0x67 are valid prefixes in all modes. */
scan_more = ( instr_lo & 0xC ) = = 0x4 ;
2008-01-30 13:32:19 +01:00
break ;
2005-04-16 15:20:36 -07:00
case 0xF0 :
2008-01-30 13:32:35 +01:00
/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
2005-04-16 15:20:36 -07:00
scan_more = ! instr_lo | | ( instr_lo > > 1 ) = = 1 ;
2008-01-30 13:32:19 +01:00
break ;
2005-04-16 15:20:36 -07:00
case 0x00 :
/* Prefetch instruction is 0x0F0D or 0x0F18 */
scan_more = 0 ;
2008-01-30 13:33:12 +01:00
2006-12-07 02:14:06 +01:00
if ( probe_kernel_address ( instr , opcode ) )
2005-04-16 15:20:36 -07:00
break ;
prefetch = ( instr_lo = = 0xF ) & &
( opcode = = 0x0D | | opcode = = 0x18 ) ;
2008-01-30 13:32:19 +01:00
break ;
2005-04-16 15:20:36 -07:00
default :
scan_more = 0 ;
break ;
2008-01-30 13:32:19 +01:00
}
2005-04-16 15:20:36 -07:00
}
return prefetch ;
}
2008-01-30 13:32:35 +01:00
static void force_sig_info_fault ( int si_signo , int si_code ,
unsigned long address , struct task_struct * tsk )
{
siginfo_t info ;
info . si_signo = si_signo ;
info . si_errno = 0 ;
info . si_code = si_code ;
info . si_addr = ( void __user * ) address ;
force_sig_info ( si_signo , & info , tsk ) ;
}
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_64
2008-01-30 13:32:19 +01:00
static int bad_address ( void * p )
{
2005-04-16 15:20:36 -07:00
unsigned long dummy ;
2006-12-07 02:14:06 +01:00
return probe_kernel_address ( ( unsigned long * ) p , dummy ) ;
2008-01-30 13:32:19 +01:00
}
2008-01-30 13:34:10 +01:00
# endif
2005-04-16 15:20:36 -07:00
void dump_pagetable ( unsigned long address )
{
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
__typeof__ ( pte_val ( __pte ( 0 ) ) ) page ;
page = read_cr3 ( ) ;
page = ( ( __typeof__ ( page ) * ) __va ( page ) ) [ address > > PGDIR_SHIFT ] ;
# ifdef CONFIG_X86_PAE
printk ( " *pdpt = %016Lx " , page ) ;
if ( ( page > > PAGE_SHIFT ) < max_low_pfn
& & page & _PAGE_PRESENT ) {
page & = PAGE_MASK ;
page = ( ( __typeof__ ( page ) * ) __va ( page ) ) [ ( address > > PMD_SHIFT )
& ( PTRS_PER_PMD - 1 ) ] ;
printk ( KERN_CONT " *pde = %016Lx " , page ) ;
page & = ~ _PAGE_NX ;
}
# else
printk ( " *pde = %08lx " , page ) ;
# endif
/*
* We must not directly access the pte in the highpte
* case if the page table is located in highmem .
* And let ' s rather not kmap - atomic the pte , just in case
* it ' s allocated already .
*/
if ( ( page > > PAGE_SHIFT ) < max_low_pfn
& & ( page & _PAGE_PRESENT )
& & ! ( page & _PAGE_PSE ) ) {
page & = PAGE_MASK ;
page = ( ( __typeof__ ( page ) * ) __va ( page ) ) [ ( address > > PAGE_SHIFT )
& ( PTRS_PER_PTE - 1 ) ] ;
printk ( " *pte = %0*Lx " , sizeof ( page ) * 2 , ( u64 ) page ) ;
}
printk ( " \n " ) ;
# else /* CONFIG_X86_64 */
2005-04-16 15:20:36 -07:00
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
2007-07-22 11:12:29 +02:00
pgd = ( pgd_t * ) read_cr3 ( ) ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:32:19 +01:00
pgd = __va ( ( unsigned long ) pgd & PHYSICAL_PAGE_MASK ) ;
2005-04-16 15:20:36 -07:00
pgd + = pgd_index ( address ) ;
if ( bad_address ( pgd ) ) goto bad ;
2006-02-03 21:51:47 +01:00
printk ( " PGD %lx " , pgd_val ( * pgd ) ) ;
2008-01-30 13:32:19 +01:00
if ( ! pgd_present ( * pgd ) ) goto ret ;
2005-04-16 15:20:36 -07:00
2006-06-26 13:57:56 +02:00
pud = pud_offset ( pgd , address ) ;
2005-04-16 15:20:36 -07:00
if ( bad_address ( pud ) ) goto bad ;
printk ( " PUD %lx " , pud_val ( * pud ) ) ;
if ( ! pud_present ( * pud ) ) goto ret ;
pmd = pmd_offset ( pud , address ) ;
if ( bad_address ( pmd ) ) goto bad ;
printk ( " PMD %lx " , pmd_val ( * pmd ) ) ;
2007-10-19 20:35:03 +02:00
if ( ! pmd_present ( * pmd ) | | pmd_large ( * pmd ) ) goto ret ;
2005-04-16 15:20:36 -07:00
pte = pte_offset_kernel ( pmd , address ) ;
if ( bad_address ( pte ) ) goto bad ;
2008-01-30 13:32:19 +01:00
printk ( " PTE %lx " , pte_val ( * pte ) ) ;
2005-04-16 15:20:36 -07:00
ret :
printk ( " \n " ) ;
return ;
bad :
printk ( " BAD \n " ) ;
2008-01-30 13:34:10 +01:00
# endif
}
# ifdef CONFIG_X86_32
static inline pmd_t * vmalloc_sync_one ( pgd_t * pgd , unsigned long address )
{
unsigned index = pgd_index ( address ) ;
pgd_t * pgd_k ;
pud_t * pud , * pud_k ;
pmd_t * pmd , * pmd_k ;
pgd + = index ;
pgd_k = init_mm . pgd + index ;
if ( ! pgd_present ( * pgd_k ) )
return NULL ;
/*
* set_pgd ( pgd , * pgd_k ) ; here would be useless on PAE
* and redundant with the set_pmd ( ) on non - PAE . As would
* set_pud .
*/
pud = pud_offset ( pgd , address ) ;
pud_k = pud_offset ( pgd_k , address ) ;
if ( ! pud_present ( * pud_k ) )
return NULL ;
pmd = pmd_offset ( pud , address ) ;
pmd_k = pmd_offset ( pud_k , address ) ;
if ( ! pmd_present ( * pmd_k ) )
return NULL ;
if ( ! pmd_present ( * pmd ) ) {
set_pmd ( pmd , * pmd_k ) ;
arch_flush_lazy_mmu_mode ( ) ;
} else
BUG_ON ( pmd_page ( * pmd ) ! = pmd_page ( * pmd_k ) ) ;
return pmd_k ;
2005-04-16 15:20:36 -07:00
}
2008-01-30 13:34:10 +01:00
# endif
2005-04-16 15:20:36 -07:00
2008-01-30 13:32:35 +01:00
# ifdef CONFIG_X86_64
2008-01-30 13:32:19 +01:00
static const char errata93_warning [ ] =
2005-04-16 15:20:36 -07:00
KERN_ERR " ******* Your BIOS seems to not contain a fix for K8 errata #93 \n "
KERN_ERR " ******* Working around it, but it may cause SEGVs or burn power. \n "
KERN_ERR " ******* Please consider a BIOS update. \n "
KERN_ERR " ******* Disabling USB legacy in the BIOS may also help. \n " ;
2008-01-30 13:33:13 +01:00
# endif
2005-04-16 15:20:36 -07:00
/* Workaround for K8 erratum #93 & buggy BIOS.
BIOS SMM functions are required to use a specific workaround
2008-01-30 13:32:19 +01:00
to avoid corruption of the 64 bit RIP register on C stepping K8 .
A lot of BIOS that didn ' t get tested properly miss this .
2005-04-16 15:20:36 -07:00
The OS sees this as a page fault with the upper 32 bits of RIP cleared .
Try to work around it here .
2008-01-30 13:33:13 +01:00
Note we only handle faults in kernel here .
Does nothing for X86_32
*/
2008-01-30 13:32:19 +01:00
static int is_errata93 ( struct pt_regs * regs , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2008-01-30 13:33:13 +01:00
# ifdef CONFIG_X86_64
2005-04-16 15:20:36 -07:00
static int warned ;
2008-01-30 13:30:56 +01:00
if ( address ! = regs - > ip )
2005-04-16 15:20:36 -07:00
return 0 ;
2008-01-30 13:32:19 +01:00
if ( ( address > > 32 ) ! = 0 )
2005-04-16 15:20:36 -07:00
return 0 ;
address | = 0xffffffffUL < < 32 ;
2008-01-30 13:32:19 +01:00
if ( ( address > = ( u64 ) _stext & & address < = ( u64 ) _etext ) | |
( address > = MODULES_VADDR & & address < = MODULES_END ) ) {
2005-04-16 15:20:36 -07:00
if ( ! warned ) {
2008-01-30 13:32:19 +01:00
printk ( errata93_warning ) ;
2005-04-16 15:20:36 -07:00
warned = 1 ;
}
2008-01-30 13:30:56 +01:00
regs - > ip = address ;
2005-04-16 15:20:36 -07:00
return 1 ;
}
2008-01-30 13:33:13 +01:00
# endif
2005-04-16 15:20:36 -07:00
return 0 ;
2008-01-30 13:32:19 +01:00
}
2005-04-16 15:20:36 -07:00
2008-01-30 13:34:09 +01:00
/*
* Work around K8 erratum # 100 K8 in compat mode occasionally jumps to illegal
* addresses > 4 GB . We catch this in the page fault handler because these
* addresses are not reachable . Just detect this case and return . Any code
* segment in LDT is compatibility mode .
*/
static int is_errata100 ( struct pt_regs * regs , unsigned long address )
{
# ifdef CONFIG_X86_64
if ( ( regs - > cs = = __USER32_CS | | ( regs - > cs & ( 1 < < 2 ) ) ) & &
( address > > 32 ) )
return 1 ;
# endif
return 0 ;
}
2008-01-30 13:34:09 +01:00
void do_invalid_op ( struct pt_regs * , unsigned long ) ;
static int is_f00f_bug ( struct pt_regs * regs , unsigned long address )
{
# ifdef CONFIG_X86_F00F_BUG
unsigned long nr ;
/*
* Pentium F0 0F C7 C8 bug workaround .
*/
if ( boot_cpu_data . f00f_bug ) {
nr = ( address - idt_descr . address ) > > 3 ;
if ( nr = = 6 ) {
do_invalid_op ( regs , 0 ) ;
return 1 ;
}
}
# endif
return 0 ;
}
2008-01-30 13:34:10 +01:00
static void show_fault_oops ( struct pt_regs * regs , unsigned long error_code ,
unsigned long address )
{
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
if ( ! oops_may_print ( ) )
return ;
2008-01-30 13:34:11 +01:00
# endif
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_PAE
if ( error_code & PF_INSTR ) {
int level ;
pte_t * pte = lookup_address ( address , & level ) ;
if ( pte & & pte_present ( * pte ) & & ! pte_exec ( * pte ) )
printk ( KERN_CRIT " kernel tried to execute "
" NX-protected page - exploit attempt? "
" (uid: %d) \n " , current - > uid ) ;
}
# endif
2008-01-30 13:34:10 +01:00
printk ( KERN_ALERT " BUG: unable to handle kernel " ) ;
2008-01-30 13:34:10 +01:00
if ( address < PAGE_SIZE )
2008-01-30 13:34:10 +01:00
printk ( KERN_CONT " NULL pointer dereference " ) ;
2008-01-30 13:34:10 +01:00
else
2008-01-30 13:34:10 +01:00
printk ( KERN_CONT " paging request " ) ;
2008-01-30 13:34:11 +01:00
# ifdef CONFIG_X86_32
printk ( KERN_CONT " at %08lx \n " , address ) ;
# else
2008-01-30 13:34:10 +01:00
printk ( KERN_CONT " at %016lx \n " , address ) ;
2008-01-30 13:34:11 +01:00
# endif
2008-01-30 13:34:10 +01:00
printk ( KERN_ALERT " IP: " ) ;
2008-01-30 13:34:10 +01:00
printk_address ( regs - > ip , 1 ) ;
dump_pagetable ( address ) ;
}
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_64
2005-04-16 15:20:36 -07:00
static noinline void pgtable_bad ( unsigned long address , struct pt_regs * regs ,
unsigned long error_code )
{
2005-09-12 18:49:24 +02:00
unsigned long flags = oops_begin ( ) ;
2006-01-11 22:42:14 +01:00
struct task_struct * tsk ;
2005-09-12 18:49:24 +02:00
2005-04-16 15:20:36 -07:00
printk ( KERN_ALERT " %s: Corrupted page table at address %lx \n " ,
current - > comm , address ) ;
dump_pagetable ( address ) ;
2006-01-11 22:42:14 +01:00
tsk = current ;
tsk - > thread . cr2 = address ;
tsk - > thread . trap_no = 14 ;
tsk - > thread . error_code = error_code ;
2008-01-30 13:31:23 +01:00
if ( __die ( " Bad pagetable " , regs , error_code ) )
regs = NULL ;
oops_end ( flags , regs , SIGKILL ) ;
2005-04-16 15:20:36 -07:00
}
2008-01-30 13:34:10 +01:00
# endif
2005-04-16 15:20:36 -07:00
2008-01-30 13:34:11 +01:00
/*
* Handle a spurious fault caused by a stale TLB entry . This allows
* us to lazily refresh the TLB when increasing the permissions of a
* kernel page ( RO - > RW or NX - > X ) . Doing it eagerly is very
* expensive since that implies doing a full cross - processor TLB
* flush , even if no stale TLB entries exist on other processors .
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page .
*/
static int spurious_fault ( unsigned long address ,
unsigned long error_code )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
/* Reserved-bit violation or user access to kernel space? */
if ( error_code & ( PF_USER | PF_RSVD ) )
return 0 ;
pgd = init_mm . pgd + pgd_index ( address ) ;
if ( ! pgd_present ( * pgd ) )
return 0 ;
pud = pud_offset ( pgd , address ) ;
if ( ! pud_present ( * pud ) )
return 0 ;
pmd = pmd_offset ( pud , address ) ;
if ( ! pmd_present ( * pmd ) )
return 0 ;
pte = pte_offset_kernel ( pmd , address ) ;
if ( ! pte_present ( * pte ) )
return 0 ;
if ( ( error_code & PF_WRITE ) & & ! pte_write ( * pte ) )
return 0 ;
if ( ( error_code & PF_INSTR ) & & ! pte_exec ( * pte ) )
return 0 ;
return 1 ;
}
2005-04-16 15:20:36 -07:00
/*
2008-01-30 13:34:10 +01:00
* X86_32
* Handle a fault on the vmalloc or module mapping area
*
* X86_64
2006-01-11 22:44:00 +01:00
* Handle a fault on the vmalloc area
2005-05-16 21:53:31 -07:00
*
* This assumes no large pages in there .
2005-04-16 15:20:36 -07:00
*/
static int vmalloc_fault ( unsigned long address )
{
2008-01-30 13:33:13 +01:00
# ifdef CONFIG_X86_32
unsigned long pgd_paddr ;
pmd_t * pmd_k ;
pte_t * pte_k ;
/*
* Synchronize this task ' s top level page - table
* with the ' reference ' page table .
*
* Do _not_ use " current " here . We might be inside
* an interrupt in the middle of a task switch . .
*/
pgd_paddr = read_cr3 ( ) ;
pmd_k = vmalloc_sync_one ( __va ( pgd_paddr ) , address ) ;
if ( ! pmd_k )
return - 1 ;
pte_k = pte_offset_kernel ( pmd_k , address ) ;
if ( ! pte_present ( * pte_k ) )
return - 1 ;
return 0 ;
# else
2005-04-16 15:20:36 -07:00
pgd_t * pgd , * pgd_ref ;
pud_t * pud , * pud_ref ;
pmd_t * pmd , * pmd_ref ;
pte_t * pte , * pte_ref ;
/* Copy kernel mappings over when needed. This can also
happen within a race in page table update . In the later
case just flush . */
pgd = pgd_offset ( current - > mm ? : & init_mm , address ) ;
pgd_ref = pgd_offset_k ( address ) ;
if ( pgd_none ( * pgd_ref ) )
return - 1 ;
if ( pgd_none ( * pgd ) )
set_pgd ( pgd , * pgd_ref ) ;
2006-03-25 16:29:40 +01:00
else
2006-09-25 23:31:48 -07:00
BUG_ON ( pgd_page_vaddr ( * pgd ) ! = pgd_page_vaddr ( * pgd_ref ) ) ;
2005-04-16 15:20:36 -07:00
/* Below here mismatches are bugs because these lower tables
are shared */
pud = pud_offset ( pgd , address ) ;
pud_ref = pud_offset ( pgd_ref , address ) ;
if ( pud_none ( * pud_ref ) )
return - 1 ;
2006-09-25 23:31:48 -07:00
if ( pud_none ( * pud ) | | pud_page_vaddr ( * pud ) ! = pud_page_vaddr ( * pud_ref ) )
2005-04-16 15:20:36 -07:00
BUG ( ) ;
pmd = pmd_offset ( pud , address ) ;
pmd_ref = pmd_offset ( pud_ref , address ) ;
if ( pmd_none ( * pmd_ref ) )
return - 1 ;
if ( pmd_none ( * pmd ) | | pmd_page ( * pmd ) ! = pmd_page ( * pmd_ref ) )
BUG ( ) ;
pte_ref = pte_offset_kernel ( pmd_ref , address ) ;
if ( ! pte_present ( * pte_ref ) )
return - 1 ;
pte = pte_offset_kernel ( pmd , address ) ;
2005-05-16 21:53:31 -07:00
/* Don't use pte_page here, because the mappings can point
outside mem_map , and the NUMA hash lookup cannot handle
that . */
if ( ! pte_present ( * pte ) | | pte_pfn ( * pte ) ! = pte_pfn ( * pte_ref ) )
2005-04-16 15:20:36 -07:00
BUG ( ) ;
return 0 ;
2008-01-30 13:33:13 +01:00
# endif
2005-04-16 15:20:36 -07:00
}
2007-07-22 11:12:28 +02:00
int show_unhandled_signals = 1 ;
2005-04-16 15:20:36 -07:00
/*
* This routine handles page faults . It determines the address ,
* and the problem , and then passes it off to one of the appropriate
* routines .
*/
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_64
asmlinkage
# endif
void __kprobes do_page_fault ( struct pt_regs * regs , unsigned long error_code )
2005-04-16 15:20:36 -07:00
{
struct task_struct * tsk ;
struct mm_struct * mm ;
2008-01-30 13:32:19 +01:00
struct vm_area_struct * vma ;
2005-04-16 15:20:36 -07:00
unsigned long address ;
2008-01-30 13:34:10 +01:00
int write , si_code ;
int fault ;
# ifdef CONFIG_X86_64
2005-09-12 18:49:24 +02:00
unsigned long flags ;
2008-01-30 13:34:10 +01:00
# endif
2005-04-16 15:20:36 -07:00
2007-10-25 14:01:10 +02:00
/*
* We can fault from pretty much anywhere , with unknown IRQ state .
*/
trace_hardirqs_fixup ( ) ;
2006-03-25 16:30:10 +01:00
tsk = current ;
mm = tsk - > mm ;
prefetchw ( & mm - > mmap_sem ) ;
2005-04-16 15:20:36 -07:00
/* get the address */
2007-07-22 11:12:29 +02:00
address = read_cr2 ( ) ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:32:35 +01:00
si_code = SEGV_MAPERR ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:33:12 +01:00
if ( notify_page_fault ( regs ) )
return ;
2005-04-16 15:20:36 -07:00
/*
* We fault - in kernel - space virtual memory on - demand . The
* ' reference ' page table is init_mm . pgd .
*
* NOTE ! We MUST NOT take any locks for this case . We may
* be in an interrupt or a critical region , and should
* only copy the information from the master page table ,
* nothing more .
*
* This verifies that the fault happens in kernel space
* ( error_code & 4 ) = = 0 , and that the fault was not a
2006-01-11 22:42:23 +01:00
* protection error ( error_code & 9 ) = = 0.
2005-04-16 15:20:36 -07:00
*/
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
if ( unlikely ( address > = TASK_SIZE ) ) {
if ( ! ( error_code & ( PF_RSVD | PF_USER | PF_PROT ) ) & &
vmalloc_fault ( address ) > = 0 )
return ;
2008-01-30 13:34:11 +01:00
/* Can handle a stale RO->RW TLB */
if ( spurious_fault ( address , error_code ) )
return ;
2008-01-30 13:34:10 +01:00
/*
* Don ' t take the mm semaphore here . If we fixup a prefetch
* fault we could otherwise deadlock .
*/
goto bad_area_nosemaphore ;
}
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
fault has been handled . */
if ( regs - > flags & ( X86_EFLAGS_IF | VM_MASK ) )
local_irq_enable ( ) ;
/*
* If we ' re in an interrupt , have no user context or are running in an
* atomic region then we must not take the fault .
*/
if ( in_atomic ( ) | | ! mm )
goto bad_area_nosemaphore ;
# else /* CONFIG_X86_64 */
[PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes
Appended patch will setup compatibility mode TASK_SIZE properly. This will
fix atleast three known bugs that can be encountered while running
compatibility mode apps.
a) A malicious 32bit app can have an elf section at 0xffffe000. During
exec of this app, we will have a memory leak as insert_vm_struct() is
not checking for return value in syscall32_setup_pages() and thus not
freeing the vma allocated for the vsyscall page. And instead of exec
failing (as it has addresses > TASK_SIZE), we were allowing it to
succeed previously.
b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area
may return addresses beyond 32bits, ultimately causing corruption
because of wrap-around and resulting in SEGFAULT, instead of returning
ENOMEM.
c) 32bit app doing this below mmap will now fail.
mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ|PROT_WRITE,
MAP_FIXED|MAP_PRIVATE|MAP_ANON, 0, 0);
Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-21 17:14:32 -07:00
if ( unlikely ( address > = TASK_SIZE64 ) ) {
2006-01-11 22:44:00 +01:00
/*
* Don ' t check for the module range here : its PML4
* is always initialized because it ' s shared with the main
* kernel text . Only vmalloc may need PML4 syncups .
*/
2006-01-11 22:44:09 +01:00
if ( ! ( error_code & ( PF_RSVD | PF_USER | PF_PROT ) ) & &
2006-01-11 22:44:00 +01:00
( ( address > = VMALLOC_START & & address < VMALLOC_END ) ) ) {
2006-03-25 16:29:40 +01:00
if ( vmalloc_fault ( address ) > = 0 )
return ;
2005-04-16 15:20:36 -07:00
}
2008-01-30 13:34:11 +01:00
/* Can handle a stale RO->RW TLB */
if ( spurious_fault ( address , error_code ) )
return ;
2005-04-16 15:20:36 -07:00
/*
* Don ' t take the mm semaphore here . If we fixup a prefetch
* fault we could otherwise deadlock .
*/
goto bad_area_nosemaphore ;
}
2008-01-30 13:30:56 +01:00
if ( likely ( regs - > flags & X86_EFLAGS_IF ) )
2006-03-25 16:29:40 +01:00
local_irq_enable ( ) ;
2006-01-11 22:44:09 +01:00
if ( unlikely ( error_code & PF_RSVD ) )
2005-04-16 15:20:36 -07:00
pgtable_bad ( address , regs , error_code ) ;
/*
2008-01-30 13:32:19 +01:00
* If we ' re in an interrupt , have no user context or are running in an
* atomic region then we must not take the fault .
2005-04-16 15:20:36 -07:00
*/
if ( unlikely ( in_atomic ( ) | | ! mm ) )
goto bad_area_nosemaphore ;
2007-09-19 11:37:14 -07:00
/*
* User - mode registers count as a user access even for any
* potential system fault or CPU buglet .
*/
if ( user_mode_vm ( regs ) )
error_code | = PF_USER ;
2008-01-30 13:34:10 +01:00
again :
# endif
2005-04-16 15:20:36 -07:00
/* When running in the kernel we expect faults to occur only to
* addresses in user space . All other faults represent errors in the
2007-10-20 01:25:36 +02:00
* kernel and should generate an OOPS . Unfortunately , in the case of an
2006-06-30 18:27:16 +02:00
* erroneous fault occurring in a code path which already holds mmap_sem
2005-04-16 15:20:36 -07:00
* we will deadlock attempting to validate the fault against the
* address space . Luckily the kernel only validly references user
* space from well defined areas of code , which are listed in the
* exceptions table .
*
* As the vast majority of faults will be valid we will only perform
2007-10-20 01:25:36 +02:00
* the source reference check when there is a possibility of a deadlock .
2005-04-16 15:20:36 -07:00
* Attempt to lock the address space , if we cannot we then validate the
* source . If this is invalid we can skip the address space check ,
* thus avoiding the deadlock .
*/
if ( ! down_read_trylock ( & mm - > mmap_sem ) ) {
2006-01-11 22:44:09 +01:00
if ( ( error_code & PF_USER ) = = 0 & &
2008-01-30 13:30:56 +01:00
! search_exception_tables ( regs - > ip ) )
2005-04-16 15:20:36 -07:00
goto bad_area_nosemaphore ;
down_read ( & mm - > mmap_sem ) ;
}
vma = find_vma ( mm , address ) ;
if ( ! vma )
goto bad_area ;
2008-01-30 13:34:10 +01:00
if ( vma - > vm_start < = address )
2005-04-16 15:20:36 -07:00
goto good_area ;
if ( ! ( vma - > vm_flags & VM_GROWSDOWN ) )
goto bad_area ;
2008-01-30 13:32:19 +01:00
if ( error_code & PF_USER ) {
2008-01-30 13:33:13 +01:00
/*
* Accessing the stack below % sp is always a bug .
* The large cushion allows instructions like enter
* and pusha to work . ( " enter $65535,$31 " pushes
* 32 pointers and then decrements % sp by 65535. )
2006-06-26 13:59:50 +02:00
*/
2008-01-30 13:30:56 +01:00
if ( address + 65536 + 32 * sizeof ( unsigned long ) < regs - > sp )
2005-04-16 15:20:36 -07:00
goto bad_area ;
}
if ( expand_stack ( vma , address ) )
goto bad_area ;
/*
* Ok , we have a good vm_area for this memory access , so
* we can handle it . .
*/
good_area :
2008-01-30 13:32:35 +01:00
si_code = SEGV_ACCERR ;
2005-04-16 15:20:36 -07:00
write = 0 ;
2006-01-11 22:44:09 +01:00
switch ( error_code & ( PF_PROT | PF_WRITE ) ) {
2008-01-30 13:32:19 +01:00
default : /* 3: write, present */
/* fall through */
case PF_WRITE : /* write, not present */
if ( ! ( vma - > vm_flags & VM_WRITE ) )
goto bad_area ;
write + + ;
break ;
case PF_PROT : /* read, present */
goto bad_area ;
case 0 : /* read, not present */
if ( ! ( vma - > vm_flags & ( VM_READ | VM_EXEC | VM_WRITE ) ) )
2005-04-16 15:20:36 -07:00
goto bad_area ;
}
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
survive :
# endif
2005-04-16 15:20:36 -07:00
/*
* If for any reason at all we couldn ' t handle the fault ,
* make sure we exit gracefully rather than endlessly redo
* the fault .
*/
2007-07-19 01:47:05 -07:00
fault = handle_mm_fault ( mm , vma , address , write ) ;
if ( unlikely ( fault & VM_FAULT_ERROR ) ) {
if ( fault & VM_FAULT_OOM )
goto out_of_memory ;
else if ( fault & VM_FAULT_SIGBUS )
goto do_sigbus ;
BUG ( ) ;
2005-04-16 15:20:36 -07:00
}
2007-07-19 01:47:05 -07:00
if ( fault & VM_FAULT_MAJOR )
tsk - > maj_flt + + ;
else
tsk - > min_flt + + ;
2008-01-30 13:33:23 +01:00
# ifdef CONFIG_X86_32
/*
* Did it hit the DOS screen memory VA from vm86 mode ?
*/
if ( v8086_mode ( regs ) ) {
unsigned long bit = ( address - 0xA0000 ) > > PAGE_SHIFT ;
if ( bit < 32 )
tsk - > thread . screen_bitmap | = 1 < < bit ;
}
# endif
2005-04-16 15:20:36 -07:00
up_read ( & mm - > mmap_sem ) ;
return ;
/*
* Something tried to access memory that isn ' t in our memory map . .
* Fix it , but check if it ' s kernel or user first . .
*/
bad_area :
up_read ( & mm - > mmap_sem ) ;
bad_area_nosemaphore :
/* User mode accesses just cause a SIGSEGV */
2006-01-11 22:44:09 +01:00
if ( error_code & PF_USER ) {
2007-06-06 23:34:04 -04:00
/*
* It ' s possible to have interrupts off here .
*/
local_irq_enable ( ) ;
2008-01-30 13:34:10 +01:00
/*
* Valid to do another page fault here because this one came
* from user space .
*/
2005-04-16 15:20:36 -07:00
if ( is_prefetch ( regs , address , error_code ) )
return ;
2008-01-30 13:34:09 +01:00
if ( is_errata100 ( regs , address ) )
2005-04-16 15:20:36 -07:00
return ;
2007-07-22 11:12:28 +02:00
if ( show_unhandled_signals & & unhandled_signal ( tsk , SIGSEGV ) & &
printk_ratelimit ( ) ) {
2005-04-16 15:20:36 -07:00
printk (
2008-01-30 13:33:13 +01:00
# ifdef CONFIG_X86_32
2008-01-30 13:33:16 +01:00
" %s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx " ,
2008-01-30 13:33:13 +01:00
# else
2008-01-30 13:33:18 +01:00
" %s%s[%d]: segfault at %lx ip %lx sp %lx error %lx " ,
2008-01-30 13:33:13 +01:00
# endif
task_pid_nr ( tsk ) > 1 ? KERN_INFO : KERN_EMERG ,
tsk - > comm , task_pid_nr ( tsk ) , address , regs - > ip ,
regs - > sp , error_code ) ;
2008-01-30 13:33:18 +01:00
print_vma_addr ( " in " , regs - > ip ) ;
printk ( " \n " ) ;
2005-04-16 15:20:36 -07:00
}
2008-01-30 13:32:19 +01:00
2005-04-16 15:20:36 -07:00
tsk - > thread . cr2 = address ;
/* Kernel addresses are always protection faults */
tsk - > thread . error_code = error_code | ( address > = TASK_SIZE ) ;
tsk - > thread . trap_no = 14 ;
2008-01-30 13:32:35 +01:00
force_sig_info_fault ( SIGSEGV , si_code , address , tsk ) ;
2005-04-16 15:20:36 -07:00
return ;
}
2008-01-30 13:34:09 +01:00
if ( is_f00f_bug ( regs , address ) )
return ;
2005-04-16 15:20:36 -07:00
no_context :
/* Are we prepared to handle this kernel fault? */
2008-01-30 13:32:19 +01:00
if ( fixup_exception ( regs ) )
2005-04-16 15:20:36 -07:00
return ;
2008-01-30 13:32:19 +01:00
/*
2008-01-30 13:34:10 +01:00
* X86_32
* Valid to do another page fault here , because if this fault
* had been triggered by is_prefetch fixup_exception would have
* handled it .
*
* X86_64
2005-04-16 15:20:36 -07:00
* Hall of shame of CPU / BIOS bugs .
*/
2008-01-30 13:32:19 +01:00
if ( is_prefetch ( regs , address , error_code ) )
return ;
2005-04-16 15:20:36 -07:00
if ( is_errata93 ( regs , address ) )
2008-01-30 13:32:19 +01:00
return ;
2005-04-16 15:20:36 -07:00
/*
* Oops . The kernel tried to access some bad page . We ' ll have to
* terminate things with extreme prejudice .
*/
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
bust_spinlocks ( 1 ) ;
2008-01-30 13:34:11 +01:00
# else
flags = oops_begin ( ) ;
# endif
2008-01-30 13:34:10 +01:00
show_fault_oops ( regs , error_code , address ) ;
2005-04-16 15:20:36 -07:00
2008-01-30 13:34:10 +01:00
tsk - > thread . cr2 = address ;
tsk - > thread . trap_no = 14 ;
tsk - > thread . error_code = error_code ;
2008-01-30 13:34:11 +01:00
# ifdef CONFIG_X86_32
2008-01-30 13:34:10 +01:00
die ( " Oops " , regs , error_code ) ;
bust_spinlocks ( 0 ) ;
do_exit ( SIGKILL ) ;
2008-01-30 13:34:11 +01:00
# else
2008-01-30 13:31:23 +01:00
if ( __die ( " Oops " , regs , error_code ) )
regs = NULL ;
2005-04-16 15:20:36 -07:00
/* Executive summary in case the body of the oops scrolled away */
printk ( KERN_EMERG " CR2: %016lx \n " , address ) ;
2008-01-30 13:31:23 +01:00
oops_end ( flags , regs , SIGKILL ) ;
2008-01-30 13:34:10 +01:00
# endif
2005-04-16 15:20:36 -07:00
/*
* We ran out of memory , or some other thing happened to us that made
* us unable to handle the page fault gracefully .
*/
out_of_memory :
up_read ( & mm - > mmap_sem ) ;
2008-01-30 13:34:10 +01:00
if ( is_global_init ( tsk ) ) {
yield ( ) ;
2008-01-30 13:34:11 +01:00
# ifdef CONFIG_X86_32
2008-01-30 13:34:10 +01:00
down_read ( & mm - > mmap_sem ) ;
goto survive ;
# else
2005-04-16 15:20:36 -07:00
goto again ;
2008-01-30 13:34:10 +01:00
# endif
2008-01-30 13:34:11 +01:00
}
2005-04-16 15:20:36 -07:00
printk ( " VM: killing process %s \n " , tsk - > comm ) ;
2008-01-30 13:32:59 +01:00
if ( error_code & PF_USER )
2007-07-21 17:11:17 +02:00
do_group_exit ( SIGKILL ) ;
2005-04-16 15:20:36 -07:00
goto no_context ;
do_sigbus :
up_read ( & mm - > mmap_sem ) ;
/* Kernel mode? Handle exceptions or die */
2006-01-11 22:44:09 +01:00
if ( ! ( error_code & PF_USER ) )
2005-04-16 15:20:36 -07:00
goto no_context ;
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
/* User space => ok to do another page fault */
if ( is_prefetch ( regs , address , error_code ) )
return ;
# endif
2005-04-16 15:20:36 -07:00
tsk - > thread . cr2 = address ;
tsk - > thread . error_code = error_code ;
tsk - > thread . trap_no = 14 ;
2008-01-30 13:32:35 +01:00
force_sig_info_fault ( SIGBUS , BUS_ADRERR , address , tsk ) ;
2005-04-16 15:20:36 -07:00
}
2005-11-05 17:25:54 +01:00
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_64
2006-03-25 16:29:40 +01:00
DEFINE_SPINLOCK ( pgd_lock ) ;
2007-05-02 19:27:10 +02:00
LIST_HEAD ( pgd_list ) ;
2008-01-30 13:34:10 +01:00
# endif
2006-03-25 16:29:40 +01:00
void vmalloc_sync_all ( void )
{
2008-01-30 13:34:10 +01:00
# ifdef CONFIG_X86_32
/*
* Note that races in the updates of insync and start aren ' t
* problematic : insync can only get set bits added , and updates to
* start are only improving performance ( without affecting correctness
* if undone ) .
*/
static DECLARE_BITMAP ( insync , PTRS_PER_PGD ) ;
static unsigned long start = TASK_SIZE ;
unsigned long address ;
if ( SHARED_KERNEL_PMD )
return ;
BUILD_BUG_ON ( TASK_SIZE & ~ PGDIR_MASK ) ;
for ( address = start ; address > = TASK_SIZE ; address + = PGDIR_SIZE ) {
if ( ! test_bit ( pgd_index ( address ) , insync ) ) {
unsigned long flags ;
struct page * page ;
spin_lock_irqsave ( & pgd_lock , flags ) ;
for ( page = pgd_list ; page ; page =
( struct page * ) page - > index )
if ( ! vmalloc_sync_one ( page_address ( page ) ,
address ) ) {
BUG_ON ( page ! = pgd_list ) ;
break ;
}
spin_unlock_irqrestore ( & pgd_lock , flags ) ;
if ( ! page )
set_bit ( pgd_index ( address ) , insync ) ;
}
if ( address = = start & & test_bit ( pgd_index ( address ) , insync ) )
start = address + PGDIR_SIZE ;
}
# else /* CONFIG_X86_64 */
2008-01-30 13:33:13 +01:00
/*
* Note that races in the updates of insync and start aren ' t
* problematic : insync can only get set bits added , and updates to
* start are only improving performance ( without affecting correctness
* if undone ) .
*/
2006-03-25 16:29:40 +01:00
static DECLARE_BITMAP ( insync , PTRS_PER_PGD ) ;
static unsigned long start = VMALLOC_START & PGDIR_MASK ;
unsigned long address ;
for ( address = start ; address < = VMALLOC_END ; address + = PGDIR_SIZE ) {
if ( ! test_bit ( pgd_index ( address ) , insync ) ) {
const pgd_t * pgd_ref = pgd_offset_k ( address ) ;
struct page * page ;
if ( pgd_none ( * pgd_ref ) )
continue ;
spin_lock ( & pgd_lock ) ;
2007-05-02 19:27:10 +02:00
list_for_each_entry ( page , & pgd_list , lru ) {
2006-03-25 16:29:40 +01:00
pgd_t * pgd ;
pgd = ( pgd_t * ) page_address ( page ) + pgd_index ( address ) ;
if ( pgd_none ( * pgd ) )
set_pgd ( pgd , * pgd_ref ) ;
else
2006-09-25 23:31:48 -07:00
BUG_ON ( pgd_page_vaddr ( * pgd ) ! = pgd_page_vaddr ( * pgd_ref ) ) ;
2006-03-25 16:29:40 +01:00
}
spin_unlock ( & pgd_lock ) ;
set_bit ( pgd_index ( address ) , insync ) ;
}
if ( address = = start )
start = address + PGDIR_SIZE ;
}
/* Check that there is no need to do the same for the modules area. */
BUILD_BUG_ON ( ! ( MODULES_VADDR > __START_KERNEL ) ) ;
2008-01-30 13:32:19 +01:00
BUILD_BUG_ON ( ! ( ( ( MODULES_END - 1 ) & PGDIR_MASK ) = =
2006-03-25 16:29:40 +01:00
( __START_KERNEL & PGDIR_MASK ) ) ) ;
2008-01-30 13:34:10 +01:00
# endif
2006-03-25 16:29:40 +01:00
}