2005-04-16 15:20:36 -07:00
/*
* linux / arch / i386 / mm / fault . c
*
* Copyright ( C ) 1995 Linus Torvalds
*/
# include <linux/signal.h>
# include <linux/sched.h>
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/string.h>
# include <linux/types.h>
# include <linux/ptrace.h>
# include <linux/mman.h>
# include <linux/mm.h>
# include <linux/smp.h>
# include <linux/smp_lock.h>
# include <linux/interrupt.h>
# include <linux/init.h>
# include <linux/tty.h>
# include <linux/vt_kern.h> /* For unblank_screen() */
# include <linux/highmem.h>
# include <linux/module.h>
2005-09-06 15:19:27 -07:00
# include <linux/kprobes.h>
2005-04-16 15:20:36 -07:00
# include <asm/system.h>
# include <asm/uaccess.h>
# include <asm/desc.h>
# include <asm/kdebug.h>
extern void die ( const char * , struct pt_regs * , long ) ;
/*
* Unlock any spinlocks which will prevent us from getting the
* message out
*/
void bust_spinlocks ( int yes )
{
int loglevel_save = console_loglevel ;
if ( yes ) {
oops_in_progress = 1 ;
return ;
}
# ifdef CONFIG_VT
unblank_screen ( ) ;
# endif
oops_in_progress = 0 ;
/*
* OK , the message is on the console . Now we call printk ( )
* without oops_in_progress set so that printk will give klogd
* a poke . Hold onto your hats . . .
*/
console_loglevel = 15 ; /* NMI oopser may have shut the console up */
printk ( " " ) ;
console_loglevel = loglevel_save ;
}
/*
* Return EIP plus the CS segment base . The segment limit is also
* adjusted , clamped to the kernel / user address space ( whichever is
* appropriate ) , and returned in * eip_limit .
*
* The segment is checked , because it might have been changed by another
* task between the original faulting instruction and here .
*
* If CS is no longer a valid code segment , or if EIP is beyond the
* limit , or if it is a kernel address when CS is not a kernel segment ,
* then the returned value will be greater than * eip_limit .
*
* This is slow , but is very rarely executed .
*/
static inline unsigned long get_segment_eip ( struct pt_regs * regs ,
unsigned long * eip_limit )
{
unsigned long eip = regs - > eip ;
unsigned seg = regs - > xcs & 0xffff ;
u32 seg_ar , seg_limit , base , * desc ;
/* The standard kernel/user address space limit. */
* eip_limit = ( seg & 3 ) ? USER_DS . seg : KERNEL_DS . seg ;
/* Unlikely, but must come before segment checks. */
if ( unlikely ( ( regs - > eflags & VM_MASK ) ! = 0 ) )
return eip + ( seg < < 4 ) ;
/* By far the most common cases. */
if ( likely ( seg = = __USER_CS | | seg = = __KERNEL_CS ) )
return eip ;
/* Check the segment exists, is within the current LDT/GDT size,
that kernel / user ( ring 0. .3 ) has the appropriate privilege ,
that it ' s a code segment , and get the limit . */
__asm__ ( " larl %3,%0; lsll %3,%1 "
: " =&r " ( seg_ar ) , " =r " ( seg_limit ) : " 0 " ( 0 ) , " rm " ( seg ) ) ;
if ( ( ~ seg_ar & 0x9800 ) | | eip > seg_limit ) {
* eip_limit = 0 ;
return 1 ; /* So that returned eip > *eip_limit. */
}
/* Get the GDT/LDT descriptor base.
When you look for races in this code remember that
LDT and other horrors are only used in user space . */
if ( seg & ( 1 < < 2 ) ) {
/* Must lock the LDT while reading it. */
down ( & current - > mm - > context . sem ) ;
desc = current - > mm - > context . ldt ;
desc = ( void * ) desc + ( seg & ~ 7 ) ;
} else {
/* Must disable preemption while reading the GDT. */
2005-10-30 14:59:34 -08:00
desc = ( u32 * ) get_cpu_gdt_table ( get_cpu ( ) ) ;
2005-04-16 15:20:36 -07:00
desc = ( void * ) desc + ( seg & ~ 7 ) ;
}
/* Decode the code segment base from the descriptor */
base = get_desc_base ( ( unsigned long * ) desc ) ;
if ( seg & ( 1 < < 2 ) ) {
up ( & current - > mm - > context . sem ) ;
} else
put_cpu ( ) ;
/* Adjust EIP and segment limit, and clamp at the kernel limit.
It ' s legitimate for segments to wrap at 0xffffffff . */
seg_limit + = base ;
if ( seg_limit < * eip_limit & & seg_limit > = base )
* eip_limit = seg_limit ;
return eip + base ;
}
/*
* Sometimes AMD Athlon / Opteron CPUs report invalid exceptions on prefetch .
* Check that here and ignore it .
*/
static int __is_prefetch ( struct pt_regs * regs , unsigned long addr )
{
unsigned long limit ;
unsigned long instr = get_segment_eip ( regs , & limit ) ;
int scan_more = 1 ;
int prefetch = 0 ;
int i ;
for ( i = 0 ; scan_more & & i < 15 ; i + + ) {
unsigned char opcode ;
unsigned char instr_hi ;
unsigned char instr_lo ;
if ( instr > limit )
break ;
2005-06-25 14:58:46 -07:00
if ( __get_user ( opcode , ( unsigned char __user * ) instr ) )
2005-04-16 15:20:36 -07:00
break ;
instr_hi = opcode & 0xf0 ;
instr_lo = opcode & 0x0f ;
instr + + ;
switch ( instr_hi ) {
case 0x20 :
case 0x30 :
/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
scan_more = ( ( instr_lo & 7 ) = = 0x6 ) ;
break ;
case 0x60 :
/* 0x64 thru 0x67 are valid prefixes in all modes. */
scan_more = ( instr_lo & 0xC ) = = 0x4 ;
break ;
case 0xF0 :
/* 0xF0, 0xF2, and 0xF3 are valid prefixes */
scan_more = ! instr_lo | | ( instr_lo > > 1 ) = = 1 ;
break ;
case 0x00 :
/* Prefetch instruction is 0x0F0D or 0x0F18 */
scan_more = 0 ;
if ( instr > limit )
break ;
2005-06-25 14:58:46 -07:00
if ( __get_user ( opcode , ( unsigned char __user * ) instr ) )
2005-04-16 15:20:36 -07:00
break ;
prefetch = ( instr_lo = = 0xF ) & &
( opcode = = 0x0D | | opcode = = 0x18 ) ;
break ;
default :
scan_more = 0 ;
break ;
}
}
return prefetch ;
}
static inline int is_prefetch ( struct pt_regs * regs , unsigned long addr ,
unsigned long error_code )
{
if ( unlikely ( boot_cpu_data . x86_vendor = = X86_VENDOR_AMD & &
boot_cpu_data . x86 > = 6 ) ) {
/* Catch an obscure case of prefetch inside an NX page. */
if ( nx_enabled & & ( error_code & 16 ) )
return 0 ;
return __is_prefetch ( regs , addr ) ;
}
return 0 ;
}
2005-09-03 15:56:26 -07:00
static noinline void force_sig_info_fault ( int si_signo , int si_code ,
unsigned long address , struct task_struct * tsk )
{
siginfo_t info ;
info . si_signo = si_signo ;
info . si_errno = 0 ;
info . si_code = si_code ;
info . si_addr = ( void __user * ) address ;
force_sig_info ( si_signo , & info , tsk ) ;
}
2005-04-16 15:20:36 -07:00
fastcall void do_invalid_op ( struct pt_regs * , unsigned long ) ;
2006-03-23 02:59:45 -08:00
static inline pmd_t * vmalloc_sync_one ( pgd_t * pgd , unsigned long address )
{
unsigned index = pgd_index ( address ) ;
pgd_t * pgd_k ;
pud_t * pud , * pud_k ;
pmd_t * pmd , * pmd_k ;
pgd + = index ;
pgd_k = init_mm . pgd + index ;
if ( ! pgd_present ( * pgd_k ) )
return NULL ;
/*
* set_pgd ( pgd , * pgd_k ) ; here would be useless on PAE
* and redundant with the set_pmd ( ) on non - PAE . As would
* set_pud .
*/
pud = pud_offset ( pgd , address ) ;
pud_k = pud_offset ( pgd_k , address ) ;
if ( ! pud_present ( * pud_k ) )
return NULL ;
pmd = pmd_offset ( pud , address ) ;
pmd_k = pmd_offset ( pud_k , address ) ;
if ( ! pmd_present ( * pmd_k ) )
return NULL ;
if ( ! pmd_present ( * pmd ) )
set_pmd ( pmd , * pmd_k ) ;
else
BUG_ON ( pmd_page ( * pmd ) ! = pmd_page ( * pmd_k ) ) ;
return pmd_k ;
}
/*
* Handle a fault on the vmalloc or module mapping area
*
* This assumes no large pages in there .
*/
static inline int vmalloc_fault ( unsigned long address )
{
unsigned long pgd_paddr ;
pmd_t * pmd_k ;
pte_t * pte_k ;
/*
* Synchronize this task ' s top level page - table
* with the ' reference ' page table .
*
* Do _not_ use " current " here . We might be inside
* an interrupt in the middle of a task switch . .
*/
pgd_paddr = read_cr3 ( ) ;
pmd_k = vmalloc_sync_one ( __va ( pgd_paddr ) , address ) ;
if ( ! pmd_k )
return - 1 ;
pte_k = pte_offset_kernel ( pmd_k , address ) ;
if ( ! pte_present ( * pte_k ) )
return - 1 ;
return 0 ;
}
2005-04-16 15:20:36 -07:00
/*
* This routine handles page faults . It determines the address ,
* and the problem , and then passes it off to one of the appropriate
* routines .
*
* error_code :
* bit 0 = = 0 means no page found , 1 means protection fault
* bit 1 = = 0 means read , 1 means write
* bit 2 = = 0 means kernel , 1 means user - mode
2006-03-23 02:59:45 -08:00
* bit 3 = = 1 means use of reserved bit detected
* bit 4 = = 1 means fault was an instruction fetch
2005-04-16 15:20:36 -07:00
*/
2005-09-06 15:19:27 -07:00
fastcall void __kprobes do_page_fault ( struct pt_regs * regs ,
unsigned long error_code )
2005-04-16 15:20:36 -07:00
{
struct task_struct * tsk ;
struct mm_struct * mm ;
struct vm_area_struct * vma ;
unsigned long address ;
unsigned long page ;
2005-09-03 15:56:26 -07:00
int write , si_code ;
2005-04-16 15:20:36 -07:00
/* get the address */
2005-09-03 15:56:36 -07:00
address = read_cr2 ( ) ;
2005-04-16 15:20:36 -07:00
tsk = current ;
2005-09-03 15:56:26 -07:00
si_code = SEGV_MAPERR ;
2005-04-16 15:20:36 -07:00
/*
* We fault - in kernel - space virtual memory on - demand . The
* ' reference ' page table is init_mm . pgd .
*
* NOTE ! We MUST NOT take any locks for this case . We may
* be in an interrupt or a critical region , and should
* only copy the information from the master page table ,
* nothing more .
*
* This verifies that the fault happens in kernel space
* ( error_code & 4 ) = = 0 , and that the fault was not a
2006-03-23 02:59:45 -08:00
* protection error ( error_code & 9 ) = = 0.
2005-04-16 15:20:36 -07:00
*/
2006-03-23 02:59:45 -08:00
if ( unlikely ( address > = TASK_SIZE ) ) {
if ( ! ( error_code & 0x0000000d ) & & vmalloc_fault ( address ) > = 0 )
return ;
if ( notify_die ( DIE_PAGE_FAULT , " page fault " , regs , error_code , 14 ,
SIGSEGV ) = = NOTIFY_STOP )
return ;
/*
2005-04-16 15:20:36 -07:00
* Don ' t take the mm semaphore here . If we fixup a prefetch
* fault we could otherwise deadlock .
*/
goto bad_area_nosemaphore ;
2006-03-23 02:59:45 -08:00
}
if ( notify_die ( DIE_PAGE_FAULT , " page fault " , regs , error_code , 14 ,
SIGSEGV ) = = NOTIFY_STOP )
return ;
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
fault has been handled . */
if ( regs - > eflags & ( X86_EFLAGS_IF | VM_MASK ) )
local_irq_enable ( ) ;
2005-04-16 15:20:36 -07:00
mm = tsk - > mm ;
/*
* If we ' re in an interrupt , have no user context or are running in an
* atomic region then we must not take the fault . .
*/
if ( in_atomic ( ) | | ! mm )
goto bad_area_nosemaphore ;
/* When running in the kernel we expect faults to occur only to
* addresses in user space . All other faults represent errors in the
* kernel and should generate an OOPS . Unfortunatly , in the case of an
* erroneous fault occuring in a code path which already holds mmap_sem
* we will deadlock attempting to validate the fault against the
* address space . Luckily the kernel only validly references user
* space from well defined areas of code , which are listed in the
* exceptions table .
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibilty of a deadlock .
* Attempt to lock the address space , if we cannot we then validate the
* source . If this is invalid we can skip the address space check ,
* thus avoiding the deadlock .
*/
if ( ! down_read_trylock ( & mm - > mmap_sem ) ) {
if ( ( error_code & 4 ) = = 0 & &
! search_exception_tables ( regs - > eip ) )
goto bad_area_nosemaphore ;
down_read ( & mm - > mmap_sem ) ;
}
vma = find_vma ( mm , address ) ;
if ( ! vma )
goto bad_area ;
if ( vma - > vm_start < = address )
goto good_area ;
if ( ! ( vma - > vm_flags & VM_GROWSDOWN ) )
goto bad_area ;
if ( error_code & 4 ) {
/*
2006-06-23 02:04:23 -07:00
* Accessing the stack below % esp is always a bug .
* The large cushion allows instructions like enter
* and pusha to work . ( " enter $65535,$31 " pushes
* 32 pointers and then decrements % esp by 65535. )
2005-04-16 15:20:36 -07:00
*/
2006-06-23 02:04:23 -07:00
if ( address + 65536 + 32 * sizeof ( unsigned long ) < regs - > esp )
2005-04-16 15:20:36 -07:00
goto bad_area ;
}
if ( expand_stack ( vma , address ) )
goto bad_area ;
/*
* Ok , we have a good vm_area for this memory access , so
* we can handle it . .
*/
good_area :
2005-09-03 15:56:26 -07:00
si_code = SEGV_ACCERR ;
2005-04-16 15:20:36 -07:00
write = 0 ;
switch ( error_code & 3 ) {
default : /* 3: write, present */
# ifdef TEST_VERIFY_AREA
if ( regs - > cs = = KERNEL_CS )
printk ( " WP fault at %08lx \n " , regs - > eip ) ;
# endif
/* fall through */
case 2 : /* write, not present */
if ( ! ( vma - > vm_flags & VM_WRITE ) )
goto bad_area ;
write + + ;
break ;
case 1 : /* read, present */
goto bad_area ;
case 0 : /* read, not present */
if ( ! ( vma - > vm_flags & ( VM_READ | VM_EXEC ) ) )
goto bad_area ;
}
survive :
/*
* If for any reason at all we couldn ' t handle the fault ,
* make sure we exit gracefully rather than endlessly redo
* the fault .
*/
switch ( handle_mm_fault ( mm , vma , address , write ) ) {
case VM_FAULT_MINOR :
tsk - > min_flt + + ;
break ;
case VM_FAULT_MAJOR :
tsk - > maj_flt + + ;
break ;
case VM_FAULT_SIGBUS :
goto do_sigbus ;
case VM_FAULT_OOM :
goto out_of_memory ;
default :
BUG ( ) ;
}
/*
* Did it hit the DOS screen memory VA from vm86 mode ?
*/
if ( regs - > eflags & VM_MASK ) {
unsigned long bit = ( address - 0xA0000 ) > > PAGE_SHIFT ;
if ( bit < 32 )
tsk - > thread . screen_bitmap | = 1 < < bit ;
}
up_read ( & mm - > mmap_sem ) ;
return ;
/*
* Something tried to access memory that isn ' t in our memory map . .
* Fix it , but check if it ' s kernel or user first . .
*/
bad_area :
up_read ( & mm - > mmap_sem ) ;
bad_area_nosemaphore :
/* User mode accesses just cause a SIGSEGV */
if ( error_code & 4 ) {
/*
* Valid to do another page fault here because this one came
* from user space .
*/
if ( is_prefetch ( regs , address , error_code ) )
return ;
tsk - > thread . cr2 = address ;
/* Kernel addresses are always protection faults */
tsk - > thread . error_code = error_code | ( address > = TASK_SIZE ) ;
tsk - > thread . trap_no = 14 ;
2005-09-03 15:56:26 -07:00
force_sig_info_fault ( SIGSEGV , si_code , address , tsk ) ;
2005-04-16 15:20:36 -07:00
return ;
}
# ifdef CONFIG_X86_F00F_BUG
/*
* Pentium F0 0F C7 C8 bug workaround .
*/
if ( boot_cpu_data . f00f_bug ) {
unsigned long nr ;
nr = ( address - idt_descr . address ) > > 3 ;
if ( nr = = 6 ) {
do_invalid_op ( regs , 0 ) ;
return ;
}
}
# endif
no_context :
/* Are we prepared to handle this kernel fault? */
if ( fixup_exception ( regs ) )
return ;
/*
* Valid to do another page fault here , because if this fault
* had been triggered by is_prefetch fixup_exception would have
* handled it .
*/
if ( is_prefetch ( regs , address , error_code ) )
return ;
/*
* Oops . The kernel tried to access some bad page . We ' ll have to
* terminate things with extreme prejudice .
*/
bust_spinlocks ( 1 ) ;
2006-03-23 03:00:57 -08:00
if ( oops_may_print ( ) ) {
# ifdef CONFIG_X86_PAE
if ( error_code & 16 ) {
pte_t * pte = lookup_address ( address ) ;
if ( pte & & pte_present ( * pte ) & & ! pte_exec_kernel ( * pte ) )
printk ( KERN_CRIT " kernel tried to execute "
" NX-protected page - exploit attempt? "
" (uid: %d) \n " , current - > uid ) ;
}
# endif
if ( address < PAGE_SIZE )
printk ( KERN_ALERT " BUG: unable to handle kernel NULL "
" pointer dereference " ) ;
else
printk ( KERN_ALERT " BUG: unable to handle kernel paging "
" request " ) ;
printk ( " at virtual address %08lx \n " , address ) ;
printk ( KERN_ALERT " printing eip: \n " ) ;
printk ( " %08lx \n " , regs - > eip ) ;
2005-04-16 15:20:36 -07:00
}
2005-09-03 15:56:36 -07:00
page = read_cr3 ( ) ;
2005-04-16 15:20:36 -07:00
page = ( ( unsigned long * ) __va ( page ) ) [ address > > 22 ] ;
2006-03-23 03:00:57 -08:00
if ( oops_may_print ( ) )
printk ( KERN_ALERT " *pde = %08lx \n " , page ) ;
2005-04-16 15:20:36 -07:00
/*
* We must not directly access the pte in the highpte
* case , the page table might be allocated in highmem .
* And lets rather not kmap - atomic the pte , just in case
* it ' s allocated already .
*/
# ifndef CONFIG_HIGHPTE
2006-03-23 03:00:57 -08:00
if ( ( page & 1 ) & & oops_may_print ( ) ) {
2005-04-16 15:20:36 -07:00
page & = PAGE_MASK ;
address & = 0x003ff000 ;
page = ( ( unsigned long * ) __va ( page ) ) [ address > > PAGE_SHIFT ] ;
printk ( KERN_ALERT " *pte = %08lx \n " , page ) ;
}
# endif
2005-06-25 14:58:27 -07:00
tsk - > thread . cr2 = address ;
tsk - > thread . trap_no = 14 ;
tsk - > thread . error_code = error_code ;
2005-04-16 15:20:36 -07:00
die ( " Oops " , regs , error_code ) ;
bust_spinlocks ( 0 ) ;
do_exit ( SIGKILL ) ;
/*
* We ran out of memory , or some other thing happened to us that made
* us unable to handle the page fault gracefully .
*/
out_of_memory :
up_read ( & mm - > mmap_sem ) ;
if ( tsk - > pid = = 1 ) {
yield ( ) ;
down_read ( & mm - > mmap_sem ) ;
goto survive ;
}
printk ( " VM: killing process %s \n " , tsk - > comm ) ;
if ( error_code & 4 )
do_exit ( SIGKILL ) ;
goto no_context ;
do_sigbus :
up_read ( & mm - > mmap_sem ) ;
/* Kernel mode? Handle exceptions or die */
if ( ! ( error_code & 4 ) )
goto no_context ;
/* User space => ok to do another page fault */
if ( is_prefetch ( regs , address , error_code ) )
return ;
tsk - > thread . cr2 = address ;
tsk - > thread . error_code = error_code ;
tsk - > thread . trap_no = 14 ;
2005-09-03 15:56:26 -07:00
force_sig_info_fault ( SIGBUS , BUS_ADRERR , address , tsk ) ;
2006-03-23 02:59:45 -08:00
}
2005-04-16 15:20:36 -07:00
2006-03-23 02:59:45 -08:00
# ifndef CONFIG_X86_PAE
void vmalloc_sync_all ( void )
{
/*
* Note that races in the updates of insync and start aren ' t
* problematic : insync can only get set bits added , and updates to
* start are only improving performance ( without affecting correctness
* if undone ) .
*/
static DECLARE_BITMAP ( insync , PTRS_PER_PGD ) ;
static unsigned long start = TASK_SIZE ;
unsigned long address ;
2005-04-16 15:20:36 -07:00
2006-03-23 02:59:45 -08:00
BUILD_BUG_ON ( TASK_SIZE & ~ PGDIR_MASK ) ;
for ( address = start ; address > = TASK_SIZE ; address + = PGDIR_SIZE ) {
if ( ! test_bit ( pgd_index ( address ) , insync ) ) {
unsigned long flags ;
struct page * page ;
spin_lock_irqsave ( & pgd_lock , flags ) ;
for ( page = pgd_list ; page ; page =
( struct page * ) page - > index )
if ( ! vmalloc_sync_one ( page_address ( page ) ,
address ) ) {
BUG_ON ( page ! = pgd_list ) ;
break ;
}
spin_unlock_irqrestore ( & pgd_lock , flags ) ;
if ( ! page )
set_bit ( pgd_index ( address ) , insync ) ;
}
if ( address = = start & & test_bit ( pgd_index ( address ) , insync ) )
start = address + PGDIR_SIZE ;
2005-04-16 15:20:36 -07:00
}
}
2006-03-23 02:59:45 -08:00
# endif