2005-04-17 02:20:36 +04:00
/*
* Copyright ( C ) 1995 Linus Torvalds
* Copyright ( C ) 2001 , 2002 Andi Kleen , SuSE Labs .
*/
# include <linux/signal.h>
# include <linux/sched.h>
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/string.h>
# include <linux/types.h>
# include <linux/ptrace.h>
# include <linux/mman.h>
# include <linux/mm.h>
# include <linux/smp.h>
# include <linux/interrupt.h>
# include <linux/init.h>
# include <linux/tty.h>
# include <linux/vt_kern.h> /* For unblank_screen() */
# include <linux/compiler.h>
2007-05-08 11:27:03 +04:00
# include <linux/vmalloc.h>
2005-04-17 02:20:36 +04:00
# include <linux/module.h>
2005-09-07 02:19:28 +04:00
# include <linux/kprobes.h>
2006-12-07 04:14:06 +03:00
# include <linux/uaccess.h>
2007-05-08 11:27:03 +04:00
# include <linux/kdebug.h>
2005-04-17 02:20:36 +04:00
# include <asm/system.h>
# include <asm/pgalloc.h>
# include <asm/smp.h>
# include <asm/tlbflush.h>
# include <asm/proto.h>
# include <asm-generic/sections.h>
2008-01-30 15:32:19 +03:00
/*
* Page fault error code bits
* bit 0 = = 0 means no page found , 1 means protection fault
* bit 1 = = 0 means read , 1 means write
* bit 2 = = 0 means kernel , 1 means user - mode
* bit 3 = = 1 means use of reserved bit detected
* bit 4 = = 1 means fault was an instruction fetch
*/
2008-01-30 15:32:53 +03:00
# define PF_PROT (1<<0)
2006-01-12 00:44:09 +03:00
# define PF_WRITE (1<<1)
2008-01-30 15:32:53 +03:00
# define PF_USER (1<<2)
# define PF_RSVD (1<<3)
2006-01-12 00:44:09 +03:00
# define PF_INSTR (1<<4)
2007-10-16 12:24:07 +04:00
static inline int notify_page_fault ( struct pt_regs * regs )
2006-06-26 11:25:25 +04:00
{
2008-01-30 15:32:19 +03:00
# ifdef CONFIG_KPROBES
2007-10-16 12:24:07 +04:00
int ret = 0 ;
/* kprobe_running() needs smp_processor_id() */
if ( ! user_mode ( regs ) ) {
preempt_disable ( ) ;
if ( kprobe_running ( ) & & kprobe_fault_handler ( regs , 14 ) )
ret = 1 ;
preempt_enable ( ) ;
}
2006-06-26 11:25:25 +04:00
2007-10-16 12:24:07 +04:00
return ret ;
# else
return 0 ;
# endif
2008-01-30 15:32:19 +03:00
}
2006-06-26 11:25:25 +04:00
2008-01-30 15:32:35 +03:00
# ifdef CONFIG_X86_32
/*
* Return EIP plus the CS segment base . The segment limit is also
* adjusted , clamped to the kernel / user address space ( whichever is
* appropriate ) , and returned in * eip_limit .
*
* The segment is checked , because it might have been changed by another
* task between the original faulting instruction and here .
*
* If CS is no longer a valid code segment , or if EIP is beyond the
* limit , or if it is a kernel address when CS is not a kernel segment ,
* then the returned value will be greater than * eip_limit .
*
* This is slow , but is very rarely executed .
*/
static inline unsigned long get_segment_eip ( struct pt_regs * regs ,
unsigned long * eip_limit )
{
unsigned long ip = regs - > ip ;
unsigned seg = regs - > cs & 0xffff ;
u32 seg_ar , seg_limit , base , * desc ;
/* Unlikely, but must come before segment checks. */
if ( unlikely ( regs - > flags & VM_MASK ) ) {
base = seg < < 4 ;
* eip_limit = base + 0xffff ;
return base + ( ip & 0xffff ) ;
}
/* The standard kernel/user address space limit. */
* eip_limit = user_mode ( regs ) ? USER_DS . seg : KERNEL_DS . seg ;
/* By far the most common cases. */
if ( likely ( SEGMENT_IS_FLAT_CODE ( seg ) ) )
return ip ;
/* Check the segment exists, is within the current LDT/GDT size,
that kernel / user ( ring 0. .3 ) has the appropriate privilege ,
that it ' s a code segment , and get the limit . */
__asm__ ( " larl %3,%0; lsll %3,%1 "
: " =&r " ( seg_ar ) , " =r " ( seg_limit ) : " 0 " ( 0 ) , " rm " ( seg ) ) ;
if ( ( ~ seg_ar & 0x9800 ) | | ip > seg_limit ) {
* eip_limit = 0 ;
return 1 ; /* So that returned ip > *eip_limit. */
}
/* Get the GDT/LDT descriptor base.
When you look for races in this code remember that
LDT and other horrors are only used in user space . */
if ( seg & ( 1 < < 2 ) ) {
/* Must lock the LDT while reading it. */
mutex_lock ( & current - > mm - > context . lock ) ;
desc = current - > mm - > context . ldt ;
desc = ( void * ) desc + ( seg & ~ 7 ) ;
} else {
/* Must disable preemption while reading the GDT. */
desc = ( u32 * ) get_cpu_gdt_table ( get_cpu ( ) ) ;
desc = ( void * ) desc + ( seg & ~ 7 ) ;
}
/* Decode the code segment base from the descriptor */
base = get_desc_base ( ( struct desc_struct * ) desc ) ;
if ( seg & ( 1 < < 2 ) )
mutex_unlock ( & current - > mm - > context . lock ) ;
else
put_cpu ( ) ;
/* Adjust EIP and segment limit, and clamp at the kernel limit.
It ' s legitimate for segments to wrap at 0xffffffff . */
seg_limit + = base ;
if ( seg_limit < * eip_limit & & seg_limit > = base )
* eip_limit = seg_limit ;
return ip + base ;
}
# endif
/*
* X86_32
* Sometimes AMD Athlon / Opteron CPUs report invalid exceptions on prefetch .
* Check that here and ignore it .
*
* X86_64
* Sometimes the CPU reports invalid exceptions on prefetch .
* Check that here and ignore it .
*
* Opcode checker based on code by Richard Brunner
*/
static int is_prefetch ( struct pt_regs * regs , unsigned long addr ,
unsigned long error_code )
2008-01-30 15:32:19 +03:00
{
2006-12-07 04:14:06 +03:00
unsigned char * instr ;
2005-04-17 02:20:36 +04:00
int scan_more = 1 ;
2008-01-30 15:32:19 +03:00
int prefetch = 0 ;
2005-04-17 02:24:59 +04:00
unsigned char * max_instr ;
2005-04-17 02:20:36 +04:00
2008-01-30 15:32:35 +03:00
# ifdef CONFIG_X86_32
unsigned long limit ;
if ( unlikely ( boot_cpu_data . x86_vendor = = X86_VENDOR_AMD & &
boot_cpu_data . x86 > = 6 ) ) {
/* Catch an obscure case of prefetch inside an NX page. */
if ( nx_enabled & & ( error_code & PF_INSTR ) )
return 0 ;
} else {
return 0 ;
}
instr = ( unsigned char * ) get_segment_eip ( regs , & limit ) ;
# else
2005-04-17 02:20:36 +04:00
/* If it was a exec fault ignore */
2006-01-12 00:44:09 +03:00
if ( error_code & PF_INSTR )
2005-04-17 02:20:36 +04:00
return 0 ;
2006-09-26 12:52:33 +04:00
instr = ( unsigned char __user * ) convert_rip_to_linear ( current , regs ) ;
2008-01-30 15:32:35 +03:00
# endif
2005-04-17 02:24:59 +04:00
max_instr = instr + 15 ;
2005-04-17 02:20:36 +04:00
2008-01-30 15:32:35 +03:00
# ifdef CONFIG_X86_64
2005-06-23 11:08:46 +04:00
if ( user_mode ( regs ) & & instr > = ( unsigned char * ) TASK_SIZE )
2005-04-17 02:20:36 +04:00
return 0 ;
2008-01-30 15:32:35 +03:00
# endif
2005-04-17 02:20:36 +04:00
2008-01-30 15:32:19 +03:00
while ( scan_more & & instr < max_instr ) {
2005-04-17 02:20:36 +04:00
unsigned char opcode ;
unsigned char instr_hi ;
unsigned char instr_lo ;
2008-01-30 15:32:35 +03:00
# ifdef CONFIG_X86_32
if ( instr > ( unsigned char * ) limit )
break ;
# endif
2006-12-07 04:14:06 +03:00
if ( probe_kernel_address ( instr , opcode ) )
2008-01-30 15:32:19 +03:00
break ;
2005-04-17 02:20:36 +04:00
2008-01-30 15:32:19 +03:00
instr_hi = opcode & 0xf0 ;
instr_lo = opcode & 0x0f ;
2005-04-17 02:20:36 +04:00
instr + + ;
2008-01-30 15:32:19 +03:00
switch ( instr_hi ) {
2005-04-17 02:20:36 +04:00
case 0x20 :
case 0x30 :
2008-01-30 15:32:19 +03:00
/*
* Values 0x26 , 0x2E , 0x36 , 0x3E are valid x86 prefixes .
* In X86_64 long mode , the CPU will signal invalid
* opcode if some of these prefixes are present so
* X86_64 will never get here anyway
*/
2005-04-17 02:20:36 +04:00
scan_more = ( ( instr_lo & 7 ) = = 0x6 ) ;
break ;
2008-01-30 15:32:19 +03:00
# ifdef CONFIG_X86_64
2005-04-17 02:20:36 +04:00
case 0x40 :
2008-01-30 15:32:19 +03:00
/*
* In AMD64 long mode 0x40 . .0 x4F are valid REX prefixes
* Need to figure out under what instruction mode the
* instruction was issued . Could check the LDT for lm ,
* but for now it ' s good enough to assume that long
* mode only uses well known segments or kernel .
*/
2005-06-23 11:08:46 +04:00
scan_more = ( ! user_mode ( regs ) ) | | ( regs - > cs = = __USER_CS ) ;
2005-04-17 02:20:36 +04:00
break ;
2008-01-30 15:32:19 +03:00
# endif
2005-04-17 02:20:36 +04:00
case 0x60 :
/* 0x64 thru 0x67 are valid prefixes in all modes. */
scan_more = ( instr_lo & 0xC ) = = 0x4 ;
2008-01-30 15:32:19 +03:00
break ;
2005-04-17 02:20:36 +04:00
case 0xF0 :
2008-01-30 15:32:35 +03:00
/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
2005-04-17 02:20:36 +04:00
scan_more = ! instr_lo | | ( instr_lo > > 1 ) = = 1 ;
2008-01-30 15:32:19 +03:00
break ;
2005-04-17 02:20:36 +04:00
case 0x00 :
/* Prefetch instruction is 0x0F0D or 0x0F18 */
scan_more = 0 ;
2008-01-30 15:32:35 +03:00
# ifdef CONFIG_X86_32
if ( instr > ( unsigned char * ) limit )
break ;
# endif
2006-12-07 04:14:06 +03:00
if ( probe_kernel_address ( instr , opcode ) )
2005-04-17 02:20:36 +04:00
break ;
prefetch = ( instr_lo = = 0xF ) & &
( opcode = = 0x0D | | opcode = = 0x18 ) ;
2008-01-30 15:32:19 +03:00
break ;
2005-04-17 02:20:36 +04:00
default :
scan_more = 0 ;
break ;
2008-01-30 15:32:19 +03:00
}
2005-04-17 02:20:36 +04:00
}
return prefetch ;
}
2008-01-30 15:32:35 +03:00
static void force_sig_info_fault ( int si_signo , int si_code ,
unsigned long address , struct task_struct * tsk )
{
siginfo_t info ;
info . si_signo = si_signo ;
info . si_errno = 0 ;
info . si_code = si_code ;
info . si_addr = ( void __user * ) address ;
force_sig_info ( si_signo , & info , tsk ) ;
}
2008-01-30 15:32:19 +03:00
static int bad_address ( void * p )
{
2005-04-17 02:20:36 +04:00
unsigned long dummy ;
2006-12-07 04:14:06 +03:00
return probe_kernel_address ( ( unsigned long * ) p , dummy ) ;
2008-01-30 15:32:19 +03:00
}
2005-04-17 02:20:36 +04:00
void dump_pagetable ( unsigned long address )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
2007-07-22 13:12:29 +04:00
pgd = ( pgd_t * ) read_cr3 ( ) ;
2005-04-17 02:20:36 +04:00
2008-01-30 15:32:19 +03:00
pgd = __va ( ( unsigned long ) pgd & PHYSICAL_PAGE_MASK ) ;
2005-04-17 02:20:36 +04:00
pgd + = pgd_index ( address ) ;
if ( bad_address ( pgd ) ) goto bad ;
2006-02-03 23:51:47 +03:00
printk ( " PGD %lx " , pgd_val ( * pgd ) ) ;
2008-01-30 15:32:19 +03:00
if ( ! pgd_present ( * pgd ) ) goto ret ;
2005-04-17 02:20:36 +04:00
2006-06-26 15:57:56 +04:00
pud = pud_offset ( pgd , address ) ;
2005-04-17 02:20:36 +04:00
if ( bad_address ( pud ) ) goto bad ;
printk ( " PUD %lx " , pud_val ( * pud ) ) ;
if ( ! pud_present ( * pud ) ) goto ret ;
pmd = pmd_offset ( pud , address ) ;
if ( bad_address ( pmd ) ) goto bad ;
printk ( " PMD %lx " , pmd_val ( * pmd ) ) ;
2007-10-19 22:35:03 +04:00
if ( ! pmd_present ( * pmd ) | | pmd_large ( * pmd ) ) goto ret ;
2005-04-17 02:20:36 +04:00
pte = pte_offset_kernel ( pmd , address ) ;
if ( bad_address ( pte ) ) goto bad ;
2008-01-30 15:32:19 +03:00
printk ( " PTE %lx " , pte_val ( * pte ) ) ;
2005-04-17 02:20:36 +04:00
ret :
printk ( " \n " ) ;
return ;
bad :
printk ( " BAD \n " ) ;
}
2008-01-30 15:32:35 +03:00
# ifdef CONFIG_X86_64
2008-01-30 15:32:19 +03:00
static const char errata93_warning [ ] =
2005-04-17 02:20:36 +04:00
KERN_ERR " ******* Your BIOS seems to not contain a fix for K8 errata #93 \n "
KERN_ERR " ******* Working around it, but it may cause SEGVs or burn power. \n "
KERN_ERR " ******* Please consider a BIOS update. \n "
KERN_ERR " ******* Disabling USB legacy in the BIOS may also help. \n " ;
/* Workaround for K8 erratum #93 & buggy BIOS.
BIOS SMM functions are required to use a specific workaround
2008-01-30 15:32:19 +03:00
to avoid corruption of the 64 bit RIP register on C stepping K8 .
A lot of BIOS that didn ' t get tested properly miss this .
2005-04-17 02:20:36 +04:00
The OS sees this as a page fault with the upper 32 bits of RIP cleared .
Try to work around it here .
Note we only handle faults in kernel here . */
2008-01-30 15:32:19 +03:00
static int is_errata93 ( struct pt_regs * regs , unsigned long address )
2005-04-17 02:20:36 +04:00
{
static int warned ;
2008-01-30 15:30:56 +03:00
if ( address ! = regs - > ip )
2005-04-17 02:20:36 +04:00
return 0 ;
2008-01-30 15:32:19 +03:00
if ( ( address > > 32 ) ! = 0 )
2005-04-17 02:20:36 +04:00
return 0 ;
address | = 0xffffffffUL < < 32 ;
2008-01-30 15:32:19 +03:00
if ( ( address > = ( u64 ) _stext & & address < = ( u64 ) _etext ) | |
( address > = MODULES_VADDR & & address < = MODULES_END ) ) {
2005-04-17 02:20:36 +04:00
if ( ! warned ) {
2008-01-30 15:32:19 +03:00
printk ( errata93_warning ) ;
2005-04-17 02:20:36 +04:00
warned = 1 ;
}
2008-01-30 15:30:56 +03:00
regs - > ip = address ;
2005-04-17 02:20:36 +04:00
return 1 ;
}
return 0 ;
2008-01-30 15:32:19 +03:00
}
2008-01-30 15:32:35 +03:00
# endif
2005-04-17 02:20:36 +04:00
static noinline void pgtable_bad ( unsigned long address , struct pt_regs * regs ,
unsigned long error_code )
{
2005-09-12 20:49:24 +04:00
unsigned long flags = oops_begin ( ) ;
2006-01-12 00:42:14 +03:00
struct task_struct * tsk ;
2005-09-12 20:49:24 +04:00
2005-04-17 02:20:36 +04:00
printk ( KERN_ALERT " %s: Corrupted page table at address %lx \n " ,
current - > comm , address ) ;
dump_pagetable ( address ) ;
2006-01-12 00:42:14 +03:00
tsk = current ;
tsk - > thread . cr2 = address ;
tsk - > thread . trap_no = 14 ;
tsk - > thread . error_code = error_code ;
2008-01-30 15:31:23 +03:00
if ( __die ( " Bad pagetable " , regs , error_code ) )
regs = NULL ;
oops_end ( flags , regs , SIGKILL ) ;
2005-04-17 02:20:36 +04:00
}
/*
2006-01-12 00:44:00 +03:00
* Handle a fault on the vmalloc area
2005-05-17 08:53:31 +04:00
*
* This assumes no large pages in there .
2005-04-17 02:20:36 +04:00
*/
static int vmalloc_fault ( unsigned long address )
{
pgd_t * pgd , * pgd_ref ;
pud_t * pud , * pud_ref ;
pmd_t * pmd , * pmd_ref ;
pte_t * pte , * pte_ref ;
/* Copy kernel mappings over when needed. This can also
happen within a race in page table update . In the later
case just flush . */
pgd = pgd_offset ( current - > mm ? : & init_mm , address ) ;
pgd_ref = pgd_offset_k ( address ) ;
if ( pgd_none ( * pgd_ref ) )
return - 1 ;
if ( pgd_none ( * pgd ) )
set_pgd ( pgd , * pgd_ref ) ;
2006-03-25 18:29:40 +03:00
else
2006-09-26 10:31:48 +04:00
BUG_ON ( pgd_page_vaddr ( * pgd ) ! = pgd_page_vaddr ( * pgd_ref ) ) ;
2005-04-17 02:20:36 +04:00
/* Below here mismatches are bugs because these lower tables
are shared */
pud = pud_offset ( pgd , address ) ;
pud_ref = pud_offset ( pgd_ref , address ) ;
if ( pud_none ( * pud_ref ) )
return - 1 ;
2006-09-26 10:31:48 +04:00
if ( pud_none ( * pud ) | | pud_page_vaddr ( * pud ) ! = pud_page_vaddr ( * pud_ref ) )
2005-04-17 02:20:36 +04:00
BUG ( ) ;
pmd = pmd_offset ( pud , address ) ;
pmd_ref = pmd_offset ( pud_ref , address ) ;
if ( pmd_none ( * pmd_ref ) )
return - 1 ;
if ( pmd_none ( * pmd ) | | pmd_page ( * pmd ) ! = pmd_page ( * pmd_ref ) )
BUG ( ) ;
pte_ref = pte_offset_kernel ( pmd_ref , address ) ;
if ( ! pte_present ( * pte_ref ) )
return - 1 ;
pte = pte_offset_kernel ( pmd , address ) ;
2005-05-17 08:53:31 +04:00
/* Don't use pte_page here, because the mappings can point
outside mem_map , and the NUMA hash lookup cannot handle
that . */
if ( ! pte_present ( * pte ) | | pte_pfn ( * pte ) ! = pte_pfn ( * pte_ref ) )
2005-04-17 02:20:36 +04:00
BUG ( ) ;
return 0 ;
}
2007-07-22 13:12:28 +04:00
int show_unhandled_signals = 1 ;
2005-04-17 02:20:36 +04:00
/*
* This routine handles page faults . It determines the address ,
* and the problem , and then passes it off to one of the appropriate
* routines .
*/
2005-09-07 02:19:28 +04:00
asmlinkage void __kprobes do_page_fault ( struct pt_regs * regs ,
unsigned long error_code )
2005-04-17 02:20:36 +04:00
{
struct task_struct * tsk ;
struct mm_struct * mm ;
2008-01-30 15:32:19 +03:00
struct vm_area_struct * vma ;
2005-04-17 02:20:36 +04:00
unsigned long address ;
2007-07-19 12:47:05 +04:00
int write , fault ;
2005-09-12 20:49:24 +04:00
unsigned long flags ;
2008-01-30 15:32:35 +03:00
int si_code ;
2005-04-17 02:20:36 +04:00
2007-10-25 16:01:10 +04:00
/*
* We can fault from pretty much anywhere , with unknown IRQ state .
*/
trace_hardirqs_fixup ( ) ;
2006-03-25 18:30:10 +03:00
tsk = current ;
mm = tsk - > mm ;
prefetchw ( & mm - > mmap_sem ) ;
2005-04-17 02:20:36 +04:00
/* get the address */
2007-07-22 13:12:29 +04:00
address = read_cr2 ( ) ;
2005-04-17 02:20:36 +04:00
2008-01-30 15:32:35 +03:00
si_code = SEGV_MAPERR ;
2005-04-17 02:20:36 +04:00
/*
* We fault - in kernel - space virtual memory on - demand . The
* ' reference ' page table is init_mm . pgd .
*
* NOTE ! We MUST NOT take any locks for this case . We may
* be in an interrupt or a critical region , and should
* only copy the information from the master page table ,
* nothing more .
*
* This verifies that the fault happens in kernel space
* ( error_code & 4 ) = = 0 , and that the fault was not a
2006-01-12 00:42:23 +03:00
* protection error ( error_code & 9 ) = = 0.
2005-04-17 02:20:36 +04:00
*/
[PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes
Appended patch will setup compatibility mode TASK_SIZE properly. This will
fix atleast three known bugs that can be encountered while running
compatibility mode apps.
a) A malicious 32bit app can have an elf section at 0xffffe000. During
exec of this app, we will have a memory leak as insert_vm_struct() is
not checking for return value in syscall32_setup_pages() and thus not
freeing the vma allocated for the vsyscall page. And instead of exec
failing (as it has addresses > TASK_SIZE), we were allowing it to
succeed previously.
b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area
may return addresses beyond 32bits, ultimately causing corruption
because of wrap-around and resulting in SEGFAULT, instead of returning
ENOMEM.
c) 32bit app doing this below mmap will now fail.
mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ|PROT_WRITE,
MAP_FIXED|MAP_PRIVATE|MAP_ANON, 0, 0);
Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-22 04:14:32 +04:00
if ( unlikely ( address > = TASK_SIZE64 ) ) {
2006-01-12 00:44:00 +03:00
/*
* Don ' t check for the module range here : its PML4
* is always initialized because it ' s shared with the main
* kernel text . Only vmalloc may need PML4 syncups .
*/
2006-01-12 00:44:09 +03:00
if ( ! ( error_code & ( PF_RSVD | PF_USER | PF_PROT ) ) & &
2006-01-12 00:44:00 +03:00
( ( address > = VMALLOC_START & & address < VMALLOC_END ) ) ) {
2006-03-25 18:29:40 +03:00
if ( vmalloc_fault ( address ) > = 0 )
return ;
2005-04-17 02:20:36 +04:00
}
2007-10-16 12:24:07 +04:00
if ( notify_page_fault ( regs ) )
2006-03-25 18:29:40 +03:00
return ;
2005-04-17 02:20:36 +04:00
/*
* Don ' t take the mm semaphore here . If we fixup a prefetch
* fault we could otherwise deadlock .
*/
goto bad_area_nosemaphore ;
}
2007-10-16 12:24:07 +04:00
if ( notify_page_fault ( regs ) )
2006-03-25 18:29:40 +03:00
return ;
2008-01-30 15:30:56 +03:00
if ( likely ( regs - > flags & X86_EFLAGS_IF ) )
2006-03-25 18:29:40 +03:00
local_irq_enable ( ) ;
2006-01-12 00:44:09 +03:00
if ( unlikely ( error_code & PF_RSVD ) )
2005-04-17 02:20:36 +04:00
pgtable_bad ( address , regs , error_code ) ;
/*
2008-01-30 15:32:19 +03:00
* If we ' re in an interrupt , have no user context or are running in an
* atomic region then we must not take the fault .
2005-04-17 02:20:36 +04:00
*/
if ( unlikely ( in_atomic ( ) | | ! mm ) )
goto bad_area_nosemaphore ;
2007-09-19 22:37:14 +04:00
/*
* User - mode registers count as a user access even for any
* potential system fault or CPU buglet .
*/
if ( user_mode_vm ( regs ) )
error_code | = PF_USER ;
2005-04-17 02:20:36 +04:00
again :
/* When running in the kernel we expect faults to occur only to
* addresses in user space . All other faults represent errors in the
2007-10-20 03:25:36 +04:00
* kernel and should generate an OOPS . Unfortunately , in the case of an
2006-06-30 20:27:16 +04:00
* erroneous fault occurring in a code path which already holds mmap_sem
2005-04-17 02:20:36 +04:00
* we will deadlock attempting to validate the fault against the
* address space . Luckily the kernel only validly references user
* space from well defined areas of code , which are listed in the
* exceptions table .
*
* As the vast majority of faults will be valid we will only perform
2007-10-20 03:25:36 +04:00
* the source reference check when there is a possibility of a deadlock .
2005-04-17 02:20:36 +04:00
* Attempt to lock the address space , if we cannot we then validate the
* source . If this is invalid we can skip the address space check ,
* thus avoiding the deadlock .
*/
if ( ! down_read_trylock ( & mm - > mmap_sem ) ) {
2006-01-12 00:44:09 +03:00
if ( ( error_code & PF_USER ) = = 0 & &
2008-01-30 15:30:56 +03:00
! search_exception_tables ( regs - > ip ) )
2005-04-17 02:20:36 +04:00
goto bad_area_nosemaphore ;
down_read ( & mm - > mmap_sem ) ;
}
vma = find_vma ( mm , address ) ;
if ( ! vma )
goto bad_area ;
if ( likely ( vma - > vm_start < = address ) )
goto good_area ;
if ( ! ( vma - > vm_flags & VM_GROWSDOWN ) )
goto bad_area ;
2008-01-30 15:32:19 +03:00
if ( error_code & PF_USER ) {
2006-06-26 15:59:50 +04:00
/* Allow userspace just enough access below the stack pointer
* to let the ' enter ' instruction work .
*/
2008-01-30 15:30:56 +03:00
if ( address + 65536 + 32 * sizeof ( unsigned long ) < regs - > sp )
2005-04-17 02:20:36 +04:00
goto bad_area ;
}
if ( expand_stack ( vma , address ) )
goto bad_area ;
/*
* Ok , we have a good vm_area for this memory access , so
* we can handle it . .
*/
good_area :
2008-01-30 15:32:35 +03:00
si_code = SEGV_ACCERR ;
2005-04-17 02:20:36 +04:00
write = 0 ;
2006-01-12 00:44:09 +03:00
switch ( error_code & ( PF_PROT | PF_WRITE ) ) {
2008-01-30 15:32:19 +03:00
default : /* 3: write, present */
/* fall through */
case PF_WRITE : /* write, not present */
if ( ! ( vma - > vm_flags & VM_WRITE ) )
goto bad_area ;
write + + ;
break ;
case PF_PROT : /* read, present */
goto bad_area ;
case 0 : /* read, not present */
if ( ! ( vma - > vm_flags & ( VM_READ | VM_EXEC | VM_WRITE ) ) )
2005-04-17 02:20:36 +04:00
goto bad_area ;
}
/*
* If for any reason at all we couldn ' t handle the fault ,
* make sure we exit gracefully rather than endlessly redo
* the fault .
*/
2007-07-19 12:47:05 +04:00
fault = handle_mm_fault ( mm , vma , address , write ) ;
if ( unlikely ( fault & VM_FAULT_ERROR ) ) {
if ( fault & VM_FAULT_OOM )
goto out_of_memory ;
else if ( fault & VM_FAULT_SIGBUS )
goto do_sigbus ;
BUG ( ) ;
2005-04-17 02:20:36 +04:00
}
2007-07-19 12:47:05 +04:00
if ( fault & VM_FAULT_MAJOR )
tsk - > maj_flt + + ;
else
tsk - > min_flt + + ;
2005-04-17 02:20:36 +04:00
up_read ( & mm - > mmap_sem ) ;
return ;
/*
* Something tried to access memory that isn ' t in our memory map . .
* Fix it , but check if it ' s kernel or user first . .
*/
bad_area :
up_read ( & mm - > mmap_sem ) ;
bad_area_nosemaphore :
/* User mode accesses just cause a SIGSEGV */
2006-01-12 00:44:09 +03:00
if ( error_code & PF_USER ) {
2007-06-07 07:34:04 +04:00
/*
* It ' s possible to have interrupts off here .
*/
local_irq_enable ( ) ;
2005-04-17 02:20:36 +04:00
if ( is_prefetch ( regs , address , error_code ) )
return ;
/* Work around K8 erratum #100 K8 in compat mode
occasionally jumps to illegal addresses > 4 GB . We
catch this here in the page fault handler because
these addresses are not reachable . Just detect this
case and return . Any code segment in LDT is
compatibility mode . */
if ( ( regs - > cs = = __USER32_CS | | ( regs - > cs & ( 1 < < 2 ) ) ) & &
( address > > 32 ) )
return ;
2007-07-22 13:12:28 +04:00
if ( show_unhandled_signals & & unhandled_signal ( tsk , SIGSEGV ) & &
printk_ratelimit ( ) ) {
2005-04-17 02:20:36 +04:00
printk (
2008-01-30 15:30:56 +03:00
" %s%s[%d]: segfault at %lx ip %lx sp %lx error %lx \n " ,
2005-04-17 02:20:36 +04:00
tsk - > pid > 1 ? KERN_INFO : KERN_EMERG ,
2008-01-30 15:30:56 +03:00
tsk - > comm , tsk - > pid , address , regs - > ip ,
regs - > sp , error_code ) ;
2005-04-17 02:20:36 +04:00
}
2008-01-30 15:32:19 +03:00
2005-04-17 02:20:36 +04:00
tsk - > thread . cr2 = address ;
/* Kernel addresses are always protection faults */
tsk - > thread . error_code = error_code | ( address > = TASK_SIZE ) ;
tsk - > thread . trap_no = 14 ;
2008-01-30 15:32:35 +03:00
force_sig_info_fault ( SIGSEGV , si_code , address , tsk ) ;
2005-04-17 02:20:36 +04:00
return ;
}
no_context :
/* Are we prepared to handle this kernel fault? */
2008-01-30 15:32:19 +03:00
if ( fixup_exception ( regs ) )
2005-04-17 02:20:36 +04:00
return ;
2008-01-30 15:32:19 +03:00
/*
2005-04-17 02:20:36 +04:00
* Hall of shame of CPU / BIOS bugs .
*/
2008-01-30 15:32:19 +03:00
if ( is_prefetch ( regs , address , error_code ) )
return ;
2005-04-17 02:20:36 +04:00
if ( is_errata93 ( regs , address ) )
2008-01-30 15:32:19 +03:00
return ;
2005-04-17 02:20:36 +04:00
/*
* Oops . The kernel tried to access some bad page . We ' ll have to
* terminate things with extreme prejudice .
*/
2005-09-12 20:49:24 +04:00
flags = oops_begin ( ) ;
2005-04-17 02:20:36 +04:00
if ( address < PAGE_SIZE )
printk ( KERN_ALERT " Unable to handle kernel NULL pointer dereference " ) ;
else
printk ( KERN_ALERT " Unable to handle kernel paging request " ) ;
2008-01-30 15:32:19 +03:00
printk ( " at %016lx RIP: \n " KERN_ALERT , address ) ;
2008-01-30 15:30:56 +03:00
printk_address ( regs - > ip ) ;
2005-04-17 02:20:36 +04:00
dump_pagetable ( address ) ;
2006-01-12 00:42:14 +03:00
tsk - > thread . cr2 = address ;
tsk - > thread . trap_no = 14 ;
tsk - > thread . error_code = error_code ;
2008-01-30 15:31:23 +03:00
if ( __die ( " Oops " , regs , error_code ) )
regs = NULL ;
2005-04-17 02:20:36 +04:00
/* Executive summary in case the body of the oops scrolled away */
printk ( KERN_EMERG " CR2: %016lx \n " , address ) ;
2008-01-30 15:31:23 +03:00
oops_end ( flags , regs , SIGKILL ) ;
2005-04-17 02:20:36 +04:00
/*
* We ran out of memory , or some other thing happened to us that made
* us unable to handle the page fault gracefully .
*/
out_of_memory :
up_read ( & mm - > mmap_sem ) ;
2007-10-19 10:39:52 +04:00
if ( is_global_init ( current ) ) {
2005-04-17 02:20:36 +04:00
yield ( ) ;
goto again ;
}
printk ( " VM: killing process %s \n " , tsk - > comm ) ;
if ( error_code & 4 )
2007-07-21 19:11:17 +04:00
do_group_exit ( SIGKILL ) ;
2005-04-17 02:20:36 +04:00
goto no_context ;
do_sigbus :
up_read ( & mm - > mmap_sem ) ;
/* Kernel mode? Handle exceptions or die */
2006-01-12 00:44:09 +03:00
if ( ! ( error_code & PF_USER ) )
2005-04-17 02:20:36 +04:00
goto no_context ;
tsk - > thread . cr2 = address ;
tsk - > thread . error_code = error_code ;
tsk - > thread . trap_no = 14 ;
2008-01-30 15:32:35 +03:00
force_sig_info_fault ( SIGBUS , BUS_ADRERR , address , tsk ) ;
2005-04-17 02:20:36 +04:00
return ;
}
2005-11-05 19:25:54 +03:00
2006-03-25 18:29:40 +03:00
DEFINE_SPINLOCK ( pgd_lock ) ;
2007-05-02 21:27:10 +04:00
LIST_HEAD ( pgd_list ) ;
2006-03-25 18:29:40 +03:00
void vmalloc_sync_all ( void )
{
2008-01-30 15:32:19 +03:00
/* Note that races in the updates of insync and start aren't
2006-03-25 18:29:40 +03:00
problematic :
insync can only get set bits added , and updates to start are only
improving performance ( without affecting correctness if undone ) . */
static DECLARE_BITMAP ( insync , PTRS_PER_PGD ) ;
static unsigned long start = VMALLOC_START & PGDIR_MASK ;
unsigned long address ;
for ( address = start ; address < = VMALLOC_END ; address + = PGDIR_SIZE ) {
if ( ! test_bit ( pgd_index ( address ) , insync ) ) {
const pgd_t * pgd_ref = pgd_offset_k ( address ) ;
struct page * page ;
if ( pgd_none ( * pgd_ref ) )
continue ;
spin_lock ( & pgd_lock ) ;
2007-05-02 21:27:10 +04:00
list_for_each_entry ( page , & pgd_list , lru ) {
2006-03-25 18:29:40 +03:00
pgd_t * pgd ;
pgd = ( pgd_t * ) page_address ( page ) + pgd_index ( address ) ;
if ( pgd_none ( * pgd ) )
set_pgd ( pgd , * pgd_ref ) ;
else
2006-09-26 10:31:48 +04:00
BUG_ON ( pgd_page_vaddr ( * pgd ) ! = pgd_page_vaddr ( * pgd_ref ) ) ;
2006-03-25 18:29:40 +03:00
}
spin_unlock ( & pgd_lock ) ;
set_bit ( pgd_index ( address ) , insync ) ;
}
if ( address = = start )
start = address + PGDIR_SIZE ;
}
/* Check that there is no need to do the same for the modules area. */
BUILD_BUG_ON ( ! ( MODULES_VADDR > __START_KERNEL ) ) ;
2008-01-30 15:32:19 +03:00
BUILD_BUG_ON ( ! ( ( ( MODULES_END - 1 ) & PGDIR_MASK ) = =
2006-03-25 18:29:40 +03:00
( __START_KERNEL & PGDIR_MASK ) ) ) ;
}