2014-10-10 02:28:37 +04:00
/*
* mm / debug . c
*
* mm / specific debug routines .
*
*/
2014-10-10 02:28:34 +04:00
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/ftrace_event.h>
# include <linux/memcontrol.h>
static const struct trace_print_flags pageflag_names [ ] = {
{ 1UL < < PG_locked , " locked " } ,
{ 1UL < < PG_error , " error " } ,
{ 1UL < < PG_referenced , " referenced " } ,
{ 1UL < < PG_uptodate , " uptodate " } ,
{ 1UL < < PG_dirty , " dirty " } ,
{ 1UL < < PG_lru , " lru " } ,
{ 1UL < < PG_active , " active " } ,
{ 1UL < < PG_slab , " slab " } ,
{ 1UL < < PG_owner_priv_1 , " owner_priv_1 " } ,
{ 1UL < < PG_arch_1 , " arch_1 " } ,
{ 1UL < < PG_reserved , " reserved " } ,
{ 1UL < < PG_private , " private " } ,
{ 1UL < < PG_private_2 , " private_2 " } ,
{ 1UL < < PG_writeback , " writeback " } ,
# ifdef CONFIG_PAGEFLAGS_EXTENDED
{ 1UL < < PG_head , " head " } ,
{ 1UL < < PG_tail , " tail " } ,
# else
{ 1UL < < PG_compound , " compound " } ,
# endif
{ 1UL < < PG_swapcache , " swapcache " } ,
{ 1UL < < PG_mappedtodisk , " mappedtodisk " } ,
{ 1UL < < PG_reclaim , " reclaim " } ,
{ 1UL < < PG_swapbacked , " swapbacked " } ,
{ 1UL < < PG_unevictable , " unevictable " } ,
# ifdef CONFIG_MMU
{ 1UL < < PG_mlocked , " mlocked " } ,
# endif
# ifdef CONFIG_ARCH_USES_PG_UNCACHED
{ 1UL < < PG_uncached , " uncached " } ,
# endif
# ifdef CONFIG_MEMORY_FAILURE
{ 1UL < < PG_hwpoison , " hwpoison " } ,
# endif
# ifdef CONFIG_TRANSPARENT_HUGEPAGE
{ 1UL < < PG_compound_lock , " compound_lock " } ,
# endif
} ;
static void dump_flags ( unsigned long flags ,
const struct trace_print_flags * names , int count )
{
const char * delim = " " ;
unsigned long mask ;
int i ;
2014-10-10 02:28:41 +04:00
pr_emerg ( " flags: %#lx( " , flags ) ;
2014-10-10 02:28:34 +04:00
/* remove zone id */
flags & = ( 1UL < < NR_PAGEFLAGS ) - 1 ;
for ( i = 0 ; i < count & & flags ; i + + ) {
mask = names [ i ] . mask ;
if ( ( flags & mask ) ! = mask )
continue ;
flags & = ~ mask ;
2014-10-10 02:28:41 +04:00
pr_cont ( " %s%s " , delim , names [ i ] . name ) ;
2014-10-10 02:28:34 +04:00
delim = " | " ;
}
/* check for left over flags */
if ( flags )
2014-10-10 02:28:41 +04:00
pr_cont ( " %s%#lx " , delim , flags ) ;
2014-10-10 02:28:34 +04:00
2014-10-10 02:28:41 +04:00
pr_cont ( " ) \n " ) ;
2014-10-10 02:28:34 +04:00
}
void dump_page_badflags ( struct page * page , const char * reason ,
unsigned long badflags )
{
2014-10-10 02:28:41 +04:00
pr_emerg ( " page:%p count:%d mapcount:%d mapping:%p index:%#lx \n " ,
page , atomic_read ( & page - > _count ) , page_mapcount ( page ) ,
page - > mapping , page - > index ) ;
2014-10-10 02:28:34 +04:00
BUILD_BUG_ON ( ARRAY_SIZE ( pageflag_names ) ! = __NR_PAGEFLAGS ) ;
dump_flags ( page - > flags , pageflag_names , ARRAY_SIZE ( pageflag_names ) ) ;
if ( reason )
pr_alert ( " page dumped because: %s \n " , reason ) ;
if ( page - > flags & badflags ) {
pr_alert ( " bad because of flags: \n " ) ;
dump_flags ( page - > flags & badflags ,
pageflag_names , ARRAY_SIZE ( pageflag_names ) ) ;
}
2014-12-11 02:44:58 +03:00
# ifdef CONFIG_MEMCG
if ( page - > mem_cgroup )
pr_alert ( " page->mem_cgroup:%p \n " , page - > mem_cgroup ) ;
# endif
2014-10-10 02:28:34 +04:00
}
void dump_page ( struct page * page , const char * reason )
{
dump_page_badflags ( page , reason , 0 ) ;
}
EXPORT_SYMBOL ( dump_page ) ;
# ifdef CONFIG_DEBUG_VM
static const struct trace_print_flags vmaflags_names [ ] = {
{ VM_READ , " read " } ,
{ VM_WRITE , " write " } ,
{ VM_EXEC , " exec " } ,
{ VM_SHARED , " shared " } ,
{ VM_MAYREAD , " mayread " } ,
{ VM_MAYWRITE , " maywrite " } ,
{ VM_MAYEXEC , " mayexec " } ,
{ VM_MAYSHARE , " mayshare " } ,
{ VM_GROWSDOWN , " growsdown " } ,
{ VM_PFNMAP , " pfnmap " } ,
{ VM_DENYWRITE , " denywrite " } ,
{ VM_LOCKED , " locked " } ,
{ VM_IO , " io " } ,
{ VM_SEQ_READ , " seqread " } ,
{ VM_RAND_READ , " randread " } ,
{ VM_DONTCOPY , " dontcopy " } ,
{ VM_DONTEXPAND , " dontexpand " } ,
{ VM_ACCOUNT , " account " } ,
{ VM_NORESERVE , " noreserve " } ,
{ VM_HUGETLB , " hugetlb " } ,
# if defined(CONFIG_X86)
{ VM_PAT , " pat " } ,
# elif defined(CONFIG_PPC)
{ VM_SAO , " sao " } ,
# elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
{ VM_GROWSUP , " growsup " } ,
# elif !defined(CONFIG_MMU)
{ VM_MAPPED_COPY , " mappedcopy " } ,
# else
{ VM_ARCH_1 , " arch_1 " } ,
# endif
{ VM_DONTDUMP , " dontdump " } ,
# ifdef CONFIG_MEM_SOFT_DIRTY
{ VM_SOFTDIRTY , " softdirty " } ,
# endif
{ VM_MIXEDMAP , " mixedmap " } ,
{ VM_HUGEPAGE , " hugepage " } ,
{ VM_NOHUGEPAGE , " nohugepage " } ,
{ VM_MERGEABLE , " mergeable " } ,
} ;
void dump_vma ( const struct vm_area_struct * vma )
{
2014-10-10 02:28:41 +04:00
pr_emerg ( " vma %p start %p end %p \n "
2014-10-10 02:28:34 +04:00
" next %p prev %p mm %p \n "
" prot %lx anon_vma %p vm_ops %p \n "
" pgoff %lx file %p private_data %p \n " ,
vma , ( void * ) vma - > vm_start , ( void * ) vma - > vm_end , vma - > vm_next ,
vma - > vm_prev , vma - > vm_mm ,
( unsigned long ) pgprot_val ( vma - > vm_page_prot ) ,
vma - > anon_vma , vma - > vm_ops , vma - > vm_pgoff ,
vma - > vm_file , vma - > vm_private_data ) ;
dump_flags ( vma - > vm_flags , vmaflags_names , ARRAY_SIZE ( vmaflags_names ) ) ;
}
EXPORT_SYMBOL ( dump_vma ) ;
2014-10-10 02:28:37 +04:00
void dump_mm ( const struct mm_struct * mm )
{
2014-10-10 02:28:41 +04:00
pr_emerg ( " mm %p mmap %p seqnum %d task_size %lu \n "
2014-10-10 02:28:37 +04:00
# ifdef CONFIG_MMU
" get_unmapped_area %p \n "
# endif
" mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu \n "
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup. The trick is to allocate a lot of PMD page tables. Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low. oom_score for the process will be 0.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#define PUD_SIZE (1UL << 30)
#define PMD_SIZE (1UL << 21)
#define NR_PUD 130000
int main(void)
{
char *addr = NULL;
unsigned long i;
prctl(PR_SET_THP_DISABLE);
for (i = 0; i < NR_PUD ; i++) {
addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap");
break;
}
*addr = 'x';
munmap(addr, PMD_SIZE);
mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
if (addr == MAP_FAILED)
perror("re-mmap"), exit(1);
}
printf("PID %d consumed %lu KiB in PMD page tables\n",
getpid(), i * 4096 >> 10);
return pause();
}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
- HugeTLB can share PMD page tables. The patch handles by accounting
the table to all processes who share it.
- x86 PAE pre-allocates few PMD tables on fork.
- Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded). As with nr_ptes we use per-mm counter. The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 02:26:50 +03:00
" pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d \n "
2014-10-10 02:28:37 +04:00
" hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx \n "
" pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx \n "
" start_code %lx end_code %lx start_data %lx end_data %lx \n "
" start_brk %lx brk %lx start_stack %lx \n "
" arg_start %lx arg_end %lx env_start %lx env_end %lx \n "
" binfmt %p flags %lx core_state %p \n "
# ifdef CONFIG_AIO
" ioctx_table %p \n "
# endif
# ifdef CONFIG_MEMCG
" owner %p "
# endif
" exe_file %p \n "
# ifdef CONFIG_MMU_NOTIFIER
" mmu_notifier_mm %p \n "
# endif
# ifdef CONFIG_NUMA_BALANCING
" numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d \n "
# endif
# if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
" tlb_flush_pending %d \n "
# endif
" %s " , /* This is here to hold the comma */
mm , mm - > mmap , mm - > vmacache_seqnum , mm - > task_size ,
# ifdef CONFIG_MMU
mm - > get_unmapped_area ,
# endif
mm - > mmap_base , mm - > mmap_legacy_base , mm - > highest_vm_end ,
mm - > pgd , atomic_read ( & mm - > mm_users ) ,
atomic_read ( & mm - > mm_count ) ,
atomic_long_read ( ( atomic_long_t * ) & mm - > nr_ptes ) ,
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup. The trick is to allocate a lot of PMD page tables. Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low. oom_score for the process will be 0.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#define PUD_SIZE (1UL << 30)
#define PMD_SIZE (1UL << 21)
#define NR_PUD 130000
int main(void)
{
char *addr = NULL;
unsigned long i;
prctl(PR_SET_THP_DISABLE);
for (i = 0; i < NR_PUD ; i++) {
addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap");
break;
}
*addr = 'x';
munmap(addr, PMD_SIZE);
mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
if (addr == MAP_FAILED)
perror("re-mmap"), exit(1);
}
printf("PID %d consumed %lu KiB in PMD page tables\n",
getpid(), i * 4096 >> 10);
return pause();
}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
- HugeTLB can share PMD page tables. The patch handles by accounting
the table to all processes who share it.
- x86 PAE pre-allocates few PMD tables on fork.
- Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded). As with nr_ptes we use per-mm counter. The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-12 02:26:50 +03:00
mm_nr_pmds ( ( struct mm_struct * ) mm ) ,
2014-10-10 02:28:37 +04:00
mm - > map_count ,
mm - > hiwater_rss , mm - > hiwater_vm , mm - > total_vm , mm - > locked_vm ,
mm - > pinned_vm , mm - > shared_vm , mm - > exec_vm , mm - > stack_vm ,
mm - > start_code , mm - > end_code , mm - > start_data , mm - > end_data ,
mm - > start_brk , mm - > brk , mm - > start_stack ,
mm - > arg_start , mm - > arg_end , mm - > env_start , mm - > env_end ,
mm - > binfmt , mm - > flags , mm - > core_state ,
# ifdef CONFIG_AIO
mm - > ioctx_table ,
# endif
# ifdef CONFIG_MEMCG
mm - > owner ,
# endif
mm - > exe_file ,
# ifdef CONFIG_MMU_NOTIFIER
mm - > mmu_notifier_mm ,
# endif
# ifdef CONFIG_NUMA_BALANCING
mm - > numa_next_scan , mm - > numa_scan_offset , mm - > numa_scan_seq ,
# endif
# if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
mm - > tlb_flush_pending ,
# endif
" " /* This is here to not have a comma! */
) ;
dump_flags ( mm - > def_flags , vmaflags_names ,
ARRAY_SIZE ( vmaflags_names ) ) ;
}
2014-10-10 02:28:34 +04:00
# endif /* CONFIG_DEBUG_VM */