mlock: do not hold mmap_sem for extended periods of time

__get_user_pages gets a new 'nonblocking' parameter to signal that the
caller is prepared to re-acquire mmap_sem and retry the operation if
needed.  This is used to split off long operations if they are going to
block on a disk transfer, or when we detect contention on the mmap_sem.

[akpm@linux-foundation.org: remove ref to rwsem_is_contended()]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Michel Lespinasse 2011-01-13 15:46:14 -08:00 committed by Linus Torvalds
parent 5fdb200213
commit 53a7706d5e
4 changed files with 47 additions and 25 deletions

View File

@ -243,7 +243,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, unsigned int foll_flags, unsigned long start, int len, unsigned int foll_flags,
struct page **pages, struct vm_area_struct **vmas); struct page **pages, struct vm_area_struct **vmas,
int *nonblocking);
#define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_FULL -1

View File

@ -1363,7 +1363,8 @@ no_page_table:
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags, unsigned long start, int nr_pages, unsigned int gup_flags,
struct page **pages, struct vm_area_struct **vmas) struct page **pages, struct vm_area_struct **vmas,
int *nonblocking)
{ {
int i; int i;
unsigned long vm_flags; unsigned long vm_flags;
@ -1463,10 +1464,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched(); cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) { while (!(page = follow_page(vma, start, foll_flags))) {
int ret; int ret;
unsigned int fault_flags = 0;
if (foll_flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (nonblocking)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
ret = handle_mm_fault(mm, vma, start, ret = handle_mm_fault(mm, vma, start,
(foll_flags & FOLL_WRITE) ? fault_flags);
FAULT_FLAG_WRITE : 0);
if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM) if (ret & VM_FAULT_OOM)
@ -1482,6 +1488,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
else else
tsk->min_flt++; tsk->min_flt++;
if (ret & VM_FAULT_RETRY) {
*nonblocking = 0;
return i;
}
/* /*
* The VM_FAULT_WRITE bit tells us that * The VM_FAULT_WRITE bit tells us that
* do_wp_page has broken COW when necessary, * do_wp_page has broken COW when necessary,
@ -1581,7 +1592,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force) if (force)
flags |= FOLL_FORCE; flags |= FOLL_FORCE;
return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
NULL);
} }
EXPORT_SYMBOL(get_user_pages); EXPORT_SYMBOL(get_user_pages);
@ -1606,7 +1618,8 @@ struct page *get_dump_page(unsigned long addr)
struct page *page; struct page *page;
if (__get_user_pages(current, current->mm, addr, 1, if (__get_user_pages(current, current->mm, addr, 1,
FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
NULL) < 1)
return NULL; return NULL;
flush_cache_page(vma, addr, page_to_pfn(page)); flush_cache_page(vma, addr, page_to_pfn(page));
return page; return page;

View File

@ -155,13 +155,13 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
* vma->vm_mm->mmap_sem must be held for at least read. * vma->vm_mm->mmap_sem must be held for at least read.
*/ */
static long __mlock_vma_pages_range(struct vm_area_struct *vma, static long __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end) unsigned long start, unsigned long end,
int *nonblocking)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start; unsigned long addr = start;
int nr_pages = (end - start) / PAGE_SIZE; int nr_pages = (end - start) / PAGE_SIZE;
int gup_flags; int gup_flags;
int ret;
VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK);
@ -187,9 +187,8 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
nr_pages--; nr_pages--;
} }
ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags, return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
NULL, NULL); NULL, NULL, nonblocking);
return max(ret, 0); /* 0 or negative error code */
} }
/* /*
@ -233,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
is_vm_hugetlb_page(vma) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) { vma == get_gate_vma(current))) {
__mlock_vma_pages_range(vma, start, end); __mlock_vma_pages_range(vma, start, end, NULL);
/* Hide errors from mmap() and other callers */ /* Hide errors from mmap() and other callers */
return 0; return 0;
@ -429,21 +428,23 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
unsigned long end, nstart, nend; unsigned long end, nstart, nend;
struct vm_area_struct *vma = NULL; struct vm_area_struct *vma = NULL;
int locked = 0;
int ret = 0; int ret = 0;
VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(len != PAGE_ALIGN(len)); VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len; end = start + len;
down_read(&mm->mmap_sem);
for (nstart = start; nstart < end; nstart = nend) { for (nstart = start; nstart < end; nstart = nend) {
/* /*
* We want to fault in pages for [nstart; end) address range. * We want to fault in pages for [nstart; end) address range.
* Find first corresponding VMA. * Find first corresponding VMA.
*/ */
if (!vma) if (!locked) {
locked = 1;
down_read(&mm->mmap_sem);
vma = find_vma(mm, nstart); vma = find_vma(mm, nstart);
else } else if (nstart >= vma->vm_end)
vma = vma->vm_next; vma = vma->vm_next;
if (!vma || vma->vm_start >= end) if (!vma || vma->vm_start >= end)
break; break;
@ -457,19 +458,24 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
if (nstart < vma->vm_start) if (nstart < vma->vm_start)
nstart = vma->vm_start; nstart = vma->vm_start;
/* /*
* Now fault in a range of pages within the first VMA. * Now fault in a range of pages. __mlock_vma_pages_range()
* double checks the vma flags, so that it won't mlock pages
* if the vma was already munlocked.
*/ */
ret = __mlock_vma_pages_range(vma, nstart, nend); ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
if (ret < 0 && ignore_errors) { if (ret < 0) {
ret = 0; if (ignore_errors) {
continue; /* continue at next VMA */ ret = 0;
} continue; /* continue at next VMA */
if (ret) { }
ret = __mlock_posix_error_return(ret); ret = __mlock_posix_error_return(ret);
break; break;
} }
nend = nstart + ret * PAGE_SIZE;
ret = 0;
} }
up_read(&mm->mmap_sem); if (locked)
up_read(&mm->mmap_sem);
return ret; /* 0 or negative error code */ return ret; /* 0 or negative error code */
} }

View File

@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int foll_flags, unsigned long start, int nr_pages, unsigned int foll_flags,
struct page **pages, struct vm_area_struct **vmas) struct page **pages, struct vm_area_struct **vmas,
int *retry)
{ {
struct vm_area_struct *vma; struct vm_area_struct *vma;
unsigned long vm_flags; unsigned long vm_flags;
@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force) if (force)
flags |= FOLL_FORCE; flags |= FOLL_FORCE;
return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
NULL);
} }
EXPORT_SYMBOL(get_user_pages); EXPORT_SYMBOL(get_user_pages);