mm, madvise: fail with ENOMEM when splitting vma will hit max_map_count
If madvise(2) advice will result in the underlying vma being split and the number of areas mapped by the process will exceed /proc/sys/vm/max_map_count as a result, return ENOMEM instead of EAGAIN. EAGAIN is returned by madvise(2) when a kernel resource, such as slab, is temporarily unavailable. It indicates that userspace should retry the advice in the near future. This is important for advice such as MADV_DONTNEED which is often used by malloc implementations to free memory back to the system: we really do want to free memory back when madvise(2) returns EAGAIN because slab allocations (for vmas, anon_vmas, or mempolicies) cannot be allocated. Encountering /proc/sys/vm/max_map_count is not a temporary failure, however, so return ENOMEM to indicate this is a more serious issue. A followup patch to the man page will specify this behavior. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701241431120.42507@chino.kir.corp.google.com Signed-off-by: David Rientjes <rientjes@google.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jerome Marchand <jmarchan@redhat.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Michael Kerrisk <mtk.manpages@googlemail.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
committed by
Linus Torvalds
parent
712c604dcd
commit
def5efe037
@ -376,8 +376,8 @@ max_map_count:
|
|||||||
|
|
||||||
This file contains the maximum number of memory map areas a process
|
This file contains the maximum number of memory map areas a process
|
||||||
may have. Memory map areas are used as a side-effect of calling
|
may have. Memory map areas are used as a side-effect of calling
|
||||||
malloc, directly by mmap and mprotect, and also when loading shared
|
malloc, directly by mmap, mprotect, and madvise, and also when loading
|
||||||
libraries.
|
shared libraries.
|
||||||
|
|
||||||
While most applications need less than a thousand maps, certain
|
While most applications need less than a thousand maps, certain
|
||||||
programs, particularly malloc debuggers, may consume lots of them,
|
programs, particularly malloc debuggers, may consume lots of them,
|
||||||
|
@ -38,6 +38,10 @@ the range for whenever the KSM daemon is started; even if the range
|
|||||||
cannot contain any pages which KSM could actually merge; even if
|
cannot contain any pages which KSM could actually merge; even if
|
||||||
MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
|
MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
|
||||||
|
|
||||||
|
If a region of memory must be split into at least one new MADV_MERGEABLE
|
||||||
|
or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
|
||||||
|
will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
|
||||||
|
|
||||||
Like other madvise calls, they are intended for use on mapped areas of
|
Like other madvise calls, they are intended for use on mapped areas of
|
||||||
the user address space: they will report ENOMEM if the specified range
|
the user address space: they will report ENOMEM if the specified range
|
||||||
includes unmapped gaps (though working on the intervening mapped areas),
|
includes unmapped gaps (though working on the intervening mapped areas),
|
||||||
|
@ -2041,8 +2041,10 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *,
|
|||||||
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
|
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
|
||||||
struct mempolicy *, struct vm_userfaultfd_ctx);
|
struct mempolicy *, struct vm_userfaultfd_ctx);
|
||||||
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
|
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
|
||||||
extern int split_vma(struct mm_struct *,
|
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
|
||||||
struct vm_area_struct *, unsigned long addr, int new_below);
|
unsigned long addr, int new_below);
|
||||||
|
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
|
||||||
|
unsigned long addr, int new_below);
|
||||||
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
|
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
|
||||||
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
|
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
|
||||||
struct rb_node **, struct rb_node *);
|
struct rb_node **, struct rb_node *);
|
||||||
|
51
mm/madvise.c
51
mm/madvise.c
@ -92,14 +92,28 @@ static long madvise_behavior(struct vm_area_struct *vma,
|
|||||||
case MADV_MERGEABLE:
|
case MADV_MERGEABLE:
|
||||||
case MADV_UNMERGEABLE:
|
case MADV_UNMERGEABLE:
|
||||||
error = ksm_madvise(vma, start, end, behavior, &new_flags);
|
error = ksm_madvise(vma, start, end, behavior, &new_flags);
|
||||||
if (error)
|
if (error) {
|
||||||
|
/*
|
||||||
|
* madvise() returns EAGAIN if kernel resources, such as
|
||||||
|
* slab, are temporarily unavailable.
|
||||||
|
*/
|
||||||
|
if (error == -ENOMEM)
|
||||||
|
error = -EAGAIN;
|
||||||
goto out;
|
goto out;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case MADV_HUGEPAGE:
|
case MADV_HUGEPAGE:
|
||||||
case MADV_NOHUGEPAGE:
|
case MADV_NOHUGEPAGE:
|
||||||
error = hugepage_madvise(vma, &new_flags, behavior);
|
error = hugepage_madvise(vma, &new_flags, behavior);
|
||||||
if (error)
|
if (error) {
|
||||||
|
/*
|
||||||
|
* madvise() returns EAGAIN if kernel resources, such as
|
||||||
|
* slab, are temporarily unavailable.
|
||||||
|
*/
|
||||||
|
if (error == -ENOMEM)
|
||||||
|
error = -EAGAIN;
|
||||||
goto out;
|
goto out;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,15 +134,37 @@ static long madvise_behavior(struct vm_area_struct *vma,
|
|||||||
*prev = vma;
|
*prev = vma;
|
||||||
|
|
||||||
if (start != vma->vm_start) {
|
if (start != vma->vm_start) {
|
||||||
error = split_vma(mm, vma, start, 1);
|
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
|
||||||
if (error)
|
error = -ENOMEM;
|
||||||
goto out;
|
goto out;
|
||||||
|
}
|
||||||
|
error = __split_vma(mm, vma, start, 1);
|
||||||
|
if (error) {
|
||||||
|
/*
|
||||||
|
* madvise() returns EAGAIN if kernel resources, such as
|
||||||
|
* slab, are temporarily unavailable.
|
||||||
|
*/
|
||||||
|
if (error == -ENOMEM)
|
||||||
|
error = -EAGAIN;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (end != vma->vm_end) {
|
if (end != vma->vm_end) {
|
||||||
error = split_vma(mm, vma, end, 0);
|
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
|
||||||
if (error)
|
error = -ENOMEM;
|
||||||
goto out;
|
goto out;
|
||||||
|
}
|
||||||
|
error = __split_vma(mm, vma, end, 0);
|
||||||
|
if (error) {
|
||||||
|
/*
|
||||||
|
* madvise() returns EAGAIN if kernel resources, such as
|
||||||
|
* slab, are temporarily unavailable.
|
||||||
|
*/
|
||||||
|
if (error == -ENOMEM)
|
||||||
|
error = -EAGAIN;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
success:
|
success:
|
||||||
@ -136,10 +172,7 @@ success:
|
|||||||
* vm_flags is protected by the mmap_sem held in write mode.
|
* vm_flags is protected by the mmap_sem held in write mode.
|
||||||
*/
|
*/
|
||||||
vma->vm_flags = new_flags;
|
vma->vm_flags = new_flags;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
if (error == -ENOMEM)
|
|
||||||
error = -EAGAIN;
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2499,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* __split_vma() bypasses sysctl_max_map_count checking. We use this on the
|
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
|
||||||
* munmap path where it doesn't make sense to fail.
|
* has already been checked or doesn't make sense to fail.
|
||||||
*/
|
*/
|
||||||
static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
unsigned long addr, int new_below)
|
unsigned long addr, int new_below)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *new;
|
struct vm_area_struct *new;
|
||||||
int err;
|
int err;
|
||||||
|
Reference in New Issue
Block a user