2005-04-17 02:20:36 +04:00
/*
* hugetlbpage - backed filesystem . Based on ramfs .
*
2012-12-06 13:39:54 +04:00
* Nadia Yvette Chambers , 2002
2005-04-17 02:20:36 +04:00
*
* Copyright ( C ) 2002 Linus Torvalds .
2016-01-15 02:21:52 +03:00
* License : GPL
2005-04-17 02:20:36 +04:00
*/
2014-06-05 03:07:21 +04:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2005-04-17 02:20:36 +04:00
# include <linux/thread_info.h>
# include <asm/current.h>
2015-09-09 01:01:54 +03:00
# include <linux/falloc.h>
2005-04-17 02:20:36 +04:00
# include <linux/fs.h>
# include <linux/mount.h>
# include <linux/file.h>
2007-07-16 10:40:52 +04:00
# include <linux/kernel.h>
2005-04-17 02:20:36 +04:00
# include <linux/writeback.h>
# include <linux/pagemap.h>
# include <linux/highmem.h>
# include <linux/init.h>
# include <linux/string.h>
2006-01-11 23:17:46 +03:00
# include <linux/capability.h>
2007-07-16 10:40:52 +04:00
# include <linux/ctype.h>
2005-04-17 02:20:36 +04:00
# include <linux/backing-dev.h>
# include <linux/hugetlb.h>
# include <linux/pagevec.h>
2018-11-02 02:07:26 +03:00
# include <linux/fs_parser.h>
2007-05-07 01:50:12 +04:00
# include <linux/mman.h>
2005-04-17 02:20:36 +04:00
# include <linux/slab.h>
# include <linux/dnotify.h>
# include <linux/statfs.h>
# include <linux/security.h>
2009-09-23 03:43:33 +04:00
# include <linux/magic.h>
2010-09-08 05:19:35 +04:00
# include <linux/migrate.h>
2015-04-03 18:31:35 +03:00
# include <linux/uio.h>
2005-04-17 02:20:36 +04:00
2016-12-24 22:46:01 +03:00
# include <linux/uaccess.h>
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 02:03:34 +03:00
# include <linux/sched/mm.h>
2005-04-17 02:20:36 +04:00
2006-06-28 15:26:44 +04:00
static const struct address_space_operations hugetlbfs_aops ;
2006-03-28 13:56:42 +04:00
const struct file_operations hugetlbfs_file_operations ;
2007-02-12 11:55:39 +03:00
static const struct inode_operations hugetlbfs_dir_inode_operations ;
static const struct inode_operations hugetlbfs_inode_operations ;
2005-04-17 02:20:36 +04:00
2018-11-02 02:07:26 +03:00
enum hugetlbfs_size_type { NO_SIZE , SIZE_STD , SIZE_PERCENT } ;
struct hugetlbfs_fs_context {
2017-07-05 18:24:18 +03:00
struct hstate * hstate ;
2018-11-02 02:07:26 +03:00
unsigned long long max_size_opt ;
unsigned long long min_size_opt ;
2017-07-05 18:24:18 +03:00
long max_hpages ;
long nr_inodes ;
long min_hpages ;
2018-11-02 02:07:26 +03:00
enum hugetlbfs_size_type max_val_type ;
enum hugetlbfs_size_type min_val_type ;
2017-07-05 18:24:18 +03:00
kuid_t uid ;
kgid_t gid ;
umode_t mode ;
2012-03-22 03:34:12 +04:00
} ;
2005-04-17 02:20:36 +04:00
int sysctl_hugetlb_shm_group ;
2018-11-02 02:07:26 +03:00
enum hugetlb_param {
Opt_gid ,
Opt_min_size ,
Opt_mode ,
Opt_nr_inodes ,
Opt_pagesize ,
Opt_size ,
Opt_uid ,
2007-07-16 10:40:52 +04:00
} ;
2019-09-07 14:23:15 +03:00
static const struct fs_parameter_spec hugetlb_fs_parameters [ ] = {
2018-11-02 02:07:26 +03:00
fsparam_u32 ( " gid " , Opt_gid ) ,
fsparam_string ( " min_size " , Opt_min_size ) ,
2021-07-24 01:50:44 +03:00
fsparam_u32oct ( " mode " , Opt_mode ) ,
2018-11-02 02:07:26 +03:00
fsparam_string ( " nr_inodes " , Opt_nr_inodes ) ,
fsparam_string ( " pagesize " , Opt_pagesize ) ,
fsparam_string ( " size " , Opt_size ) ,
fsparam_u32 ( " uid " , Opt_uid ) ,
{ }
} ;
hugetlbfs: check for pgoff value overflow
A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call. The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.
A sequence such as:
mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);
will result in the following when task exits/file closed,
kernel BUG at mm/hugetlb.c:749!
Call Trace:
hugetlbfs_evict_inode+0x2f/0x40
evict+0xcb/0x190
__dentry_kill+0xcb/0x150
__fput+0x164/0x1e0
task_work_run+0x84/0xa0
exit_to_usermode_loop+0x7d/0x80
do_syscall_64+0x18b/0x190
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.
The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.
[mike.kravetz@oracle.com: v3]
Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 02:17:13 +03:00
/*
* Mask used when checking the page offset value passed in via system
* calls . This value will be converted to a loff_t which is signed .
* Therefore , we want to check the upper PAGE_SHIFT + 1 bits of the
* value . The extra bit ( - 1 in the shift value ) is to take the sign
* bit into account .
*/
# define PGOFF_LOFFT_MAX \
( ( ( 1UL < < ( PAGE_SHIFT + 1 ) ) - 1 ) < < ( BITS_PER_LONG - ( PAGE_SHIFT + 1 ) ) )
2005-04-17 02:20:36 +04:00
static int hugetlbfs_file_mmap ( struct file * file , struct vm_area_struct * vma )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2021-05-15 03:27:04 +03:00
struct hugetlbfs_inode_info * info = HUGETLBFS_I ( inode ) ;
2005-04-17 02:20:36 +04:00
loff_t len , vma_len ;
int ret ;
2008-07-24 08:27:41 +04:00
struct hstate * h = hstate_file ( file ) ;
2005-04-17 02:20:36 +04:00
[PATCH] hugetlb: prepare_hugepage_range check offset too
(David:)
If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.
But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it. That will eventually call down to the
non-hugepage version of unmap_page_range(). On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries. I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.
(Hugh:)
prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down. PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.
Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-11-14 13:03:32 +03:00
/*
2007-08-31 10:56:40 +04:00
* vma address alignment ( but not the pgoff alignment ) has
* already been checked by prepare_hugepage_range . If you add
* any error returns here , do so after setting VM_HUGETLB , so
* is_vm_hugetlb_page tests below unmap_region go the right
2020-08-07 09:23:37 +03:00
* way when do_mmap unwinds ( may be important on powerpc
2007-08-31 10:56:40 +04:00
* and ia64 ) .
[PATCH] hugetlb: prepare_hugepage_range check offset too
(David:)
If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.
But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it. That will eventually call down to the
non-hugepage version of unmap_page_range(). On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries. I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.
(Hugh:)
prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down. PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.
Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-11-14 13:03:32 +03:00
*/
2023-01-26 22:37:49 +03:00
vm_flags_set ( vma , VM_HUGETLB | VM_DONTEXPAND ) ;
[PATCH] hugetlb: prepare_hugepage_range check offset too
(David:)
If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.
But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it. That will eventually call down to the
non-hugepage version of unmap_page_range(). On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries. I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.
(Hugh:)
prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down. PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.
Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-11-14 13:03:32 +03:00
vma - > vm_ops = & hugetlb_vm_ops ;
2005-04-17 02:20:36 +04:00
2023-10-12 20:04:29 +03:00
ret = seal_check_write ( info - > seals , vma ) ;
2021-05-15 03:27:04 +03:00
if ( ret )
return ret ;
2017-04-14 00:56:32 +03:00
/*
hugetlbfs: check for pgoff value overflow
A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call. The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.
A sequence such as:
mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);
will result in the following when task exits/file closed,
kernel BUG at mm/hugetlb.c:749!
Call Trace:
hugetlbfs_evict_inode+0x2f/0x40
evict+0xcb/0x190
__dentry_kill+0xcb/0x150
__fput+0x164/0x1e0
task_work_run+0x84/0xa0
exit_to_usermode_loop+0x7d/0x80
do_syscall_64+0x18b/0x190
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.
The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.
[mike.kravetz@oracle.com: v3]
Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 02:17:13 +03:00
* page based offset in vm_pgoff could be sufficiently large to
2018-04-06 02:18:21 +03:00
* overflow a loff_t when converted to byte offset . This can
* only happen on architectures where sizeof ( loff_t ) = =
* sizeof ( unsigned long ) . So , only check in those instances .
2017-04-14 00:56:32 +03:00
*/
2018-04-06 02:18:21 +03:00
if ( sizeof ( unsigned long ) = = sizeof ( loff_t ) ) {
if ( vma - > vm_pgoff & PGOFF_LOFFT_MAX )
return - EINVAL ;
}
2017-04-14 00:56:32 +03:00
hugetlbfs: check for pgoff value overflow
A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call. The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.
A sequence such as:
mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);
will result in the following when task exits/file closed,
kernel BUG at mm/hugetlb.c:749!
Call Trace:
hugetlbfs_evict_inode+0x2f/0x40
evict+0xcb/0x190
__dentry_kill+0xcb/0x150
__fput+0x164/0x1e0
task_work_run+0x84/0xa0
exit_to_usermode_loop+0x7d/0x80
do_syscall_64+0x18b/0x190
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.
The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.
[mike.kravetz@oracle.com: v3]
Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 02:17:13 +03:00
/* must be huge page aligned */
2011-07-26 04:11:49 +04:00
if ( vma - > vm_pgoff & ( ~ huge_page_mask ( h ) > > PAGE_SHIFT ) )
2007-08-31 10:56:40 +04:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
vma_len = ( loff_t ) ( vma - > vm_end - vma - > vm_start ) ;
2017-04-14 00:56:32 +03:00
len = vma_len + ( ( loff_t ) vma - > vm_pgoff < < PAGE_SHIFT ) ;
/* check for overflow */
if ( len < vma_len )
return - EINVAL ;
2005-04-17 02:20:36 +04:00
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
2005-04-17 02:20:36 +04:00
file_accessed ( file ) ;
ret = - ENOMEM ;
2021-02-24 23:09:54 +03:00
if ( ! hugetlb_reserve_pages ( inode ,
2008-07-24 08:27:41 +04:00
vma - > vm_pgoff > > huge_page_order ( h ) ,
2009-02-10 17:02:27 +03:00
len > > huge_page_shift ( h ) , vma ,
vma - > vm_flags ) )
2006-06-23 13:03:15 +04:00
goto out ;
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:55 +03:00
2005-10-30 04:16:46 +03:00
ret = 0 ;
[PATCH] mmap zero-length hugetlb file with PROT_NONE to protect a hugetlb virtual area
Sometimes, applications need below call to be successful although
"/mnt/hugepages/file1" doesn't exist.
fd = open("/mnt/hugepages/file1", O_CREAT|O_RDWR, 0755);
*addr = mmap(NULL, 0x1024*1024*256, PROT_NONE, 0, fd, 0);
As for regular pages (or files), above call does work, but as for huge
pages, above call would fail because hugetlbfs_file_mmap would fail if
(!(vma->vm_flags & VM_WRITE) && len > inode->i_size).
This capability on huge page is useful on ia64 when the process wants to
protect one area on region 4, so other threads couldn't read/write this
area. A famous JVM (Java Virtual Machine) implementation on IA64 needs the
capability.
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hugh@veritas.com>
[ Expand-on-mmap semantics again... this time matching normal fs's. wli ]
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-10 15:44:49 +04:00
if ( vma - > vm_flags & VM_WRITE & & inode - > i_size < len )
2017-04-14 00:56:32 +03:00
i_size_write ( inode , len ) ;
2005-04-17 02:20:36 +04:00
out :
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
/*
2020-06-09 07:33:51 +03:00
* Called under mmap_write_lock ( mm ) .
2005-04-17 02:20:36 +04:00
*/
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 02:03:34 +03:00
static unsigned long
hugetlb_get_unmapped_area_bottomup ( struct file * file , unsigned long addr ,
unsigned long len , unsigned long pgoff , unsigned long flags )
{
struct hstate * h = hstate_file ( file ) ;
struct vm_unmapped_area_info info ;
info . flags = 0 ;
info . length = len ;
info . low_limit = current - > mm - > mmap_base ;
2022-04-09 20:17:28 +03:00
info . high_limit = arch_get_mmap_end ( addr , len , flags ) ;
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 02:03:34 +03:00
info . align_mask = PAGE_MASK & ~ huge_page_mask ( h ) ;
info . align_offset = 0 ;
return vm_unmapped_area ( & info ) ;
}
static unsigned long
hugetlb_get_unmapped_area_topdown ( struct file * file , unsigned long addr ,
unsigned long len , unsigned long pgoff , unsigned long flags )
{
struct hstate * h = hstate_file ( file ) ;
struct vm_unmapped_area_info info ;
info . flags = VM_UNMAPPED_AREA_TOPDOWN ;
info . length = len ;
2023-04-19 00:40:09 +03:00
info . low_limit = PAGE_SIZE ;
2022-04-22 02:35:46 +03:00
info . high_limit = arch_get_mmap_base ( addr , current - > mm - > mmap_base ) ;
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 02:03:34 +03:00
info . align_mask = PAGE_MASK & ~ huge_page_mask ( h ) ;
info . align_offset = 0 ;
addr = vm_unmapped_area ( & info ) ;
/*
* A failed mmap ( ) very likely causes application failure ,
* so fall back to the bottom - up function here . This scenario
* can happen with large stack limits and large mmap ( )
* allocations .
*/
if ( unlikely ( offset_in_page ( addr ) ) ) {
VM_BUG_ON ( addr ! = - ENOMEM ) ;
info . flags = 0 ;
info . low_limit = current - > mm - > mmap_base ;
2022-04-09 20:17:28 +03:00
info . high_limit = arch_get_mmap_end ( addr , len , flags ) ;
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 02:03:34 +03:00
addr = vm_unmapped_area ( & info ) ;
}
return addr ;
}
2022-04-09 20:17:27 +03:00
unsigned long
generic_hugetlb_get_unmapped_area ( struct file * file , unsigned long addr ,
unsigned long len , unsigned long pgoff ,
unsigned long flags )
2005-04-17 02:20:36 +04:00
{
struct mm_struct * mm = current - > mm ;
struct vm_area_struct * vma ;
2008-07-24 08:27:41 +04:00
struct hstate * h = hstate_file ( file ) ;
2022-04-09 20:17:28 +03:00
const unsigned long mmap_end = arch_get_mmap_end ( addr , len , flags ) ;
2005-04-17 02:20:36 +04:00
2008-07-24 08:27:41 +04:00
if ( len & ~ huge_page_mask ( h ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
if ( len > TASK_SIZE )
return - ENOMEM ;
2007-05-07 01:50:12 +04:00
if ( flags & MAP_FIXED ) {
2008-07-24 08:27:41 +04:00
if ( prepare_hugepage_range ( file , addr , len ) )
2007-05-07 01:50:12 +04:00
return - EINVAL ;
return addr ;
}
2005-04-17 02:20:36 +04:00
if ( addr ) {
2008-07-24 08:27:41 +04:00
addr = ALIGN ( addr , huge_page_size ( h ) ) ;
2005-04-17 02:20:36 +04:00
vma = find_vma ( mm , addr ) ;
2022-04-22 02:35:46 +03:00
if ( mmap_end - len > = addr & &
mm: larger stack guard gap, between vmas
Stack guard page is a useful feature to reduce a risk of stack smashing
into a different mapping. We have been using a single page gap which
is sufficient to prevent having stack adjacent to a different mapping.
But this seems to be insufficient in the light of the stack usage in
userspace. E.g. glibc uses as large as 64kB alloca() in many commonly
used functions. Others use constructs liks gid_t buffer[NGROUPS_MAX]
which is 256kB or stack strings with MAX_ARG_STRLEN.
This will become especially dangerous for suid binaries and the default
no limit for the stack size limit because those applications can be
tricked to consume a large portion of the stack and a single glibc call
could jump over the guard page. These attacks are not theoretical,
unfortunatelly.
Make those attacks less probable by increasing the stack guard gap
to 1MB (on systems with 4k pages; but make it depend on the page size
because systems with larger base pages might cap stack allocations in
the PAGE_SIZE units) which should cover larger alloca() and VLA stack
allocations. It is obviously not a full fix because the problem is
somehow inherent, but it should reduce attack space a lot.
One could argue that the gap size should be configurable from userspace,
but that can be done later when somebody finds that the new 1MB is wrong
for some special case applications. For now, add a kernel command line
option (stack_guard_gap) to specify the stack gap size (in page units).
Implementation wise, first delete all the old code for stack guard page:
because although we could get away with accounting one extra page in a
stack vma, accounting a larger gap can break userspace - case in point,
a program run with "ulimit -S -v 20000" failed when the 1MB gap was
counted for RLIMIT_AS; similar problems could come with RLIMIT_MLOCK
and strict non-overcommit mode.
Instead of keeping gap inside the stack vma, maintain the stack guard
gap as a gap between vmas: using vm_start_gap() in place of vm_start
(or vm_end_gap() in place of vm_end if VM_GROWSUP) in just those few
places which need to respect the gap - mainly arch_get_unmapped_area(),
and and the vma tree's subtree_gap support for that.
Original-patch-by: Oleg Nesterov <oleg@redhat.com>
Original-patch-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Tested-by: Helge Deller <deller@gmx.de> # parisc
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-06-19 14:03:24 +03:00
( ! vma | | addr + len < = vm_start_gap ( vma ) ) )
2005-04-17 02:20:36 +04:00
return addr ;
}
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-04 02:03:34 +03:00
/*
* Use mm - > get_unmapped_area value as a hint to use topdown routine .
* If architectures have special needs , they should define their own
* version of hugetlb_get_unmapped_area .
*/
if ( mm - > get_unmapped_area = = arch_get_unmapped_area_topdown )
return hugetlb_get_unmapped_area_topdown ( file , addr , len ,
pgoff , flags ) ;
return hugetlb_get_unmapped_area_bottomup ( file , addr , len ,
pgoff , flags ) ;
2005-04-17 02:20:36 +04:00
}
2022-04-09 20:17:27 +03:00
# ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long
hugetlb_get_unmapped_area ( struct file * file , unsigned long addr ,
unsigned long len , unsigned long pgoff ,
unsigned long flags )
{
return generic_hugetlb_get_unmapped_area ( file , addr , len , pgoff , flags ) ;
}
2005-04-17 02:20:36 +04:00
# endif
2023-07-13 03:18:32 +03:00
/*
* Someone wants to read @ bytes from a HWPOISON hugetlb @ page from @ offset .
* Returns the maximum number of bytes one can read without touching the 1 st raw
* HWPOISON subpage .
*
* The implementation borrows the iteration logic from copy_page_to_iter * .
*/
static size_t adjust_range_hwpoison ( struct page * page , size_t offset , size_t bytes )
{
size_t n = 0 ;
size_t res = 0 ;
/* First subpage to start the loop. */
2023-09-13 23:12:47 +03:00
page = nth_page ( page , offset / PAGE_SIZE ) ;
2023-07-13 03:18:32 +03:00
offset % = PAGE_SIZE ;
while ( 1 ) {
if ( is_raw_hwpoison_page_in_hugepage ( page ) )
break ;
/* Safe to read n bytes without touching HWPOISON subpage. */
n = min ( bytes , ( size_t ) PAGE_SIZE - offset ) ;
res + = n ;
bytes - = n ;
if ( ! bytes | | ! n )
break ;
offset + = n ;
if ( offset = = PAGE_SIZE ) {
2023-09-13 23:12:47 +03:00
page = nth_page ( page , 1 ) ;
2023-07-13 03:18:32 +03:00
offset = 0 ;
}
}
return res ;
}
2007-10-16 12:26:22 +04:00
/*
* Support for read ( ) - Find the page attached to f_mapping and copy out the
2022-07-26 17:29:17 +03:00
* data . This provides functionality similar to filemap_read ( ) .
2007-10-16 12:26:22 +04:00
*/
2015-04-03 18:31:35 +03:00
static ssize_t hugetlbfs_read_iter ( struct kiocb * iocb , struct iov_iter * to )
2007-10-16 12:26:22 +04:00
{
2015-04-03 18:31:35 +03:00
struct file * file = iocb - > ki_filp ;
struct hstate * h = hstate_file ( file ) ;
struct address_space * mapping = file - > f_mapping ;
2007-10-16 12:26:22 +04:00
struct inode * inode = mapping - > host ;
2015-04-03 18:31:35 +03:00
unsigned long index = iocb - > ki_pos > > huge_page_shift ( h ) ;
unsigned long offset = iocb - > ki_pos & ~ huge_page_mask ( h ) ;
2007-10-16 12:26:22 +04:00
unsigned long end_index ;
loff_t isize ;
ssize_t retval = 0 ;
2015-04-03 18:31:35 +03:00
while ( iov_iter_count ( to ) ) {
2023-09-26 22:20:17 +03:00
struct folio * folio ;
2023-07-13 03:18:32 +03:00
size_t nr , copied , want ;
2007-10-16 12:26:22 +04:00
/* nr is the maximum number of bytes to copy from this page */
2008-07-24 08:27:41 +04:00
nr = huge_page_size ( h ) ;
2012-03-22 03:34:08 +04:00
isize = i_size_read ( inode ) ;
if ( ! isize )
2015-04-03 18:31:35 +03:00
break ;
2012-03-22 03:34:08 +04:00
end_index = ( isize - 1 ) > > huge_page_shift ( h ) ;
2015-04-03 18:31:35 +03:00
if ( index > end_index )
break ;
if ( index = = end_index ) {
2008-07-24 08:27:41 +04:00
nr = ( ( isize - 1 ) & ~ huge_page_mask ( h ) ) + 1 ;
2012-03-22 03:34:08 +04:00
if ( nr < = offset )
2015-04-03 18:31:35 +03:00
break ;
2007-10-16 12:26:22 +04:00
}
nr = nr - offset ;
2023-09-26 22:20:17 +03:00
/* Find the folio */
folio = filemap_lock_hugetlb_folio ( h , mapping , index ) ;
if ( IS_ERR ( folio ) ) {
2007-10-16 12:26:22 +04:00
/*
* We have a HOLE , zero out the user - buffer for the
* length of the hole or request .
*/
2015-04-03 18:31:35 +03:00
copied = iov_iter_zero ( nr , to ) ;
2007-10-16 12:26:22 +04:00
} else {
2023-09-26 22:20:17 +03:00
folio_unlock ( folio ) ;
2012-03-22 03:34:08 +04:00
2023-09-26 22:20:17 +03:00
if ( ! folio_test_has_hwpoisoned ( folio ) )
2023-07-13 03:18:32 +03:00
want = nr ;
else {
/*
* Adjust how many bytes safe to read without
* touching the 1 st raw HWPOISON subpage after
* offset .
*/
2023-09-26 22:20:17 +03:00
want = adjust_range_hwpoison ( & folio - > page , offset , nr ) ;
2023-07-13 03:18:32 +03:00
if ( want = = 0 ) {
2023-09-26 22:20:17 +03:00
folio_put ( folio ) ;
2023-07-13 03:18:32 +03:00
retval = - EIO ;
break ;
}
2022-10-18 23:01:25 +03:00
}
2007-10-16 12:26:22 +04:00
/*
2023-09-26 22:20:17 +03:00
* We have the folio , copy it to user space buffer .
2007-10-16 12:26:22 +04:00
*/
2023-09-26 22:20:17 +03:00
copied = copy_folio_to_iter ( folio , offset , want , to ) ;
folio_put ( folio ) ;
2007-10-16 12:26:22 +04:00
}
2015-04-03 18:31:35 +03:00
offset + = copied ;
retval + = copied ;
if ( copied ! = nr & & iov_iter_count ( to ) ) {
if ( ! retval )
retval = - EFAULT ;
break ;
2007-10-16 12:26:22 +04:00
}
2008-07-24 08:27:41 +04:00
index + = offset > > huge_page_shift ( h ) ;
offset & = ~ huge_page_mask ( h ) ;
2007-10-16 12:26:22 +04:00
}
2015-04-03 18:31:35 +03:00
iocb - > ki_pos = ( ( loff_t ) index < < huge_page_shift ( h ) ) + offset ;
2007-10-16 12:26:22 +04:00
return retval ;
}
2007-10-16 12:25:03 +04:00
static int hugetlbfs_write_begin ( struct file * file ,
struct address_space * mapping ,
2022-02-22 22:31:43 +03:00
loff_t pos , unsigned len ,
2007-10-16 12:25:03 +04:00
struct page * * pagep , void * * fsdata )
2005-04-17 02:20:36 +04:00
{
return - EINVAL ;
}
2007-10-16 12:25:03 +04:00
static int hugetlbfs_write_end ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len , unsigned copied ,
struct page * page , void * fsdata )
2005-04-17 02:20:36 +04:00
{
2007-10-16 12:25:03 +04:00
BUG ( ) ;
2005-04-17 02:20:36 +04:00
return - EINVAL ;
}
2022-09-22 18:42:06 +03:00
static void hugetlb_delete_from_page_cache ( struct folio * folio )
2005-04-17 02:20:36 +04:00
{
2022-09-22 18:42:06 +03:00
folio_clear_dirty ( folio ) ;
folio_clear_uptodate ( folio ) ;
filemap_remove_folio ( folio ) ;
2005-04-17 02:20:36 +04:00
}
2022-09-15 01:18:08 +03:00
/*
* Called with i_mmap_rwsem held for inode based vma maps . This makes
* sure vma ( and vm_mm ) will not go away . We also hold the hugetlb fault
* mutex for the page in the mapping . So , we can not race with page being
* faulted into the vma .
*/
static bool hugetlb_vma_maps_page ( struct vm_area_struct * vma ,
unsigned long addr , struct page * page )
{
pte_t * ptep , pte ;
2022-12-16 18:52:29 +03:00
ptep = hugetlb_walk ( vma , addr , huge_page_size ( hstate_vma ( vma ) ) ) ;
2022-09-15 01:18:08 +03:00
if ( ! ptep )
return false ;
pte = huge_ptep_get ( ptep ) ;
if ( huge_pte_none ( pte ) | | ! pte_present ( pte ) )
return false ;
if ( pte_page ( pte ) = = page )
return true ;
return false ;
}
/*
* Can vma_offset_start / vma_offset_end overflow on 32 - bit arches ?
* No , because the interval tree returns us only those vmas
* which overlap the truncated area starting at pgoff ,
* and no vma on a 32 - bit arch can span beyond the 4 GB .
*/
static unsigned long vma_offset_start ( struct vm_area_struct * vma , pgoff_t start )
{
mm/hugetlb: let vma_offset_start() to return start
Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd
unshare", v4.
Problem
=======
huge_pte_offset() is a major helper used by hugetlb code paths to walk a
hugetlb pgtable. It's used mostly everywhere since that's needed even
before taking the pgtable lock.
huge_pte_offset() is always called with mmap lock held with either read or
write. It was assumed to be safe but it's actually not. One race
condition can easily trigger by: (1) firstly trigger pmd share on a memory
range, (2) do huge_pte_offset() on the range, then at the meantime, (3)
another thread unshare the pmd range, and the pgtable page is prone to lost
if the other shared process wants to free it completely (by either munmap
or exit mm).
The recent work from Mike on vma lock can resolve most of this already.
It's achieved by forbidden pmd unsharing during the lock being taken, so no
further risk of the pgtable page being freed. It means if we can take the
vma lock around all huge_pte_offset() callers it'll be safe.
There're already a bunch of them that we did as per the latest mm-unstable,
but also quite a few others that we didn't for various reasons especially
on huge_pte_offset() usage.
One more thing to mention is that besides the vma lock, i_mmap_rwsem can
also be used to protect the pgtable page (along with its pgtable lock) from
being freed from under us. IOW, huge_pte_offset() callers need to either
hold the vma lock or i_mmap_rwsem to safely walk the pgtables.
A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is
very hard to trigger, one needs to apply another kernel delay patch too,
see below):
======8<=======
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <linux/memfd.h>
#include <assert.h>
#include <pthread.h>
#define MSIZE (1UL << 30) /* 1GB */
#define PSIZE (2UL << 20) /* 2MB */
#define HOLD_SEC (1)
int pipefd[2];
void *buf;
void *do_map(int fd)
{
unsigned char *tmpbuf, *p;
int ret;
ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE);
if (ret) {
perror("posix_memalign() failed");
return NULL;
}
tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
if (tmpbuf == MAP_FAILED) {
perror("mmap() failed");
return NULL;
}
printf("mmap() -> %p\n", tmpbuf);
for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) {
*p = 1;
}
return tmpbuf;
}
void do_unmap(void *buf)
{
munmap(buf, MSIZE);
}
void proc2(int fd)
{
unsigned char c;
buf = do_map(fd);
if (!buf)
return;
read(pipefd[0], &c, 1);
/*
* This frees the shared pgtable page, causing use-after-free in
* proc1_thread1 when soft walking hugetlb pgtable.
*/
do_unmap(buf);
printf("Proc2 quitting\n");
}
void *proc1_thread1(void *data)
{
/*
* Trigger follow-page on 1st 2m page. Kernel hack patch needed to
* withhold this procedure for easier reproduce.
*/
madvise(buf, PSIZE, MADV_POPULATE_WRITE);
printf("Proc1-thread1 quitting\n");
return NULL;
}
void *proc1_thread2(void *data)
{
unsigned char c;
/* Wait a while until proc1_thread1() start to wait */
sleep(0.5);
/* Trigger pmd unshare */
madvise(buf, PSIZE, MADV_DONTNEED);
/* Kick off proc2 to release the pgtable */
write(pipefd[1], &c, 1);
printf("Proc1-thread2 quitting\n");
return NULL;
}
void proc1(int fd)
{
pthread_t tid1, tid2;
int ret;
buf = do_map(fd);
if (!buf)
return;
ret = pthread_create(&tid1, NULL, proc1_thread1, NULL);
assert(ret == 0);
ret = pthread_create(&tid2, NULL, proc1_thread2, NULL);
assert(ret == 0);
/* Kick the child to share the PUD entry */
pthread_join(tid1, NULL);
pthread_join(tid2, NULL);
do_unmap(buf);
}
int main(void)
{
int fd, ret;
fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB);
if (fd < 0) {
perror("open failed");
return -1;
}
ret = ftruncate(fd, MSIZE);
if (ret) {
perror("ftruncate() failed");
return -1;
}
ret = pipe(pipefd);
if (ret) {
perror("pipe() failed");
return -1;
}
if (fork()) {
proc1(fd);
} else {
proc2(fd);
}
close(pipefd[0]);
close(pipefd[1]);
close(fd);
return 0;
}
======8<=======
The kernel patch needed to present such a race so it'll trigger 100%:
======8<=======
: diff --git a/mm/hugetlb.c b/mm/hugetlb.c
: index 9d97c9a2a15d..f8d99dad5004 100644
: --- a/mm/hugetlb.c
: +++ b/mm/hugetlb.c
: @@ -38,6 +38,7 @@
: #include <asm/page.h>
: #include <asm/pgalloc.h>
: #include <asm/tlb.h>
: +#include <asm/delay.h>
:
: #include <linux/io.h>
: #include <linux/hugetlb.h>
: @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: bool unshare = false;
: int absent;
: struct page *page;
: + unsigned long c = 0;
:
: /*
: * If we have a pending SIGKILL, don't keep faulting pages and
: @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: */
: pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
: huge_page_size(h));
: +
: + pr_info("%s: withhold 1 sec...\n", __func__);
: + for (c = 0; c < 100; c++) {
: + udelay(10000);
: + }
: + pr_info("%s: withhold 1 sec...done\n", __func__);
: +
: if (pte)
: ptl = huge_pte_lock(h, mm, pte);
: absent = !pte || huge_pte_none(huge_ptep_get(pte));
: ======8<=======
It'll trigger use-after-free of the pgtable spinlock:
======8<=======
[ 16.959907] follow_hugetlb_page: withhold 1 sec...
[ 17.960315] follow_hugetlb_page: withhold 1 sec...done
[ 17.960550] ------------[ cut here ]------------
[ 17.960742] DEBUG_LOCKS_WARN_ON(1)
[ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0
[ 17.961264] Modules linked in:
[ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46
[ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0
[ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9
[ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096
[ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58
[ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000
[ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.965956] PKRU: 55555554
[ 17.966068] Call Trace:
[ 17.966172] <TASK>
[ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.966455] lock_acquire+0xbf/0x2b0
[ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.966799] ? _printk+0x48/0x4e
[ 17.966934] _raw_spin_lock+0x2f/0x40
[ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967473] __get_user_pages+0xbb/0x620
[ 17.967635] faultin_vma_page_range+0x9a/0x100
[ 17.967817] madvise_vma_behavior+0x3c0/0xbd0
[ 17.967998] ? mas_prev+0x11/0x290
[ 17.968141] ? find_vma_prev+0x5e/0xa0
[ 17.968304] ? madvise_vma_anon_name+0x70/0x70
[ 17.968486] madvise_walk_vmas+0xa9/0x120
[ 17.968650] do_madvise.part.0+0xfa/0x270
[ 17.968813] __x64_sys_madvise+0x5a/0x70
[ 17.968974] do_syscall_64+0x37/0x90
[ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 17.969329] RIP: 0033:0x7f1840f0efdb
[ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68
[ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c
[ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb
[ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000
[ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f
[ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80
[ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000
[ 17.972083] </TASK>
[ 17.972199] irq event stamp: 2353
[ 17.972372] hardirqs last enabled at (2353): [<ffffffff8117fe4e>] __up_console_sem+0x5e/0x70
[ 17.972869] hardirqs last disabled at (2352): [<ffffffff8117fe33>] __up_console_sem+0x43/0x70
[ 17.973365] softirqs last enabled at (2330): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.973857] softirqs last disabled at (2323): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.974341] ---[ end trace 0000000000000000 ]---
[ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8
[ 17.975012] #PF: supervisor read access in kernel mode
[ 17.975314] #PF: error_code(0x0000) - not-present page
[ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0
[ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46
[ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0
[ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f
[ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046
[ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58
[ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000
[ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.986674] PKRU: 55555554
[ 17.986832] Call Trace:
[ 17.987012] <TASK>
[ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.987770] lock_acquire+0xbf/0x2b0
[ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.988575] ? _printk+0x48/0x4e
[ 17.988889] _raw_spin_lock+0x2f/0x40
[ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.990119] __get_user_pages+0xbb/0x620
[ 17.990500] faultin_vma_page_range+0x9a/0x100
[ 17.990928] madvise_vma_behavior+0x3c0/0xbd0
[ 17.991354] ? mas_prev+0x11/0x290
[ 17.991678] ? find_vma_prev+0x5e/0xa0
[ 17.992024] ? madvise_vma_anon_name+0x70/0x70
[ 17.992421] madvise_walk_vmas+0xa9/0x120
[ 17.992793] do_madvise.part.0+0xfa/0x270
[ 17.993166] __x64_sys_madvise+0x5a/0x70
[ 17.993539] do_syscall_64+0x37/0x90
[ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd
======8<=======
Resolution
==========
This patchset protects all the huge_pte_offset() callers to also take the
vma lock properly.
Patch Layout
============
Patch 1-2: cleanup, or dependency of the follow up patches
Patch 3: before fixing, document huge_pte_offset() on lock required
Patch 4-8: each patch resolves one possible race condition
Patch 9: introduce hugetlb_walk() to replace huge_pte_offset()
Tests
=====
The series is verified with the above reproducer so the race cannot
trigger anymore. It also passes all hugetlb kselftests.
This patch (of 9):
Even though vma_offset_start() is named like that, it's not returning "the
start address of the range" but rather the offset we should use to offset
the vma->vm_start address.
Make it return the real value of the start vaddr, and it also helps for
all the callers because whenever the retval is used, it'll be ultimately
added into the vma->vm_start anyway, so it's better.
Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-16 18:50:52 +03:00
unsigned long offset = 0 ;
2022-09-15 01:18:08 +03:00
if ( vma - > vm_pgoff < start )
mm/hugetlb: let vma_offset_start() to return start
Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd
unshare", v4.
Problem
=======
huge_pte_offset() is a major helper used by hugetlb code paths to walk a
hugetlb pgtable. It's used mostly everywhere since that's needed even
before taking the pgtable lock.
huge_pte_offset() is always called with mmap lock held with either read or
write. It was assumed to be safe but it's actually not. One race
condition can easily trigger by: (1) firstly trigger pmd share on a memory
range, (2) do huge_pte_offset() on the range, then at the meantime, (3)
another thread unshare the pmd range, and the pgtable page is prone to lost
if the other shared process wants to free it completely (by either munmap
or exit mm).
The recent work from Mike on vma lock can resolve most of this already.
It's achieved by forbidden pmd unsharing during the lock being taken, so no
further risk of the pgtable page being freed. It means if we can take the
vma lock around all huge_pte_offset() callers it'll be safe.
There're already a bunch of them that we did as per the latest mm-unstable,
but also quite a few others that we didn't for various reasons especially
on huge_pte_offset() usage.
One more thing to mention is that besides the vma lock, i_mmap_rwsem can
also be used to protect the pgtable page (along with its pgtable lock) from
being freed from under us. IOW, huge_pte_offset() callers need to either
hold the vma lock or i_mmap_rwsem to safely walk the pgtables.
A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is
very hard to trigger, one needs to apply another kernel delay patch too,
see below):
======8<=======
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <linux/memfd.h>
#include <assert.h>
#include <pthread.h>
#define MSIZE (1UL << 30) /* 1GB */
#define PSIZE (2UL << 20) /* 2MB */
#define HOLD_SEC (1)
int pipefd[2];
void *buf;
void *do_map(int fd)
{
unsigned char *tmpbuf, *p;
int ret;
ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE);
if (ret) {
perror("posix_memalign() failed");
return NULL;
}
tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
if (tmpbuf == MAP_FAILED) {
perror("mmap() failed");
return NULL;
}
printf("mmap() -> %p\n", tmpbuf);
for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) {
*p = 1;
}
return tmpbuf;
}
void do_unmap(void *buf)
{
munmap(buf, MSIZE);
}
void proc2(int fd)
{
unsigned char c;
buf = do_map(fd);
if (!buf)
return;
read(pipefd[0], &c, 1);
/*
* This frees the shared pgtable page, causing use-after-free in
* proc1_thread1 when soft walking hugetlb pgtable.
*/
do_unmap(buf);
printf("Proc2 quitting\n");
}
void *proc1_thread1(void *data)
{
/*
* Trigger follow-page on 1st 2m page. Kernel hack patch needed to
* withhold this procedure for easier reproduce.
*/
madvise(buf, PSIZE, MADV_POPULATE_WRITE);
printf("Proc1-thread1 quitting\n");
return NULL;
}
void *proc1_thread2(void *data)
{
unsigned char c;
/* Wait a while until proc1_thread1() start to wait */
sleep(0.5);
/* Trigger pmd unshare */
madvise(buf, PSIZE, MADV_DONTNEED);
/* Kick off proc2 to release the pgtable */
write(pipefd[1], &c, 1);
printf("Proc1-thread2 quitting\n");
return NULL;
}
void proc1(int fd)
{
pthread_t tid1, tid2;
int ret;
buf = do_map(fd);
if (!buf)
return;
ret = pthread_create(&tid1, NULL, proc1_thread1, NULL);
assert(ret == 0);
ret = pthread_create(&tid2, NULL, proc1_thread2, NULL);
assert(ret == 0);
/* Kick the child to share the PUD entry */
pthread_join(tid1, NULL);
pthread_join(tid2, NULL);
do_unmap(buf);
}
int main(void)
{
int fd, ret;
fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB);
if (fd < 0) {
perror("open failed");
return -1;
}
ret = ftruncate(fd, MSIZE);
if (ret) {
perror("ftruncate() failed");
return -1;
}
ret = pipe(pipefd);
if (ret) {
perror("pipe() failed");
return -1;
}
if (fork()) {
proc1(fd);
} else {
proc2(fd);
}
close(pipefd[0]);
close(pipefd[1]);
close(fd);
return 0;
}
======8<=======
The kernel patch needed to present such a race so it'll trigger 100%:
======8<=======
: diff --git a/mm/hugetlb.c b/mm/hugetlb.c
: index 9d97c9a2a15d..f8d99dad5004 100644
: --- a/mm/hugetlb.c
: +++ b/mm/hugetlb.c
: @@ -38,6 +38,7 @@
: #include <asm/page.h>
: #include <asm/pgalloc.h>
: #include <asm/tlb.h>
: +#include <asm/delay.h>
:
: #include <linux/io.h>
: #include <linux/hugetlb.h>
: @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: bool unshare = false;
: int absent;
: struct page *page;
: + unsigned long c = 0;
:
: /*
: * If we have a pending SIGKILL, don't keep faulting pages and
: @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: */
: pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
: huge_page_size(h));
: +
: + pr_info("%s: withhold 1 sec...\n", __func__);
: + for (c = 0; c < 100; c++) {
: + udelay(10000);
: + }
: + pr_info("%s: withhold 1 sec...done\n", __func__);
: +
: if (pte)
: ptl = huge_pte_lock(h, mm, pte);
: absent = !pte || huge_pte_none(huge_ptep_get(pte));
: ======8<=======
It'll trigger use-after-free of the pgtable spinlock:
======8<=======
[ 16.959907] follow_hugetlb_page: withhold 1 sec...
[ 17.960315] follow_hugetlb_page: withhold 1 sec...done
[ 17.960550] ------------[ cut here ]------------
[ 17.960742] DEBUG_LOCKS_WARN_ON(1)
[ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0
[ 17.961264] Modules linked in:
[ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46
[ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0
[ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9
[ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096
[ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58
[ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000
[ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.965956] PKRU: 55555554
[ 17.966068] Call Trace:
[ 17.966172] <TASK>
[ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.966455] lock_acquire+0xbf/0x2b0
[ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.966799] ? _printk+0x48/0x4e
[ 17.966934] _raw_spin_lock+0x2f/0x40
[ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967473] __get_user_pages+0xbb/0x620
[ 17.967635] faultin_vma_page_range+0x9a/0x100
[ 17.967817] madvise_vma_behavior+0x3c0/0xbd0
[ 17.967998] ? mas_prev+0x11/0x290
[ 17.968141] ? find_vma_prev+0x5e/0xa0
[ 17.968304] ? madvise_vma_anon_name+0x70/0x70
[ 17.968486] madvise_walk_vmas+0xa9/0x120
[ 17.968650] do_madvise.part.0+0xfa/0x270
[ 17.968813] __x64_sys_madvise+0x5a/0x70
[ 17.968974] do_syscall_64+0x37/0x90
[ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 17.969329] RIP: 0033:0x7f1840f0efdb
[ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68
[ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c
[ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb
[ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000
[ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f
[ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80
[ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000
[ 17.972083] </TASK>
[ 17.972199] irq event stamp: 2353
[ 17.972372] hardirqs last enabled at (2353): [<ffffffff8117fe4e>] __up_console_sem+0x5e/0x70
[ 17.972869] hardirqs last disabled at (2352): [<ffffffff8117fe33>] __up_console_sem+0x43/0x70
[ 17.973365] softirqs last enabled at (2330): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.973857] softirqs last disabled at (2323): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.974341] ---[ end trace 0000000000000000 ]---
[ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8
[ 17.975012] #PF: supervisor read access in kernel mode
[ 17.975314] #PF: error_code(0x0000) - not-present page
[ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0
[ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46
[ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0
[ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f
[ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046
[ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58
[ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000
[ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.986674] PKRU: 55555554
[ 17.986832] Call Trace:
[ 17.987012] <TASK>
[ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.987770] lock_acquire+0xbf/0x2b0
[ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.988575] ? _printk+0x48/0x4e
[ 17.988889] _raw_spin_lock+0x2f/0x40
[ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.990119] __get_user_pages+0xbb/0x620
[ 17.990500] faultin_vma_page_range+0x9a/0x100
[ 17.990928] madvise_vma_behavior+0x3c0/0xbd0
[ 17.991354] ? mas_prev+0x11/0x290
[ 17.991678] ? find_vma_prev+0x5e/0xa0
[ 17.992024] ? madvise_vma_anon_name+0x70/0x70
[ 17.992421] madvise_walk_vmas+0xa9/0x120
[ 17.992793] do_madvise.part.0+0xfa/0x270
[ 17.993166] __x64_sys_madvise+0x5a/0x70
[ 17.993539] do_syscall_64+0x37/0x90
[ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd
======8<=======
Resolution
==========
This patchset protects all the huge_pte_offset() callers to also take the
vma lock properly.
Patch Layout
============
Patch 1-2: cleanup, or dependency of the follow up patches
Patch 3: before fixing, document huge_pte_offset() on lock required
Patch 4-8: each patch resolves one possible race condition
Patch 9: introduce hugetlb_walk() to replace huge_pte_offset()
Tests
=====
The series is verified with the above reproducer so the race cannot
trigger anymore. It also passes all hugetlb kselftests.
This patch (of 9):
Even though vma_offset_start() is named like that, it's not returning "the
start address of the range" but rather the offset we should use to offset
the vma->vm_start address.
Make it return the real value of the start vaddr, and it also helps for
all the callers because whenever the retval is used, it'll be ultimately
added into the vma->vm_start anyway, so it's better.
Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-16 18:50:52 +03:00
offset = ( start - vma - > vm_pgoff ) < < PAGE_SHIFT ;
return vma - > vm_start + offset ;
2022-09-15 01:18:08 +03:00
}
static unsigned long vma_offset_end ( struct vm_area_struct * vma , pgoff_t end )
{
unsigned long t_end ;
if ( ! end )
return vma - > vm_end ;
t_end = ( ( end - vma - > vm_pgoff ) < < PAGE_SHIFT ) + vma - > vm_start ;
if ( t_end > vma - > vm_end )
t_end = vma - > vm_end ;
return t_end ;
}
/*
* Called with hugetlb fault mutex held . Therefore , no more mappings to
* this folio can be created while executing the routine .
*/
static void hugetlb_unmap_file_folio ( struct hstate * h ,
struct address_space * mapping ,
struct folio * folio , pgoff_t index )
{
struct rb_root_cached * root = & mapping - > i_mmap ;
2022-09-15 01:18:09 +03:00
struct hugetlb_vma_lock * vma_lock ;
2022-09-15 01:18:08 +03:00
struct page * page = & folio - > page ;
struct vm_area_struct * vma ;
unsigned long v_start ;
unsigned long v_end ;
pgoff_t start , end ;
start = index * pages_per_huge_page ( h ) ;
end = ( index + 1 ) * pages_per_huge_page ( h ) ;
i_mmap_lock_write ( mapping ) ;
2022-09-15 01:18:09 +03:00
retry :
vma_lock = NULL ;
2022-09-15 01:18:08 +03:00
vma_interval_tree_foreach ( vma , root , start , end - 1 ) {
v_start = vma_offset_start ( vma , start ) ;
v_end = vma_offset_end ( vma , end ) ;
mm/hugetlb: let vma_offset_start() to return start
Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd
unshare", v4.
Problem
=======
huge_pte_offset() is a major helper used by hugetlb code paths to walk a
hugetlb pgtable. It's used mostly everywhere since that's needed even
before taking the pgtable lock.
huge_pte_offset() is always called with mmap lock held with either read or
write. It was assumed to be safe but it's actually not. One race
condition can easily trigger by: (1) firstly trigger pmd share on a memory
range, (2) do huge_pte_offset() on the range, then at the meantime, (3)
another thread unshare the pmd range, and the pgtable page is prone to lost
if the other shared process wants to free it completely (by either munmap
or exit mm).
The recent work from Mike on vma lock can resolve most of this already.
It's achieved by forbidden pmd unsharing during the lock being taken, so no
further risk of the pgtable page being freed. It means if we can take the
vma lock around all huge_pte_offset() callers it'll be safe.
There're already a bunch of them that we did as per the latest mm-unstable,
but also quite a few others that we didn't for various reasons especially
on huge_pte_offset() usage.
One more thing to mention is that besides the vma lock, i_mmap_rwsem can
also be used to protect the pgtable page (along with its pgtable lock) from
being freed from under us. IOW, huge_pte_offset() callers need to either
hold the vma lock or i_mmap_rwsem to safely walk the pgtables.
A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is
very hard to trigger, one needs to apply another kernel delay patch too,
see below):
======8<=======
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <linux/memfd.h>
#include <assert.h>
#include <pthread.h>
#define MSIZE (1UL << 30) /* 1GB */
#define PSIZE (2UL << 20) /* 2MB */
#define HOLD_SEC (1)
int pipefd[2];
void *buf;
void *do_map(int fd)
{
unsigned char *tmpbuf, *p;
int ret;
ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE);
if (ret) {
perror("posix_memalign() failed");
return NULL;
}
tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
if (tmpbuf == MAP_FAILED) {
perror("mmap() failed");
return NULL;
}
printf("mmap() -> %p\n", tmpbuf);
for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) {
*p = 1;
}
return tmpbuf;
}
void do_unmap(void *buf)
{
munmap(buf, MSIZE);
}
void proc2(int fd)
{
unsigned char c;
buf = do_map(fd);
if (!buf)
return;
read(pipefd[0], &c, 1);
/*
* This frees the shared pgtable page, causing use-after-free in
* proc1_thread1 when soft walking hugetlb pgtable.
*/
do_unmap(buf);
printf("Proc2 quitting\n");
}
void *proc1_thread1(void *data)
{
/*
* Trigger follow-page on 1st 2m page. Kernel hack patch needed to
* withhold this procedure for easier reproduce.
*/
madvise(buf, PSIZE, MADV_POPULATE_WRITE);
printf("Proc1-thread1 quitting\n");
return NULL;
}
void *proc1_thread2(void *data)
{
unsigned char c;
/* Wait a while until proc1_thread1() start to wait */
sleep(0.5);
/* Trigger pmd unshare */
madvise(buf, PSIZE, MADV_DONTNEED);
/* Kick off proc2 to release the pgtable */
write(pipefd[1], &c, 1);
printf("Proc1-thread2 quitting\n");
return NULL;
}
void proc1(int fd)
{
pthread_t tid1, tid2;
int ret;
buf = do_map(fd);
if (!buf)
return;
ret = pthread_create(&tid1, NULL, proc1_thread1, NULL);
assert(ret == 0);
ret = pthread_create(&tid2, NULL, proc1_thread2, NULL);
assert(ret == 0);
/* Kick the child to share the PUD entry */
pthread_join(tid1, NULL);
pthread_join(tid2, NULL);
do_unmap(buf);
}
int main(void)
{
int fd, ret;
fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB);
if (fd < 0) {
perror("open failed");
return -1;
}
ret = ftruncate(fd, MSIZE);
if (ret) {
perror("ftruncate() failed");
return -1;
}
ret = pipe(pipefd);
if (ret) {
perror("pipe() failed");
return -1;
}
if (fork()) {
proc1(fd);
} else {
proc2(fd);
}
close(pipefd[0]);
close(pipefd[1]);
close(fd);
return 0;
}
======8<=======
The kernel patch needed to present such a race so it'll trigger 100%:
======8<=======
: diff --git a/mm/hugetlb.c b/mm/hugetlb.c
: index 9d97c9a2a15d..f8d99dad5004 100644
: --- a/mm/hugetlb.c
: +++ b/mm/hugetlb.c
: @@ -38,6 +38,7 @@
: #include <asm/page.h>
: #include <asm/pgalloc.h>
: #include <asm/tlb.h>
: +#include <asm/delay.h>
:
: #include <linux/io.h>
: #include <linux/hugetlb.h>
: @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: bool unshare = false;
: int absent;
: struct page *page;
: + unsigned long c = 0;
:
: /*
: * If we have a pending SIGKILL, don't keep faulting pages and
: @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: */
: pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
: huge_page_size(h));
: +
: + pr_info("%s: withhold 1 sec...\n", __func__);
: + for (c = 0; c < 100; c++) {
: + udelay(10000);
: + }
: + pr_info("%s: withhold 1 sec...done\n", __func__);
: +
: if (pte)
: ptl = huge_pte_lock(h, mm, pte);
: absent = !pte || huge_pte_none(huge_ptep_get(pte));
: ======8<=======
It'll trigger use-after-free of the pgtable spinlock:
======8<=======
[ 16.959907] follow_hugetlb_page: withhold 1 sec...
[ 17.960315] follow_hugetlb_page: withhold 1 sec...done
[ 17.960550] ------------[ cut here ]------------
[ 17.960742] DEBUG_LOCKS_WARN_ON(1)
[ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0
[ 17.961264] Modules linked in:
[ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46
[ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0
[ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9
[ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096
[ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58
[ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000
[ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.965956] PKRU: 55555554
[ 17.966068] Call Trace:
[ 17.966172] <TASK>
[ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.966455] lock_acquire+0xbf/0x2b0
[ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.966799] ? _printk+0x48/0x4e
[ 17.966934] _raw_spin_lock+0x2f/0x40
[ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967473] __get_user_pages+0xbb/0x620
[ 17.967635] faultin_vma_page_range+0x9a/0x100
[ 17.967817] madvise_vma_behavior+0x3c0/0xbd0
[ 17.967998] ? mas_prev+0x11/0x290
[ 17.968141] ? find_vma_prev+0x5e/0xa0
[ 17.968304] ? madvise_vma_anon_name+0x70/0x70
[ 17.968486] madvise_walk_vmas+0xa9/0x120
[ 17.968650] do_madvise.part.0+0xfa/0x270
[ 17.968813] __x64_sys_madvise+0x5a/0x70
[ 17.968974] do_syscall_64+0x37/0x90
[ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 17.969329] RIP: 0033:0x7f1840f0efdb
[ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68
[ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c
[ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb
[ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000
[ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f
[ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80
[ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000
[ 17.972083] </TASK>
[ 17.972199] irq event stamp: 2353
[ 17.972372] hardirqs last enabled at (2353): [<ffffffff8117fe4e>] __up_console_sem+0x5e/0x70
[ 17.972869] hardirqs last disabled at (2352): [<ffffffff8117fe33>] __up_console_sem+0x43/0x70
[ 17.973365] softirqs last enabled at (2330): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.973857] softirqs last disabled at (2323): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.974341] ---[ end trace 0000000000000000 ]---
[ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8
[ 17.975012] #PF: supervisor read access in kernel mode
[ 17.975314] #PF: error_code(0x0000) - not-present page
[ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0
[ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46
[ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0
[ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f
[ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046
[ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58
[ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000
[ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.986674] PKRU: 55555554
[ 17.986832] Call Trace:
[ 17.987012] <TASK>
[ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.987770] lock_acquire+0xbf/0x2b0
[ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.988575] ? _printk+0x48/0x4e
[ 17.988889] _raw_spin_lock+0x2f/0x40
[ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.990119] __get_user_pages+0xbb/0x620
[ 17.990500] faultin_vma_page_range+0x9a/0x100
[ 17.990928] madvise_vma_behavior+0x3c0/0xbd0
[ 17.991354] ? mas_prev+0x11/0x290
[ 17.991678] ? find_vma_prev+0x5e/0xa0
[ 17.992024] ? madvise_vma_anon_name+0x70/0x70
[ 17.992421] madvise_walk_vmas+0xa9/0x120
[ 17.992793] do_madvise.part.0+0xfa/0x270
[ 17.993166] __x64_sys_madvise+0x5a/0x70
[ 17.993539] do_syscall_64+0x37/0x90
[ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd
======8<=======
Resolution
==========
This patchset protects all the huge_pte_offset() callers to also take the
vma lock properly.
Patch Layout
============
Patch 1-2: cleanup, or dependency of the follow up patches
Patch 3: before fixing, document huge_pte_offset() on lock required
Patch 4-8: each patch resolves one possible race condition
Patch 9: introduce hugetlb_walk() to replace huge_pte_offset()
Tests
=====
The series is verified with the above reproducer so the race cannot
trigger anymore. It also passes all hugetlb kselftests.
This patch (of 9):
Even though vma_offset_start() is named like that, it's not returning "the
start address of the range" but rather the offset we should use to offset
the vma->vm_start address.
Make it return the real value of the start vaddr, and it also helps for
all the callers because whenever the retval is used, it'll be ultimately
added into the vma->vm_start anyway, so it's better.
Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-16 18:50:52 +03:00
if ( ! hugetlb_vma_maps_page ( vma , v_start , page ) )
2022-09-15 01:18:08 +03:00
continue ;
2022-09-15 01:18:09 +03:00
if ( ! hugetlb_vma_trylock_write ( vma ) ) {
vma_lock = vma - > vm_private_data ;
/*
* If we can not get vma lock , we need to drop
* immap_sema and take locks in order . First ,
* take a ref on the vma_lock structure so that
* we can be guaranteed it will not go away when
* dropping immap_sema .
*/
kref_get ( & vma_lock - > refs ) ;
break ;
}
mm/hugetlb: let vma_offset_start() to return start
Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd
unshare", v4.
Problem
=======
huge_pte_offset() is a major helper used by hugetlb code paths to walk a
hugetlb pgtable. It's used mostly everywhere since that's needed even
before taking the pgtable lock.
huge_pte_offset() is always called with mmap lock held with either read or
write. It was assumed to be safe but it's actually not. One race
condition can easily trigger by: (1) firstly trigger pmd share on a memory
range, (2) do huge_pte_offset() on the range, then at the meantime, (3)
another thread unshare the pmd range, and the pgtable page is prone to lost
if the other shared process wants to free it completely (by either munmap
or exit mm).
The recent work from Mike on vma lock can resolve most of this already.
It's achieved by forbidden pmd unsharing during the lock being taken, so no
further risk of the pgtable page being freed. It means if we can take the
vma lock around all huge_pte_offset() callers it'll be safe.
There're already a bunch of them that we did as per the latest mm-unstable,
but also quite a few others that we didn't for various reasons especially
on huge_pte_offset() usage.
One more thing to mention is that besides the vma lock, i_mmap_rwsem can
also be used to protect the pgtable page (along with its pgtable lock) from
being freed from under us. IOW, huge_pte_offset() callers need to either
hold the vma lock or i_mmap_rwsem to safely walk the pgtables.
A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is
very hard to trigger, one needs to apply another kernel delay patch too,
see below):
======8<=======
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <linux/memfd.h>
#include <assert.h>
#include <pthread.h>
#define MSIZE (1UL << 30) /* 1GB */
#define PSIZE (2UL << 20) /* 2MB */
#define HOLD_SEC (1)
int pipefd[2];
void *buf;
void *do_map(int fd)
{
unsigned char *tmpbuf, *p;
int ret;
ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE);
if (ret) {
perror("posix_memalign() failed");
return NULL;
}
tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
if (tmpbuf == MAP_FAILED) {
perror("mmap() failed");
return NULL;
}
printf("mmap() -> %p\n", tmpbuf);
for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) {
*p = 1;
}
return tmpbuf;
}
void do_unmap(void *buf)
{
munmap(buf, MSIZE);
}
void proc2(int fd)
{
unsigned char c;
buf = do_map(fd);
if (!buf)
return;
read(pipefd[0], &c, 1);
/*
* This frees the shared pgtable page, causing use-after-free in
* proc1_thread1 when soft walking hugetlb pgtable.
*/
do_unmap(buf);
printf("Proc2 quitting\n");
}
void *proc1_thread1(void *data)
{
/*
* Trigger follow-page on 1st 2m page. Kernel hack patch needed to
* withhold this procedure for easier reproduce.
*/
madvise(buf, PSIZE, MADV_POPULATE_WRITE);
printf("Proc1-thread1 quitting\n");
return NULL;
}
void *proc1_thread2(void *data)
{
unsigned char c;
/* Wait a while until proc1_thread1() start to wait */
sleep(0.5);
/* Trigger pmd unshare */
madvise(buf, PSIZE, MADV_DONTNEED);
/* Kick off proc2 to release the pgtable */
write(pipefd[1], &c, 1);
printf("Proc1-thread2 quitting\n");
return NULL;
}
void proc1(int fd)
{
pthread_t tid1, tid2;
int ret;
buf = do_map(fd);
if (!buf)
return;
ret = pthread_create(&tid1, NULL, proc1_thread1, NULL);
assert(ret == 0);
ret = pthread_create(&tid2, NULL, proc1_thread2, NULL);
assert(ret == 0);
/* Kick the child to share the PUD entry */
pthread_join(tid1, NULL);
pthread_join(tid2, NULL);
do_unmap(buf);
}
int main(void)
{
int fd, ret;
fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB);
if (fd < 0) {
perror("open failed");
return -1;
}
ret = ftruncate(fd, MSIZE);
if (ret) {
perror("ftruncate() failed");
return -1;
}
ret = pipe(pipefd);
if (ret) {
perror("pipe() failed");
return -1;
}
if (fork()) {
proc1(fd);
} else {
proc2(fd);
}
close(pipefd[0]);
close(pipefd[1]);
close(fd);
return 0;
}
======8<=======
The kernel patch needed to present such a race so it'll trigger 100%:
======8<=======
: diff --git a/mm/hugetlb.c b/mm/hugetlb.c
: index 9d97c9a2a15d..f8d99dad5004 100644
: --- a/mm/hugetlb.c
: +++ b/mm/hugetlb.c
: @@ -38,6 +38,7 @@
: #include <asm/page.h>
: #include <asm/pgalloc.h>
: #include <asm/tlb.h>
: +#include <asm/delay.h>
:
: #include <linux/io.h>
: #include <linux/hugetlb.h>
: @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: bool unshare = false;
: int absent;
: struct page *page;
: + unsigned long c = 0;
:
: /*
: * If we have a pending SIGKILL, don't keep faulting pages and
: @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: */
: pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
: huge_page_size(h));
: +
: + pr_info("%s: withhold 1 sec...\n", __func__);
: + for (c = 0; c < 100; c++) {
: + udelay(10000);
: + }
: + pr_info("%s: withhold 1 sec...done\n", __func__);
: +
: if (pte)
: ptl = huge_pte_lock(h, mm, pte);
: absent = !pte || huge_pte_none(huge_ptep_get(pte));
: ======8<=======
It'll trigger use-after-free of the pgtable spinlock:
======8<=======
[ 16.959907] follow_hugetlb_page: withhold 1 sec...
[ 17.960315] follow_hugetlb_page: withhold 1 sec...done
[ 17.960550] ------------[ cut here ]------------
[ 17.960742] DEBUG_LOCKS_WARN_ON(1)
[ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0
[ 17.961264] Modules linked in:
[ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46
[ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0
[ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9
[ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096
[ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58
[ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000
[ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.965956] PKRU: 55555554
[ 17.966068] Call Trace:
[ 17.966172] <TASK>
[ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.966455] lock_acquire+0xbf/0x2b0
[ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.966799] ? _printk+0x48/0x4e
[ 17.966934] _raw_spin_lock+0x2f/0x40
[ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967473] __get_user_pages+0xbb/0x620
[ 17.967635] faultin_vma_page_range+0x9a/0x100
[ 17.967817] madvise_vma_behavior+0x3c0/0xbd0
[ 17.967998] ? mas_prev+0x11/0x290
[ 17.968141] ? find_vma_prev+0x5e/0xa0
[ 17.968304] ? madvise_vma_anon_name+0x70/0x70
[ 17.968486] madvise_walk_vmas+0xa9/0x120
[ 17.968650] do_madvise.part.0+0xfa/0x270
[ 17.968813] __x64_sys_madvise+0x5a/0x70
[ 17.968974] do_syscall_64+0x37/0x90
[ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 17.969329] RIP: 0033:0x7f1840f0efdb
[ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68
[ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c
[ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb
[ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000
[ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f
[ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80
[ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000
[ 17.972083] </TASK>
[ 17.972199] irq event stamp: 2353
[ 17.972372] hardirqs last enabled at (2353): [<ffffffff8117fe4e>] __up_console_sem+0x5e/0x70
[ 17.972869] hardirqs last disabled at (2352): [<ffffffff8117fe33>] __up_console_sem+0x43/0x70
[ 17.973365] softirqs last enabled at (2330): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.973857] softirqs last disabled at (2323): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.974341] ---[ end trace 0000000000000000 ]---
[ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8
[ 17.975012] #PF: supervisor read access in kernel mode
[ 17.975314] #PF: error_code(0x0000) - not-present page
[ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0
[ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46
[ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0
[ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f
[ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046
[ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58
[ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000
[ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.986674] PKRU: 55555554
[ 17.986832] Call Trace:
[ 17.987012] <TASK>
[ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.987770] lock_acquire+0xbf/0x2b0
[ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.988575] ? _printk+0x48/0x4e
[ 17.988889] _raw_spin_lock+0x2f/0x40
[ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.990119] __get_user_pages+0xbb/0x620
[ 17.990500] faultin_vma_page_range+0x9a/0x100
[ 17.990928] madvise_vma_behavior+0x3c0/0xbd0
[ 17.991354] ? mas_prev+0x11/0x290
[ 17.991678] ? find_vma_prev+0x5e/0xa0
[ 17.992024] ? madvise_vma_anon_name+0x70/0x70
[ 17.992421] madvise_walk_vmas+0xa9/0x120
[ 17.992793] do_madvise.part.0+0xfa/0x270
[ 17.993166] __x64_sys_madvise+0x5a/0x70
[ 17.993539] do_syscall_64+0x37/0x90
[ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd
======8<=======
Resolution
==========
This patchset protects all the huge_pte_offset() callers to also take the
vma lock properly.
Patch Layout
============
Patch 1-2: cleanup, or dependency of the follow up patches
Patch 3: before fixing, document huge_pte_offset() on lock required
Patch 4-8: each patch resolves one possible race condition
Patch 9: introduce hugetlb_walk() to replace huge_pte_offset()
Tests
=====
The series is verified with the above reproducer so the race cannot
trigger anymore. It also passes all hugetlb kselftests.
This patch (of 9):
Even though vma_offset_start() is named like that, it's not returning "the
start address of the range" but rather the offset we should use to offset
the vma->vm_start address.
Make it return the real value of the start vaddr, and it also helps for
all the callers because whenever the retval is used, it'll be ultimately
added into the vma->vm_start anyway, so it's better.
Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-16 18:50:52 +03:00
unmap_hugepage_range ( vma , v_start , v_end , NULL ,
ZAP_FLAG_DROP_MARKER ) ;
2022-09-15 01:18:09 +03:00
hugetlb_vma_unlock_write ( vma ) ;
2022-09-15 01:18:08 +03:00
}
i_mmap_unlock_write ( mapping ) ;
2022-09-15 01:18:09 +03:00
if ( vma_lock ) {
/*
* Wait on vma_lock . We know it is still valid as we have
* a reference . We must ' open code ' vma locking as we do
* not know if vma_lock is still attached to vma .
*/
down_write ( & vma_lock - > rw_sema ) ;
i_mmap_lock_write ( mapping ) ;
vma = vma_lock - > vma ;
if ( ! vma ) {
/*
* If lock is no longer attached to vma , then just
* unlock , drop our reference and retry looking for
* other vmas .
*/
up_write ( & vma_lock - > rw_sema ) ;
kref_put ( & vma_lock - > refs , hugetlb_vma_lock_release ) ;
goto retry ;
}
/*
* vma_lock is still attached to vma . Check to see if vma
* still maps page and if so , unmap .
*/
v_start = vma_offset_start ( vma , start ) ;
v_end = vma_offset_end ( vma , end ) ;
mm/hugetlb: let vma_offset_start() to return start
Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd
unshare", v4.
Problem
=======
huge_pte_offset() is a major helper used by hugetlb code paths to walk a
hugetlb pgtable. It's used mostly everywhere since that's needed even
before taking the pgtable lock.
huge_pte_offset() is always called with mmap lock held with either read or
write. It was assumed to be safe but it's actually not. One race
condition can easily trigger by: (1) firstly trigger pmd share on a memory
range, (2) do huge_pte_offset() on the range, then at the meantime, (3)
another thread unshare the pmd range, and the pgtable page is prone to lost
if the other shared process wants to free it completely (by either munmap
or exit mm).
The recent work from Mike on vma lock can resolve most of this already.
It's achieved by forbidden pmd unsharing during the lock being taken, so no
further risk of the pgtable page being freed. It means if we can take the
vma lock around all huge_pte_offset() callers it'll be safe.
There're already a bunch of them that we did as per the latest mm-unstable,
but also quite a few others that we didn't for various reasons especially
on huge_pte_offset() usage.
One more thing to mention is that besides the vma lock, i_mmap_rwsem can
also be used to protect the pgtable page (along with its pgtable lock) from
being freed from under us. IOW, huge_pte_offset() callers need to either
hold the vma lock or i_mmap_rwsem to safely walk the pgtables.
A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is
very hard to trigger, one needs to apply another kernel delay patch too,
see below):
======8<=======
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <linux/memfd.h>
#include <assert.h>
#include <pthread.h>
#define MSIZE (1UL << 30) /* 1GB */
#define PSIZE (2UL << 20) /* 2MB */
#define HOLD_SEC (1)
int pipefd[2];
void *buf;
void *do_map(int fd)
{
unsigned char *tmpbuf, *p;
int ret;
ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE);
if (ret) {
perror("posix_memalign() failed");
return NULL;
}
tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
if (tmpbuf == MAP_FAILED) {
perror("mmap() failed");
return NULL;
}
printf("mmap() -> %p\n", tmpbuf);
for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) {
*p = 1;
}
return tmpbuf;
}
void do_unmap(void *buf)
{
munmap(buf, MSIZE);
}
void proc2(int fd)
{
unsigned char c;
buf = do_map(fd);
if (!buf)
return;
read(pipefd[0], &c, 1);
/*
* This frees the shared pgtable page, causing use-after-free in
* proc1_thread1 when soft walking hugetlb pgtable.
*/
do_unmap(buf);
printf("Proc2 quitting\n");
}
void *proc1_thread1(void *data)
{
/*
* Trigger follow-page on 1st 2m page. Kernel hack patch needed to
* withhold this procedure for easier reproduce.
*/
madvise(buf, PSIZE, MADV_POPULATE_WRITE);
printf("Proc1-thread1 quitting\n");
return NULL;
}
void *proc1_thread2(void *data)
{
unsigned char c;
/* Wait a while until proc1_thread1() start to wait */
sleep(0.5);
/* Trigger pmd unshare */
madvise(buf, PSIZE, MADV_DONTNEED);
/* Kick off proc2 to release the pgtable */
write(pipefd[1], &c, 1);
printf("Proc1-thread2 quitting\n");
return NULL;
}
void proc1(int fd)
{
pthread_t tid1, tid2;
int ret;
buf = do_map(fd);
if (!buf)
return;
ret = pthread_create(&tid1, NULL, proc1_thread1, NULL);
assert(ret == 0);
ret = pthread_create(&tid2, NULL, proc1_thread2, NULL);
assert(ret == 0);
/* Kick the child to share the PUD entry */
pthread_join(tid1, NULL);
pthread_join(tid2, NULL);
do_unmap(buf);
}
int main(void)
{
int fd, ret;
fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB);
if (fd < 0) {
perror("open failed");
return -1;
}
ret = ftruncate(fd, MSIZE);
if (ret) {
perror("ftruncate() failed");
return -1;
}
ret = pipe(pipefd);
if (ret) {
perror("pipe() failed");
return -1;
}
if (fork()) {
proc1(fd);
} else {
proc2(fd);
}
close(pipefd[0]);
close(pipefd[1]);
close(fd);
return 0;
}
======8<=======
The kernel patch needed to present such a race so it'll trigger 100%:
======8<=======
: diff --git a/mm/hugetlb.c b/mm/hugetlb.c
: index 9d97c9a2a15d..f8d99dad5004 100644
: --- a/mm/hugetlb.c
: +++ b/mm/hugetlb.c
: @@ -38,6 +38,7 @@
: #include <asm/page.h>
: #include <asm/pgalloc.h>
: #include <asm/tlb.h>
: +#include <asm/delay.h>
:
: #include <linux/io.h>
: #include <linux/hugetlb.h>
: @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: bool unshare = false;
: int absent;
: struct page *page;
: + unsigned long c = 0;
:
: /*
: * If we have a pending SIGKILL, don't keep faulting pages and
: @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: */
: pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
: huge_page_size(h));
: +
: + pr_info("%s: withhold 1 sec...\n", __func__);
: + for (c = 0; c < 100; c++) {
: + udelay(10000);
: + }
: + pr_info("%s: withhold 1 sec...done\n", __func__);
: +
: if (pte)
: ptl = huge_pte_lock(h, mm, pte);
: absent = !pte || huge_pte_none(huge_ptep_get(pte));
: ======8<=======
It'll trigger use-after-free of the pgtable spinlock:
======8<=======
[ 16.959907] follow_hugetlb_page: withhold 1 sec...
[ 17.960315] follow_hugetlb_page: withhold 1 sec...done
[ 17.960550] ------------[ cut here ]------------
[ 17.960742] DEBUG_LOCKS_WARN_ON(1)
[ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0
[ 17.961264] Modules linked in:
[ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46
[ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0
[ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9
[ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096
[ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58
[ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000
[ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.965956] PKRU: 55555554
[ 17.966068] Call Trace:
[ 17.966172] <TASK>
[ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.966455] lock_acquire+0xbf/0x2b0
[ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.966799] ? _printk+0x48/0x4e
[ 17.966934] _raw_spin_lock+0x2f/0x40
[ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967473] __get_user_pages+0xbb/0x620
[ 17.967635] faultin_vma_page_range+0x9a/0x100
[ 17.967817] madvise_vma_behavior+0x3c0/0xbd0
[ 17.967998] ? mas_prev+0x11/0x290
[ 17.968141] ? find_vma_prev+0x5e/0xa0
[ 17.968304] ? madvise_vma_anon_name+0x70/0x70
[ 17.968486] madvise_walk_vmas+0xa9/0x120
[ 17.968650] do_madvise.part.0+0xfa/0x270
[ 17.968813] __x64_sys_madvise+0x5a/0x70
[ 17.968974] do_syscall_64+0x37/0x90
[ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 17.969329] RIP: 0033:0x7f1840f0efdb
[ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68
[ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c
[ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb
[ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000
[ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f
[ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80
[ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000
[ 17.972083] </TASK>
[ 17.972199] irq event stamp: 2353
[ 17.972372] hardirqs last enabled at (2353): [<ffffffff8117fe4e>] __up_console_sem+0x5e/0x70
[ 17.972869] hardirqs last disabled at (2352): [<ffffffff8117fe33>] __up_console_sem+0x43/0x70
[ 17.973365] softirqs last enabled at (2330): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.973857] softirqs last disabled at (2323): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.974341] ---[ end trace 0000000000000000 ]---
[ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8
[ 17.975012] #PF: supervisor read access in kernel mode
[ 17.975314] #PF: error_code(0x0000) - not-present page
[ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0
[ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46
[ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0
[ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f
[ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046
[ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58
[ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000
[ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.986674] PKRU: 55555554
[ 17.986832] Call Trace:
[ 17.987012] <TASK>
[ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.987770] lock_acquire+0xbf/0x2b0
[ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.988575] ? _printk+0x48/0x4e
[ 17.988889] _raw_spin_lock+0x2f/0x40
[ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.990119] __get_user_pages+0xbb/0x620
[ 17.990500] faultin_vma_page_range+0x9a/0x100
[ 17.990928] madvise_vma_behavior+0x3c0/0xbd0
[ 17.991354] ? mas_prev+0x11/0x290
[ 17.991678] ? find_vma_prev+0x5e/0xa0
[ 17.992024] ? madvise_vma_anon_name+0x70/0x70
[ 17.992421] madvise_walk_vmas+0xa9/0x120
[ 17.992793] do_madvise.part.0+0xfa/0x270
[ 17.993166] __x64_sys_madvise+0x5a/0x70
[ 17.993539] do_syscall_64+0x37/0x90
[ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd
======8<=======
Resolution
==========
This patchset protects all the huge_pte_offset() callers to also take the
vma lock properly.
Patch Layout
============
Patch 1-2: cleanup, or dependency of the follow up patches
Patch 3: before fixing, document huge_pte_offset() on lock required
Patch 4-8: each patch resolves one possible race condition
Patch 9: introduce hugetlb_walk() to replace huge_pte_offset()
Tests
=====
The series is verified with the above reproducer so the race cannot
trigger anymore. It also passes all hugetlb kselftests.
This patch (of 9):
Even though vma_offset_start() is named like that, it's not returning "the
start address of the range" but rather the offset we should use to offset
the vma->vm_start address.
Make it return the real value of the start vaddr, and it also helps for
all the callers because whenever the retval is used, it'll be ultimately
added into the vma->vm_start anyway, so it's better.
Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-16 18:50:52 +03:00
if ( hugetlb_vma_maps_page ( vma , v_start , page ) )
unmap_hugepage_range ( vma , v_start , v_end , NULL ,
ZAP_FLAG_DROP_MARKER ) ;
2022-09-15 01:18:09 +03:00
kref_put ( & vma_lock - > refs , hugetlb_vma_lock_release ) ;
hugetlb_vma_unlock_write ( vma ) ;
goto retry ;
}
2022-09-15 01:18:08 +03:00
}
2016-01-16 03:57:40 +03:00
static void
2022-05-13 06:22:55 +03:00
hugetlb_vmdelete_list ( struct rb_root_cached * root , pgoff_t start , pgoff_t end ,
zap_flags_t zap_flags )
2016-01-16 03:57:40 +03:00
{
struct vm_area_struct * vma ;
/*
2022-01-15 01:08:30 +03:00
* end = = 0 indicates that the entire range after start should be
* unmapped . Note , end is exclusive , whereas the interval tree takes
* an inclusive " last " .
2016-01-16 03:57:40 +03:00
*/
2022-01-15 01:08:30 +03:00
vma_interval_tree_foreach ( vma , root , start , end ? end - 1 : ULONG_MAX ) {
2022-09-15 01:18:08 +03:00
unsigned long v_start ;
2016-01-16 03:57:40 +03:00
unsigned long v_end ;
2022-09-15 01:18:09 +03:00
if ( ! hugetlb_vma_trylock_write ( vma ) )
continue ;
2022-09-15 01:18:08 +03:00
v_start = vma_offset_start ( vma , start ) ;
v_end = vma_offset_end ( vma , end ) ;
2016-01-16 03:57:40 +03:00
mm/hugetlb: let vma_offset_start() to return start
Patch series "mm/hugetlb: Make huge_pte_offset() thread-safe for pmd
unshare", v4.
Problem
=======
huge_pte_offset() is a major helper used by hugetlb code paths to walk a
hugetlb pgtable. It's used mostly everywhere since that's needed even
before taking the pgtable lock.
huge_pte_offset() is always called with mmap lock held with either read or
write. It was assumed to be safe but it's actually not. One race
condition can easily trigger by: (1) firstly trigger pmd share on a memory
range, (2) do huge_pte_offset() on the range, then at the meantime, (3)
another thread unshare the pmd range, and the pgtable page is prone to lost
if the other shared process wants to free it completely (by either munmap
or exit mm).
The recent work from Mike on vma lock can resolve most of this already.
It's achieved by forbidden pmd unsharing during the lock being taken, so no
further risk of the pgtable page being freed. It means if we can take the
vma lock around all huge_pte_offset() callers it'll be safe.
There're already a bunch of them that we did as per the latest mm-unstable,
but also quite a few others that we didn't for various reasons especially
on huge_pte_offset() usage.
One more thing to mention is that besides the vma lock, i_mmap_rwsem can
also be used to protect the pgtable page (along with its pgtable lock) from
being freed from under us. IOW, huge_pte_offset() callers need to either
hold the vma lock or i_mmap_rwsem to safely walk the pgtables.
A reproducer of such problem, based on hugetlb GUP (NOTE: since the race is
very hard to trigger, one needs to apply another kernel delay patch too,
see below):
======8<=======
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <linux/memfd.h>
#include <assert.h>
#include <pthread.h>
#define MSIZE (1UL << 30) /* 1GB */
#define PSIZE (2UL << 20) /* 2MB */
#define HOLD_SEC (1)
int pipefd[2];
void *buf;
void *do_map(int fd)
{
unsigned char *tmpbuf, *p;
int ret;
ret = posix_memalign((void **)&tmpbuf, MSIZE, MSIZE);
if (ret) {
perror("posix_memalign() failed");
return NULL;
}
tmpbuf = mmap(tmpbuf, MSIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
if (tmpbuf == MAP_FAILED) {
perror("mmap() failed");
return NULL;
}
printf("mmap() -> %p\n", tmpbuf);
for (p = tmpbuf; p < tmpbuf + MSIZE; p += PSIZE) {
*p = 1;
}
return tmpbuf;
}
void do_unmap(void *buf)
{
munmap(buf, MSIZE);
}
void proc2(int fd)
{
unsigned char c;
buf = do_map(fd);
if (!buf)
return;
read(pipefd[0], &c, 1);
/*
* This frees the shared pgtable page, causing use-after-free in
* proc1_thread1 when soft walking hugetlb pgtable.
*/
do_unmap(buf);
printf("Proc2 quitting\n");
}
void *proc1_thread1(void *data)
{
/*
* Trigger follow-page on 1st 2m page. Kernel hack patch needed to
* withhold this procedure for easier reproduce.
*/
madvise(buf, PSIZE, MADV_POPULATE_WRITE);
printf("Proc1-thread1 quitting\n");
return NULL;
}
void *proc1_thread2(void *data)
{
unsigned char c;
/* Wait a while until proc1_thread1() start to wait */
sleep(0.5);
/* Trigger pmd unshare */
madvise(buf, PSIZE, MADV_DONTNEED);
/* Kick off proc2 to release the pgtable */
write(pipefd[1], &c, 1);
printf("Proc1-thread2 quitting\n");
return NULL;
}
void proc1(int fd)
{
pthread_t tid1, tid2;
int ret;
buf = do_map(fd);
if (!buf)
return;
ret = pthread_create(&tid1, NULL, proc1_thread1, NULL);
assert(ret == 0);
ret = pthread_create(&tid2, NULL, proc1_thread2, NULL);
assert(ret == 0);
/* Kick the child to share the PUD entry */
pthread_join(tid1, NULL);
pthread_join(tid2, NULL);
do_unmap(buf);
}
int main(void)
{
int fd, ret;
fd = memfd_create("test-huge", MFD_HUGETLB | MFD_HUGE_2MB);
if (fd < 0) {
perror("open failed");
return -1;
}
ret = ftruncate(fd, MSIZE);
if (ret) {
perror("ftruncate() failed");
return -1;
}
ret = pipe(pipefd);
if (ret) {
perror("pipe() failed");
return -1;
}
if (fork()) {
proc1(fd);
} else {
proc2(fd);
}
close(pipefd[0]);
close(pipefd[1]);
close(fd);
return 0;
}
======8<=======
The kernel patch needed to present such a race so it'll trigger 100%:
======8<=======
: diff --git a/mm/hugetlb.c b/mm/hugetlb.c
: index 9d97c9a2a15d..f8d99dad5004 100644
: --- a/mm/hugetlb.c
: +++ b/mm/hugetlb.c
: @@ -38,6 +38,7 @@
: #include <asm/page.h>
: #include <asm/pgalloc.h>
: #include <asm/tlb.h>
: +#include <asm/delay.h>
:
: #include <linux/io.h>
: #include <linux/hugetlb.h>
: @@ -6290,6 +6291,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: bool unshare = false;
: int absent;
: struct page *page;
: + unsigned long c = 0;
:
: /*
: * If we have a pending SIGKILL, don't keep faulting pages and
: @@ -6309,6 +6311,13 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
: */
: pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
: huge_page_size(h));
: +
: + pr_info("%s: withhold 1 sec...\n", __func__);
: + for (c = 0; c < 100; c++) {
: + udelay(10000);
: + }
: + pr_info("%s: withhold 1 sec...done\n", __func__);
: +
: if (pte)
: ptl = huge_pte_lock(h, mm, pte);
: absent = !pte || huge_pte_none(huge_ptep_get(pte));
: ======8<=======
It'll trigger use-after-free of the pgtable spinlock:
======8<=======
[ 16.959907] follow_hugetlb_page: withhold 1 sec...
[ 17.960315] follow_hugetlb_page: withhold 1 sec...done
[ 17.960550] ------------[ cut here ]------------
[ 17.960742] DEBUG_LOCKS_WARN_ON(1)
[ 17.960756] WARNING: CPU: 3 PID: 542 at kernel/locking/lockdep.c:231 __lock_acquire+0x955/0x1fa0
[ 17.961264] Modules linked in:
[ 17.961394] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Not tainted 6.1.0-rc4-peterx+ #46
[ 17.961704] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.962266] RIP: 0010:__lock_acquire+0x955/0x1fa0
[ 17.962516] Code: c0 0f 84 5f fe ff ff 44 8b 1d 0f 9a 29 02 45 85 db 0f 85 4f fe ff ff 48 c7 c6 75 50 83 82 48 c7 c7 1b 4b 7d 82 e8 d3 22 d8 00 <0f> 0b 31 c0 4c 8b 54 24 08 4c 8b 04 24 e9
[ 17.963494] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010096
[ 17.963704] RAX: 0000000000000016 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.963989] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.964276] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffc90000e4fa58
[ 17.964557] R10: 0000000000000003 R11: ffffffff83162688 R12: 0000000000000000
[ 17.964839] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.965123] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.965443] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.965672] CR2: 00007f17c09ffef8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.965956] PKRU: 55555554
[ 17.966068] Call Trace:
[ 17.966172] <TASK>
[ 17.966268] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.966455] lock_acquire+0xbf/0x2b0
[ 17.966603] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.966799] ? _printk+0x48/0x4e
[ 17.966934] _raw_spin_lock+0x2f/0x40
[ 17.967087] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967285] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.967473] __get_user_pages+0xbb/0x620
[ 17.967635] faultin_vma_page_range+0x9a/0x100
[ 17.967817] madvise_vma_behavior+0x3c0/0xbd0
[ 17.967998] ? mas_prev+0x11/0x290
[ 17.968141] ? find_vma_prev+0x5e/0xa0
[ 17.968304] ? madvise_vma_anon_name+0x70/0x70
[ 17.968486] madvise_walk_vmas+0xa9/0x120
[ 17.968650] do_madvise.part.0+0xfa/0x270
[ 17.968813] __x64_sys_madvise+0x5a/0x70
[ 17.968974] do_syscall_64+0x37/0x90
[ 17.969123] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 17.969329] RIP: 0033:0x7f1840f0efdb
[ 17.969477] Code: c3 66 0f 1f 44 00 00 48 8b 15 39 6e 0e 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 1c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0d 68
[ 17.970205] RSP: 002b:00007f17c09ffe38 EFLAGS: 00000202 ORIG_RAX: 000000000000001c
[ 17.970504] RAX: ffffffffffffffda RBX: 00007f17c0a00640 RCX: 00007f1840f0efdb
[ 17.970786] RDX: 0000000000000017 RSI: 0000000000200000 RDI: 00007f1800000000
[ 17.971068] RBP: 00007f17c09ffe50 R08: 0000000000000000 R09: 00007ffd3954164f
[ 17.971353] R10: 00007f1840e10348 R11: 0000000000000202 R12: ffffffffffffff80
[ 17.971709] R13: 0000000000000000 R14: 00007ffd39541550 R15: 00007f17c0200000
[ 17.972083] </TASK>
[ 17.972199] irq event stamp: 2353
[ 17.972372] hardirqs last enabled at (2353): [<ffffffff8117fe4e>] __up_console_sem+0x5e/0x70
[ 17.972869] hardirqs last disabled at (2352): [<ffffffff8117fe33>] __up_console_sem+0x43/0x70
[ 17.973365] softirqs last enabled at (2330): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.973857] softirqs last disabled at (2323): [<ffffffff810f763d>] __irq_exit_rcu+0xed/0x160
[ 17.974341] ---[ end trace 0000000000000000 ]---
[ 17.974614] BUG: kernel NULL pointer dereference, address: 00000000000000b8
[ 17.975012] #PF: supervisor read access in kernel mode
[ 17.975314] #PF: error_code(0x0000) - not-present page
[ 17.975615] PGD 103f7b067 P4D 103f7b067 PUD 106cd7067 PMD 0
[ 17.975943] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 17.976197] CPU: 3 PID: 542 Comm: hugetlb-pmd-sha Tainted: G W 6.1.0-rc4-peterx+ #46
[ 17.976712] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 17.977370] RIP: 0010:__lock_acquire+0x190/0x1fa0
[ 17.977655] Code: 98 00 00 00 41 89 46 24 81 e2 ff 1f 00 00 48 0f a3 15 e4 ba dd 02 0f 83 ff 05 00 00 48 8d 04 52 48 c1 e0 06 48 05 c0 d2 f4 83 <44> 0f b6 a0 b8 00 00 00 41 0f b7 46 20 6f
[ 17.979170] RSP: 0018:ffffc90000e4fba8 EFLAGS: 00010046
[ 17.979787] RAX: 0000000000000000 RBX: fffffffffd3925a8 RCX: 0000000000000000
[ 17.980838] RDX: 0000000000000002 RSI: ffffffff82863ccf RDI: 00000000ffffffff
[ 17.982048] RBP: 0000000000000000 R08: ffff888105eac720 R09: ffffc90000e4fa58
[ 17.982892] R10: ffff888105eab900 R11: ffffffff83162688 R12: 0000000000000000
[ 17.983771] R13: 0000000000000001 R14: ffff888105eac748 R15: 0000000000000001
[ 17.984815] FS: 00007f17c0a00640(0000) GS:ffff888277cc0000(0000) knlGS:0000000000000000
[ 17.985924] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 17.986265] CR2: 00000000000000b8 CR3: 000000010c87a005 CR4: 0000000000770ee0
[ 17.986674] PKRU: 55555554
[ 17.986832] Call Trace:
[ 17.987012] <TASK>
[ 17.987266] ? tick_nohz_tick_stopped+0x12/0x30
[ 17.987770] lock_acquire+0xbf/0x2b0
[ 17.988118] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.988575] ? _printk+0x48/0x4e
[ 17.988889] _raw_spin_lock+0x2f/0x40
[ 17.989243] ? follow_hugetlb_page.cold+0x75/0x5c4
[ 17.989687] follow_hugetlb_page.cold+0x75/0x5c4
[ 17.990119] __get_user_pages+0xbb/0x620
[ 17.990500] faultin_vma_page_range+0x9a/0x100
[ 17.990928] madvise_vma_behavior+0x3c0/0xbd0
[ 17.991354] ? mas_prev+0x11/0x290
[ 17.991678] ? find_vma_prev+0x5e/0xa0
[ 17.992024] ? madvise_vma_anon_name+0x70/0x70
[ 17.992421] madvise_walk_vmas+0xa9/0x120
[ 17.992793] do_madvise.part.0+0xfa/0x270
[ 17.993166] __x64_sys_madvise+0x5a/0x70
[ 17.993539] do_syscall_64+0x37/0x90
[ 17.993879] entry_SYSCALL_64_after_hwframe+0x63/0xcd
======8<=======
Resolution
==========
This patchset protects all the huge_pte_offset() callers to also take the
vma lock properly.
Patch Layout
============
Patch 1-2: cleanup, or dependency of the follow up patches
Patch 3: before fixing, document huge_pte_offset() on lock required
Patch 4-8: each patch resolves one possible race condition
Patch 9: introduce hugetlb_walk() to replace huge_pte_offset()
Tests
=====
The series is verified with the above reproducer so the race cannot
trigger anymore. It also passes all hugetlb kselftests.
This patch (of 9):
Even though vma_offset_start() is named like that, it's not returning "the
start address of the range" but rather the offset we should use to offset
the vma->vm_start address.
Make it return the real value of the start vaddr, and it also helps for
all the callers because whenever the retval is used, it'll be ultimately
added into the vma->vm_start anyway, so it's better.
Link: https://lkml.kernel.org/r/20221216155100.2043537-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20221216155100.2043537-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-12-16 18:50:52 +03:00
unmap_hugepage_range ( vma , v_start , v_end , NULL , zap_flags ) ;
2022-09-15 01:18:09 +03:00
/*
* Note that vma lock only exists for shared / non - private
* vmas . Therefore , lock is not held when calling
* unmap_hugepage_range for private vmas .
*/
hugetlb_vma_unlock_write ( vma ) ;
2016-01-16 03:57:40 +03:00
}
}
2015-09-09 01:01:41 +03:00
2022-09-15 01:18:05 +03:00
/*
* Called with hugetlb fault mutex held .
* Returns true if page was actually removed , false otherwise .
*/
static bool remove_inode_single_folio ( struct hstate * h , struct inode * inode ,
struct address_space * mapping ,
struct folio * folio , pgoff_t index ,
bool truncate_op )
{
bool ret = false ;
/*
* If folio is mapped , it was faulted in after being
* unmapped in caller . Unmap ( again ) while holding
* the fault mutex . The mutex will prevent faults
* until we finish removing the folio .
*/
2022-09-15 01:18:08 +03:00
if ( unlikely ( folio_mapped ( folio ) ) )
hugetlb_unmap_file_folio ( h , mapping , folio , index ) ;
2022-09-15 01:18:05 +03:00
folio_lock ( folio ) ;
/*
2022-09-15 01:18:10 +03:00
* We must remove the folio from page cache before removing
* the region / reserve map ( hugetlb_unreserve_pages ) . In
* rare out of memory conditions , removal of the region / reserve
* map could fail . Correspondingly , the subpool and global
* reserve usage count can need to be adjusted .
2022-09-15 01:18:05 +03:00
*/
2022-09-22 18:42:06 +03:00
VM_BUG_ON_FOLIO ( folio_test_hugetlb_restore_reserve ( folio ) , folio ) ;
hugetlb_delete_from_page_cache ( folio ) ;
2022-09-15 01:18:10 +03:00
ret = true ;
if ( ! truncate_op ) {
if ( unlikely ( hugetlb_unreserve_pages ( inode , index ,
index + 1 , 1 ) ) )
hugetlb_fix_reserve_counts ( inode ) ;
2022-09-15 01:18:05 +03:00
}
folio_unlock ( folio ) ;
return ret ;
}
2015-09-09 01:01:41 +03:00
/*
* remove_inode_hugepages handles two distinct cases : truncation and hole
* punch . There are subtle differences in operation for each case .
2016-01-16 03:57:40 +03:00
*
2015-09-09 01:01:41 +03:00
* truncation is indicated by end of range being LLONG_MAX
* In this case , we first scan the range and release found pages .
2021-02-24 23:10:21 +03:00
* After releasing pages , hugetlb_unreserve_pages cleans up region / reserve
2022-09-15 01:18:05 +03:00
* maps and global counts . Page faults can race with truncation .
* During faults , hugetlb_no_page ( ) checks i_size before page allocation ,
* and again after obtaining page table lock . It will ' back out '
* allocations in the truncated range .
2015-09-09 01:01:41 +03:00
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages .
2021-02-24 23:10:21 +03:00
* Only when releasing a page is the associated region / reserve map
* deleted . The region / reserve map for ranges without associated
2019-01-09 02:23:32 +03:00
* pages are not modified . Page faults can race with hole punch .
* This is indicated if we find a mapped page .
2015-09-09 01:01:41 +03:00
* Note : If the passed end of range value is beyond the end of file , but
* not LLONG_MAX this routine still performs a hole punch operation .
*/
static void remove_inode_hugepages ( struct inode * inode , loff_t lstart ,
loff_t lend )
2005-04-17 02:20:36 +04:00
{
2008-07-24 08:27:41 +04:00
struct hstate * h = hstate_inode ( inode ) ;
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:55 +03:00
struct address_space * mapping = & inode - > i_data ;
2023-09-26 22:20:17 +03:00
const pgoff_t end = lend > > PAGE_SHIFT ;
2022-06-04 23:39:04 +03:00
struct folio_batch fbatch ;
2017-09-07 02:21:18 +03:00
pgoff_t next , index ;
2006-06-23 13:03:15 +04:00
int i , freed = 0 ;
2015-09-09 01:01:41 +03:00
bool truncate_op = ( lend = = LLONG_MAX ) ;
2005-04-17 02:20:36 +04:00
2022-06-04 23:39:04 +03:00
folio_batch_init ( & fbatch ) ;
2023-09-26 22:20:17 +03:00
next = lstart > > PAGE_SHIFT ;
2022-06-04 23:39:04 +03:00
while ( filemap_get_folios ( mapping , & next , end - 1 , & fbatch ) ) {
for ( i = 0 ; i < folio_batch_count ( & fbatch ) ; + + i ) {
struct folio * folio = fbatch . folios [ i ] ;
2021-05-05 04:33:34 +03:00
u32 hash = 0 ;
2015-09-09 01:01:41 +03:00
2023-09-26 22:20:17 +03:00
index = folio - > index > > huge_page_order ( h ) ;
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-15 01:18:02 +03:00
hash = hugetlb_fault_mutex_hash ( mapping , index ) ;
mutex_lock ( & hugetlb_fault_mutex_table [ hash ] ) ;
2019-01-09 02:23:32 +03:00
2016-01-16 03:57:40 +03:00
/*
2022-09-15 01:18:05 +03:00
* Remove folio that was part of folio_batch .
2016-01-16 03:57:40 +03:00
*/
2022-09-15 01:18:05 +03:00
if ( remove_inode_single_folio ( h , inode , mapping , folio ,
index , truncate_op ) )
freed + + ;
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-15 01:18:02 +03:00
mutex_unlock ( & hugetlb_fault_mutex_table [ hash ] ) ;
2005-04-17 02:20:36 +04:00
}
2022-06-04 23:39:04 +03:00
folio_batch_release ( & fbatch ) ;
2015-11-21 02:57:13 +03:00
cond_resched ( ) ;
2005-04-17 02:20:36 +04:00
}
2015-09-09 01:01:41 +03:00
if ( truncate_op )
2023-09-26 22:20:17 +03:00
( void ) hugetlb_unreserve_pages ( inode ,
lstart > > huge_page_shift ( h ) ,
LONG_MAX , freed ) ;
2005-04-17 02:20:36 +04:00
}
2010-06-05 03:52:12 +04:00
static void hugetlbfs_evict_inode ( struct inode * inode )
2005-04-17 02:20:36 +04:00
{
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 01:47:25 +04:00
struct resv_map * resv_map ;
2015-09-09 01:01:41 +03:00
remove_inode_hugepages ( inode , 0 , LLONG_MAX ) ;
2019-05-14 03:22:55 +03:00
/*
* Get the resv_map from the address space embedded in the inode .
* This is the address space which points to any resv_map allocated
* at inode creation time . If this is a device special inode ,
* i_mapping may not point to the original address space .
*/
resv_map = ( struct resv_map * ) ( & inode - > i_data ) - > private_data ;
/* Only regular and link inodes have associated reserve maps */
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 01:47:25 +04:00
if ( resv_map )
resv_map_release ( & resv_map - > refs ) ;
2012-05-03 16:48:02 +04:00
clear_inode ( inode ) ;
2005-10-30 04:16:43 +03:00
}
2021-02-24 23:10:25 +03:00
static void hugetlb_vmtruncate ( struct inode * inode , loff_t offset )
2005-04-17 02:20:36 +04:00
{
2006-10-28 21:38:43 +04:00
pgoff_t pgoff ;
2005-04-17 02:20:36 +04:00
struct address_space * mapping = inode - > i_mapping ;
2008-07-24 08:27:41 +04:00
struct hstate * h = hstate_inode ( inode ) ;
2005-04-17 02:20:36 +04:00
2008-07-24 08:27:41 +04:00
BUG_ON ( offset & ~ huge_page_mask ( h ) ) ;
2006-10-28 21:38:43 +04:00
pgoff = offset > > PAGE_SHIFT ;
2005-04-17 02:20:36 +04:00
2020-04-02 07:11:08 +03:00
i_size_write ( inode , offset ) ;
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-15 01:18:02 +03:00
i_mmap_lock_write ( mapping ) ;
2017-09-09 02:15:08 +03:00
if ( ! RB_EMPTY_ROOT ( & mapping - > i_mmap . rb_root ) )
2022-05-13 06:22:55 +03:00
hugetlb_vmdelete_list ( & mapping - > i_mmap , pgoff , 0 ,
ZAP_FLAG_DROP_MARKER ) ;
2018-12-28 11:39:42 +03:00
i_mmap_unlock_write ( mapping ) ;
2019-01-09 02:23:32 +03:00
remove_inode_hugepages ( inode , offset , LLONG_MAX ) ;
2005-04-17 02:20:36 +04:00
}
2022-06-13 23:36:48 +03:00
static void hugetlbfs_zero_partial_page ( struct hstate * h ,
struct address_space * mapping ,
loff_t start ,
loff_t end )
{
pgoff_t idx = start > > huge_page_shift ( h ) ;
struct folio * folio ;
2023-09-26 22:20:17 +03:00
folio = filemap_lock_hugetlb_folio ( h , mapping , idx ) ;
2023-03-07 17:34:10 +03:00
if ( IS_ERR ( folio ) )
2022-06-13 23:36:48 +03:00
return ;
start = start & ~ huge_page_mask ( h ) ;
end = end & ~ huge_page_mask ( h ) ;
if ( ! end )
end = huge_page_size ( h ) ;
folio_zero_segment ( folio , ( size_t ) start , ( size_t ) end ) ;
folio_unlock ( folio ) ;
folio_put ( folio ) ;
}
2015-09-09 01:01:54 +03:00
static long hugetlbfs_punch_hole ( struct inode * inode , loff_t offset , loff_t len )
{
2022-06-13 23:36:48 +03:00
struct hugetlbfs_inode_info * info = HUGETLBFS_I ( inode ) ;
struct address_space * mapping = inode - > i_mapping ;
2015-09-09 01:01:54 +03:00
struct hstate * h = hstate_inode ( inode ) ;
loff_t hpage_size = huge_page_size ( h ) ;
loff_t hole_start , hole_end ;
/*
2022-06-13 23:36:48 +03:00
* hole_start and hole_end indicate the full pages within the hole .
2015-09-09 01:01:54 +03:00
*/
hole_start = round_up ( offset , hpage_size ) ;
hole_end = round_down ( offset + len , hpage_size ) ;
2022-06-13 23:36:48 +03:00
inode_lock ( inode ) ;
/* protected by i_rwsem */
if ( info - > seals & ( F_SEAL_WRITE | F_SEAL_FUTURE_WRITE ) ) {
inode_unlock ( inode ) ;
return - EPERM ;
}
2015-09-09 01:01:54 +03:00
2022-06-13 23:36:48 +03:00
i_mmap_lock_write ( mapping ) ;
2018-02-01 03:19:25 +03:00
2022-06-13 23:36:48 +03:00
/* If range starts before first full page, zero partial page. */
if ( offset < hole_start )
hugetlbfs_zero_partial_page ( h , mapping ,
offset , min ( offset + len , hole_start ) ) ;
2018-02-01 03:19:25 +03:00
2022-06-13 23:36:48 +03:00
/* Unmap users of full pages in the hole. */
if ( hole_end > hole_start ) {
2017-09-09 02:15:08 +03:00
if ( ! RB_EMPTY_ROOT ( & mapping - > i_mmap . rb_root ) )
2015-09-09 01:01:54 +03:00
hugetlb_vmdelete_list ( & mapping - > i_mmap ,
2022-05-13 06:22:55 +03:00
hole_start > > PAGE_SHIFT ,
hole_end > > PAGE_SHIFT , 0 ) ;
2015-09-09 01:01:54 +03:00
}
2022-06-13 23:36:48 +03:00
/* If range extends beyond last full page, zero partial page. */
if ( ( offset + len ) > hole_end & & ( offset + len ) > hole_start )
hugetlbfs_zero_partial_page ( h , mapping ,
hole_end , offset + len ) ;
i_mmap_unlock_write ( mapping ) ;
/* Remove full pages from the file. */
if ( hole_end > hole_start )
remove_inode_hugepages ( inode , hole_start , hole_end ) ;
inode_unlock ( inode ) ;
2015-09-09 01:01:54 +03:00
return 0 ;
}
static long hugetlbfs_fallocate ( struct file * file , int mode , loff_t offset ,
loff_t len )
{
struct inode * inode = file_inode ( file ) ;
2018-02-01 03:19:25 +03:00
struct hugetlbfs_inode_info * info = HUGETLBFS_I ( inode ) ;
2015-09-09 01:01:54 +03:00
struct address_space * mapping = inode - > i_mapping ;
struct hstate * h = hstate_inode ( inode ) ;
struct vm_area_struct pseudo_vma ;
struct mm_struct * mm = current - > mm ;
loff_t hpage_size = huge_page_size ( h ) ;
unsigned long hpage_shift = huge_page_shift ( h ) ;
pgoff_t start , index , end ;
int error ;
u32 hash ;
if ( mode & ~ ( FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE ) )
return - EOPNOTSUPP ;
if ( mode & FALLOC_FL_PUNCH_HOLE )
return hugetlbfs_punch_hole ( inode , offset , len ) ;
/*
* Default preallocate case .
* For this range , start is rounded down and end is rounded up
* as well as being converted to page offsets .
*/
start = offset > > hpage_shift ;
end = ( offset + len + hpage_size - 1 ) > > hpage_shift ;
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
2015-09-09 01:01:54 +03:00
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
error = inode_newsize_ok ( inode , offset + len ) ;
if ( error )
goto out ;
2018-02-01 03:19:25 +03:00
if ( ( info - > seals & F_SEAL_GROW ) & & offset + len > inode - > i_size ) {
error = - EPERM ;
goto out ;
}
2015-09-09 01:01:54 +03:00
/*
* Initialize a pseudo vma as this is required by the huge page
hugetlbfs: drop shared NUMA mempolicy pretence
Patch series "mempolicy: cleanups leading to NUMA mpol without vma", v2.
Mostly cleanups in mm/mempolicy.c, but finally removing the pseudo-vma
from shmem folio allocation, and removing the mmap_lock around folio
migration for mbind and migrate_pages syscalls.
This patch (of 12):
hugetlbfs_fallocate() goes through the motions of pasting a shared NUMA
mempolicy onto its pseudo-vma, but how could there ever be a shared NUMA
mempolicy for this file? hugetlb_vm_ops has never offered a set_policy
method, and hugetlbfs_parse_param() has never supported any mpol options
for a mount-wide default policy.
It's just an illusion: clean it away so as not to confuse others, giving
us more freedom to adjust shmem's set_policy/get_policy implementation.
But hugetlbfs_inode_info is still required, just to accommodate seals.
Yes, shared NUMA mempolicy support could be added to hugetlbfs, with a
set_policy method and/or mpol mount option (Andi's first posting did
include an admitted-unsatisfactory hugetlb_set_policy()); but it seems
that nobody has bothered to add that in the nineteen years since v2.6.7
made it possible, and there is at least one company that has invested
enough into hugetlbfs, that I guess they have learnt well enough how to
manage its NUMA, without needing shared mempolicy.
Remove linux/mempolicy.h from linux/hugetlb.h: include linux/pagemap.h in
its place, because hugetlb.h's recently added use of filemap_lock_folio()
requires that (although most .configs and .c's get it in some other way).
Link: https://lkml.kernel.org/r/ebc0987e-beff-8bfb-9283-234c2cbd17c5@google.com
Link: https://lkml.kernel.org/r/cae82d4b-904a-faaf-282a-34fcc188c81f@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 12:15:09 +03:00
* allocation routines .
2015-09-09 01:01:54 +03:00
*/
2018-07-27 02:37:30 +03:00
vma_init ( & pseudo_vma , mm ) ;
2023-01-26 22:37:49 +03:00
vm_flags_init ( & pseudo_vma , VM_HUGETLB | VM_MAYSHARE | VM_SHARED ) ;
2015-09-09 01:01:54 +03:00
pseudo_vma . vm_file = file ;
for ( index = start ; index < end ; index + + ) {
/*
* This is supposed to be the vaddr where the page is being
* faulted in , but we have no vaddr here .
*/
2023-01-25 20:05:33 +03:00
struct folio * folio ;
2015-09-09 01:01:54 +03:00
unsigned long addr ;
cond_resched ( ) ;
/*
* fallocate ( 2 ) manpage permits EINTR ; we may have been
* interrupted because we are using up too much memory .
*/
if ( signal_pending ( current ) ) {
error = - EINTR ;
break ;
}
/* addr is the offset within the file (zero based) */
addr = index * hpage_size ;
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-15 01:18:02 +03:00
/* mutex taken here, fault path and hole punch */
2019-12-01 04:57:02 +03:00
hash = hugetlb_fault_mutex_hash ( mapping , index ) ;
2015-09-09 01:01:54 +03:00
mutex_lock ( & hugetlb_fault_mutex_table [ hash ] ) ;
/* See if already present in mapping to avoid alloc/free */
2023-09-26 22:20:17 +03:00
folio = filemap_get_folio ( mapping , index < < huge_page_order ( h ) ) ;
2023-06-22 00:24:03 +03:00
if ( ! IS_ERR ( folio ) ) {
folio_put ( folio ) ;
2015-09-09 01:01:54 +03:00
mutex_unlock ( & hugetlb_fault_mutex_table [ hash ] ) ;
continue ;
}
2021-02-24 23:10:11 +03:00
/*
2023-01-25 20:05:33 +03:00
* Allocate folio without setting the avoid_reserve argument .
2021-02-24 23:10:11 +03:00
* There certainly are no reserves associated with the
* pseudo_vma . However , there could be shared mappings with
* reserves for the file at the inode level . If we fallocate
2023-01-25 20:05:33 +03:00
* folios in these areas , we need to consume the reserves
2021-02-24 23:10:11 +03:00
* to keep reservation accounting consistent .
*/
2023-01-25 20:05:33 +03:00
folio = alloc_hugetlb_folio ( & pseudo_vma , addr , 0 ) ;
if ( IS_ERR ( folio ) ) {
2015-09-09 01:01:54 +03:00
mutex_unlock ( & hugetlb_fault_mutex_table [ hash ] ) ;
2023-01-25 20:05:33 +03:00
error = PTR_ERR ( folio ) ;
2015-09-09 01:01:54 +03:00
goto out ;
}
2023-01-25 20:05:33 +03:00
clear_huge_page ( & folio - > page , addr , pages_per_huge_page ( h ) ) ;
__folio_mark_uptodate ( folio ) ;
2023-01-25 20:05:35 +03:00
error = hugetlb_add_to_page_cache ( folio , mapping , index ) ;
2015-09-09 01:01:54 +03:00
if ( unlikely ( error ) ) {
2023-01-25 20:05:34 +03:00
restore_reserve_on_error ( h , & pseudo_vma , addr , folio ) ;
2023-01-25 20:05:33 +03:00
folio_put ( folio ) ;
2015-09-09 01:01:54 +03:00
mutex_unlock ( & hugetlb_fault_mutex_table [ hash ] ) ;
goto out ;
}
mutex_unlock ( & hugetlb_fault_mutex_table [ hash ] ) ;
2023-01-25 20:05:33 +03:00
folio_set_hugetlb_migratable ( folio ) ;
2015-09-09 01:01:54 +03:00
/*
2023-01-25 20:05:33 +03:00
* folio_unlock because locked by hugetlb_add_to_page_cache ( )
* folio_put ( ) due to reference from alloc_hugetlb_folio ( )
2015-09-09 01:01:54 +03:00
*/
2023-01-25 20:05:33 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2015-09-09 01:01:54 +03:00
}
if ( ! ( mode & FALLOC_FL_KEEP_SIZE ) & & offset + len > inode - > i_size )
i_size_write ( inode , offset + len ) ;
2023-07-05 22:01:17 +03:00
inode_set_ctime_current ( inode ) ;
2015-09-09 01:01:54 +03:00
out :
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2015-09-09 01:01:54 +03:00
return error ;
}
2023-01-13 14:49:11 +03:00
static int hugetlbfs_setattr ( struct mnt_idmap * idmap ,
2021-01-21 16:19:43 +03:00
struct dentry * dentry , struct iattr * attr )
2005-04-17 02:20:36 +04:00
{
2015-03-18 01:25:59 +03:00
struct inode * inode = d_inode ( dentry ) ;
2008-07-24 08:27:41 +04:00
struct hstate * h = hstate_inode ( inode ) ;
2005-04-17 02:20:36 +04:00
int error ;
unsigned int ia_valid = attr - > ia_valid ;
2018-02-01 03:19:25 +03:00
struct hugetlbfs_inode_info * info = HUGETLBFS_I ( inode ) ;
2005-04-17 02:20:36 +04:00
2023-01-13 14:49:11 +03:00
error = setattr_prepare ( & nop_mnt_idmap , dentry , attr ) ;
2005-04-17 02:20:36 +04:00
if ( error )
2010-06-04 13:30:02 +04:00
return error ;
2005-04-17 02:20:36 +04:00
if ( ia_valid & ATTR_SIZE ) {
2018-02-01 03:19:25 +03:00
loff_t oldsize = inode - > i_size ;
loff_t newsize = attr - > ia_size ;
if ( newsize & ~ huge_page_mask ( h ) )
2010-06-04 13:30:02 +04:00
return - EINVAL ;
2021-02-24 23:10:18 +03:00
/* protected by i_rwsem */
2018-02-01 03:19:25 +03:00
if ( ( newsize < oldsize & & ( info - > seals & F_SEAL_SHRINK ) ) | |
( newsize > oldsize & & ( info - > seals & F_SEAL_GROW ) ) )
return - EPERM ;
2021-02-24 23:10:25 +03:00
hugetlb_vmtruncate ( inode , newsize ) ;
2005-04-17 02:20:36 +04:00
}
2010-06-04 13:30:02 +04:00
2023-01-13 14:49:11 +03:00
setattr_copy ( & nop_mnt_idmap , inode , attr ) ;
2010-06-04 13:30:02 +04:00
mark_inode_dirty ( inode ) ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2011-07-25 04:20:48 +04:00
static struct inode * hugetlbfs_get_root ( struct super_block * sb ,
2018-11-02 02:07:26 +03:00
struct hugetlbfs_fs_context * ctx )
2005-04-17 02:20:36 +04:00
{
struct inode * inode ;
inode = new_inode ( sb ) ;
if ( inode ) {
2010-10-23 19:19:54 +04:00
inode - > i_ino = get_next_ino ( ) ;
2018-11-02 02:07:26 +03:00
inode - > i_mode = S_IFDIR | ctx - > mode ;
inode - > i_uid = ctx - > uid ;
inode - > i_gid = ctx - > gid ;
2023-10-04 21:52:30 +03:00
simple_inode_init_ts ( inode ) ;
2011-07-25 04:20:48 +04:00
inode - > i_op = & hugetlbfs_dir_inode_operations ;
inode - > i_fop = & simple_dir_operations ;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink ( inode ) ;
2012-04-26 03:01:50 +04:00
lockdep_annotate_inode_mutex_key ( inode ) ;
2011-07-25 04:20:48 +04:00
}
return inode ;
}
2013-08-14 03:00:55 +04:00
/*
2014-12-13 03:54:24 +03:00
* Hugetlbfs is not reclaimable ; therefore its i_mmap_rwsem will never
2013-08-14 03:00:55 +04:00
* be taken from reclaim - - unlike regular filesystems . This needs an
2016-01-16 03:57:31 +03:00
* annotation because huge_pmd_share ( ) does an allocation under hugetlb ' s
2014-12-13 03:54:24 +03:00
* i_mmap_rwsem .
2013-08-14 03:00:55 +04:00
*/
2014-12-13 03:54:24 +03:00
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key ;
2013-08-14 03:00:55 +04:00
2011-07-25 04:20:48 +04:00
static struct inode * hugetlbfs_get_inode ( struct super_block * sb ,
struct inode * dir ,
2011-07-25 07:17:40 +04:00
umode_t mode , dev_t dev )
2011-07-25 04:20:48 +04:00
{
struct inode * inode ;
2019-04-06 04:39:06 +03:00
struct resv_map * resv_map = NULL ;
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 01:47:25 +04:00
2019-04-06 04:39:06 +03:00
/*
* Reserve maps are only needed for inodes that can have associated
* page allocations .
*/
if ( S_ISREG ( mode ) | | S_ISLNK ( mode ) ) {
resv_map = resv_map_alloc ( ) ;
if ( ! resv_map )
return NULL ;
}
2011-07-25 04:20:48 +04:00
inode = new_inode ( sb ) ;
if ( inode ) {
2018-02-01 03:19:25 +03:00
struct hugetlbfs_inode_info * info = HUGETLBFS_I ( inode ) ;
2011-07-25 04:20:48 +04:00
inode - > i_ino = get_next_ino ( ) ;
2023-01-13 14:49:25 +03:00
inode_init_owner ( & nop_mnt_idmap , inode , dir , mode ) ;
2014-12-13 03:54:24 +03:00
lockdep_set_class ( & inode - > i_mapping - > i_mmap_rwsem ,
& hugetlbfs_i_mmap_rwsem_key ) ;
2005-04-17 02:20:36 +04:00
inode - > i_mapping - > a_ops = & hugetlbfs_aops ;
2023-10-04 21:52:30 +03:00
simple_inode_init_ts ( inode ) ;
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 01:47:25 +04:00
inode - > i_mapping - > private_data = resv_map ;
2018-02-01 03:19:25 +03:00
info - > seals = F_SEAL_SEAL ;
2005-04-17 02:20:36 +04:00
switch ( mode & S_IFMT ) {
default :
init_special_inode ( inode , mode , dev ) ;
break ;
case S_IFREG :
inode - > i_op = & hugetlbfs_inode_operations ;
inode - > i_fop = & hugetlbfs_file_operations ;
break ;
case S_IFDIR :
inode - > i_op = & hugetlbfs_dir_inode_operations ;
inode - > i_fop = & simple_dir_operations ;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
2006-10-01 10:29:04 +04:00
inc_nlink ( inode ) ;
2005-04-17 02:20:36 +04:00
break ;
case S_IFLNK :
inode - > i_op = & page_symlink_inode_operations ;
2015-11-17 09:07:57 +03:00
inode_nohighmem ( inode ) ;
2005-04-17 02:20:36 +04:00
break ;
}
lockdep: Add helper function for dir vs file i_mutex annotation
Purely in-memory filesystems do not use the inode hash as the dcache
tells us if an entry already exists. As a result, they do not call
unlock_new_inode, and thus directory inodes do not get put into a
different lockdep class for i_sem.
We need the different lockdep classes, because the locking order for
i_mutex is different for directory inodes and regular inodes. Directory
inodes can do "readdir()", which takes i_mutex *before* possibly taking
mm->mmap_sem (due to a page fault while copying the directory entry to
user space).
In contrast, regular inodes can be mmap'ed, which takes mm->mmap_sem
before accessing i_mutex.
The two cases can never happen for the same inode, so no real deadlock
can occur, but without the different lockdep classes, lockdep cannot
understand that. As a result, if CONFIG_DEBUG_LOCK_ALLOC is set, this
can lead to false positives from lockdep like below:
find/645 is trying to acquire lock:
(&mm->mmap_sem){++++++}, at: [<ffffffff81109514>] might_fault+0x5c/0xac
but task is already holding lock:
(&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffff81149f34>]
vfs_readdir+0x5b/0xb4
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&sb->s_type->i_mutex_key#15){+.+.+.}:
[<ffffffff8108ac26>] lock_acquire+0xbf/0x103
[<ffffffff814db822>] __mutex_lock_common+0x4c/0x361
[<ffffffff814dbc46>] mutex_lock_nested+0x40/0x45
[<ffffffff811daa87>] hugetlbfs_file_mmap+0x82/0x110
[<ffffffff81111557>] mmap_region+0x258/0x432
[<ffffffff811119dd>] do_mmap_pgoff+0x2ac/0x306
[<ffffffff81111b4f>] sys_mmap_pgoff+0x118/0x16a
[<ffffffff8100c858>] sys_mmap+0x22/0x24
[<ffffffff814e3ec2>] system_call_fastpath+0x16/0x1b
-> #0 (&mm->mmap_sem){++++++}:
[<ffffffff8108a4bc>] __lock_acquire+0xa1a/0xcf7
[<ffffffff8108ac26>] lock_acquire+0xbf/0x103
[<ffffffff81109541>] might_fault+0x89/0xac
[<ffffffff81149cff>] filldir+0x6f/0xc7
[<ffffffff811586ea>] dcache_readdir+0x67/0x205
[<ffffffff81149f54>] vfs_readdir+0x7b/0xb4
[<ffffffff8114a073>] sys_getdents+0x7e/0xd1
[<ffffffff814e3ec2>] system_call_fastpath+0x16/0x1b
This patch moves the directory vs file lockdep annotation into a helper
function that can be called by in-memory filesystems and has hugetlbfs
call it.
Signed-off-by: Josh Boyer <jwboyer@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-08-25 15:48:12 +04:00
lockdep_annotate_inode_mutex_key ( inode ) ;
2019-04-06 04:39:06 +03:00
} else {
if ( resv_map )
kref_put ( & resv_map - > refs , resv_map_release ) ;
}
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 01:47:25 +04:00
2005-04-17 02:20:36 +04:00
return inode ;
}
/*
* File creation . Allocate an inode , and we ' re done . .
*/
2023-01-13 14:49:16 +03:00
static int hugetlbfs_mknod ( struct mnt_idmap * idmap , struct inode * dir ,
2022-09-24 07:59:59 +03:00
struct dentry * dentry , umode_t mode , dev_t dev )
2005-04-17 02:20:36 +04:00
{
struct inode * inode ;
2011-07-25 04:20:48 +04:00
inode = hugetlbfs_get_inode ( dir - > i_sb , dir , mode , dev ) ;
2022-09-24 07:59:59 +03:00
if ( ! inode )
return - ENOSPC ;
2023-10-04 21:52:30 +03:00
inode_set_mtime_to_ts ( dir , inode_set_ctime_current ( dir ) ) ;
2022-09-24 07:59:59 +03:00
d_instantiate ( dentry , inode ) ;
dget ( dentry ) ; /* Extra count - pin the dentry in core */
return 0 ;
2019-12-01 04:56:43 +03:00
}
2023-01-13 14:49:15 +03:00
static int hugetlbfs_mkdir ( struct mnt_idmap * idmap , struct inode * dir ,
2021-01-21 16:19:43 +03:00
struct dentry * dentry , umode_t mode )
2005-04-17 02:20:36 +04:00
{
2023-01-13 14:49:16 +03:00
int retval = hugetlbfs_mknod ( & nop_mnt_idmap , dir , dentry ,
2021-01-21 16:19:43 +03:00
mode | S_IFDIR , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( ! retval )
2006-10-01 10:29:04 +04:00
inc_nlink ( dir ) ;
2005-04-17 02:20:36 +04:00
return retval ;
}
2023-01-13 14:49:13 +03:00
static int hugetlbfs_create ( struct mnt_idmap * idmap ,
2021-01-21 16:19:43 +03:00
struct inode * dir , struct dentry * dentry ,
umode_t mode , bool excl )
2005-04-17 02:20:36 +04:00
{
2023-01-13 14:49:16 +03:00
return hugetlbfs_mknod ( & nop_mnt_idmap , dir , dentry , mode | S_IFREG , 0 ) ;
2005-04-17 02:20:36 +04:00
}
2023-01-13 14:49:18 +03:00
static int hugetlbfs_tmpfile ( struct mnt_idmap * idmap ,
2022-09-24 08:00:00 +03:00
struct inode * dir , struct file * file ,
2021-01-21 16:19:43 +03:00
umode_t mode )
2019-12-01 04:56:43 +03:00
{
2022-09-24 07:59:59 +03:00
struct inode * inode ;
inode = hugetlbfs_get_inode ( dir - > i_sb , dir , mode | S_IFREG , 0 ) ;
if ( ! inode )
return - ENOSPC ;
2023-10-04 21:52:30 +03:00
inode_set_mtime_to_ts ( dir , inode_set_ctime_current ( dir ) ) ;
2022-09-24 08:00:00 +03:00
d_tmpfile ( file , inode ) ;
return finish_open_simple ( file , 0 ) ;
2019-12-01 04:56:43 +03:00
}
2023-01-13 14:49:14 +03:00
static int hugetlbfs_symlink ( struct mnt_idmap * idmap ,
2021-01-21 16:19:43 +03:00
struct inode * dir , struct dentry * dentry ,
const char * symname )
2005-04-17 02:20:36 +04:00
{
struct inode * inode ;
int error = - ENOSPC ;
2011-07-25 04:20:48 +04:00
inode = hugetlbfs_get_inode ( dir - > i_sb , dir , S_IFLNK | S_IRWXUGO , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( inode ) {
int l = strlen ( symname ) + 1 ;
error = page_symlink ( inode , symname , l ) ;
if ( ! error ) {
d_instantiate ( dentry , inode ) ;
dget ( dentry ) ;
} else
iput ( inode ) ;
}
2023-10-04 21:52:30 +03:00
inode_set_mtime_to_ts ( dir , inode_set_ctime_current ( dir ) ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
2022-06-06 17:47:21 +03:00
# ifdef CONFIG_MIGRATION
static int hugetlbfs_migrate_folio ( struct address_space * mapping ,
struct folio * dst , struct folio * src ,
2012-01-13 05:19:43 +04:00
enum migrate_mode mode )
2010-09-08 05:19:35 +04:00
{
int rc ;
2022-06-06 17:47:21 +03:00
rc = migrate_huge_page_move_mapping ( mapping , dst , src ) ;
2012-12-12 04:02:31 +04:00
if ( rc ! = MIGRATEPAGE_SUCCESS )
2010-09-08 05:19:35 +04:00
return rc ;
hugetlbfs: fix races and page leaks during migration
hugetlb pages should only be migrated if they are 'active'. The
routines set/clear_page_huge_active() modify the active state of hugetlb
pages.
When a new hugetlb page is allocated at fault time, set_page_huge_active
is called before the page is locked. Therefore, another thread could
race and migrate the page while it is being added to page table by the
fault code. This race is somewhat hard to trigger, but can be seen by
strategically adding udelay to simulate worst case scheduling behavior.
Depending on 'how' the code races, various BUG()s could be triggered.
To address this issue, simply delay the set_page_huge_active call until
after the page is successfully added to the page table.
Hugetlb pages can also be leaked at migration time if the pages are
associated with a file in an explicitly mounted hugetlbfs filesystem.
For example, consider a two node system with 4GB worth of huge pages
available. A program mmaps a 2G file in a hugetlbfs filesystem. It
then migrates the pages associated with the file from one node to
another. When the program exits, huge page counts are as follows:
node0
1024 free_hugepages
1024 nr_hugepages
node1
0 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
That is as expected. 2G of huge pages are taken from the free_hugepages
counts, and 2G is the size of the file in the explicitly mounted
filesystem. If the file is then removed, the counts become:
node0
1024 free_hugepages
1024 nr_hugepages
node1
1024 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
Note that the filesystem still shows 2G of pages used, while there
actually are no huge pages in use. The only way to 'fix' the filesystem
accounting is to unmount the filesystem
If a hugetlb page is associated with an explicitly mounted filesystem,
this information in contained in the page_private field. At migration
time, this information is not preserved. To fix, simply transfer
page_private from old to new page at migration time if necessary.
There is a related race with removing a huge page from a file and
migration. When a huge page is removed from the pagecache, the
page_mapping() field is cleared, yet page_private remains set until the
page is actually freed by free_huge_page(). A page could be migrated
while in this state. However, since page_mapping() is not set the
hugetlbfs specific routine to transfer page_private is not called and we
leak the page count in the filesystem.
To fix that, check for this condition before migrating a huge page. If
the condition is detected, return EBUSY for the page.
Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com
Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com
Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: <stable@vger.kernel.org>
[mike.kravetz@oracle.com: v2]
Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com
[mike.kravetz@oracle.com: update comment and changelog]
Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-01 03:22:02 +03:00
2022-09-22 18:42:05 +03:00
if ( hugetlb_folio_subpool ( src ) ) {
hugetlb_set_folio_subpool ( dst ,
hugetlb_folio_subpool ( src ) ) ;
hugetlb_set_folio_subpool ( src , NULL ) ;
hugetlbfs: fix races and page leaks during migration
hugetlb pages should only be migrated if they are 'active'. The
routines set/clear_page_huge_active() modify the active state of hugetlb
pages.
When a new hugetlb page is allocated at fault time, set_page_huge_active
is called before the page is locked. Therefore, another thread could
race and migrate the page while it is being added to page table by the
fault code. This race is somewhat hard to trigger, but can be seen by
strategically adding udelay to simulate worst case scheduling behavior.
Depending on 'how' the code races, various BUG()s could be triggered.
To address this issue, simply delay the set_page_huge_active call until
after the page is successfully added to the page table.
Hugetlb pages can also be leaked at migration time if the pages are
associated with a file in an explicitly mounted hugetlbfs filesystem.
For example, consider a two node system with 4GB worth of huge pages
available. A program mmaps a 2G file in a hugetlbfs filesystem. It
then migrates the pages associated with the file from one node to
another. When the program exits, huge page counts are as follows:
node0
1024 free_hugepages
1024 nr_hugepages
node1
0 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
That is as expected. 2G of huge pages are taken from the free_hugepages
counts, and 2G is the size of the file in the explicitly mounted
filesystem. If the file is then removed, the counts become:
node0
1024 free_hugepages
1024 nr_hugepages
node1
1024 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
Note that the filesystem still shows 2G of pages used, while there
actually are no huge pages in use. The only way to 'fix' the filesystem
accounting is to unmount the filesystem
If a hugetlb page is associated with an explicitly mounted filesystem,
this information in contained in the page_private field. At migration
time, this information is not preserved. To fix, simply transfer
page_private from old to new page at migration time if necessary.
There is a related race with removing a huge page from a file and
migration. When a huge page is removed from the pagecache, the
page_mapping() field is cleared, yet page_private remains set until the
page is actually freed by free_huge_page(). A page could be migrated
while in this state. However, since page_mapping() is not set the
hugetlbfs specific routine to transfer page_private is not called and we
leak the page count in the filesystem.
To fix that, check for this condition before migrating a huge page. If
the condition is detected, return EBUSY for the page.
Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com
Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com
Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: <stable@vger.kernel.org>
[mike.kravetz@oracle.com: v2]
Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com
[mike.kravetz@oracle.com: update comment and changelog]
Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-01 03:22:02 +03:00
}
2017-09-09 02:12:06 +03:00
if ( mode ! = MIGRATE_SYNC_NO_COPY )
2022-06-06 17:47:21 +03:00
folio_migrate_copy ( dst , src ) ;
2017-09-09 02:12:06 +03:00
else
2022-06-06 17:47:21 +03:00
folio_migrate_flags ( dst , src ) ;
2010-09-08 05:19:35 +04:00
2012-12-12 04:02:31 +04:00
return MIGRATEPAGE_SUCCESS ;
2010-09-08 05:19:35 +04:00
}
2022-06-06 17:47:21 +03:00
# else
# define hugetlbfs_migrate_folio NULL
# endif
2010-09-08 05:19:35 +04:00
2017-07-11 01:47:50 +03:00
static int hugetlbfs_error_remove_page ( struct address_space * mapping ,
struct page * page )
{
return 0 ;
}
2017-07-05 18:24:18 +03:00
/*
* Display the mount options in / proc / mounts .
*/
static int hugetlbfs_show_options ( struct seq_file * m , struct dentry * root )
{
struct hugetlbfs_sb_info * sbinfo = HUGETLBFS_SB ( root - > d_sb ) ;
struct hugepage_subpool * spool = sbinfo - > spool ;
unsigned long hpage_size = huge_page_size ( sbinfo - > hstate ) ;
unsigned hpage_shift = huge_page_shift ( sbinfo - > hstate ) ;
char mod ;
if ( ! uid_eq ( sbinfo - > uid , GLOBAL_ROOT_UID ) )
seq_printf ( m , " ,uid=%u " ,
from_kuid_munged ( & init_user_ns , sbinfo - > uid ) ) ;
if ( ! gid_eq ( sbinfo - > gid , GLOBAL_ROOT_GID ) )
seq_printf ( m , " ,gid=%u " ,
from_kgid_munged ( & init_user_ns , sbinfo - > gid ) ) ;
if ( sbinfo - > mode ! = 0755 )
seq_printf ( m , " ,mode=%o " , sbinfo - > mode ) ;
if ( sbinfo - > max_inodes ! = - 1 )
seq_printf ( m , " ,nr_inodes=%lu " , sbinfo - > max_inodes ) ;
hpage_size / = 1024 ;
mod = ' K ' ;
if ( hpage_size > = 1024 ) {
hpage_size / = 1024 ;
mod = ' M ' ;
}
seq_printf ( m , " ,pagesize=%lu%c " , hpage_size , mod ) ;
if ( spool ) {
if ( spool - > max_hpages ! = - 1 )
seq_printf ( m , " ,size=%llu " ,
( unsigned long long ) spool - > max_hpages < < hpage_shift ) ;
if ( spool - > min_hpages ! = - 1 )
seq_printf ( m , " ,min_size=%llu " ,
( unsigned long long ) spool - > min_hpages < < hpage_shift ) ;
}
return 0 ;
}
2006-06-23 13:02:58 +04:00
static int hugetlbfs_statfs ( struct dentry * dentry , struct kstatfs * buf )
2005-04-17 02:20:36 +04:00
{
2006-06-23 13:02:58 +04:00
struct hugetlbfs_sb_info * sbinfo = HUGETLBFS_SB ( dentry - > d_sb ) ;
2015-03-18 01:25:59 +03:00
struct hstate * h = hstate_inode ( d_inode ( dentry ) ) ;
fs: report f_fsid from s_dev for "simple" filesystems
There are many "simple" filesystems (*) that report null f_fsid in
statfs(2). Those "simple" filesystems report sb->s_dev as the st_dev
field of the stat syscalls for all inodes of the filesystem (**).
In order to enable fanotify reporting of events with fsid on those
"simple" filesystems, report the sb->s_dev number in f_fsid field of
statfs(2).
(*) For most of the "simple" filesystem refered to in this commit, the
->statfs() operation is simple_statfs(). Some of those fs assign the
simple_statfs() method directly in their ->s_op struct and some assign it
indirectly via a call to simple_fill_super() or to pseudo_fs_fill_super()
with either custom or "simple" s_op.
We also make the same change to efivarfs and hugetlbfs, although they do
not use simple_statfs(), because they use the simple_* inode opreations
(e.g. simple_lookup()).
(**) For most of the "simple" filesystems, the ->getattr() method is not
assigned, so stat() is implemented by generic_fillattr(). A few "simple"
filesystem use the simple_getattr() method which also calls
generic_fillattr() to fill most of the stat struct.
The two exceptions are procfs and 9p. procfs implements several different
->getattr() methods, but they all end up calling generic_fillattr() to
fill the st_dev field from sb->s_dev.
9p has more complicated ->getattr() methods, but they too, end up calling
generic_fillattr() to fill the st_dev field from sb->s_dev.
Note that 9p and kernfs also call simple_statfs() from custom ->statfs()
methods which already fill the f_fsid field, but v9fs_statfs() calls
simple_statfs() only in case f_fsid was not filled and kenrfs_statfs()
overwrites f_fsid after calling simple_statfs().
Link: https://lore.kernel.org/r/20230919094820.g5bwharbmy2dq46w@quack3/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023143049.2944970-1-amir73il@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-10-23 17:30:49 +03:00
u64 id = huge_encode_dev ( dentry - > d_sb - > s_dev ) ;
2005-04-17 02:20:36 +04:00
fs: report f_fsid from s_dev for "simple" filesystems
There are many "simple" filesystems (*) that report null f_fsid in
statfs(2). Those "simple" filesystems report sb->s_dev as the st_dev
field of the stat syscalls for all inodes of the filesystem (**).
In order to enable fanotify reporting of events with fsid on those
"simple" filesystems, report the sb->s_dev number in f_fsid field of
statfs(2).
(*) For most of the "simple" filesystem refered to in this commit, the
->statfs() operation is simple_statfs(). Some of those fs assign the
simple_statfs() method directly in their ->s_op struct and some assign it
indirectly via a call to simple_fill_super() or to pseudo_fs_fill_super()
with either custom or "simple" s_op.
We also make the same change to efivarfs and hugetlbfs, although they do
not use simple_statfs(), because they use the simple_* inode opreations
(e.g. simple_lookup()).
(**) For most of the "simple" filesystems, the ->getattr() method is not
assigned, so stat() is implemented by generic_fillattr(). A few "simple"
filesystem use the simple_getattr() method which also calls
generic_fillattr() to fill most of the stat struct.
The two exceptions are procfs and 9p. procfs implements several different
->getattr() methods, but they all end up calling generic_fillattr() to
fill the st_dev field from sb->s_dev.
9p has more complicated ->getattr() methods, but they too, end up calling
generic_fillattr() to fill the st_dev field from sb->s_dev.
Note that 9p and kernfs also call simple_statfs() from custom ->statfs()
methods which already fill the f_fsid field, but v9fs_statfs() calls
simple_statfs() only in case f_fsid was not filled and kenrfs_statfs()
overwrites f_fsid after calling simple_statfs().
Link: https://lore.kernel.org/r/20230919094820.g5bwharbmy2dq46w@quack3/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231023143049.2944970-1-amir73il@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-10-23 17:30:49 +03:00
buf - > f_fsid = u64_to_fsid ( id ) ;
2005-04-17 02:20:36 +04:00
buf - > f_type = HUGETLBFS_MAGIC ;
2008-07-24 08:27:41 +04:00
buf - > f_bsize = huge_page_size ( h ) ;
2005-04-17 02:20:36 +04:00
if ( sbinfo ) {
spin_lock ( & sbinfo - > stat_lock ) ;
2022-07-26 17:29:18 +03:00
/* If no limits set, just report 0 or -1 for max/free/used
2005-11-22 08:32:24 +03:00
* blocks , like simple_statfs ( ) */
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 03:34:12 +04:00
if ( sbinfo - > spool ) {
long free_pages ;
2022-05-10 04:20:50 +03:00
spin_lock_irq ( & sbinfo - > spool - > lock ) ;
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 03:34:12 +04:00
buf - > f_blocks = sbinfo - > spool - > max_hpages ;
free_pages = sbinfo - > spool - > max_hpages
- sbinfo - > spool - > used_hpages ;
buf - > f_bavail = buf - > f_bfree = free_pages ;
2022-05-10 04:20:50 +03:00
spin_unlock_irq ( & sbinfo - > spool - > lock ) ;
2005-11-22 08:32:24 +03:00
buf - > f_files = sbinfo - > max_inodes ;
buf - > f_ffree = sbinfo - > free_inodes ;
}
2005-04-17 02:20:36 +04:00
spin_unlock ( & sbinfo - > stat_lock ) ;
}
buf - > f_namelen = NAME_MAX ;
return 0 ;
}
static void hugetlbfs_put_super ( struct super_block * sb )
{
struct hugetlbfs_sb_info * sbi = HUGETLBFS_SB ( sb ) ;
if ( sbi ) {
sb - > s_fs_info = NULL ;
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 03:34:12 +04:00
if ( sbi - > spool )
hugepage_put_subpool ( sbi - > spool ) ;
2005-04-17 02:20:36 +04:00
kfree ( sbi ) ;
}
}
2005-10-30 04:16:42 +03:00
static inline int hugetlbfs_dec_free_inodes ( struct hugetlbfs_sb_info * sbinfo )
{
if ( sbinfo - > free_inodes > = 0 ) {
spin_lock ( & sbinfo - > stat_lock ) ;
if ( unlikely ( ! sbinfo - > free_inodes ) ) {
spin_unlock ( & sbinfo - > stat_lock ) ;
return 0 ;
}
sbinfo - > free_inodes - - ;
spin_unlock ( & sbinfo - > stat_lock ) ;
}
return 1 ;
}
static void hugetlbfs_inc_free_inodes ( struct hugetlbfs_sb_info * sbinfo )
{
if ( sbinfo - > free_inodes > = 0 ) {
spin_lock ( & sbinfo - > stat_lock ) ;
sbinfo - > free_inodes + + ;
spin_unlock ( & sbinfo - > stat_lock ) ;
}
}
2006-12-07 07:33:20 +03:00
static struct kmem_cache * hugetlbfs_inode_cachep ;
2005-04-17 02:20:36 +04:00
static struct inode * hugetlbfs_alloc_inode ( struct super_block * sb )
{
2005-10-30 04:16:42 +03:00
struct hugetlbfs_sb_info * sbinfo = HUGETLBFS_SB ( sb ) ;
2005-04-17 02:20:36 +04:00
struct hugetlbfs_inode_info * p ;
2005-10-30 04:16:42 +03:00
if ( unlikely ( ! hugetlbfs_dec_free_inodes ( sbinfo ) ) )
return NULL ;
2022-03-23 00:41:03 +03:00
p = alloc_inode_sb ( sb , hugetlbfs_inode_cachep , GFP_KERNEL ) ;
2005-10-30 04:16:42 +03:00
if ( unlikely ( ! p ) ) {
hugetlbfs_inc_free_inodes ( sbinfo ) ;
2005-04-17 02:20:36 +04:00
return NULL ;
2005-10-30 04:16:42 +03:00
}
2005-04-17 02:20:36 +04:00
return & p - > vfs_inode ;
}
2019-04-16 06:16:38 +03:00
static void hugetlbfs_free_inode ( struct inode * inode )
2011-01-07 09:49:49 +03:00
{
kmem_cache_free ( hugetlbfs_inode_cachep , HUGETLBFS_I ( inode ) ) ;
}
2005-04-17 02:20:36 +04:00
static void hugetlbfs_destroy_inode ( struct inode * inode )
{
2005-10-30 04:16:42 +03:00
hugetlbfs_inc_free_inodes ( HUGETLBFS_SB ( inode - > i_sb ) ) ;
2005-04-17 02:20:36 +04:00
}
2006-06-28 15:26:44 +04:00
static const struct address_space_operations hugetlbfs_aops = {
2007-10-16 12:25:03 +04:00
. write_begin = hugetlbfs_write_begin ,
. write_end = hugetlbfs_write_end ,
2022-02-09 23:22:13 +03:00
. dirty_folio = noop_dirty_folio ,
2022-06-06 17:47:21 +03:00
. migrate_folio = hugetlbfs_migrate_folio ,
2017-07-11 01:47:50 +03:00
. error_remove_page = hugetlbfs_error_remove_page ,
2005-04-17 02:20:36 +04:00
} ;
2005-10-30 04:16:42 +03:00
2008-07-26 06:45:34 +04:00
static void init_once ( void * foo )
2005-10-30 04:16:42 +03:00
{
2022-11-07 04:56:59 +03:00
struct hugetlbfs_inode_info * ei = foo ;
2005-10-30 04:16:42 +03:00
2007-05-17 09:10:57 +04:00
inode_init_once ( & ei - > vfs_inode ) ;
2005-10-30 04:16:42 +03:00
}
2006-03-28 13:56:42 +04:00
const struct file_operations hugetlbfs_file_operations = {
2015-04-03 18:31:35 +03:00
. read_iter = hugetlbfs_read_iter ,
2005-04-17 02:20:36 +04:00
. mmap = hugetlbfs_file_mmap ,
2010-05-26 19:53:41 +04:00
. fsync = noop_fsync ,
2005-04-17 02:20:36 +04:00
. get_unmapped_area = hugetlb_get_unmapped_area ,
2015-09-09 01:01:54 +03:00
. llseek = default_llseek ,
. fallocate = hugetlbfs_fallocate ,
2005-04-17 02:20:36 +04:00
} ;
2007-02-12 11:55:39 +03:00
static const struct inode_operations hugetlbfs_dir_inode_operations = {
2005-04-17 02:20:36 +04:00
. create = hugetlbfs_create ,
. lookup = simple_lookup ,
. link = simple_link ,
. unlink = simple_unlink ,
. symlink = hugetlbfs_symlink ,
. mkdir = hugetlbfs_mkdir ,
. rmdir = simple_rmdir ,
. mknod = hugetlbfs_mknod ,
. rename = simple_rename ,
. setattr = hugetlbfs_setattr ,
2019-12-01 04:56:43 +03:00
. tmpfile = hugetlbfs_tmpfile ,
2005-04-17 02:20:36 +04:00
} ;
2007-02-12 11:55:39 +03:00
static const struct inode_operations hugetlbfs_inode_operations = {
2005-04-17 02:20:36 +04:00
. setattr = hugetlbfs_setattr ,
} ;
2007-02-12 11:55:41 +03:00
static const struct super_operations hugetlbfs_ops = {
2005-04-17 02:20:36 +04:00
. alloc_inode = hugetlbfs_alloc_inode ,
2019-04-16 06:16:38 +03:00
. free_inode = hugetlbfs_free_inode ,
2005-04-17 02:20:36 +04:00
. destroy_inode = hugetlbfs_destroy_inode ,
2010-06-05 03:52:12 +04:00
. evict_inode = hugetlbfs_evict_inode ,
2005-04-17 02:20:36 +04:00
. statfs = hugetlbfs_statfs ,
. put_super = hugetlbfs_put_super ,
2017-07-05 18:24:18 +03:00
. show_options = hugetlbfs_show_options ,
2005-04-17 02:20:36 +04:00
} ;
2015-04-16 02:13:42 +03:00
/*
* Convert size option passed from command line to number of huge pages
* in the pool specified by hstate . Size option could be in bytes
* ( val_type = = SIZE_STD ) or percentage of the pool ( val_type = = SIZE_PERCENT ) .
*/
2017-07-05 18:24:18 +03:00
static long
2015-04-16 02:13:42 +03:00
hugetlbfs_size_to_hpages ( struct hstate * h , unsigned long long size_opt ,
2017-07-05 18:24:18 +03:00
enum hugetlbfs_size_type val_type )
2015-04-16 02:13:42 +03:00
{
if ( val_type = = NO_SIZE )
return - 1 ;
if ( val_type = = SIZE_PERCENT ) {
size_opt < < = huge_page_shift ( h ) ;
size_opt * = h - > max_huge_pages ;
do_div ( size_opt , 100 ) ;
}
size_opt > > = huge_page_shift ( h ) ;
return size_opt ;
}
2018-11-02 02:07:26 +03:00
/*
* Parse one mount parameter .
*/
static int hugetlbfs_parse_param ( struct fs_context * fc , struct fs_parameter * param )
2005-04-17 02:20:36 +04:00
{
2018-11-02 02:07:26 +03:00
struct hugetlbfs_fs_context * ctx = fc - > fs_private ;
struct fs_parse_result result ;
char * rest ;
unsigned long ps ;
int opt ;
2019-09-07 14:23:15 +03:00
opt = fs_parse ( fc , hugetlb_fs_parameters , param , & result ) ;
2018-11-02 02:07:26 +03:00
if ( opt < 0 )
return opt ;
switch ( opt ) {
case Opt_uid :
ctx - > uid = make_kuid ( current_user_ns ( ) , result . uint_32 ) ;
if ( ! uid_valid ( ctx - > uid ) )
goto bad_val ;
2005-04-17 02:20:36 +04:00
return 0 ;
2018-11-02 02:07:26 +03:00
case Opt_gid :
ctx - > gid = make_kgid ( current_user_ns ( ) , result . uint_32 ) ;
if ( ! gid_valid ( ctx - > gid ) )
goto bad_val ;
return 0 ;
2007-07-16 10:40:52 +04:00
2018-11-02 02:07:26 +03:00
case Opt_mode :
ctx - > mode = result . uint_32 & 01777U ;
return 0 ;
2007-07-16 10:40:52 +04:00
2018-11-02 02:07:26 +03:00
case Opt_size :
/* memparse() will accept a K/M/G without a digit */
2022-10-21 02:16:08 +03:00
if ( ! param - > string | | ! isdigit ( param - > string [ 0 ] ) )
2018-11-02 02:07:26 +03:00
goto bad_val ;
ctx - > max_size_opt = memparse ( param - > string , & rest ) ;
ctx - > max_val_type = SIZE_STD ;
if ( * rest = = ' % ' )
ctx - > max_val_type = SIZE_PERCENT ;
return 0 ;
2007-07-16 10:40:52 +04:00
2018-11-02 02:07:26 +03:00
case Opt_nr_inodes :
/* memparse() will accept a K/M/G without a digit */
2022-10-21 02:16:08 +03:00
if ( ! param - > string | | ! isdigit ( param - > string [ 0 ] ) )
2018-11-02 02:07:26 +03:00
goto bad_val ;
ctx - > nr_inodes = memparse ( param - > string , & rest ) ;
return 0 ;
2007-07-16 10:40:52 +04:00
2018-11-02 02:07:26 +03:00
case Opt_pagesize :
ps = memparse ( param - > string , & rest ) ;
ctx - > hstate = size_to_hstate ( ps ) ;
if ( ! ctx - > hstate ) {
2022-07-26 17:29:14 +03:00
pr_err ( " Unsupported page size %lu MB \n " , ps / SZ_1M ) ;
2018-11-02 02:07:26 +03:00
return - EINVAL ;
2007-07-16 10:40:52 +04:00
}
2018-11-02 02:07:26 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
2018-11-02 02:07:26 +03:00
case Opt_min_size :
/* memparse() will accept a K/M/G without a digit */
2022-10-21 02:16:08 +03:00
if ( ! param - > string | | ! isdigit ( param - > string [ 0 ] ) )
2018-11-02 02:07:26 +03:00
goto bad_val ;
ctx - > min_size_opt = memparse ( param - > string , & rest ) ;
ctx - > min_val_type = SIZE_STD ;
if ( * rest = = ' % ' )
ctx - > min_val_type = SIZE_PERCENT ;
return 0 ;
2007-07-16 10:40:52 +04:00
2018-11-02 02:07:26 +03:00
default :
return - EINVAL ;
}
2008-07-24 08:27:43 +04:00
2018-11-02 02:07:26 +03:00
bad_val :
2019-12-22 05:34:06 +03:00
return invalfc ( fc , " Bad value '%s' for mount option '%s' \n " ,
2018-11-02 02:07:26 +03:00
param - > string , param - > key ) ;
}
2015-04-16 02:13:42 +03:00
2018-11-02 02:07:26 +03:00
/*
* Validate the parsed options .
*/
static int hugetlbfs_validate ( struct fs_context * fc )
{
struct hugetlbfs_fs_context * ctx = fc - > fs_private ;
2008-07-24 08:27:43 +04:00
2015-04-16 02:13:42 +03:00
/*
* Use huge page pool size ( in hstate ) to convert the size
* options to number of huge pages . If NO_SIZE , - 1 is returned .
*/
2018-11-02 02:07:26 +03:00
ctx - > max_hpages = hugetlbfs_size_to_hpages ( ctx - > hstate ,
ctx - > max_size_opt ,
ctx - > max_val_type ) ;
ctx - > min_hpages = hugetlbfs_size_to_hpages ( ctx - > hstate ,
ctx - > min_size_opt ,
ctx - > min_val_type ) ;
2015-04-16 02:13:42 +03:00
/*
* If max_size was specified , then min_size must be smaller
*/
2018-11-02 02:07:26 +03:00
if ( ctx - > max_val_type > NO_SIZE & &
ctx - > min_hpages > ctx - > max_hpages ) {
pr_err ( " Minimum size can not be greater than maximum size \n " ) ;
2015-04-16 02:13:42 +03:00
return - EINVAL ;
2008-07-24 08:27:43 +04:00
}
2005-04-17 02:20:36 +04:00
return 0 ;
}
static int
2018-11-02 02:07:26 +03:00
hugetlbfs_fill_super ( struct super_block * sb , struct fs_context * fc )
2005-04-17 02:20:36 +04:00
{
2018-11-02 02:07:26 +03:00
struct hugetlbfs_fs_context * ctx = fc - > fs_private ;
2005-04-17 02:20:36 +04:00
struct hugetlbfs_sb_info * sbinfo ;
sbinfo = kmalloc ( sizeof ( struct hugetlbfs_sb_info ) , GFP_KERNEL ) ;
if ( ! sbinfo )
return - ENOMEM ;
sb - > s_fs_info = sbinfo ;
spin_lock_init ( & sbinfo - > stat_lock ) ;
2018-11-02 02:07:26 +03:00
sbinfo - > hstate = ctx - > hstate ;
sbinfo - > max_inodes = ctx - > nr_inodes ;
sbinfo - > free_inodes = ctx - > nr_inodes ;
sbinfo - > spool = NULL ;
sbinfo - > uid = ctx - > uid ;
sbinfo - > gid = ctx - > gid ;
sbinfo - > mode = ctx - > mode ;
2017-07-05 18:24:18 +03:00
2015-04-16 02:13:42 +03:00
/*
* Allocate and initialize subpool if maximum or minimum size is
2021-02-24 23:10:21 +03:00
* specified . Any needed reservations ( for minimum size ) are taken
2022-07-26 17:29:17 +03:00
* when the subpool is created .
2015-04-16 02:13:42 +03:00
*/
2018-11-02 02:07:26 +03:00
if ( ctx - > max_hpages ! = - 1 | | ctx - > min_hpages ! = - 1 ) {
sbinfo - > spool = hugepage_new_subpool ( ctx - > hstate ,
ctx - > max_hpages ,
ctx - > min_hpages ) ;
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 03:34:12 +04:00
if ( ! sbinfo - > spool )
goto out_free ;
}
2005-04-17 02:20:36 +04:00
sb - > s_maxbytes = MAX_LFS_FILESIZE ;
2018-11-02 02:07:26 +03:00
sb - > s_blocksize = huge_page_size ( ctx - > hstate ) ;
sb - > s_blocksize_bits = huge_page_shift ( ctx - > hstate ) ;
2005-04-17 02:20:36 +04:00
sb - > s_magic = HUGETLBFS_MAGIC ;
sb - > s_op = & hugetlbfs_ops ;
sb - > s_time_gran = 1 ;
2020-08-12 04:31:35 +03:00
/*
* Due to the special and limited functionality of hugetlbfs , it does
* not work well as a stacking filesystem .
*/
sb - > s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH ;
2018-11-02 02:07:26 +03:00
sb - > s_root = d_make_root ( hugetlbfs_get_root ( sb , ctx ) ) ;
2012-01-09 07:15:13 +04:00
if ( ! sb - > s_root )
2005-04-17 02:20:36 +04:00
goto out_free ;
return 0 ;
out_free :
2014-06-05 03:10:40 +04:00
kfree ( sbinfo - > spool ) ;
2005-04-17 02:20:36 +04:00
kfree ( sbinfo ) ;
return - ENOMEM ;
}
2018-11-02 02:07:26 +03:00
static int hugetlbfs_get_tree ( struct fs_context * fc )
{
int err = hugetlbfs_validate ( fc ) ;
if ( err )
return err ;
2019-06-02 03:48:55 +03:00
return get_tree_nodev ( fc , hugetlbfs_fill_super ) ;
2018-11-02 02:07:26 +03:00
}
static void hugetlbfs_fs_context_free ( struct fs_context * fc )
{
kfree ( fc - > fs_private ) ;
}
static const struct fs_context_operations hugetlbfs_fs_context_ops = {
. free = hugetlbfs_fs_context_free ,
. parse_param = hugetlbfs_parse_param ,
. get_tree = hugetlbfs_get_tree ,
} ;
static int hugetlbfs_init_fs_context ( struct fs_context * fc )
2005-04-17 02:20:36 +04:00
{
2018-11-02 02:07:26 +03:00
struct hugetlbfs_fs_context * ctx ;
ctx = kzalloc ( sizeof ( struct hugetlbfs_fs_context ) , GFP_KERNEL ) ;
if ( ! ctx )
return - ENOMEM ;
ctx - > max_hpages = - 1 ; /* No limit on size by default */
ctx - > nr_inodes = - 1 ; /* No limit on number of inodes by default */
ctx - > uid = current_fsuid ( ) ;
ctx - > gid = current_fsgid ( ) ;
ctx - > mode = 0755 ;
ctx - > hstate = & default_hstate ;
ctx - > min_hpages = - 1 ; /* No default minimum size */
ctx - > max_val_type = NO_SIZE ;
ctx - > min_val_type = NO_SIZE ;
fc - > fs_private = ctx ;
fc - > ops = & hugetlbfs_fs_context_ops ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
static struct file_system_type hugetlbfs_fs_type = {
2018-11-02 02:07:26 +03:00
. name = " hugetlbfs " ,
. init_fs_context = hugetlbfs_init_fs_context ,
2019-09-07 14:23:15 +03:00
. parameters = hugetlb_fs_parameters ,
2018-11-02 02:07:26 +03:00
. kill_sb = kill_litter_super ,
2005-04-17 02:20:36 +04:00
} ;
2012-12-12 04:01:34 +04:00
static struct vfsmount * hugetlbfs_vfsmount [ HUGE_MAX_HSTATE ] ;
2005-04-17 02:20:36 +04:00
2009-09-24 02:56:05 +04:00
static int can_do_hugetlb_shm ( void )
2005-04-17 02:20:36 +04:00
{
2012-02-08 04:19:25 +04:00
kgid_t shm_group ;
shm_group = make_kgid ( & init_user_ns , sysctl_hugetlb_shm_group ) ;
return capable ( CAP_IPC_LOCK ) | | in_group_p ( shm_group ) ;
2005-04-17 02:20:36 +04:00
}
2012-12-12 04:01:34 +04:00
static int get_hstate_idx ( int page_size_log )
{
2013-05-08 03:18:13 +04:00
struct hstate * h = hstate_sizelog ( page_size_log ) ;
2012-12-12 04:01:34 +04:00
if ( ! h )
return - 1 ;
2021-05-05 04:33:22 +03:00
return hstate_index ( h ) ;
2012-12-12 04:01:34 +04:00
}
2013-05-08 03:18:13 +04:00
/*
* Note that size should be aligned to proper hugepage size in caller side ,
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended .
*/
struct file * hugetlb_file_setup ( const char * name , size_t size ,
2021-11-09 05:31:27 +03:00
vm_flags_t acctflag , int creat_flags ,
int page_size_log )
2005-04-17 02:20:36 +04:00
{
struct inode * inode ;
2018-06-09 16:50:46 +03:00
struct vfsmount * mnt ;
2012-12-12 04:01:34 +04:00
int hstate_idx ;
2018-06-09 16:50:46 +03:00
struct file * file ;
2012-12-12 04:01:34 +04:00
hstate_idx = get_hstate_idx ( page_size_log ) ;
if ( hstate_idx < 0 )
return ERR_PTR ( - ENODEV ) ;
2005-04-17 02:20:36 +04:00
2018-06-09 16:50:46 +03:00
mnt = hugetlbfs_vfsmount [ hstate_idx ] ;
if ( ! mnt )
2007-05-07 01:50:18 +04:00
return ERR_PTR ( - ENOENT ) ;
2009-09-24 02:56:05 +04:00
if ( creat_flags = = HUGETLB_SHMFS_INODE & & ! can_do_hugetlb_shm ( ) ) {
2021-11-09 05:31:27 +03:00
struct ucounts * ucounts = current_ucounts ( ) ;
if ( user_shm_lock ( size , ucounts ) ) {
pr_warn_once ( " %s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete \n " ,
2012-03-22 03:34:13 +04:00
current - > comm , current - > pid ) ;
2021-11-09 05:31:27 +03:00
user_shm_unlock ( size , ucounts ) ;
2009-08-24 19:30:28 +04:00
}
2021-11-09 05:31:27 +03:00
return ERR_PTR ( - EPERM ) ;
2009-04-01 02:21:26 +04:00
}
2005-04-17 02:20:36 +04:00
2012-09-13 07:11:55 +04:00
file = ERR_PTR ( - ENOSPC ) ;
2018-06-09 16:50:46 +03:00
inode = hugetlbfs_get_inode ( mnt - > mnt_sb , NULL , S_IFREG | S_IRWXUGO , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( ! inode )
2018-06-09 16:50:46 +03:00
goto out ;
2015-08-07 01:46:55 +03:00
if ( creat_flags = = HUGETLB_SHMFS_INODE )
inode - > i_flags | = S_PRIVATE ;
2005-04-17 02:20:36 +04:00
inode - > i_size = size ;
2011-10-28 16:13:28 +04:00
clear_nlink ( inode ) ;
2007-10-17 10:31:13 +04:00
2021-02-24 23:09:54 +03:00
if ( ! hugetlb_reserve_pages ( inode , 0 ,
2018-06-09 16:50:46 +03:00
size > > huge_page_shift ( hstate_inode ( inode ) ) , NULL ,
acctflag ) )
file = ERR_PTR ( - ENOMEM ) ;
else
file = alloc_file_pseudo ( inode , mnt , name , O_RDWR ,
& hugetlbfs_file_operations ) ;
if ( ! IS_ERR ( file ) )
return file ;
2005-04-17 02:20:36 +04:00
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:55 +03:00
iput ( inode ) ;
2018-06-09 16:50:46 +03:00
out :
2012-09-13 07:11:55 +04:00
return file ;
2005-04-17 02:20:36 +04:00
}
2018-11-02 02:07:26 +03:00
static struct vfsmount * __init mount_one_hugetlbfs ( struct hstate * h )
{
struct fs_context * fc ;
struct vfsmount * mnt ;
fc = fs_context_for_mount ( & hugetlbfs_fs_type , SB_KERNMOUNT ) ;
if ( IS_ERR ( fc ) ) {
mnt = ERR_CAST ( fc ) ;
} else {
struct hugetlbfs_fs_context * ctx = fc - > fs_private ;
ctx - > hstate = h ;
mnt = fc_mount ( fc ) ;
put_fs_context ( fc ) ;
}
if ( IS_ERR ( mnt ) )
2021-02-24 23:10:14 +03:00
pr_err ( " Cannot mount internal hugetlbfs for page size %luK " ,
2022-07-26 17:29:14 +03:00
huge_page_size ( h ) / SZ_1K ) ;
2018-11-02 02:07:26 +03:00
return mnt ;
}
2005-04-17 02:20:36 +04:00
static int __init init_hugetlbfs_fs ( void )
{
2018-11-02 02:07:26 +03:00
struct vfsmount * mnt ;
2012-12-12 04:01:34 +04:00
struct hstate * h ;
2005-04-17 02:20:36 +04:00
int error ;
2012-12-12 04:01:34 +04:00
int i ;
2005-04-17 02:20:36 +04:00
hugetlb: ensure hugepage access is denied if hugepages are not supported
Currently, I am seeing the following when I `mount -t hugetlbfs /none
/dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's
related to the fact that hugetlbfs is properly not correctly setting
itself up in this state?:
Unable to handle kernel paging request for data at address 0x00000031
Faulting instruction address: 0xc000000000245710
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 NUMA pSeries
....
In KVM guests on Power, in a guest not backed by hugepages, we see the
following:
AnonHugePages: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 64 kB
HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages
are not supported at boot-time, but this is only checked in
hugetlb_init(). Extract the check to a helper function, and use it in a
few relevant places.
This does make hugetlbfs not supported (not registered at all) in this
environment. I believe this is fine, as there are no valid hugepages
and that won't change at runtime.
[akpm@linux-foundation.org: use pr_info(), per Mel]
[akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined]
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-05-06 23:50:00 +04:00
if ( ! hugepages_supported ( ) ) {
2014-06-05 03:07:21 +04:00
pr_info ( " disabling because there are no supported hugepage sizes \n " ) ;
hugetlb: ensure hugepage access is denied if hugepages are not supported
Currently, I am seeing the following when I `mount -t hugetlbfs /none
/dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's
related to the fact that hugetlbfs is properly not correctly setting
itself up in this state?:
Unable to handle kernel paging request for data at address 0x00000031
Faulting instruction address: 0xc000000000245710
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 NUMA pSeries
....
In KVM guests on Power, in a guest not backed by hugepages, we see the
following:
AnonHugePages: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 64 kB
HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages
are not supported at boot-time, but this is only checked in
hugetlb_init(). Extract the check to a helper function, and use it in a
few relevant places.
This does make hugetlbfs not supported (not registered at all) in this
environment. I believe this is fine, as there are no valid hugepages
and that won't change at runtime.
[akpm@linux-foundation.org: use pr_info(), per Mel]
[akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined]
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-05-06 23:50:00 +04:00
return - ENOTSUPP ;
}
2012-03-22 03:34:15 +04:00
error = - ENOMEM ;
2005-04-17 02:20:36 +04:00
hugetlbfs_inode_cachep = kmem_cache_create ( " hugetlbfs_inode_cache " ,
sizeof ( struct hugetlbfs_inode_info ) ,
2016-01-15 02:18:21 +03:00
0 , SLAB_ACCOUNT , init_once ) ;
2005-04-17 02:20:36 +04:00
if ( hugetlbfs_inode_cachep = = NULL )
2019-12-01 04:56:34 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
error = register_filesystem ( & hugetlbfs_fs_type ) ;
if ( error )
2019-12-01 04:56:34 +03:00
goto out_free ;
2005-04-17 02:20:36 +04:00
2019-12-01 04:56:34 +03:00
/* default hstate mount is required */
2021-02-24 23:10:04 +03:00
mnt = mount_one_hugetlbfs ( & default_hstate ) ;
2019-12-01 04:56:34 +03:00
if ( IS_ERR ( mnt ) ) {
error = PTR_ERR ( mnt ) ;
goto out_unreg ;
}
hugetlbfs_vfsmount [ default_hstate_idx ] = mnt ;
/* other hstates are optional */
2012-12-12 04:01:34 +04:00
i = 0 ;
for_each_hstate ( h ) {
2020-01-03 20:37:18 +03:00
if ( i = = default_hstate_idx ) {
i + + ;
2019-12-01 04:56:34 +03:00
continue ;
2020-01-03 20:37:18 +03:00
}
2019-12-01 04:56:34 +03:00
2018-11-02 02:07:26 +03:00
mnt = mount_one_hugetlbfs ( h ) ;
2019-12-01 04:56:34 +03:00
if ( IS_ERR ( mnt ) )
hugetlbfs_vfsmount [ i ] = NULL ;
else
hugetlbfs_vfsmount [ i ] = mnt ;
2012-12-12 04:01:34 +04:00
i + + ;
}
2018-11-02 02:07:26 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
2019-12-01 04:56:34 +03:00
out_unreg :
( void ) unregister_filesystem ( & hugetlbfs_fs_type ) ;
out_free :
2012-03-22 03:34:15 +04:00
kmem_cache_destroy ( hugetlbfs_inode_cachep ) ;
2019-12-01 04:56:34 +03:00
out :
2005-04-17 02:20:36 +04:00
return error ;
}
2016-01-15 02:21:52 +03:00
fs_initcall ( init_hugetlbfs_fs )