linux/fs/hugetlbfs/inode.c
Rafael Aquini 78bd52097d mm: adjust address_space_operations.migratepage() return code
Memory fragmentation introduced by ballooning might reduce significantly
the number of 2MB contiguous memory blocks that can be used within a
guest, thus imposing performance penalties associated with the reduced
number of transparent huge pages that could be used by the guest workload.

This patch-set follows the main idea discussed at 2012 LSFMMS session:
"Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/
to introduce the required changes to the virtio_balloon driver, as well as
the changes to the core compaction & migration bits, in order to make
those subsystems aware of ballooned pages and allow memory balloon pages
become movable within a guest, thus avoiding the aforementioned
fragmentation issue

Following are numbers that prove this patch benefits on allowing
compaction to be more effective at memory ballooned guests.

Results for STRESS-HIGHALLOC benchmark, from Mel Gorman's mmtests suite,
running on a 4gB RAM KVM guest which was ballooning 512mB RAM in 64mB
chunks, at every minute (inflating/deflating), while test was running:

===BEGIN stress-highalloc

STRESS-HIGHALLOC
                 highalloc-3.7     highalloc-3.7
                     rc4-clean         rc4-patch
Pass 1          55.00 ( 0.00%)    62.00 ( 7.00%)
Pass 2          54.00 ( 0.00%)    62.00 ( 8.00%)
while Rested    75.00 ( 0.00%)    80.00 ( 5.00%)

MMTests Statistics: duration
                 3.7         3.7
           rc4-clean   rc4-patch
User         1207.59     1207.46
System       1300.55     1299.61
Elapsed      2273.72     2157.06

MMTests Statistics: vmstat
                                3.7         3.7
                          rc4-clean   rc4-patch
Page Ins                    3581516     2374368
Page Outs                  11148692    10410332
Swap Ins                         80          47
Swap Outs                      3641         476
Direct pages scanned          37978       33826
Kswapd pages scanned        1828245     1342869
Kswapd pages reclaimed      1710236     1304099
Direct pages reclaimed        32207       31005
Kswapd efficiency               93%         97%
Kswapd velocity             804.077     622.546
Direct efficiency               84%         91%
Direct velocity              16.703      15.682
Percentage direct scans          2%          2%
Page writes by reclaim        79252        9704
Page writes file              75611        9228
Page writes anon               3641         476
Page reclaim immediate        16764       11014
Page rescued immediate            0           0
Slabs scanned               2171904     2152448
Direct inode steals             385        2261
Kswapd inode steals          659137      609670
Kswapd skipped wait               1          69
THP fault alloc                 546         631
THP collapse alloc              361         339
THP splits                      259         263
THP fault fallback               98          50
THP collapse fail                20          17
Compaction stalls               747         499
Compaction success              244         145
Compaction failures             503         354
Compaction pages moved       370888      474837
Compaction move failure       77378       65259

===END stress-highalloc

This patch:

Introduce MIGRATEPAGE_SUCCESS as the default return code for
address_space_operations.migratepage() method and documents the expected
return code for the same method in failure cases.

Signed-off-by: Rafael Aquini <aquini@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-11 17:22:26 -08:00

1076 lines
25 KiB
C

/*
* hugetlbpage-backed filesystem. Based on ramfs.
*
* William Irwin, 2002
*
* Copyright (C) 2002 Linus Torvalds.
*/
#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/ctype.h>
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
#include <linux/parser.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/dnotify.h>
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/migrate.h>
#include <asm/uaccess.h>
static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
struct hugetlbfs_config {
kuid_t uid;
kgid_t gid;
umode_t mode;
long nr_blocks;
long nr_inodes;
struct hstate *hstate;
};
struct hugetlbfs_inode_info {
struct shared_policy policy;
struct inode vfs_inode;
};
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}
static struct backing_dev_info hugetlbfs_backing_dev_info = {
.name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
int sysctl_hugetlb_shm_group;
enum {
Opt_size, Opt_nr_inodes,
Opt_mode, Opt_uid, Opt_gid,
Opt_pagesize,
Opt_err,
};
static const match_table_t tokens = {
{Opt_size, "size=%s"},
{Opt_nr_inodes, "nr_inodes=%s"},
{Opt_mode, "mode=%o"},
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_pagesize, "pagesize=%s"},
{Opt_err, NULL},
};
static void huge_pagevec_release(struct pagevec *pvec)
{
int i;
for (i = 0; i < pagevec_count(pvec); ++i)
put_page(pvec->pages[i]);
pagevec_reinit(pvec);
}
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_path.dentry->d_inode;
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
/*
* vma address alignment (but not the pgoff alignment) has
* already been checked by prepare_hugepage_range. If you add
* any error returns here, do so after setting VM_HUGETLB, so
* is_vm_hugetlb_page tests below unmap_region go the right
* way when do_mmap_pgoff unwinds (may be important on powerpc
* and ia64).
*/
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &hugetlb_vm_ops;
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
mutex_lock(&inode->i_mutex);
file_accessed(file);
ret = -ENOMEM;
len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
if (hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
vma->vm_flags))
goto out;
ret = 0;
hugetlb_prefault_arch_hook(vma->vm_mm);
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
inode->i_size = len;
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
/*
* Called under down_write(mmap_sem).
*/
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
if (len & ~huge_page_mask(h))
return -EINVAL;
if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED) {
if (prepare_hugepage_range(file, addr, len))
return -EINVAL;
return addr;
}
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
}
#endif
static int
hugetlbfs_read_actor(struct page *page, unsigned long offset,
char __user *buf, unsigned long count,
unsigned long size)
{
char *kaddr;
unsigned long left, copied = 0;
int i, chunksize;
if (size > count)
size = count;
/* Find which 4k chunk and offset with in that chunk */
i = offset >> PAGE_CACHE_SHIFT;
offset = offset & ~PAGE_CACHE_MASK;
while (size) {
chunksize = PAGE_CACHE_SIZE;
if (offset)
chunksize -= offset;
if (chunksize > size)
chunksize = size;
kaddr = kmap(&page[i]);
left = __copy_to_user(buf, kaddr + offset, chunksize);
kunmap(&page[i]);
if (left) {
copied += (chunksize - left);
break;
}
offset = 0;
size -= chunksize;
buf += chunksize;
copied += chunksize;
i++;
}
return copied ? copied : -EFAULT;
}
/*
* Support for read() - Find the page attached to f_mapping and copy out the
* data. Its *very* similar to do_generic_mapping_read(), we can't use that
* since it has PAGE_CACHE_SIZE assumptions.
*/
static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
size_t len, loff_t *ppos)
{
struct hstate *h = hstate_file(filp);
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
unsigned long index = *ppos >> huge_page_shift(h);
unsigned long offset = *ppos & ~huge_page_mask(h);
unsigned long end_index;
loff_t isize;
ssize_t retval = 0;
/* validate length */
if (len == 0)
goto out;
for (;;) {
struct page *page;
unsigned long nr, ret;
int ra;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> huge_page_shift(h);
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
if (nr <= offset)
goto out;
}
nr = nr - offset;
/* Find the page */
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
/*
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
*/
ret = len < nr ? len : nr;
if (clear_user(buf, ret))
ra = -EFAULT;
else
ra = 0;
} else {
unlock_page(page);
/*
* We have the page, copy it to user space buffer.
*/
ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
ret = ra;
page_cache_release(page);
}
if (ra < 0) {
if (retval == 0)
retval = ra;
goto out;
}
offset += ret;
retval += ret;
len -= ret;
index += offset >> huge_page_shift(h);
offset &= ~huge_page_mask(h);
/* short read or no more work */
if ((ret != nr) || (len == 0))
break;
}
out:
*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
return retval;
}
static int hugetlbfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
return -EINVAL;
}
static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
BUG();
return -EINVAL;
}
static void truncate_huge_page(struct page *page)
{
cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
static void truncate_hugepages(struct inode *inode, loff_t lstart)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
pagevec_init(&pvec, 0);
next = start;
while (1) {
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
if (next == start)
break;
next = start;
continue;
}
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
lock_page(page);
if (page->index > next)
next = page->index;
++next;
truncate_huge_page(page);
unlock_page(page);
freed++;
}
huge_pagevec_release(&pvec);
}
BUG_ON(!lstart && mapping->nrpages);
hugetlb_unreserve_pages(inode, start, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
truncate_hugepages(inode, 0);
clear_inode(inode);
}
static inline void
hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
{
struct vm_area_struct *vma;
vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
unsigned long v_offset;
/*
* Can the expression below overflow on 32-bit arches?
* No, because the interval tree returns us only those vmas
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
if (vma->vm_pgoff < pgoff)
v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
else
v_offset = 0;
unmap_hugepage_range(vma, vma->vm_start + v_offset,
vma->vm_end, NULL);
}
}
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;
i_size_write(inode, offset);
mutex_lock(&mapping->i_mmap_mutex);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
mutex_unlock(&mapping->i_mmap_mutex);
truncate_hugepages(inode, offset);
return 0;
}
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct hstate *h = hstate_inode(inode);
int error;
unsigned int ia_valid = attr->ia_valid;
BUG_ON(!inode);
error = inode_change_ok(inode, attr);
if (error)
return error;
if (ia_valid & ATTR_SIZE) {
error = -EINVAL;
if (attr->ia_size & ~huge_page_mask(h))
return -EINVAL;
error = hugetlb_vmtruncate(inode, attr->ia_size);
if (error)
return error;
}
setattr_copy(inode, attr);
mark_inode_dirty(inode);
return 0;
}
static struct inode *hugetlbfs_get_root(struct super_block *sb,
struct hugetlbfs_config *config)
{
struct inode *inode;
inode = new_inode(sb);
if (inode) {
struct hugetlbfs_inode_info *info;
inode->i_ino = get_next_ino();
inode->i_mode = S_IFDIR | config->mode;
inode->i_uid = config->uid;
inode->i_gid = config->gid;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
info = HUGETLBFS_I(inode);
mpol_shared_policy_init(&info->policy, NULL);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
lockdep_annotate_inode_mutex_key(inode);
}
return inode;
}
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct inode *dir,
umode_t mode, dev_t dev)
{
struct inode *inode;
inode = new_inode(sb);
if (inode) {
struct hugetlbfs_inode_info *info;
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &hugetlbfs_aops;
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
INIT_LIST_HEAD(&inode->i_mapping->private_list);
info = HUGETLBFS_I(inode);
/*
* The policy is initialized here even if we are creating a
* private inode because initialization simply creates an
* an empty rb tree and calls spin_lock_init(), later when we
* call mpol_free_shared_policy() it will just return because
* the rb tree will still be empty.
*/
mpol_shared_policy_init(&info->policy, NULL);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
inode->i_op = &hugetlbfs_inode_operations;
inode->i_fop = &hugetlbfs_file_operations;
break;
case S_IFDIR:
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
break;
}
lockdep_annotate_inode_mutex_key(inode);
}
return inode;
}
/*
* File creation. Allocate an inode, and we're done..
*/
static int hugetlbfs_mknod(struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (inode) {
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
}
return error;
}
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
return retval;
}
static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
{
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
static int hugetlbfs_symlink(struct inode *dir,
struct dentry *dentry, const char *symname)
{
struct inode *inode;
int error = -ENOSPC;
inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
if (inode) {
int l = strlen(symname)+1;
error = page_symlink(inode, symname, l);
if (!error) {
d_instantiate(dentry, inode);
dget(dentry);
} else
iput(inode);
}
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
return error;
}
/*
* mark the head page dirty
*/
static int hugetlbfs_set_page_dirty(struct page *page)
{
struct page *head = compound_head(page);
SetPageDirty(head);
return 0;
}
static int hugetlbfs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
int rc;
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
migrate_page_copy(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
struct hstate *h = hstate_inode(dentry->d_inode);
buf->f_type = HUGETLBFS_MAGIC;
buf->f_bsize = huge_page_size(h);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock);
/* If no limits set, just report 0 for max/free/used
* blocks, like simple_statfs() */
if (sbinfo->spool) {
long free_pages;
spin_lock(&sbinfo->spool->lock);
buf->f_blocks = sbinfo->spool->max_hpages;
free_pages = sbinfo->spool->max_hpages
- sbinfo->spool->used_hpages;
buf->f_bavail = buf->f_bfree = free_pages;
spin_unlock(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
spin_unlock(&sbinfo->stat_lock);
}
buf->f_namelen = NAME_MAX;
return 0;
}
static void hugetlbfs_put_super(struct super_block *sb)
{
struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
if (sbi) {
sb->s_fs_info = NULL;
if (sbi->spool)
hugepage_put_subpool(sbi->spool);
kfree(sbi);
}
}
static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
if (sbinfo->free_inodes >= 0) {
spin_lock(&sbinfo->stat_lock);
if (unlikely(!sbinfo->free_inodes)) {
spin_unlock(&sbinfo->stat_lock);
return 0;
}
sbinfo->free_inodes--;
spin_unlock(&sbinfo->stat_lock);
}
return 1;
}
static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
if (sbinfo->free_inodes >= 0) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
}
}
static struct kmem_cache *hugetlbfs_inode_cachep;
static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
struct hugetlbfs_inode_info *p;
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
return NULL;
p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
if (unlikely(!p)) {
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
}
return &p->vfs_inode;
}
static void hugetlbfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
static void hugetlbfs_destroy_inode(struct inode *inode)
{
hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
}
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
.set_page_dirty = hugetlbfs_set_page_dirty,
.migratepage = hugetlbfs_migrate_page,
};
static void init_once(void *foo)
{
struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
inode_init_once(&ei->vfs_inode);
}
const struct file_operations hugetlbfs_file_operations = {
.read = hugetlbfs_read,
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
.llseek = default_llseek,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
.create = hugetlbfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.symlink = hugetlbfs_symlink,
.mkdir = hugetlbfs_mkdir,
.rmdir = simple_rmdir,
.mknod = hugetlbfs_mknod,
.rename = simple_rename,
.setattr = hugetlbfs_setattr,
};
static const struct inode_operations hugetlbfs_inode_operations = {
.setattr = hugetlbfs_setattr,
};
static const struct super_operations hugetlbfs_ops = {
.alloc_inode = hugetlbfs_alloc_inode,
.destroy_inode = hugetlbfs_destroy_inode,
.evict_inode = hugetlbfs_evict_inode,
.statfs = hugetlbfs_statfs,
.put_super = hugetlbfs_put_super,
.show_options = generic_show_options,
};
static int
hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
{
char *p, *rest;
substring_t args[MAX_OPT_ARGS];
int option;
unsigned long long size = 0;
enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
if (!options)
return 0;
while ((p = strsep(&options, ",")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, tokens, args);
switch (token) {
case Opt_uid:
if (match_int(&args[0], &option))
goto bad_val;
pconfig->uid = make_kuid(current_user_ns(), option);
if (!uid_valid(pconfig->uid))
goto bad_val;
break;
case Opt_gid:
if (match_int(&args[0], &option))
goto bad_val;
pconfig->gid = make_kgid(current_user_ns(), option);
if (!gid_valid(pconfig->gid))
goto bad_val;
break;
case Opt_mode:
if (match_octal(&args[0], &option))
goto bad_val;
pconfig->mode = option & 01777U;
break;
case Opt_size: {
/* memparse() will accept a K/M/G without a digit */
if (!isdigit(*args[0].from))
goto bad_val;
size = memparse(args[0].from, &rest);
setsize = SIZE_STD;
if (*rest == '%')
setsize = SIZE_PERCENT;
break;
}
case Opt_nr_inodes:
/* memparse() will accept a K/M/G without a digit */
if (!isdigit(*args[0].from))
goto bad_val;
pconfig->nr_inodes = memparse(args[0].from, &rest);
break;
case Opt_pagesize: {
unsigned long ps;
ps = memparse(args[0].from, &rest);
pconfig->hstate = size_to_hstate(ps);
if (!pconfig->hstate) {
printk(KERN_ERR
"hugetlbfs: Unsupported page size %lu MB\n",
ps >> 20);
return -EINVAL;
}
break;
}
default:
printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
p);
return -EINVAL;
break;
}
}
/* Do size after hstate is set up */
if (setsize > NO_SIZE) {
struct hstate *h = pconfig->hstate;
if (setsize == SIZE_PERCENT) {
size <<= huge_page_shift(h);
size *= h->max_huge_pages;
do_div(size, 100);
}
pconfig->nr_blocks = (size >> huge_page_shift(h));
}
return 0;
bad_val:
printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
args[0].from, p);
return -EINVAL;
}
static int
hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
{
int ret;
struct hugetlbfs_config config;
struct hugetlbfs_sb_info *sbinfo;
save_mount_options(sb, data);
config.nr_blocks = -1; /* No limit on size by default */
config.nr_inodes = -1; /* No limit on number of inodes by default */
config.uid = current_fsuid();
config.gid = current_fsgid();
config.mode = 0755;
config.hstate = &default_hstate;
ret = hugetlbfs_parse_options(data, &config);
if (ret)
return ret;
sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
if (!sbinfo)
return -ENOMEM;
sb->s_fs_info = sbinfo;
sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
sbinfo->spool = NULL;
if (config.nr_blocks != -1) {
sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
if (!sbinfo->spool)
goto out_free;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = huge_page_size(config.hstate);
sb->s_blocksize_bits = huge_page_shift(config.hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
if (!sb->s_root)
goto out_free;
return 0;
out_free:
if (sbinfo->spool)
kfree(sbinfo->spool);
kfree(sbinfo);
return -ENOMEM;
}
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
}
static struct file_system_type hugetlbfs_fs_type = {
.name = "hugetlbfs",
.mount = hugetlbfs_mount,
.kill_sb = kill_litter_super,
};
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
static int can_do_hugetlb_shm(void)
{
kgid_t shm_group;
shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
}
static int get_hstate_idx(int page_size_log)
{
struct hstate *h;
if (!page_size_log)
return default_hstate_idx;
h = size_to_hstate(1 << page_size_log);
if (!h)
return -1;
return h - hstates;
}
struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size_t size, vm_flags_t acctflag,
struct user_struct **user,
int creat_flags, int page_size_log)
{
int error = -ENOMEM;
struct file *file;
struct inode *inode;
struct path path;
struct dentry *root;
struct qstr quick_string;
struct hstate *hstate;
unsigned long num_pages;
int hstate_idx;
hstate_idx = get_hstate_idx(page_size_log);
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);
*user = NULL;
if (!hugetlbfs_vfsmount[hstate_idx])
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user();
if (user_shm_lock(size, *user)) {
task_lock(current);
printk_once(KERN_WARNING
"%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
current->comm, current->pid);
task_unlock(current);
} else {
*user = NULL;
return ERR_PTR(-EPERM);
}
}
root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
quick_string.name = name;
quick_string.len = strlen(quick_string.name);
quick_string.hash = 0;
path.dentry = d_alloc(root, &quick_string);
if (!path.dentry)
goto out_shm_unlock;
path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
error = -ENOSPC;
inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out_dentry;
hstate = hstate_inode(inode);
size += addr & ~huge_page_mask(hstate);
num_pages = ALIGN(size, huge_page_size(hstate)) >>
huge_page_shift(hstate);
error = -ENOMEM;
if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
goto out_inode;
d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode);
error = -ENFILE;
file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
&hugetlbfs_file_operations);
if (!file)
goto out_dentry; /* inode is already attached */
return file;
out_inode:
iput(inode);
out_dentry:
path_put(&path);
out_shm_unlock:
if (*user) {
user_shm_unlock(size, *user);
*user = NULL;
}
return ERR_PTR(error);
}
static int __init init_hugetlbfs_fs(void)
{
struct hstate *h;
int error;
int i;
error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
return error;
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
0, 0, init_once);
if (hugetlbfs_inode_cachep == NULL)
goto out2;
error = register_filesystem(&hugetlbfs_fs_type);
if (error)
goto out;
i = 0;
for_each_hstate(h) {
char buf[50];
unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
buf);
if (IS_ERR(hugetlbfs_vfsmount[i])) {
pr_err("hugetlb: Cannot mount internal hugetlbfs for "
"page size %uK", ps_kb);
error = PTR_ERR(hugetlbfs_vfsmount[i]);
hugetlbfs_vfsmount[i] = NULL;
}
i++;
}
/* Non default hstates are optional */
if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
return 0;
out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
out2:
bdi_destroy(&hugetlbfs_backing_dev_info);
return error;
}
static void __exit exit_hugetlbfs_fs(void)
{
struct hstate *h;
int i;
/*
* Make sure all delayed rcu free inodes are flushed before we
* destroy cache.
*/
rcu_barrier();
kmem_cache_destroy(hugetlbfs_inode_cachep);
i = 0;
for_each_hstate(h)
kern_unmount(hugetlbfs_vfsmount[i++]);
unregister_filesystem(&hugetlbfs_fs_type);
bdi_destroy(&hugetlbfs_backing_dev_info);
}
module_init(init_hugetlbfs_fs)
module_exit(exit_hugetlbfs_fs)
MODULE_LICENSE("GPL");