dax: rip out get_block based IO support
No one uses functions using the get_block callback anymore. Rip them out and update documentation. Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
parent
00697eed38
commit
dd936e4313
@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
|
||||
Filesystem support consists of
|
||||
- adding support to mark inodes as being DAX by setting the S_DAX flag in
|
||||
i_flags
|
||||
- implementing the direct_IO address space operation, and calling
|
||||
dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
|
||||
- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
|
||||
when inode has S_DAX flag set
|
||||
- implementing an mmap file operation for DAX files which sets the
|
||||
VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
|
||||
include handlers for fault, pmd_fault and page_mkwrite (which should
|
||||
probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
|
||||
appropriate get_block() callback)
|
||||
- calling dax_truncate_page() instead of block_truncate_page() for DAX files
|
||||
- calling dax_zero_page_range() instead of zero_user() for DAX files
|
||||
include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
|
||||
handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
|
||||
handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
|
||||
iomap operations.
|
||||
- calling iomap_zero_range() passing appropriate iomap operations instead of
|
||||
block_truncate_page() for DAX files
|
||||
- ensuring that there is sufficient locking between reads, writes,
|
||||
truncates and page faults
|
||||
|
||||
The get_block() callback passed to the DAX functions may return
|
||||
uninitialised extents. If it does, it must ensure that simultaneous
|
||||
calls to get_block() (for example by a page-fault racing with a read()
|
||||
or a write()) work correctly.
|
||||
The iomap handlers for allocating blocks must make sure that allocated blocks
|
||||
are zeroed out and converted to written extents before being returned to avoid
|
||||
exposure of uninitialized data through mmap.
|
||||
|
||||
These filesystems may be used for inspiration:
|
||||
- ext2: see Documentation/filesystems/ext2.txt
|
||||
|
315
fs/dax.c
315
fs/dax.c
@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
|
||||
return page;
|
||||
}
|
||||
|
||||
static bool buffer_written(struct buffer_head *bh)
|
||||
{
|
||||
return buffer_mapped(bh) && !buffer_unwritten(bh);
|
||||
}
|
||||
|
||||
static sector_t to_sector(const struct buffer_head *bh,
|
||||
const struct inode *inode)
|
||||
{
|
||||
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
|
||||
|
||||
return sector;
|
||||
}
|
||||
|
||||
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
|
||||
loff_t start, loff_t end, get_block_t get_block,
|
||||
struct buffer_head *bh)
|
||||
{
|
||||
loff_t pos = start, max = start, bh_max = start;
|
||||
bool hole = false;
|
||||
struct block_device *bdev = NULL;
|
||||
int rw = iov_iter_rw(iter), rc;
|
||||
long map_len = 0;
|
||||
struct blk_dax_ctl dax = {
|
||||
.addr = ERR_PTR(-EIO),
|
||||
};
|
||||
unsigned blkbits = inode->i_blkbits;
|
||||
sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
|
||||
>> blkbits;
|
||||
|
||||
if (rw == READ)
|
||||
end = min(end, i_size_read(inode));
|
||||
|
||||
while (pos < end) {
|
||||
size_t len;
|
||||
if (pos == max) {
|
||||
long page = pos >> PAGE_SHIFT;
|
||||
sector_t block = page << (PAGE_SHIFT - blkbits);
|
||||
unsigned first = pos - (block << blkbits);
|
||||
long size;
|
||||
|
||||
if (pos == bh_max) {
|
||||
bh->b_size = PAGE_ALIGN(end - pos);
|
||||
bh->b_state = 0;
|
||||
rc = get_block(inode, block, bh, rw == WRITE);
|
||||
if (rc)
|
||||
break;
|
||||
bh_max = pos - first + bh->b_size;
|
||||
bdev = bh->b_bdev;
|
||||
/*
|
||||
* We allow uninitialized buffers for writes
|
||||
* beyond EOF as those cannot race with faults
|
||||
*/
|
||||
WARN_ON_ONCE(
|
||||
(buffer_new(bh) && block < file_blks) ||
|
||||
(rw == WRITE && buffer_unwritten(bh)));
|
||||
} else {
|
||||
unsigned done = bh->b_size -
|
||||
(bh_max - (pos - first));
|
||||
bh->b_blocknr += done >> blkbits;
|
||||
bh->b_size -= done;
|
||||
}
|
||||
|
||||
hole = rw == READ && !buffer_written(bh);
|
||||
if (hole) {
|
||||
size = bh->b_size - first;
|
||||
} else {
|
||||
dax_unmap_atomic(bdev, &dax);
|
||||
dax.sector = to_sector(bh, inode);
|
||||
dax.size = bh->b_size;
|
||||
map_len = dax_map_atomic(bdev, &dax);
|
||||
if (map_len < 0) {
|
||||
rc = map_len;
|
||||
break;
|
||||
}
|
||||
dax.addr += first;
|
||||
size = map_len - first;
|
||||
}
|
||||
/*
|
||||
* pos + size is one past the last offset for IO,
|
||||
* so pos + size can overflow loff_t at extreme offsets.
|
||||
* Cast to u64 to catch this and get the true minimum.
|
||||
*/
|
||||
max = min_t(u64, pos + size, end);
|
||||
}
|
||||
|
||||
if (iov_iter_rw(iter) == WRITE) {
|
||||
len = copy_from_iter_pmem(dax.addr, max - pos, iter);
|
||||
} else if (!hole)
|
||||
len = copy_to_iter((void __force *) dax.addr, max - pos,
|
||||
iter);
|
||||
else
|
||||
len = iov_iter_zero(max - pos, iter);
|
||||
|
||||
if (!len) {
|
||||
rc = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
pos += len;
|
||||
if (!IS_ERR(dax.addr))
|
||||
dax.addr += len;
|
||||
}
|
||||
|
||||
dax_unmap_atomic(bdev, &dax);
|
||||
|
||||
return (pos == start) ? rc : pos - start;
|
||||
}
|
||||
|
||||
/**
|
||||
* dax_do_io - Perform I/O to a DAX file
|
||||
* @iocb: The control block for this I/O
|
||||
* @inode: The file which the I/O is directed at
|
||||
* @iter: The addresses to do I/O from or to
|
||||
* @get_block: The filesystem method used to translate file offsets to blocks
|
||||
* @end_io: A filesystem callback for I/O completion
|
||||
* @flags: See below
|
||||
*
|
||||
* This function uses the same locking scheme as do_blockdev_direct_IO:
|
||||
* If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
|
||||
* caller for writes. For reads, we take and release the i_mutex ourselves.
|
||||
* If DIO_LOCKING is not set, the filesystem takes care of its own locking.
|
||||
* As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
|
||||
* is in progress.
|
||||
*/
|
||||
ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
|
||||
struct iov_iter *iter, get_block_t get_block,
|
||||
dio_iodone_t end_io, int flags)
|
||||
{
|
||||
struct buffer_head bh;
|
||||
ssize_t retval = -EINVAL;
|
||||
loff_t pos = iocb->ki_pos;
|
||||
loff_t end = pos + iov_iter_count(iter);
|
||||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
bh.b_bdev = inode->i_sb->s_bdev;
|
||||
|
||||
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
|
||||
inode_lock(inode);
|
||||
|
||||
/* Protects against truncate */
|
||||
if (!(flags & DIO_SKIP_DIO_COUNT))
|
||||
inode_dio_begin(inode);
|
||||
|
||||
retval = dax_io(inode, iter, pos, end, get_block, &bh);
|
||||
|
||||
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
|
||||
inode_unlock(inode);
|
||||
|
||||
if (end_io) {
|
||||
int err;
|
||||
|
||||
err = end_io(iocb, pos, retval, bh.b_private);
|
||||
if (err)
|
||||
retval = err;
|
||||
}
|
||||
|
||||
if (!(flags & DIO_SKIP_DIO_COUNT))
|
||||
inode_dio_end(inode);
|
||||
return retval;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_do_io);
|
||||
|
||||
/*
|
||||
* DAX radix tree locking
|
||||
*/
|
||||
@ -919,105 +757,6 @@ static int dax_insert_mapping(struct address_space *mapping,
|
||||
return vm_insert_mixed(vma, vaddr, dax.pfn);
|
||||
}
|
||||
|
||||
/**
|
||||
* dax_fault - handle a page fault on a DAX file
|
||||
* @vma: The virtual memory area where the fault occurred
|
||||
* @vmf: The description of the fault
|
||||
* @get_block: The filesystem method used to translate file offsets to blocks
|
||||
*
|
||||
* When a page fault occurs, filesystems may call this helper in their
|
||||
* fault handler for DAX files. dax_fault() assumes the caller has done all
|
||||
* the necessary locking for the page fault to proceed successfully.
|
||||
*/
|
||||
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
get_block_t get_block)
|
||||
{
|
||||
struct file *file = vma->vm_file;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
void *entry;
|
||||
struct buffer_head bh;
|
||||
unsigned long vaddr = (unsigned long)vmf->virtual_address;
|
||||
unsigned blkbits = inode->i_blkbits;
|
||||
sector_t block;
|
||||
pgoff_t size;
|
||||
int error;
|
||||
int major = 0;
|
||||
|
||||
/*
|
||||
* Check whether offset isn't beyond end of file now. Caller is supposed
|
||||
* to hold locks serializing us with truncate / punch hole so this is
|
||||
* a reliable test.
|
||||
*/
|
||||
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
if (vmf->pgoff >= size)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
|
||||
bh.b_bdev = inode->i_sb->s_bdev;
|
||||
bh.b_size = PAGE_SIZE;
|
||||
|
||||
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
|
||||
if (IS_ERR(entry)) {
|
||||
error = PTR_ERR(entry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = get_block(inode, block, &bh, 0);
|
||||
if (!error && (bh.b_size < PAGE_SIZE))
|
||||
error = -EIO; /* fs corruption? */
|
||||
if (error)
|
||||
goto unlock_entry;
|
||||
|
||||
if (vmf->cow_page) {
|
||||
struct page *new_page = vmf->cow_page;
|
||||
if (buffer_written(&bh))
|
||||
error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
|
||||
bh.b_size, new_page, vaddr);
|
||||
else
|
||||
clear_user_highpage(new_page, vaddr);
|
||||
if (error)
|
||||
goto unlock_entry;
|
||||
if (!radix_tree_exceptional_entry(entry)) {
|
||||
vmf->page = entry;
|
||||
return VM_FAULT_LOCKED;
|
||||
}
|
||||
vmf->entry = entry;
|
||||
return VM_FAULT_DAX_LOCKED;
|
||||
}
|
||||
|
||||
if (!buffer_mapped(&bh)) {
|
||||
if (vmf->flags & FAULT_FLAG_WRITE) {
|
||||
error = get_block(inode, block, &bh, 1);
|
||||
count_vm_event(PGMAJFAULT);
|
||||
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
|
||||
major = VM_FAULT_MAJOR;
|
||||
if (!error && (bh.b_size < PAGE_SIZE))
|
||||
error = -EIO;
|
||||
if (error)
|
||||
goto unlock_entry;
|
||||
} else {
|
||||
return dax_load_hole(mapping, entry, vmf);
|
||||
}
|
||||
}
|
||||
|
||||
/* Filesystem should not return unwritten buffers to us! */
|
||||
WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
|
||||
error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
|
||||
bh.b_size, &entry, vma, vmf);
|
||||
unlock_entry:
|
||||
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
|
||||
out:
|
||||
if (error == -ENOMEM)
|
||||
return VM_FAULT_OOM | major;
|
||||
/* -EBUSY is fine, somebody else faulted on the same PTE */
|
||||
if ((error < 0) && (error != -EBUSY))
|
||||
return VM_FAULT_SIGBUS | major;
|
||||
return VM_FAULT_NOPAGE | major;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_fault);
|
||||
|
||||
/**
|
||||
* dax_pfn_mkwrite - handle first write to DAX page
|
||||
* @vma: The virtual memory area where the fault occurred
|
||||
@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
|
||||
|
||||
/**
|
||||
* dax_zero_page_range - zero a range within a page of a DAX file
|
||||
* @inode: The file being truncated
|
||||
* @from: The file offset that is being truncated to
|
||||
* @length: The number of bytes to zero
|
||||
* @get_block: The filesystem method used to translate file offsets to blocks
|
||||
*
|
||||
* This function can be called by a filesystem when it is zeroing part of a
|
||||
* page in a DAX file. This is intended for hole-punch operations. If
|
||||
* you are truncating a file, the helper function dax_truncate_page() may be
|
||||
* more convenient.
|
||||
*/
|
||||
int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
|
||||
get_block_t get_block)
|
||||
{
|
||||
struct buffer_head bh;
|
||||
pgoff_t index = from >> PAGE_SHIFT;
|
||||
unsigned offset = from & (PAGE_SIZE-1);
|
||||
int err;
|
||||
|
||||
/* Block boundary? Nothing to do */
|
||||
if (!length)
|
||||
return 0;
|
||||
if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
|
||||
return -EINVAL;
|
||||
|
||||
memset(&bh, 0, sizeof(bh));
|
||||
bh.b_bdev = inode->i_sb->s_bdev;
|
||||
bh.b_size = PAGE_SIZE;
|
||||
err = get_block(inode, index, &bh, 0);
|
||||
if (err < 0 || !buffer_written(&bh))
|
||||
return err;
|
||||
|
||||
return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
|
||||
offset, length);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_zero_page_range);
|
||||
|
||||
/**
|
||||
* dax_truncate_page - handle a partial page being truncated in a DAX file
|
||||
* @inode: The file being truncated
|
||||
* @from: The file offset that is being truncated to
|
||||
* @get_block: The filesystem method used to translate file offsets to blocks
|
||||
*
|
||||
* Similar to block_truncate_page(), this function can be called by a
|
||||
* filesystem when it is truncating a DAX file to handle the partial page.
|
||||
*/
|
||||
int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
|
||||
{
|
||||
unsigned length = PAGE_ALIGN(from) - from;
|
||||
return dax_zero_page_range(inode, from, length, get_block);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_truncate_page);
|
||||
|
||||
#ifdef CONFIG_FS_IOMAP
|
||||
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
|
||||
{
|
||||
|
@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
|
||||
|
||||
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
struct iomap_ops *ops);
|
||||
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
|
||||
get_block_t, dio_iodone_t, int flags);
|
||||
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
|
||||
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
|
||||
int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
struct iomap_ops *ops);
|
||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
|
||||
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
|
||||
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
||||
pgoff_t index, void *entry, bool wake_all);
|
||||
@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, unsigned int flags, get_block_t gb)
|
||||
{
|
||||
return VM_FAULT_FALLBACK;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX_PMD
|
||||
static inline unsigned int dax_radix_order(void *entry)
|
||||
{
|
||||
@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
|
||||
}
|
||||
#endif
|
||||
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
|
||||
#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
|
||||
|
||||
static inline bool vma_is_dax(struct vm_area_struct *vma)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user