2022-11-25 03:39:33 +03:00
// SPDX-License-Identifier: GPL-2.0
/*
* Simple file system for zoned block devices exposing zones as files .
*
* Copyright ( C ) 2022 Western Digital Corporation or its affiliates .
*/
# include <linux/module.h>
# include <linux/pagemap.h>
# include <linux/iomap.h>
# include <linux/init.h>
# include <linux/slab.h>
# include <linux/blkdev.h>
# include <linux/statfs.h>
# include <linux/writeback.h>
# include <linux/quotaops.h>
# include <linux/seq_file.h>
# include <linux/parser.h>
# include <linux/uio.h>
# include <linux/mman.h>
# include <linux/sched/mm.h>
# include <linux/task_io_accounting_ops.h>
# include "zonefs.h"
# include "trace.h"
static int zonefs_read_iomap_begin ( struct inode * inode , loff_t offset ,
loff_t length , unsigned int flags ,
struct iomap * iomap , struct iomap * srcmap )
{
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
struct super_block * sb = inode - > i_sb ;
loff_t isize ;
/*
* All blocks are always mapped below EOF . If reading past EOF ,
* act as if there is a hole up to the file maximum size .
*/
mutex_lock ( & zi - > i_truncate_mutex ) ;
iomap - > bdev = inode - > i_sb - > s_bdev ;
iomap - > offset = ALIGN_DOWN ( offset , sb - > s_blocksize ) ;
isize = i_size_read ( inode ) ;
if ( iomap - > offset > = isize ) {
iomap - > type = IOMAP_HOLE ;
iomap - > addr = IOMAP_NULL_ADDR ;
iomap - > length = length ;
} else {
iomap - > type = IOMAP_MAPPED ;
2022-11-16 12:15:40 +03:00
iomap - > addr = ( z - > z_sector < < SECTOR_SHIFT ) + iomap - > offset ;
2022-11-25 03:39:33 +03:00
iomap - > length = isize - iomap - > offset ;
}
mutex_unlock ( & zi - > i_truncate_mutex ) ;
trace_zonefs_iomap_begin ( inode , iomap ) ;
return 0 ;
}
static const struct iomap_ops zonefs_read_iomap_ops = {
. iomap_begin = zonefs_read_iomap_begin ,
} ;
static int zonefs_write_iomap_begin ( struct inode * inode , loff_t offset ,
loff_t length , unsigned int flags ,
struct iomap * iomap , struct iomap * srcmap )
{
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
struct super_block * sb = inode - > i_sb ;
loff_t isize ;
/* All write I/Os should always be within the file maximum size */
2022-11-16 12:15:40 +03:00
if ( WARN_ON_ONCE ( offset + length > z - > z_capacity ) )
2022-11-25 03:39:33 +03:00
return - EIO ;
/*
* Sequential zones can only accept direct writes . This is already
* checked when writes are issued , so warn if we see a page writeback
* operation .
*/
2022-11-16 12:15:40 +03:00
if ( WARN_ON_ONCE ( zonefs_zone_is_seq ( z ) & & ! ( flags & IOMAP_DIRECT ) ) )
2022-11-25 03:39:33 +03:00
return - EIO ;
/*
* For conventional zones , all blocks are always mapped . For sequential
* zones , all blocks after always mapped below the inode size ( zone
* write pointer ) and unwriten beyond .
*/
mutex_lock ( & zi - > i_truncate_mutex ) ;
iomap - > bdev = inode - > i_sb - > s_bdev ;
iomap - > offset = ALIGN_DOWN ( offset , sb - > s_blocksize ) ;
2022-11-16 12:15:40 +03:00
iomap - > addr = ( z - > z_sector < < SECTOR_SHIFT ) + iomap - > offset ;
2022-11-25 03:39:33 +03:00
isize = i_size_read ( inode ) ;
if ( iomap - > offset > = isize ) {
iomap - > type = IOMAP_UNWRITTEN ;
2022-11-16 12:15:40 +03:00
iomap - > length = z - > z_capacity - iomap - > offset ;
2022-11-25 03:39:33 +03:00
} else {
iomap - > type = IOMAP_MAPPED ;
iomap - > length = isize - iomap - > offset ;
}
mutex_unlock ( & zi - > i_truncate_mutex ) ;
trace_zonefs_iomap_begin ( inode , iomap ) ;
return 0 ;
}
static const struct iomap_ops zonefs_write_iomap_ops = {
. iomap_begin = zonefs_write_iomap_begin ,
} ;
static int zonefs_read_folio ( struct file * unused , struct folio * folio )
{
return iomap_read_folio ( folio , & zonefs_read_iomap_ops ) ;
}
static void zonefs_readahead ( struct readahead_control * rac )
{
iomap_readahead ( rac , & zonefs_read_iomap_ops ) ;
}
/*
* Map blocks for page writeback . This is used only on conventional zone files ,
* which implies that the page range can only be within the fixed inode size .
*/
static int zonefs_write_map_blocks ( struct iomap_writepage_ctx * wpc ,
2023-12-07 10:27:10 +03:00
struct inode * inode , loff_t offset ,
unsigned int len )
2022-11-25 03:39:33 +03:00
{
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
2022-11-16 12:15:40 +03:00
if ( WARN_ON_ONCE ( zonefs_zone_is_seq ( z ) ) )
2022-11-25 03:39:33 +03:00
return - EIO ;
if ( WARN_ON_ONCE ( offset > = i_size_read ( inode ) ) )
return - EIO ;
/* If the mapping is already OK, nothing needs to be done */
if ( offset > = wpc - > iomap . offset & &
offset < wpc - > iomap . offset + wpc - > iomap . length )
return 0 ;
2022-11-16 12:15:40 +03:00
return zonefs_write_iomap_begin ( inode , offset ,
z - > z_capacity - offset ,
2022-11-25 03:39:33 +03:00
IOMAP_WRITE , & wpc - > iomap , NULL ) ;
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
. map_blocks = zonefs_write_map_blocks ,
} ;
static int zonefs_writepages ( struct address_space * mapping ,
struct writeback_control * wbc )
{
struct iomap_writepage_ctx wpc = { } ;
return iomap_writepages ( mapping , wbc , & wpc , & zonefs_writeback_ops ) ;
}
static int zonefs_swap_activate ( struct swap_info_struct * sis ,
struct file * swap_file , sector_t * span )
{
struct inode * inode = file_inode ( swap_file ) ;
2022-11-24 13:43:30 +03:00
if ( zonefs_inode_is_seq ( inode ) ) {
2022-11-25 03:39:33 +03:00
zonefs_err ( inode - > i_sb ,
" swap file: not a conventional zone file \n " ) ;
return - EINVAL ;
}
return iomap_swapfile_activate ( sis , swap_file , span ,
& zonefs_read_iomap_ops ) ;
}
const struct address_space_operations zonefs_file_aops = {
. read_folio = zonefs_read_folio ,
. readahead = zonefs_readahead ,
. writepages = zonefs_writepages ,
2023-07-11 00:12:43 +03:00
. dirty_folio = iomap_dirty_folio ,
2022-11-25 03:39:33 +03:00
. release_folio = iomap_release_folio ,
. invalidate_folio = iomap_invalidate_folio ,
. migrate_folio = filemap_migrate_folio ,
. is_partially_uptodate = iomap_is_partially_uptodate ,
2023-11-17 19:14:47 +03:00
. error_remove_folio = generic_error_remove_folio ,
2022-11-25 03:39:33 +03:00
. swap_activate = zonefs_swap_activate ,
} ;
int zonefs_file_truncate ( struct inode * inode , loff_t isize )
{
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
loff_t old_isize ;
enum req_op op ;
int ret = 0 ;
/*
* Only sequential zone files can be truncated and truncation is allowed
* only down to a 0 size , which is equivalent to a zone reset , and to
* the maximum file size , which is equivalent to a zone finish .
*/
2022-11-16 12:15:40 +03:00
if ( ! zonefs_zone_is_seq ( z ) )
2022-11-25 03:39:33 +03:00
return - EPERM ;
if ( ! isize )
op = REQ_OP_ZONE_RESET ;
2022-11-16 12:15:40 +03:00
else if ( isize = = z - > z_capacity )
2022-11-25 03:39:33 +03:00
op = REQ_OP_ZONE_FINISH ;
else
return - EPERM ;
inode_dio_wait ( inode ) ;
/* Serialize against page faults */
filemap_invalidate_lock ( inode - > i_mapping ) ;
/* Serialize against zonefs_iomap_begin() */
mutex_lock ( & zi - > i_truncate_mutex ) ;
old_isize = i_size_read ( inode ) ;
if ( isize = = old_isize )
goto unlock ;
2022-11-16 12:15:40 +03:00
ret = zonefs_inode_zone_mgmt ( inode , op ) ;
2022-11-25 03:39:33 +03:00
if ( ret )
goto unlock ;
/*
* If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set ,
* take care of open zones .
*/
2022-11-16 12:15:40 +03:00
if ( z - > z_flags & ZONEFS_ZONE_OPEN ) {
2022-11-25 03:39:33 +03:00
/*
* Truncating a zone to EMPTY or FULL is the equivalent of
* closing the zone . For a truncation to 0 , we need to
* re - open the zone to ensure new writes can be processed .
* For a truncation to the maximum file size , the zone is
* closed and writes cannot be accepted anymore , so clear
* the open flag .
*/
if ( ! isize )
2022-11-16 12:15:40 +03:00
ret = zonefs_inode_zone_mgmt ( inode , REQ_OP_ZONE_OPEN ) ;
2022-11-25 03:39:33 +03:00
else
2022-11-16 12:15:40 +03:00
z - > z_flags & = ~ ZONEFS_ZONE_OPEN ;
2022-11-25 03:39:33 +03:00
}
zonefs_update_stats ( inode , isize ) ;
truncate_setsize ( inode , isize ) ;
2022-11-16 12:15:40 +03:00
z - > z_wpoffset = isize ;
zonefs_inode_account_active ( inode ) ;
2022-11-25 03:39:33 +03:00
unlock :
mutex_unlock ( & zi - > i_truncate_mutex ) ;
filemap_invalidate_unlock ( inode - > i_mapping ) ;
return ret ;
}
static int zonefs_file_fsync ( struct file * file , loff_t start , loff_t end ,
int datasync )
{
struct inode * inode = file_inode ( file ) ;
int ret = 0 ;
if ( unlikely ( IS_IMMUTABLE ( inode ) ) )
return - EPERM ;
/*
* Since only direct writes are allowed in sequential files , page cache
* flush is needed only for conventional zone files .
*/
2022-11-24 13:43:30 +03:00
if ( zonefs_inode_is_cnv ( inode ) )
2022-11-25 03:39:33 +03:00
ret = file_write_and_wait_range ( file , start , end ) ;
if ( ! ret )
ret = blkdev_issue_flush ( inode - > i_sb - > s_bdev ) ;
if ( ret )
zonefs_io_error ( inode , true ) ;
return ret ;
}
static vm_fault_t zonefs_filemap_page_mkwrite ( struct vm_fault * vmf )
{
struct inode * inode = file_inode ( vmf - > vma - > vm_file ) ;
vm_fault_t ret ;
if ( unlikely ( IS_IMMUTABLE ( inode ) ) )
return VM_FAULT_SIGBUS ;
/*
* Sanity check : only conventional zone files can have shared
* writeable mappings .
*/
2022-11-24 13:43:30 +03:00
if ( zonefs_inode_is_seq ( inode ) )
2022-11-25 03:39:33 +03:00
return VM_FAULT_NOPAGE ;
sb_start_pagefault ( inode - > i_sb ) ;
file_update_time ( vmf - > vma - > vm_file ) ;
/* Serialize against truncates */
filemap_invalidate_lock_shared ( inode - > i_mapping ) ;
ret = iomap_page_mkwrite ( vmf , & zonefs_write_iomap_ops ) ;
filemap_invalidate_unlock_shared ( inode - > i_mapping ) ;
sb_end_pagefault ( inode - > i_sb ) ;
return ret ;
}
static const struct vm_operations_struct zonefs_file_vm_ops = {
. fault = filemap_fault ,
. map_pages = filemap_map_pages ,
. page_mkwrite = zonefs_filemap_page_mkwrite ,
} ;
static int zonefs_file_mmap ( struct file * file , struct vm_area_struct * vma )
{
/*
* Conventional zones accept random writes , so their files can support
* shared writable mappings . For sequential zone files , only read
* mappings are possible since there are no guarantees for write
* ordering between msync ( ) and page cache writeback .
*/
2022-11-24 13:43:30 +03:00
if ( zonefs_inode_is_seq ( file_inode ( file ) ) & &
2022-11-25 03:39:33 +03:00
( vma - > vm_flags & VM_SHARED ) & & ( vma - > vm_flags & VM_MAYWRITE ) )
return - EINVAL ;
file_accessed ( file ) ;
vma - > vm_ops = & zonefs_file_vm_ops ;
return 0 ;
}
static loff_t zonefs_file_llseek ( struct file * file , loff_t offset , int whence )
{
loff_t isize = i_size_read ( file_inode ( file ) ) ;
/*
* Seeks are limited to below the zone size for conventional zones
* and below the zone write pointer for sequential zones . In both
* cases , this limit is the inode size .
*/
return generic_file_llseek_size ( file , offset , whence , isize , isize ) ;
}
static int zonefs_file_write_dio_end_io ( struct kiocb * iocb , ssize_t size ,
int error , unsigned int flags )
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
if ( error ) {
zonefs: Improve error handling
Write error handling is racy and can sometime lead to the error recovery
path wrongly changing the inode size of a sequential zone file to an
incorrect value which results in garbage data being readable at the end
of a file. There are 2 problems:
1) zonefs_file_dio_write() updates a zone file write pointer offset
after issuing a direct IO with iomap_dio_rw(). This update is done
only if the IO succeed for synchronous direct writes. However, for
asynchronous direct writes, the update is done without waiting for
the IO completion so that the next asynchronous IO can be
immediately issued. However, if an asynchronous IO completes with a
failure right before the i_truncate_mutex lock protecting the update,
the update may change the value of the inode write pointer offset
that was corrected by the error path (zonefs_io_error() function).
2) zonefs_io_error() is called when a read or write error occurs. This
function executes a report zone operation using the callback function
zonefs_io_error_cb(), which does all the error recovery handling
based on the current zone condition, write pointer position and
according to the mount options being used. However, depending on the
zoned device being used, a report zone callback may be executed in a
context that is different from the context of __zonefs_io_error(). As
a result, zonefs_io_error_cb() may be executed without the inode
truncate mutex lock held, which can lead to invalid error processing.
Fix both problems as follows:
- Problem 1: Perform the inode write pointer offset update before a
direct write is issued with iomap_dio_rw(). This is safe to do as
partial direct writes are not supported (IOMAP_DIO_PARTIAL is not
set) and any failed IO will trigger the execution of zonefs_io_error()
which will correct the inode write pointer offset to reflect the
current state of the one on the device.
- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error()
and call this function directly from __zonefs_io_error() after
obtaining the zone information using blkdev_report_zones() with a
simple callback function that copies to a local stack variable the
struct blk_zone obtained from the device. This ensures that error
handling is performed holding the inode truncate mutex.
This change also simplifies error handling for conventional zone files
by bypassing the execution of report zones entirely. This is safe to
do because the condition of conventional zones cannot be read-only or
offline and conventional zone files are always fully mapped with a
constant file size.
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
2024-02-08 11:26:59 +03:00
/*
* For Sync IOs , error recovery is called from
* zonefs_file_dio_write ( ) .
*/
if ( ! is_sync_kiocb ( iocb ) )
zonefs_io_error ( inode , true ) ;
2022-11-25 03:39:33 +03:00
return error ;
}
2022-11-16 12:15:40 +03:00
if ( size & & zonefs_inode_is_seq ( inode ) ) {
2022-11-25 03:39:33 +03:00
/*
* Note that we may be seeing completions out of order ,
* but that is not a problem since a write completed
* successfully necessarily means that all preceding writes
* were also successful . So we can safely increase the inode
* size to the write end location .
*/
mutex_lock ( & zi - > i_truncate_mutex ) ;
if ( i_size_read ( inode ) < iocb - > ki_pos + size ) {
zonefs_update_stats ( inode , iocb - > ki_pos + size ) ;
zonefs_i_size_write ( inode , iocb - > ki_pos + size ) ;
}
mutex_unlock ( & zi - > i_truncate_mutex ) ;
}
return 0 ;
}
2023-06-01 11:15:41 +03:00
static const struct iomap_dio_ops zonefs_write_dio_ops = {
. end_io = zonefs_file_write_dio_end_io ,
} ;
2022-11-25 03:39:33 +03:00
/*
* Do not exceed the LFS limits nor the file zone size . If pos is under the
* limit it becomes a short access . If it exceeds the limit , return - EFBIG .
*/
static loff_t zonefs_write_check_limits ( struct file * file , loff_t pos ,
loff_t count )
{
struct inode * inode = file_inode ( file ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
loff_t limit = rlimit ( RLIMIT_FSIZE ) ;
2022-11-16 12:15:40 +03:00
loff_t max_size = z - > z_capacity ;
2022-11-25 03:39:33 +03:00
if ( limit ! = RLIM_INFINITY ) {
if ( pos > = limit ) {
send_sig ( SIGXFSZ , current , 0 ) ;
return - EFBIG ;
}
count = min ( count , limit - pos ) ;
}
if ( ! ( file - > f_flags & O_LARGEFILE ) )
max_size = min_t ( loff_t , MAX_NON_LFS , max_size ) ;
if ( unlikely ( pos > = max_size ) )
return - EFBIG ;
return min ( count , max_size - pos ) ;
}
static ssize_t zonefs_write_checks ( struct kiocb * iocb , struct iov_iter * from )
{
struct file * file = iocb - > ki_filp ;
struct inode * inode = file_inode ( file ) ;
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
loff_t count ;
if ( IS_SWAPFILE ( inode ) )
return - ETXTBSY ;
if ( ! iov_iter_count ( from ) )
return 0 ;
if ( ( iocb - > ki_flags & IOCB_NOWAIT ) & & ! ( iocb - > ki_flags & IOCB_DIRECT ) )
return - EINVAL ;
if ( iocb - > ki_flags & IOCB_APPEND ) {
2022-11-16 12:15:40 +03:00
if ( zonefs_zone_is_cnv ( z ) )
2022-11-25 03:39:33 +03:00
return - EINVAL ;
mutex_lock ( & zi - > i_truncate_mutex ) ;
2022-11-16 12:15:40 +03:00
iocb - > ki_pos = z - > z_wpoffset ;
2022-11-25 03:39:33 +03:00
mutex_unlock ( & zi - > i_truncate_mutex ) ;
}
count = zonefs_write_check_limits ( file , iocb - > ki_pos ,
iov_iter_count ( from ) ) ;
if ( count < 0 )
return count ;
iov_iter_truncate ( from , count ) ;
return iov_iter_count ( from ) ;
}
/*
* Handle direct writes . For sequential zone files , this is the only possible
* write path . For these files , check that the user is issuing writes
* sequentially from the end of the file . This code assumes that the block layer
* delivers write requests to the device in sequential order . This is always the
* case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
* elevator feature is being used ( e . g . mq - deadline ) . The block layer always
* automatically select such an elevator for zoned block devices during the
* device initialization .
*/
static ssize_t zonefs_file_dio_write ( struct kiocb * iocb , struct iov_iter * from )
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
struct super_block * sb = inode - > i_sb ;
ssize_t ret , count ;
/*
* For async direct IOs to sequential zone files , refuse IOCB_NOWAIT
* as this can cause write reordering ( e . g . the first aio gets EAGAIN
* on the inode lock but the second goes through but is now unaligned ) .
*/
2023-08-07 07:11:48 +03:00
if ( zonefs_zone_is_seq ( z ) & & ! is_sync_kiocb ( iocb ) & &
( iocb - > ki_flags & IOCB_NOWAIT ) )
2022-11-25 03:39:33 +03:00
return - EOPNOTSUPP ;
if ( iocb - > ki_flags & IOCB_NOWAIT ) {
if ( ! inode_trylock ( inode ) )
return - EAGAIN ;
} else {
inode_lock ( inode ) ;
}
count = zonefs_write_checks ( iocb , from ) ;
if ( count < = 0 ) {
ret = count ;
goto inode_unlock ;
}
if ( ( iocb - > ki_pos | count ) & ( sb - > s_blocksize - 1 ) ) {
ret = - EINVAL ;
goto inode_unlock ;
}
/* Enforce sequential writes (append only) in sequential zones */
2022-11-16 12:15:40 +03:00
if ( zonefs_zone_is_seq ( z ) ) {
2022-11-25 03:39:33 +03:00
mutex_lock ( & zi - > i_truncate_mutex ) ;
2022-11-16 12:15:40 +03:00
if ( iocb - > ki_pos ! = z - > z_wpoffset ) {
2022-11-25 03:39:33 +03:00
mutex_unlock ( & zi - > i_truncate_mutex ) ;
ret = - EINVAL ;
goto inode_unlock ;
}
zonefs: Improve error handling
Write error handling is racy and can sometime lead to the error recovery
path wrongly changing the inode size of a sequential zone file to an
incorrect value which results in garbage data being readable at the end
of a file. There are 2 problems:
1) zonefs_file_dio_write() updates a zone file write pointer offset
after issuing a direct IO with iomap_dio_rw(). This update is done
only if the IO succeed for synchronous direct writes. However, for
asynchronous direct writes, the update is done without waiting for
the IO completion so that the next asynchronous IO can be
immediately issued. However, if an asynchronous IO completes with a
failure right before the i_truncate_mutex lock protecting the update,
the update may change the value of the inode write pointer offset
that was corrected by the error path (zonefs_io_error() function).
2) zonefs_io_error() is called when a read or write error occurs. This
function executes a report zone operation using the callback function
zonefs_io_error_cb(), which does all the error recovery handling
based on the current zone condition, write pointer position and
according to the mount options being used. However, depending on the
zoned device being used, a report zone callback may be executed in a
context that is different from the context of __zonefs_io_error(). As
a result, zonefs_io_error_cb() may be executed without the inode
truncate mutex lock held, which can lead to invalid error processing.
Fix both problems as follows:
- Problem 1: Perform the inode write pointer offset update before a
direct write is issued with iomap_dio_rw(). This is safe to do as
partial direct writes are not supported (IOMAP_DIO_PARTIAL is not
set) and any failed IO will trigger the execution of zonefs_io_error()
which will correct the inode write pointer offset to reflect the
current state of the one on the device.
- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error()
and call this function directly from __zonefs_io_error() after
obtaining the zone information using blkdev_report_zones() with a
simple callback function that copies to a local stack variable the
struct blk_zone obtained from the device. This ensures that error
handling is performed holding the inode truncate mutex.
This change also simplifies error handling for conventional zone files
by bypassing the execution of report zones entirely. This is safe to
do because the condition of conventional zones cannot be read-only or
offline and conventional zone files are always fully mapped with a
constant file size.
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
2024-02-08 11:26:59 +03:00
/*
* Advance the zone write pointer offset . This assumes that the
* IO will succeed , which is OK to do because we do not allow
* partial writes ( IOMAP_DIO_PARTIAL is not set ) and if the IO
* fails , the error path will correct the write pointer offset .
*/
z - > z_wpoffset + = count ;
zonefs_inode_account_active ( inode ) ;
2022-11-25 03:39:33 +03:00
mutex_unlock ( & zi - > i_truncate_mutex ) ;
2023-03-30 03:47:58 +03:00
}
2023-06-01 11:15:41 +03:00
/*
* iomap_dio_rw ( ) may return ENOTBLK if there was an issue with
* page invalidation . Overwrite that error code with EBUSY so that
* the user can make sense of the error .
*/
ret = iomap_dio_rw ( iocb , from , & zonefs_write_iomap_ops ,
2023-08-07 07:11:48 +03:00
& zonefs_write_dio_ops , 0 , NULL , 0 ) ;
2023-06-01 11:15:41 +03:00
if ( ret = = - ENOTBLK )
ret = - EBUSY ;
zonefs: Improve error handling
Write error handling is racy and can sometime lead to the error recovery
path wrongly changing the inode size of a sequential zone file to an
incorrect value which results in garbage data being readable at the end
of a file. There are 2 problems:
1) zonefs_file_dio_write() updates a zone file write pointer offset
after issuing a direct IO with iomap_dio_rw(). This update is done
only if the IO succeed for synchronous direct writes. However, for
asynchronous direct writes, the update is done without waiting for
the IO completion so that the next asynchronous IO can be
immediately issued. However, if an asynchronous IO completes with a
failure right before the i_truncate_mutex lock protecting the update,
the update may change the value of the inode write pointer offset
that was corrected by the error path (zonefs_io_error() function).
2) zonefs_io_error() is called when a read or write error occurs. This
function executes a report zone operation using the callback function
zonefs_io_error_cb(), which does all the error recovery handling
based on the current zone condition, write pointer position and
according to the mount options being used. However, depending on the
zoned device being used, a report zone callback may be executed in a
context that is different from the context of __zonefs_io_error(). As
a result, zonefs_io_error_cb() may be executed without the inode
truncate mutex lock held, which can lead to invalid error processing.
Fix both problems as follows:
- Problem 1: Perform the inode write pointer offset update before a
direct write is issued with iomap_dio_rw(). This is safe to do as
partial direct writes are not supported (IOMAP_DIO_PARTIAL is not
set) and any failed IO will trigger the execution of zonefs_io_error()
which will correct the inode write pointer offset to reflect the
current state of the one on the device.
- Problem 2: Change zonefs_io_error_cb() into zonefs_handle_io_error()
and call this function directly from __zonefs_io_error() after
obtaining the zone information using blkdev_report_zones() with a
simple callback function that copies to a local stack variable the
struct blk_zone obtained from the device. This ensures that error
handling is performed holding the inode truncate mutex.
This change also simplifies error handling for conventional zone files
by bypassing the execution of report zones entirely. This is safe to
do because the condition of conventional zones cannot be read-only or
offline and conventional zone files are always fully mapped with a
constant file size.
Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
2024-02-08 11:26:59 +03:00
/*
* For a failed IO or partial completion , trigger error recovery
* to update the zone write pointer offset to a correct value .
* For asynchronous IOs , zonefs_file_write_dio_end_io ( ) may already
* have executed error recovery if the IO already completed when we
* reach here . However , we cannot know that and execute error recovery
* again ( that will not change anything ) .
*/
if ( zonefs_zone_is_seq ( z ) ) {
if ( ret > 0 & & ret ! = count )
ret = - EIO ;
if ( ret < 0 & & ret ! = - EIOCBQUEUED )
zonefs_io_error ( inode , true ) ;
2022-11-25 03:39:33 +03:00
}
inode_unlock :
inode_unlock ( inode ) ;
return ret ;
}
static ssize_t zonefs_file_buffered_write ( struct kiocb * iocb ,
struct iov_iter * from )
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
ssize_t ret ;
/*
* Direct IO writes are mandatory for sequential zone files so that the
* write IO issuing order is preserved .
*/
2022-11-24 13:43:30 +03:00
if ( zonefs_inode_is_seq ( inode ) )
2022-11-25 03:39:33 +03:00
return - EIO ;
if ( iocb - > ki_flags & IOCB_NOWAIT ) {
if ( ! inode_trylock ( inode ) )
return - EAGAIN ;
} else {
inode_lock ( inode ) ;
}
ret = zonefs_write_checks ( iocb , from ) ;
if ( ret < = 0 )
goto inode_unlock ;
ret = iomap_file_buffered_write ( iocb , from , & zonefs_write_iomap_ops ) ;
2023-06-01 17:58:59 +03:00
if ( ret = = - EIO )
2022-11-25 03:39:33 +03:00
zonefs_io_error ( inode , true ) ;
inode_unlock :
inode_unlock ( inode ) ;
if ( ret > 0 )
ret = generic_write_sync ( iocb , ret ) ;
return ret ;
}
static ssize_t zonefs_file_write_iter ( struct kiocb * iocb , struct iov_iter * from )
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
if ( unlikely ( IS_IMMUTABLE ( inode ) ) )
return - EPERM ;
if ( sb_rdonly ( inode - > i_sb ) )
return - EROFS ;
2022-11-16 12:15:40 +03:00
/* Write operations beyond the zone capacity are not allowed */
if ( iocb - > ki_pos > = z - > z_capacity )
2022-11-25 03:39:33 +03:00
return - EFBIG ;
if ( iocb - > ki_flags & IOCB_DIRECT ) {
ssize_t ret = zonefs_file_dio_write ( iocb , from ) ;
if ( ret ! = - ENOTBLK )
return ret ;
}
return zonefs_file_buffered_write ( iocb , from ) ;
}
static int zonefs_file_read_dio_end_io ( struct kiocb * iocb , ssize_t size ,
int error , unsigned int flags )
{
if ( error ) {
zonefs_io_error ( file_inode ( iocb - > ki_filp ) , false ) ;
return error ;
}
return 0 ;
}
static const struct iomap_dio_ops zonefs_read_dio_ops = {
. end_io = zonefs_file_read_dio_end_io ,
} ;
static ssize_t zonefs_file_read_iter ( struct kiocb * iocb , struct iov_iter * to )
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
struct super_block * sb = inode - > i_sb ;
loff_t isize ;
ssize_t ret ;
/* Offline zones cannot be read */
if ( unlikely ( IS_IMMUTABLE ( inode ) & & ! ( inode - > i_mode & 0777 ) ) )
return - EPERM ;
2022-11-16 12:15:40 +03:00
if ( iocb - > ki_pos > = z - > z_capacity )
2022-11-25 03:39:33 +03:00
return 0 ;
if ( iocb - > ki_flags & IOCB_NOWAIT ) {
if ( ! inode_trylock_shared ( inode ) )
return - EAGAIN ;
} else {
inode_lock_shared ( inode ) ;
}
/* Limit read operations to written data */
mutex_lock ( & zi - > i_truncate_mutex ) ;
isize = i_size_read ( inode ) ;
if ( iocb - > ki_pos > = isize ) {
mutex_unlock ( & zi - > i_truncate_mutex ) ;
ret = 0 ;
goto inode_unlock ;
}
iov_iter_truncate ( to , isize - iocb - > ki_pos ) ;
mutex_unlock ( & zi - > i_truncate_mutex ) ;
if ( iocb - > ki_flags & IOCB_DIRECT ) {
size_t count = iov_iter_count ( to ) ;
if ( ( iocb - > ki_pos | count ) & ( sb - > s_blocksize - 1 ) ) {
ret = - EINVAL ;
goto inode_unlock ;
}
file_accessed ( iocb - > ki_filp ) ;
ret = iomap_dio_rw ( iocb , to , & zonefs_read_iomap_ops ,
& zonefs_read_dio_ops , 0 , NULL , 0 ) ;
} else {
ret = generic_file_read_iter ( iocb , to ) ;
if ( ret = = - EIO )
zonefs_io_error ( inode , false ) ;
}
inode_unlock :
inode_unlock_shared ( inode ) ;
return ret ;
}
2023-05-22 16:50:12 +03:00
static ssize_t zonefs_file_splice_read ( struct file * in , loff_t * ppos ,
struct pipe_inode_info * pipe ,
size_t len , unsigned int flags )
{
struct inode * inode = file_inode ( in ) ;
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
loff_t isize ;
ssize_t ret = 0 ;
/* Offline zones cannot be read */
if ( unlikely ( IS_IMMUTABLE ( inode ) & & ! ( inode - > i_mode & 0777 ) ) )
return - EPERM ;
if ( * ppos > = z - > z_capacity )
return 0 ;
inode_lock_shared ( inode ) ;
/* Limit read operations to written data */
mutex_lock ( & zi - > i_truncate_mutex ) ;
isize = i_size_read ( inode ) ;
if ( * ppos > = isize )
len = 0 ;
else
len = min_t ( loff_t , len , isize - * ppos ) ;
mutex_unlock ( & zi - > i_truncate_mutex ) ;
if ( len > 0 ) {
ret = filemap_splice_read ( in , ppos , pipe , len , flags ) ;
if ( ret = = - EIO )
zonefs_io_error ( inode , false ) ;
}
inode_unlock_shared ( inode ) ;
return ret ;
}
2022-11-25 03:39:33 +03:00
/*
* Write open accounting is done only for sequential files .
*/
static inline bool zonefs_seq_file_need_wro ( struct inode * inode ,
struct file * file )
{
2022-11-24 13:43:30 +03:00
if ( zonefs_inode_is_cnv ( inode ) )
2022-11-25 03:39:33 +03:00
return false ;
if ( ! ( file - > f_mode & FMODE_WRITE ) )
return false ;
return true ;
}
static int zonefs_seq_file_write_open ( struct inode * inode )
{
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
int ret = 0 ;
mutex_lock ( & zi - > i_truncate_mutex ) ;
if ( ! zi - > i_wr_refcnt ) {
struct zonefs_sb_info * sbi = ZONEFS_SB ( inode - > i_sb ) ;
unsigned int wro = atomic_inc_return ( & sbi - > s_wro_seq_files ) ;
if ( sbi - > s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN ) {
if ( sbi - > s_max_wro_seq_files
& & wro > sbi - > s_max_wro_seq_files ) {
atomic_dec ( & sbi - > s_wro_seq_files ) ;
ret = - EBUSY ;
goto unlock ;
}
2022-11-16 12:15:40 +03:00
if ( i_size_read ( inode ) < z - > z_capacity ) {
ret = zonefs_inode_zone_mgmt ( inode ,
REQ_OP_ZONE_OPEN ) ;
2022-11-25 03:39:33 +03:00
if ( ret ) {
atomic_dec ( & sbi - > s_wro_seq_files ) ;
goto unlock ;
}
2022-11-16 12:15:40 +03:00
z - > z_flags | = ZONEFS_ZONE_OPEN ;
zonefs_inode_account_active ( inode ) ;
2022-11-25 03:39:33 +03:00
}
}
}
zi - > i_wr_refcnt + + ;
unlock :
mutex_unlock ( & zi - > i_truncate_mutex ) ;
return ret ;
}
static int zonefs_file_open ( struct inode * inode , struct file * file )
{
int ret ;
2023-06-12 08:35:15 +03:00
file - > f_mode | = FMODE_CAN_ODIRECT ;
2022-11-25 03:39:33 +03:00
ret = generic_file_open ( inode , file ) ;
if ( ret )
return ret ;
if ( zonefs_seq_file_need_wro ( inode , file ) )
return zonefs_seq_file_write_open ( inode ) ;
return 0 ;
}
static void zonefs_seq_file_write_close ( struct inode * inode )
{
struct zonefs_inode_info * zi = ZONEFS_I ( inode ) ;
2022-11-16 12:15:40 +03:00
struct zonefs_zone * z = zonefs_inode_zone ( inode ) ;
2022-11-25 03:39:33 +03:00
struct super_block * sb = inode - > i_sb ;
struct zonefs_sb_info * sbi = ZONEFS_SB ( sb ) ;
int ret = 0 ;
mutex_lock ( & zi - > i_truncate_mutex ) ;
zi - > i_wr_refcnt - - ;
if ( zi - > i_wr_refcnt )
goto unlock ;
/*
* The file zone may not be open anymore ( e . g . the file was truncated to
* its maximum size or it was fully written ) . For this case , we only
* need to decrement the write open count .
*/
2022-11-16 12:15:40 +03:00
if ( z - > z_flags & ZONEFS_ZONE_OPEN ) {
ret = zonefs_inode_zone_mgmt ( inode , REQ_OP_ZONE_CLOSE ) ;
2022-11-25 03:39:33 +03:00
if ( ret ) {
__zonefs_io_error ( inode , false ) ;
/*
* Leaving zones explicitly open may lead to a state
* where most zones cannot be written ( zone resources
* exhausted ) . So take preventive action by remounting
* read - only .
*/
2022-11-16 12:15:40 +03:00
if ( z - > z_flags & ZONEFS_ZONE_OPEN & &
2022-11-25 03:39:33 +03:00
! ( sb - > s_flags & SB_RDONLY ) ) {
zonefs_warn ( sb ,
" closing zone at %llu failed %d \n " ,
2022-11-16 12:15:40 +03:00
z - > z_sector , ret ) ;
2022-11-25 03:39:33 +03:00
zonefs_warn ( sb ,
" remounting filesystem read-only \n " ) ;
sb - > s_flags | = SB_RDONLY ;
}
goto unlock ;
}
2022-11-16 12:15:40 +03:00
z - > z_flags & = ~ ZONEFS_ZONE_OPEN ;
zonefs_inode_account_active ( inode ) ;
2022-11-25 03:39:33 +03:00
}
atomic_dec ( & sbi - > s_wro_seq_files ) ;
unlock :
mutex_unlock ( & zi - > i_truncate_mutex ) ;
}
static int zonefs_file_release ( struct inode * inode , struct file * file )
{
/*
* If we explicitly open a zone we must close it again as well , but the
* zone management operation can fail ( either due to an IO error or as
* the zone has gone offline or read - only ) . Make sure we don ' t fail the
* close ( 2 ) for user - space .
*/
if ( zonefs_seq_file_need_wro ( inode , file ) )
zonefs_seq_file_write_close ( inode ) ;
return 0 ;
}
const struct file_operations zonefs_file_operations = {
. open = zonefs_file_open ,
. release = zonefs_file_release ,
. fsync = zonefs_file_fsync ,
. mmap = zonefs_file_mmap ,
. llseek = zonefs_file_llseek ,
. read_iter = zonefs_file_read_iter ,
. write_iter = zonefs_file_write_iter ,
2023-05-22 16:50:12 +03:00
. splice_read = zonefs_file_splice_read ,
2022-11-25 03:39:33 +03:00
. splice_write = iter_file_splice_write ,
. iopoll = iocb_bio_iopoll ,
} ;