2019-05-19 15:08:55 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2005-04-17 02:20:36 +04:00
/*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
* Copyright ( C ) 2001 Andrea Arcangeli < andrea @ suse . de > SuSE
2020-12-10 10:55:44 +03:00
* Copyright ( C ) 2016 - 2020 Christoph Hellwig
2005-04-17 02:20:36 +04:00
*/
# include <linux/init.h>
# include <linux/mm.h>
# include <linux/slab.h>
# include <linux/kmod.h>
# include <linux/major.h>
2008-06-06 09:46:27 +04:00
# include <linux/device_cgroup.h>
2005-04-17 02:20:36 +04:00
# include <linux/blkdev.h>
2021-09-20 15:33:27 +03:00
# include <linux/blk-integrity.h>
2015-05-23 00:13:32 +03:00
# include <linux/backing-dev.h>
2005-04-17 02:20:36 +04:00
# include <linux/module.h>
# include <linux/blkpg.h>
2012-03-24 02:01:50 +04:00
# include <linux/magic.h>
2005-04-17 02:20:36 +04:00
# include <linux/buffer_head.h>
2011-09-16 10:31:11 +04:00
# include <linux/swap.h>
2006-08-29 22:06:09 +04:00
# include <linux/writeback.h>
2005-04-17 02:20:36 +04:00
# include <linux/mount.h>
2019-03-25 19:38:23 +03:00
# include <linux/pseudo_fs.h>
2005-04-17 02:20:36 +04:00
# include <linux/uio.h>
# include <linux/namei.h>
2011-09-16 10:31:11 +04:00
# include <linux/cleancache.h>
2020-11-24 11:34:00 +03:00
# include <linux/part_stat.h>
2016-12-24 22:46:01 +03:00
# include <linux/uaccess.h>
2021-09-07 17:13:03 +03:00
# include "../fs/internal.h"
# include "blk.h"
2005-04-17 02:20:36 +04:00
struct bdev_inode {
struct block_device bdev ;
struct inode vfs_inode ;
} ;
static inline struct bdev_inode * BDEV_I ( struct inode * inode )
{
return container_of ( inode , struct bdev_inode , vfs_inode ) ;
}
2015-06-26 14:58:32 +03:00
struct block_device * I_BDEV ( struct inode * inode )
2005-04-17 02:20:36 +04:00
{
return & BDEV_I ( inode ) - > bdev ;
}
EXPORT_SYMBOL ( I_BDEV ) ;
2015-11-09 19:23:40 +03:00
static void bdev_write_inode ( struct block_device * bdev )
2015-01-14 12:42:33 +03:00
{
2015-11-09 19:23:40 +03:00
struct inode * inode = bdev - > bd_inode ;
int ret ;
2015-01-14 12:42:33 +03:00
spin_lock ( & inode - > i_lock ) ;
while ( inode - > i_state & I_DIRTY ) {
spin_unlock ( & inode - > i_lock ) ;
2015-11-09 19:23:40 +03:00
ret = write_inode_now ( inode , true ) ;
if ( ret ) {
char name [ BDEVNAME_SIZE ] ;
pr_warn_ratelimited ( " VFS: Dirty inode writeback failed "
" for block device %s (err=%d). \n " ,
bdevname ( bdev , name ) , ret ) ;
}
2015-01-14 12:42:33 +03:00
spin_lock ( & inode - > i_lock ) ;
}
spin_unlock ( & inode - > i_lock ) ;
}
2007-05-07 01:49:55 +04:00
/* Kill _all_ buffers and pagecache , dirty or not.. */
2020-06-18 07:21:38 +03:00
static void kill_bdev ( struct block_device * bdev )
2005-04-17 02:20:36 +04:00
{
2011-09-16 10:31:11 +04:00
struct address_space * mapping = bdev - > bd_inode - > i_mapping ;
2021-05-05 04:32:45 +03:00
if ( mapping_empty ( mapping ) )
2007-05-07 01:49:55 +04:00
return ;
2011-09-16 10:31:11 +04:00
2007-05-07 01:49:55 +04:00
invalidate_bh_lrus ( ) ;
2011-09-16 10:31:11 +04:00
truncate_inode_pages ( mapping , 0 ) ;
2020-06-18 07:21:38 +03:00
}
2011-09-16 10:31:11 +04:00
/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev ( struct block_device * bdev )
{
struct address_space * mapping = bdev - > bd_inode - > i_mapping ;
2017-05-04 00:56:02 +03:00
if ( mapping - > nrpages ) {
invalidate_bh_lrus ( ) ;
lru_add_drain_all ( ) ; /* make sure all lru add caches are flushed */
invalidate_mapping_pages ( mapping , 0 , - 1 ) ;
}
2011-09-16 10:31:11 +04:00
/* 99% of the time, we don't need to flush the cleancache on the bdev.
* But , for the strange corners , lets be cautious
*/
2011-09-21 19:56:28 +04:00
cleancache_invalidate_inode ( mapping ) ;
2011-09-16 10:31:11 +04:00
}
EXPORT_SYMBOL ( invalidate_bdev ) ;
2005-04-17 02:20:36 +04:00
2020-09-04 11:58:52 +03:00
/*
* Drop all buffers & page cache for given bdev range . This function bails
* with error if bdev has other exclusive owner ( such as filesystem ) .
*/
int truncate_bdev_range ( struct block_device * bdev , fmode_t mode ,
loff_t lstart , loff_t lend )
{
/*
* If we don ' t hold exclusive handle for the device , upgrade to it
* while we discard the buffer cache to avoid discarding buffers
* under live filesystem .
*/
if ( ! ( mode & FMODE_EXCL ) ) {
2020-11-25 23:20:08 +03:00
int err = bd_prepare_to_claim ( bdev , truncate_bdev_range ) ;
2020-09-04 11:58:52 +03:00
if ( err )
2021-02-22 12:48:09 +03:00
goto invalidate ;
2020-09-04 11:58:52 +03:00
}
2020-11-25 23:20:08 +03:00
2020-09-04 11:58:52 +03:00
truncate_inode_pages_range ( bdev - > bd_inode - > i_mapping , lstart , lend ) ;
2020-11-25 23:20:08 +03:00
if ( ! ( mode & FMODE_EXCL ) )
bd_abort_claiming ( bdev , truncate_bdev_range ) ;
2020-09-04 11:58:52 +03:00
return 0 ;
2021-02-22 12:48:09 +03:00
invalidate :
/*
* Someone else has handle exclusively open . Try invalidating instead .
* The ' end ' argument is inclusive so the rounding is safe .
*/
return invalidate_inode_pages2_range ( bdev - > bd_inode - > i_mapping ,
lstart > > PAGE_SHIFT ,
lend > > PAGE_SHIFT ) ;
2020-09-04 11:58:52 +03:00
}
2019-01-14 11:48:10 +03:00
static void set_init_blocksize ( struct block_device * bdev )
{
2021-01-26 22:59:07 +03:00
unsigned int bsize = bdev_logical_block_size ( bdev ) ;
loff_t size = i_size_read ( bdev - > bd_inode ) ;
while ( bsize < PAGE_SIZE ) {
if ( size & bsize )
break ;
bsize < < = 1 ;
}
bdev - > bd_inode - > i_blkbits = blksize_bits ( bsize ) ;
2019-01-14 11:48:10 +03:00
}
2005-04-17 02:20:36 +04:00
int set_blocksize ( struct block_device * bdev , int size )
{
/* Size must be a power of two, and between 512 and PAGE_SIZE */
2007-05-08 11:24:32 +04:00
if ( size > PAGE_SIZE | | size < 512 | | ! is_power_of_2 ( size ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
/* Size cannot be smaller than the size supported by the device */
2009-05-23 01:17:49 +04:00
if ( size < bdev_logical_block_size ( bdev ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
/* Don't change the size if it is same as current */
2020-06-26 11:01:55 +03:00
if ( bdev - > bd_inode - > i_blkbits ! = blksize_bits ( size ) ) {
2005-04-17 02:20:36 +04:00
sync_blockdev ( bdev ) ;
bdev - > bd_inode - > i_blkbits = blksize_bits ( size ) ;
kill_bdev ( bdev ) ;
}
return 0 ;
}
EXPORT_SYMBOL ( set_blocksize ) ;
int sb_set_blocksize ( struct super_block * sb , int size )
{
if ( set_blocksize ( sb - > s_bdev , size ) )
return 0 ;
/* If we get here, we know size is power of two
* and it ' s value is between 512 and PAGE_SIZE */
sb - > s_blocksize = size ;
2006-03-24 14:18:05 +03:00
sb - > s_blocksize_bits = blksize_bits ( size ) ;
2005-04-17 02:20:36 +04:00
return sb - > s_blocksize ;
}
EXPORT_SYMBOL ( sb_set_blocksize ) ;
int sb_min_blocksize ( struct super_block * sb , int size )
{
2009-05-23 01:17:49 +04:00
int minsize = bdev_logical_block_size ( sb - > s_bdev ) ;
2005-04-17 02:20:36 +04:00
if ( size < minsize )
size = minsize ;
return sb_set_blocksize ( sb , size ) ;
}
EXPORT_SYMBOL ( sb_min_blocksize ) ;
2021-10-19 09:25:25 +03:00
int sync_blockdev_nowait ( struct block_device * bdev )
2009-04-27 18:43:51 +04:00
{
if ( ! bdev )
return 0 ;
2021-10-19 09:25:25 +03:00
return filemap_flush ( bdev - > bd_inode - > i_mapping ) ;
2009-04-27 18:43:51 +04:00
}
2021-10-19 09:25:25 +03:00
EXPORT_SYMBOL_GPL ( sync_blockdev_nowait ) ;
2009-04-27 18:43:51 +04:00
2009-02-25 12:44:19 +03:00
/*
* Write out and wait upon all the dirty data associated with a block
* device via its mapping . Does not take the superblock lock .
*/
int sync_blockdev ( struct block_device * bdev )
{
2021-10-19 09:25:25 +03:00
if ( ! bdev )
return 0 ;
return filemap_write_and_wait ( bdev - > bd_inode - > i_mapping ) ;
2009-02-25 12:44:19 +03:00
}
EXPORT_SYMBOL ( sync_blockdev ) ;
/*
* Write out and wait upon all dirty data associated with this
* device . Filesystem data as well as the underlying block
* device . Takes the superblock lock .
*/
int fsync_bdev ( struct block_device * bdev )
{
struct super_block * sb = get_super ( bdev ) ;
if ( sb ) {
2009-04-27 18:43:53 +04:00
int res = sync_filesystem ( sb ) ;
2009-02-25 12:44:19 +03:00
drop_super ( sb ) ;
return res ;
}
return sync_blockdev ( bdev ) ;
}
2009-04-01 15:07:16 +04:00
EXPORT_SYMBOL ( fsync_bdev ) ;
2009-02-25 12:44:19 +03:00
/**
* freeze_bdev - - lock a filesystem and force it into a consistent state
* @ bdev : blockdevice to lock
*
* If a superblock is found on this device , we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done .
* The reference counter ( bd_fsfreeze_count ) guarantees that only the last
* unfreeze process can unfreeze the frozen filesystem actually when multiple
* freeze requests arrive simultaneously . It counts up in freeze_bdev ( ) and
* count down in thaw_bdev ( ) . When it becomes 0 , thaw_bdev ( ) will unfreeze
* actually .
*/
2020-11-24 13:54:06 +03:00
int freeze_bdev ( struct block_device * bdev )
2009-02-25 12:44:19 +03:00
{
struct super_block * sb ;
int error = 0 ;
mutex_lock ( & bdev - > bd_fsfreeze_mutex ) ;
2020-11-24 13:54:06 +03:00
if ( + + bdev - > bd_fsfreeze_count > 1 )
goto done ;
2009-08-04 01:28:35 +04:00
sb = get_active_super ( bdev ) ;
if ( ! sb )
2020-11-24 13:54:06 +03:00
goto sync ;
fs: add freeze_super/thaw_super fs hooks
Currently, freezing a filesystem involves calling freeze_super, which locks
sb->s_umount and then calls the fs-specific freeze_fs hook. This makes it
hard for gfs2 (and potentially other cluster filesystems) to use the vfs
freezing code to do freezes on all the cluster nodes.
In order to communicate that a freeze has been requested, and to make sure
that only one node is trying to freeze at a time, gfs2 uses a glock
(sd_freeze_gl). The problem is that there is no hook for gfs2 to acquire
this lock before calling freeze_super. This means that two nodes can
attempt to freeze the filesystem by both calling freeze_super, acquiring
the sb->s_umount lock, and then attempting to grab the cluster glock
sd_freeze_gl. Only one will succeed, and the other will be stuck in
freeze_super, making it impossible to finish freezing the node.
To solve this problem, this patch adds the freeze_super and thaw_super
hooks. If a filesystem implements these hooks, they are called instead of
the vfs freeze_super and thaw_super functions. This means that every
filesystem that implements these hooks must call the vfs freeze_super and
thaw_super functions itself within the hook function to make use of the vfs
freezing code.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-11-14 05:42:03 +03:00
if ( sb - > s_op - > freeze_super )
error = sb - > s_op - > freeze_super ( sb ) ;
else
error = freeze_super ( sb ) ;
2020-11-24 13:54:06 +03:00
deactivate_super ( sb ) ;
2010-03-23 17:34:56 +03:00
if ( error ) {
bdev - > bd_fsfreeze_count - - ;
2020-11-24 13:54:06 +03:00
goto done ;
2009-02-25 12:44:19 +03:00
}
2020-11-24 13:54:06 +03:00
bdev - > bd_fsfreeze_sb = sb ;
sync :
2009-02-25 12:44:19 +03:00
sync_blockdev ( bdev ) ;
2020-11-24 13:54:06 +03:00
done :
2009-02-25 12:44:19 +03:00
mutex_unlock ( & bdev - > bd_fsfreeze_mutex ) ;
2020-11-24 13:54:06 +03:00
return error ;
2009-02-25 12:44:19 +03:00
}
EXPORT_SYMBOL ( freeze_bdev ) ;
/**
* thaw_bdev - - unlock filesystem
* @ bdev : blockdevice to unlock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev ( ) .
*/
2020-11-24 13:54:06 +03:00
int thaw_bdev ( struct block_device * bdev )
2009-02-25 12:44:19 +03:00
{
2020-11-24 13:54:06 +03:00
struct super_block * sb ;
2009-08-04 01:28:35 +04:00
int error = - EINVAL ;
2009-02-25 12:44:19 +03:00
mutex_lock ( & bdev - > bd_fsfreeze_mutex ) ;
2009-08-04 01:28:35 +04:00
if ( ! bdev - > bd_fsfreeze_count )
2010-03-23 17:34:56 +03:00
goto out ;
2009-08-04 01:28:35 +04:00
error = 0 ;
if ( - - bdev - > bd_fsfreeze_count > 0 )
2010-03-23 17:34:56 +03:00
goto out ;
2009-08-04 01:28:35 +04:00
2020-11-24 13:54:06 +03:00
sb = bdev - > bd_fsfreeze_sb ;
2009-08-04 01:28:35 +04:00
if ( ! sb )
2010-03-23 17:34:56 +03:00
goto out ;
2009-08-04 01:28:35 +04:00
fs: add freeze_super/thaw_super fs hooks
Currently, freezing a filesystem involves calling freeze_super, which locks
sb->s_umount and then calls the fs-specific freeze_fs hook. This makes it
hard for gfs2 (and potentially other cluster filesystems) to use the vfs
freezing code to do freezes on all the cluster nodes.
In order to communicate that a freeze has been requested, and to make sure
that only one node is trying to freeze at a time, gfs2 uses a glock
(sd_freeze_gl). The problem is that there is no hook for gfs2 to acquire
this lock before calling freeze_super. This means that two nodes can
attempt to freeze the filesystem by both calling freeze_super, acquiring
the sb->s_umount lock, and then attempting to grab the cluster glock
sd_freeze_gl. Only one will succeed, and the other will be stuck in
freeze_super, making it impossible to finish freezing the node.
To solve this problem, this patch adds the freeze_super and thaw_super
hooks. If a filesystem implements these hooks, they are called instead of
the vfs freeze_super and thaw_super functions. This means that every
filesystem that implements these hooks must call the vfs freeze_super and
thaw_super functions itself within the hook function to make use of the vfs
freezing code.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-11-14 05:42:03 +03:00
if ( sb - > s_op - > thaw_super )
error = sb - > s_op - > thaw_super ( sb ) ;
else
error = thaw_super ( sb ) ;
2016-10-04 11:53:40 +03:00
if ( error )
2010-03-23 17:34:56 +03:00
bdev - > bd_fsfreeze_count + + ;
2020-12-24 07:49:54 +03:00
else
bdev - > bd_fsfreeze_sb = NULL ;
2010-03-23 17:34:56 +03:00
out :
2009-02-25 12:44:19 +03:00
mutex_unlock ( & bdev - > bd_fsfreeze_mutex ) ;
2016-10-04 11:53:40 +03:00
return error ;
2009-02-25 12:44:19 +03:00
}
EXPORT_SYMBOL ( thaw_bdev ) ;
2014-06-05 03:07:46 +04:00
/**
* bdev_read_page ( ) - Start reading a page from a block device
* @ bdev : The device to read the page from
* @ sector : The offset on the device to read the page to ( need not be aligned )
* @ page : The page to read
*
* On entry , the page should be locked . It will be unlocked when the page
* has been read . If the block driver implements rw_page synchronously ,
* that will be true on exit from this function , but it need not be .
*
* Errors returned by this function are usually " soft " , eg out of memory , or
* queue full ; callers should try a different route to read this page rather
* than propagate an error back up the stack .
*
* Return : negative errno if an error occurs , 0 if submission was successful .
*/
int bdev_read_page ( struct block_device * bdev , sector_t sector ,
struct page * page )
{
const struct block_device_operations * ops = bdev - > bd_disk - > fops ;
2015-11-20 00:29:28 +03:00
int result = - EOPNOTSUPP ;
2015-05-12 20:48:53 +03:00
if ( ! ops - > rw_page | | bdev_get_integrity ( bdev ) )
2015-11-20 00:29:28 +03:00
return result ;
2021-10-14 17:03:27 +03:00
result = blk_queue_enter ( bdev_get_queue ( bdev ) , 0 ) ;
2015-11-20 00:29:28 +03:00
if ( result )
return result ;
2018-07-18 14:47:36 +03:00
result = ops - > rw_page ( bdev , sector + get_start_sect ( bdev ) , page ,
REQ_OP_READ ) ;
2021-10-14 17:03:27 +03:00
blk_queue_exit ( bdev_get_queue ( bdev ) ) ;
2015-11-20 00:29:28 +03:00
return result ;
2014-06-05 03:07:46 +04:00
}
/**
* bdev_write_page ( ) - Start writing a page to a block device
* @ bdev : The device to write the page to
* @ sector : The offset on the device to write the page to ( need not be aligned )
* @ page : The page to write
* @ wbc : The writeback_control for the write
*
* On entry , the page should be locked and not currently under writeback .
* On exit , if the write started successfully , the page will be unlocked and
* under writeback . If the write failed already ( eg the driver failed to
* queue the page to the device ) , the page will still be locked . If the
* caller is a - > writepage implementation , it will need to unlock the page .
*
* Errors returned by this function are usually " soft " , eg out of memory , or
* queue full ; callers should try a different route to write this page rather
* than propagate an error back up the stack .
*
* Return : negative errno if an error occurs , 0 if submission was successful .
*/
int bdev_write_page ( struct block_device * bdev , sector_t sector ,
struct page * page , struct writeback_control * wbc )
{
int result ;
const struct block_device_operations * ops = bdev - > bd_disk - > fops ;
2015-11-20 00:29:28 +03:00
2015-05-12 20:48:53 +03:00
if ( ! ops - > rw_page | | bdev_get_integrity ( bdev ) )
2014-06-05 03:07:46 +04:00
return - EOPNOTSUPP ;
2021-10-14 17:03:27 +03:00
result = blk_queue_enter ( bdev_get_queue ( bdev ) , 0 ) ;
2015-11-20 00:29:28 +03:00
if ( result )
return result ;
2014-06-05 03:07:46 +04:00
set_page_writeback ( page ) ;
2018-07-18 14:47:36 +03:00
result = ops - > rw_page ( bdev , sector + get_start_sect ( bdev ) , page ,
REQ_OP_WRITE ) ;
2017-10-14 01:58:15 +03:00
if ( result ) {
2014-06-05 03:07:46 +04:00
end_page_writeback ( page ) ;
2017-10-14 01:58:15 +03:00
} else {
clean_page_buffers ( page ) ;
2014-06-05 03:07:46 +04:00
unlock_page ( page ) ;
2017-10-14 01:58:15 +03:00
}
2021-10-14 17:03:27 +03:00
blk_queue_exit ( bdev_get_queue ( bdev ) ) ;
2014-06-05 03:07:46 +04:00
return result ;
}
2005-04-17 02:20:36 +04:00
/*
* pseudo - fs
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK ( bdev_lock ) ;
2006-12-07 07:33:20 +03:00
static struct kmem_cache * bdev_cachep __read_mostly ;
2005-04-17 02:20:36 +04:00
static struct inode * bdev_alloc_inode ( struct super_block * sb )
{
2006-12-07 07:33:17 +03:00
struct bdev_inode * ei = kmem_cache_alloc ( bdev_cachep , GFP_KERNEL ) ;
2021-01-07 21:36:40 +03:00
2005-04-17 02:20:36 +04:00
if ( ! ei )
return NULL ;
2021-01-07 21:36:40 +03:00
memset ( & ei - > bdev , 0 , sizeof ( ei - > bdev ) ) ;
2005-04-17 02:20:36 +04:00
return & ei - > vfs_inode ;
}
2019-04-10 22:12:38 +03:00
static void bdev_free_inode ( struct inode * inode )
2005-04-17 02:20:36 +04:00
{
2020-11-24 11:34:00 +03:00
struct block_device * bdev = I_BDEV ( inode ) ;
free_percpu ( bdev - > bd_stats ) ;
2020-11-24 14:01:45 +03:00
kfree ( bdev - > bd_meta_info ) ;
2020-11-24 11:34:00 +03:00
2021-08-16 15:26:14 +03:00
if ( ! bdev_is_partition ( bdev ) ) {
if ( bdev - > bd_disk & & bdev - > bd_disk - > bdi )
bdi_put ( bdev - > bd_disk - > bdi ) ;
2021-07-22 10:53:54 +03:00
kfree ( bdev - > bd_disk ) ;
2021-08-16 15:26:14 +03:00
}
2021-08-16 15:26:13 +03:00
if ( MAJOR ( bdev - > bd_dev ) = = BLOCK_EXT_MAJOR )
blk_free_ext_minor ( MINOR ( bdev - > bd_dev ) ) ;
2019-04-10 22:12:38 +03:00
kmem_cache_free ( bdev_cachep , BDEV_I ( inode ) ) ;
2011-01-07 09:49:49 +03:00
}
2020-11-23 17:41:40 +03:00
static void init_once ( void * data )
2005-04-17 02:20:36 +04:00
{
2020-11-23 17:41:40 +03:00
struct bdev_inode * ei = data ;
2005-04-17 02:20:36 +04:00
2007-05-17 09:10:57 +04:00
inode_init_once ( & ei - > vfs_inode ) ;
2005-04-17 02:20:36 +04:00
}
2010-06-07 22:34:48 +04:00
static void bdev_evict_inode ( struct inode * inode )
2005-04-17 02:20:36 +04:00
{
2014-04-04 01:47:49 +04:00
truncate_inode_pages_final ( & inode - > i_data ) ;
2010-06-07 22:34:48 +04:00
invalidate_inode_buffers ( inode ) ; /* is it needed here? */
2012-05-03 16:48:02 +04:00
clear_inode ( inode ) ;
2005-04-17 02:20:36 +04:00
}
2007-02-12 11:55:41 +03:00
static const struct super_operations bdev_sops = {
2005-04-17 02:20:36 +04:00
. statfs = simple_statfs ,
. alloc_inode = bdev_alloc_inode ,
2019-04-10 22:12:38 +03:00
. free_inode = bdev_free_inode ,
2005-04-17 02:20:36 +04:00
. drop_inode = generic_delete_inode ,
2010-06-07 22:34:48 +04:00
. evict_inode = bdev_evict_inode ,
2005-04-17 02:20:36 +04:00
} ;
2019-03-25 19:38:23 +03:00
static int bd_init_fs_context ( struct fs_context * fc )
2005-04-17 02:20:36 +04:00
{
2019-03-25 19:38:23 +03:00
struct pseudo_fs_context * ctx = init_pseudo ( fc , BDEVFS_MAGIC ) ;
if ( ! ctx )
return - ENOMEM ;
fc - > s_iflags | = SB_I_CGROUPWB ;
ctx - > ops = & bdev_sops ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
static struct file_system_type bd_type = {
. name = " bdev " ,
2019-03-25 19:38:23 +03:00
. init_fs_context = bd_init_fs_context ,
2005-04-17 02:20:36 +04:00
. kill_sb = kill_anon_super ,
} ;
2015-05-23 00:13:33 +03:00
struct super_block * blockdev_superblock __read_mostly ;
EXPORT_SYMBOL_GPL ( blockdev_superblock ) ;
2005-04-17 02:20:36 +04:00
void __init bdev_cache_init ( void )
{
int err ;
2012-01-10 03:43:59 +04:00
static struct vfsmount * bd_mnt ;
2008-12-02 01:34:56 +03:00
2005-04-17 02:20:36 +04:00
bdev_cachep = kmem_cache_create ( " bdev_cache " , sizeof ( struct bdev_inode ) ,
2006-03-24 14:16:06 +03:00
0 , ( SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
2016-01-15 02:18:21 +03:00
SLAB_MEM_SPREAD | SLAB_ACCOUNT | SLAB_PANIC ) ,
2007-07-20 05:11:58 +04:00
init_once ) ;
2005-04-17 02:20:36 +04:00
err = register_filesystem ( & bd_type ) ;
if ( err )
panic ( " Cannot register bdev pseudo-fs " ) ;
bd_mnt = kern_mount ( & bd_type ) ;
if ( IS_ERR ( bd_mnt ) )
panic ( " Cannot create bdev pseudo-fs " ) ;
2012-01-10 03:43:59 +04:00
blockdev_superblock = bd_mnt - > mnt_sb ; /* For writeback */
2005-04-17 02:20:36 +04:00
}
2020-11-26 11:23:26 +03:00
struct block_device * bdev_alloc ( struct gendisk * disk , u8 partno )
2005-04-17 02:20:36 +04:00
{
struct block_device * bdev ;
struct inode * inode ;
2020-11-26 11:23:26 +03:00
inode = new_inode ( blockdev_superblock ) ;
2005-04-17 02:20:36 +04:00
if ( ! inode )
return NULL ;
2020-11-26 11:23:26 +03:00
inode - > i_mode = S_IFBLK ;
inode - > i_rdev = 0 ;
inode - > i_data . a_ops = & def_blk_aops ;
mapping_set_gfp_mask ( & inode - > i_data , GFP_USER ) ;
bdev = I_BDEV ( inode ) ;
2020-11-23 17:41:40 +03:00
mutex_init ( & bdev - > bd_fsfreeze_mutex ) ;
2020-11-26 11:23:26 +03:00
spin_lock_init ( & bdev - > bd_size_lock ) ;
bdev - > bd_partno = partno ;
bdev - > bd_inode = inode ;
2021-10-14 17:03:26 +03:00
bdev - > bd_queue = disk - > queue ;
2020-11-24 11:34:00 +03:00
bdev - > bd_stats = alloc_percpu ( struct disk_stats ) ;
if ( ! bdev - > bd_stats ) {
iput ( inode ) ;
return NULL ;
}
2021-10-02 12:23:02 +03:00
bdev - > bd_disk = disk ;
2020-11-26 11:23:26 +03:00
return bdev ;
}
2005-04-17 02:20:36 +04:00
2020-11-26 11:23:26 +03:00
void bdev_add ( struct block_device * bdev , dev_t dev )
{
bdev - > bd_dev = dev ;
bdev - > bd_inode - > i_rdev = dev ;
bdev - > bd_inode - > i_ino = dev ;
insert_inode_hash ( bdev - > bd_inode ) ;
}
2005-04-17 02:20:36 +04:00
long nr_blockdev_pages ( void )
{
2020-06-26 11:01:58 +03:00
struct inode * inode ;
2005-04-17 02:20:36 +04:00
long ret = 0 ;
2020-06-26 11:01:58 +03:00
spin_lock ( & blockdev_superblock - > s_inode_list_lock ) ;
list_for_each_entry ( inode , & blockdev_superblock - > s_inodes , i_sb_list )
ret + = inode - > i_mapping - > nrpages ;
spin_unlock ( & blockdev_superblock - > s_inode_list_lock ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
2010-04-07 13:52:29 +04:00
/**
* bd_may_claim - test whether a block device can be claimed
* @ bdev : block device of interest
* @ whole : whole block device containing @ bdev , may equal @ bdev
* @ holder : holder trying to claim @ bdev
*
2011-03-31 05:57:33 +04:00
* Test whether @ bdev can be claimed by @ holder .
2010-04-07 13:52:29 +04:00
*
* CONTEXT :
* spin_lock ( & bdev_lock ) .
*
* RETURNS :
* % true if @ bdev can be claimed , % false otherwise .
*/
static bool bd_may_claim ( struct block_device * bdev , struct block_device * whole ,
void * holder )
2005-04-17 02:20:36 +04:00
{
if ( bdev - > bd_holder = = holder )
2010-04-07 13:52:29 +04:00
return true ; /* already a holder */
2005-04-17 02:20:36 +04:00
else if ( bdev - > bd_holder ! = NULL )
2010-04-07 13:52:29 +04:00
return false ; /* held by someone else */
block_dev: don't test bdev->bd_contains when it is not stable
bdev->bd_contains is not stable before calling __blkdev_get().
When __blkdev_get() is called on a parition with ->bd_openers == 0
it sets
bdev->bd_contains = bdev;
which is not correct for a partition.
After a call to __blkdev_get() succeeds, ->bd_openers will be > 0
and then ->bd_contains is stable.
When FMODE_EXCL is used, blkdev_get() calls
bd_start_claiming() -> bd_prepare_to_claim() -> bd_may_claim()
This call happens before __blkdev_get() is called, so ->bd_contains
is not stable. So bd_may_claim() cannot safely use ->bd_contains.
It currently tries to use it, and this can lead to a BUG_ON().
This happens when a whole device is already open with a bd_holder (in
use by dm in my particular example) and two threads race to open a
partition of that device for the first time, one opening with O_EXCL and
one without.
The thread that doesn't use O_EXCL gets through blkdev_get() to
__blkdev_get(), gains the ->bd_mutex, and sets bdev->bd_contains = bdev;
Immediately thereafter the other thread, using FMODE_EXCL, calls
bd_start_claiming() from blkdev_get(). This should fail because the
whole device has a holder, but because bdev->bd_contains == bdev
bd_may_claim() incorrectly reports success.
This thread continues and blocks on bd_mutex.
The first thread then sets bdev->bd_contains correctly and drops the mutex.
The thread using FMODE_EXCL then continues and when it calls bd_may_claim()
again in:
BUG_ON(!bd_may_claim(bdev, whole, holder));
The BUG_ON fires.
Fix this by removing the dependency on ->bd_contains in
bd_may_claim(). As bd_may_claim() has direct access to the whole
device, it can simply test if the target bdev is the whole device.
Fixes: 6b4517a7913a ("block: implement bd_claiming and claiming block")
Cc: stable@vger.kernel.org (v2.6.35+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2016-12-12 18:21:51 +03:00
else if ( whole = = bdev )
2010-04-07 13:52:29 +04:00
return true ; /* is a whole device which isn't held */
2005-04-17 02:20:36 +04:00
block: make blkdev_get/put() handle exclusive access
Over time, block layer has accumulated a set of APIs dealing with bdev
open, close, claim and release.
* blkdev_get/put() are the primary open and close functions.
* bd_claim/release() deal with exclusive open.
* open/close_bdev_exclusive() are combination of open and claim and
the other way around, respectively.
* bd_link/unlink_disk_holder() to create and remove holder/slave
symlinks.
* open_by_devnum() wraps bdget() + blkdev_get().
The interface is a bit confusing and the decoupling of open and claim
makes it impossible to properly guarantee exclusive access as
in-kernel open + claim sequence can disturb the existing exclusive
open even before the block layer knows the current open if for another
exclusive access. Reorganize the interface such that,
* blkdev_get() is extended to include exclusive access management.
@holder argument is added and, if is @FMODE_EXCL specified, it will
gain exclusive access atomically w.r.t. other exclusive accesses.
* blkdev_put() is similarly extended. It now takes @mode argument and
if @FMODE_EXCL is set, it releases an exclusive access. Also, when
the last exclusive claim is released, the holder/slave symlinks are
removed automatically.
* bd_claim/release() and close_bdev_exclusive() are no longer
necessary and either made static or removed.
* bd_link_disk_holder() remains the same but bd_unlink_disk_holder()
is no longer necessary and removed.
* open_bdev_exclusive() becomes a simple wrapper around lookup_bdev()
and blkdev_get(). It also has an unexpected extra bdev_read_only()
test which probably should be moved into blkdev_get().
* open_by_devnum() is modified to take @holder argument and pass it to
blkdev_get().
Most of bdev open/close operations are unified into blkdev_get/put()
and most exclusive accesses are tested atomically at the open time (as
it should). This cleans up code and removes some, both valid and
invalid, but unnecessary all the same, corner cases.
open_bdev_exclusive() and open_by_devnum() can use further cleanup -
rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop
special features. Well, let's leave them for another day.
Most conversions are straight-forward. drbd conversion is a bit more
involved as there was some reordering, but the logic should stay the
same.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: dm-devel@redhat.com
Cc: drbd-dev@lists.linbit.com
Cc: Leo Chen <leochen@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Joern Engel <joern@logfs.org>
Cc: reiserfs-devel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
2010-11-13 13:55:17 +03:00
else if ( whole - > bd_holder = = bd_may_claim )
2010-04-07 13:52:29 +04:00
return true ; /* is a partition of a device that is being partitioned */
else if ( whole - > bd_holder ! = NULL )
return false ; /* is a partition of a held device */
2005-04-17 02:20:36 +04:00
else
2010-04-07 13:52:29 +04:00
return true ; /* is a partition of an un-held device */
}
2010-04-07 13:53:59 +04:00
/**
2020-07-16 17:33:08 +03:00
* bd_prepare_to_claim - claim a block device
2010-04-07 13:53:59 +04:00
* @ bdev : block device of interest
* @ holder : holder trying to claim @ bdev
*
2020-07-16 17:33:08 +03:00
* Claim @ bdev . This function fails if @ bdev is already claimed by another
* holder and waits if another claiming is in progress . return , the caller
* has ownership of bd_claiming and bd_holder [ s ] .
2010-04-07 13:53:59 +04:00
*
* RETURNS :
* 0 if @ bdev can be claimed , - EBUSY otherwise .
*/
2020-11-25 23:20:08 +03:00
int bd_prepare_to_claim ( struct block_device * bdev , void * holder )
2010-04-07 13:53:59 +04:00
{
2020-11-25 23:20:08 +03:00
struct block_device * whole = bdev_whole ( bdev ) ;
if ( WARN_ON_ONCE ( ! holder ) )
return - EINVAL ;
2010-04-07 13:53:59 +04:00
retry :
2020-07-16 17:33:08 +03:00
spin_lock ( & bdev_lock ) ;
2010-04-07 13:53:59 +04:00
/* if someone else claimed, fail */
2020-07-16 17:33:08 +03:00
if ( ! bd_may_claim ( bdev , whole , holder ) ) {
spin_unlock ( & bdev_lock ) ;
2010-04-07 13:53:59 +04:00
return - EBUSY ;
2020-07-16 17:33:08 +03:00
}
2010-04-07 13:53:59 +04:00
2010-08-04 19:59:39 +04:00
/* if claiming is already in progress, wait for it to finish */
if ( whole - > bd_claiming ) {
2010-04-07 13:53:59 +04:00
wait_queue_head_t * wq = bit_waitqueue ( & whole - > bd_claiming , 0 ) ;
DEFINE_WAIT ( wait ) ;
prepare_to_wait ( wq , & wait , TASK_UNINTERRUPTIBLE ) ;
spin_unlock ( & bdev_lock ) ;
schedule ( ) ;
finish_wait ( wq , & wait ) ;
goto retry ;
}
/* yay, all mine */
2020-07-16 17:33:08 +03:00
whole - > bd_claiming = holder ;
spin_unlock ( & bdev_lock ) ;
2010-04-07 13:53:59 +04:00
return 0 ;
}
2020-07-16 17:33:09 +03:00
EXPORT_SYMBOL_GPL ( bd_prepare_to_claim ) ; /* only for the loop driver */
2010-04-07 13:53:59 +04:00
2019-07-30 14:10:14 +03:00
static void bd_clear_claiming ( struct block_device * whole , void * holder )
{
lockdep_assert_held ( & bdev_lock ) ;
/* tell others that we're done */
BUG_ON ( whole - > bd_claiming ! = holder ) ;
whole - > bd_claiming = NULL ;
wake_up_bit ( & whole - > bd_claiming , 0 ) ;
}
/**
* bd_finish_claiming - finish claiming of a block device
* @ bdev : block device of interest
* @ holder : holder that has claimed @ bdev
*
* Finish exclusive open of a block device . Mark the device as exlusively
* open by the holder and wake up all waiters for exclusive open to finish .
*/
2020-11-25 23:20:08 +03:00
static void bd_finish_claiming ( struct block_device * bdev , void * holder )
2019-07-30 14:10:14 +03:00
{
2020-11-25 23:20:08 +03:00
struct block_device * whole = bdev_whole ( bdev ) ;
2019-07-30 14:10:14 +03:00
spin_lock ( & bdev_lock ) ;
BUG_ON ( ! bd_may_claim ( bdev , whole , holder ) ) ;
/*
* Note that for a whole device bd_holders will be incremented twice ,
* and bd_holder will be set to bd_may_claim before being set to holder
*/
whole - > bd_holders + + ;
whole - > bd_holder = bd_may_claim ;
bdev - > bd_holders + + ;
bdev - > bd_holder = holder ;
bd_clear_claiming ( whole , holder ) ;
spin_unlock ( & bdev_lock ) ;
}
/**
* bd_abort_claiming - abort claiming of a block device
* @ bdev : block device of interest
* @ holder : holder that has claimed @ bdev
*
* Abort claiming of a block device when the exclusive open failed . This can be
* also used when exclusive open is not actually desired and we just needed
* to block other exclusive openers for a while .
*/
2020-11-25 23:20:08 +03:00
void bd_abort_claiming ( struct block_device * bdev , void * holder )
2019-07-30 14:10:14 +03:00
{
spin_lock ( & bdev_lock ) ;
2020-11-25 23:20:08 +03:00
bd_clear_claiming ( bdev_whole ( bdev ) , holder ) ;
2019-07-30 14:10:14 +03:00
spin_unlock ( & bdev_lock ) ;
}
EXPORT_SYMBOL ( bd_abort_claiming ) ;
2010-04-07 13:53:59 +04:00
2021-05-25 09:12:58 +03:00
static void blkdev_flush_mapping ( struct block_device * bdev )
{
WARN_ON_ONCE ( bdev - > bd_holders ) ;
sync_blockdev ( bdev ) ;
kill_bdev ( bdev ) ;
bdev_write_inode ( bdev ) ;
}
[PATCH] lockdep: simplify some aspects of bd_mutex nesting
When we open (actually blkdev_get) a partition we need to also open (get) the
whole device that holds the partition. The involves some limited recursion.
This patch tries to simplify some aspects of this.
As well as opening the whole device, we need to increment ->bd_part_count when
a partition is opened (this is used by rescan_partitions to avoid a rescan if
any partition is active, as that would be confusing).
The main change this patch makes is to move the inc/dec of bd_part_count into
blkdev_{get,put} for the whole rather than doing it in blkdev_{get,put} for
the partition.
More specifically, we introduce __blkdev_get and __blkdev_put which do exactly
what blkdev_{get,put} did, only with an extra "for_part" argument
(blkget_{get,put} then call the __ version with a '0' for the extra argument).
If for_part is 1, then the blkdev is being get(put) because a partition is
being opened(closed) for the first(last) time, and so bd_part_count should be
updated (on success). The particular advantage of pushing this function down
is that the bd_mutex lock (which is needed to update bd_part_count) is already
held at the lower level.
Note that this slightly changes the semantics of bd_part_count. Instead of
updating it whenever a partition is opened or released, it is now only updated
on the first open or last release. This is an adequate semantic as it is only
ever tested for "== 0".
Having introduced these functions we remove the current bd_part_count updates
from do_open (which is really the body of blkdev_get) and call
__blkdev_get(... 1). Similarly in blkget_put we remove the old bd_part_count
updates and call __blkget_put(..., 1). This call is moved to the end of
__blkdev_put to avoid nested locks of bd_mutex.
Finally the mutex_lock on whole->bd_mutex in do_open can be removed. It was
only really needed to protect bd_part_count, and that is now managed (and
protected) within the recursive call.
The observation that bd_part_count is central to the locking issues, and the
modifications to create __blkdev_put are from Peter Zijlstra.
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 13:36:16 +03:00
2021-05-25 09:12:54 +03:00
static int blkdev_get_whole ( struct block_device * bdev , fmode_t mode )
2019-11-14 17:34:34 +03:00
{
2019-11-14 17:34:35 +03:00
struct gendisk * disk = bdev - > bd_disk ;
2021-04-08 22:41:40 +03:00
int ret = 0 ;
2019-11-14 17:34:34 +03:00
2021-05-25 09:12:54 +03:00
if ( disk - > fops - > open ) {
ret = disk - > fops - > open ( bdev , mode ) ;
if ( ret ) {
/* avoid ghost partitions on a removed medium */
if ( ret = = - ENOMEDIUM & &
test_bit ( GD_NEED_PART_SCAN , & disk - > state ) )
2021-06-24 15:32:40 +03:00
bdev_disk_changed ( disk , true ) ;
2021-05-25 09:12:54 +03:00
return ret ;
}
2020-03-18 11:12:06 +03:00
}
2019-11-14 17:34:34 +03:00
2021-08-09 17:17:44 +03:00
if ( ! bdev - > bd_openers )
2021-05-25 09:12:54 +03:00
set_init_blocksize ( bdev ) ;
if ( test_bit ( GD_NEED_PART_SCAN , & disk - > state ) )
2021-06-24 15:32:40 +03:00
bdev_disk_changed ( disk , false ) ;
2021-05-25 09:12:54 +03:00
bdev - > bd_openers + + ;
return 0 ; ;
}
2019-11-14 17:34:34 +03:00
2021-05-25 09:12:58 +03:00
static void blkdev_put_whole ( struct block_device * bdev , fmode_t mode )
{
if ( ! - - bdev - > bd_openers )
blkdev_flush_mapping ( bdev ) ;
if ( bdev - > bd_disk - > fops - > release )
bdev - > bd_disk - > fops - > release ( bdev - > bd_disk , mode ) ;
2019-11-14 17:34:34 +03:00
}
2021-05-25 09:12:54 +03:00
static int blkdev_get_part ( struct block_device * part , fmode_t mode )
2005-04-17 02:20:36 +04:00
{
2021-05-25 09:12:54 +03:00
struct gendisk * disk = part - > bd_disk ;
int ret ;
2020-07-16 17:33:10 +03:00
2021-05-25 09:12:54 +03:00
if ( part - > bd_openers )
goto done ;
2021-05-14 16:18:41 +03:00
2021-07-22 10:53:58 +03:00
ret = blkdev_get_whole ( bdev_whole ( part ) , mode ) ;
2021-05-25 09:12:56 +03:00
if ( ret )
2021-07-22 10:53:58 +03:00
return ret ;
2011-05-23 15:26:07 +04:00
2021-05-25 09:12:54 +03:00
ret = - ENXIO ;
if ( ! bdev_nr_sectors ( part ) )
goto out_blkdev_put ;
2011-05-23 15:26:07 +04:00
2021-05-25 09:12:59 +03:00
disk - > open_partitions + + ;
2021-05-25 09:12:54 +03:00
set_init_blocksize ( part ) ;
done :
part - > bd_openers + + ;
2005-04-17 02:20:36 +04:00
return 0 ;
2015-11-30 21:20:29 +03:00
2021-05-25 09:12:54 +03:00
out_blkdev_put :
2021-07-22 10:53:58 +03:00
blkdev_put_whole ( bdev_whole ( part ) , mode ) ;
2021-05-25 09:12:54 +03:00
return ret ;
2005-04-17 02:20:36 +04:00
}
2020-11-23 12:19:22 +03:00
2021-05-25 09:12:58 +03:00
static void blkdev_put_part ( struct block_device * part , fmode_t mode )
{
struct block_device * whole = bdev_whole ( part ) ;
2017-03-23 03:36:53 +03:00
2021-05-25 09:12:58 +03:00
if ( - - part - > bd_openers )
return ;
blkdev_flush_mapping ( part ) ;
2021-05-25 09:12:59 +03:00
whole - > bd_disk - > open_partitions - - ;
2021-05-25 09:12:58 +03:00
blkdev_put_whole ( whole , mode ) ;
2005-04-17 02:20:36 +04:00
}
2020-11-26 11:23:26 +03:00
struct block_device * blkdev_get_no_open ( dev_t dev )
{
struct block_device * bdev ;
2021-07-22 10:53:58 +03:00
struct inode * inode ;
2020-11-26 11:23:26 +03:00
2021-07-22 10:53:58 +03:00
inode = ilookup ( blockdev_superblock , dev ) ;
if ( ! inode ) {
2020-11-26 11:23:26 +03:00
blk_request_module ( dev ) ;
2021-07-22 10:53:58 +03:00
inode = ilookup ( blockdev_superblock , dev ) ;
if ( ! inode )
2021-05-14 16:18:41 +03:00
return NULL ;
2020-11-26 11:23:26 +03:00
}
2021-07-22 10:53:58 +03:00
/* switch from the inode reference to a device mode one: */
bdev = & BDEV_I ( inode ) - > bdev ;
if ( ! kobject_get_unless_zero ( & bdev - > bd_device . kobj ) )
bdev = NULL ;
iput ( inode ) ;
if ( ! bdev )
return NULL ;
if ( ( bdev - > bd_disk - > flags & GENHD_FL_HIDDEN ) | |
! try_module_get ( bdev - > bd_disk - > fops - > owner ) ) {
put_device ( & bdev - > bd_device ) ;
return NULL ;
}
2020-11-26 11:23:26 +03:00
return bdev ;
}
void blkdev_put_no_open ( struct block_device * bdev )
{
module_put ( bdev - > bd_disk - > fops - > owner ) ;
2021-07-22 10:53:58 +03:00
put_device ( & bdev - > bd_device ) ;
2020-11-26 11:23:26 +03:00
}
2010-11-13 13:55:18 +03:00
/**
2020-11-23 15:38:40 +03:00
* blkdev_get_by_dev - open a block device by device number
* @ dev : device number of block device to open
2010-11-13 13:55:18 +03:00
* @ mode : FMODE_ * mask
* @ holder : exclusive holder identifier
*
2020-11-23 15:38:40 +03:00
* Open the block device described by device number @ dev . If @ mode includes
* % FMODE_EXCL , the block device is opened with exclusive access . Specifying
* % FMODE_EXCL with a % NULL @ holder is invalid . Exclusive opens may nest for
* the same @ holder .
2010-11-13 13:55:18 +03:00
*
2020-11-23 15:38:40 +03:00
* Use this interface ONLY if you really do not have anything better - i . e . when
* you are behind a truly sucky interface and all you are given is a device
* number . Everything else should use blkdev_get_by_path ( ) .
2010-11-13 13:55:18 +03:00
*
* CONTEXT :
* Might sleep .
*
* RETURNS :
2020-11-23 15:38:40 +03:00
* Reference to the block_device on success , ERR_PTR ( - errno ) on failure .
2010-11-13 13:55:18 +03:00
*/
2020-11-23 15:38:40 +03:00
struct block_device * blkdev_get_by_dev ( dev_t dev , fmode_t mode , void * holder )
2005-04-17 02:20:36 +04:00
{
2020-11-23 12:19:22 +03:00
bool unblock_events = true ;
2020-11-23 15:38:40 +03:00
struct block_device * bdev ;
2020-11-23 12:19:22 +03:00
struct gendisk * disk ;
int ret ;
block: make blkdev_get/put() handle exclusive access
Over time, block layer has accumulated a set of APIs dealing with bdev
open, close, claim and release.
* blkdev_get/put() are the primary open and close functions.
* bd_claim/release() deal with exclusive open.
* open/close_bdev_exclusive() are combination of open and claim and
the other way around, respectively.
* bd_link/unlink_disk_holder() to create and remove holder/slave
symlinks.
* open_by_devnum() wraps bdget() + blkdev_get().
The interface is a bit confusing and the decoupling of open and claim
makes it impossible to properly guarantee exclusive access as
in-kernel open + claim sequence can disturb the existing exclusive
open even before the block layer knows the current open if for another
exclusive access. Reorganize the interface such that,
* blkdev_get() is extended to include exclusive access management.
@holder argument is added and, if is @FMODE_EXCL specified, it will
gain exclusive access atomically w.r.t. other exclusive accesses.
* blkdev_put() is similarly extended. It now takes @mode argument and
if @FMODE_EXCL is set, it releases an exclusive access. Also, when
the last exclusive claim is released, the holder/slave symlinks are
removed automatically.
* bd_claim/release() and close_bdev_exclusive() are no longer
necessary and either made static or removed.
* bd_link_disk_holder() remains the same but bd_unlink_disk_holder()
is no longer necessary and removed.
* open_bdev_exclusive() becomes a simple wrapper around lookup_bdev()
and blkdev_get(). It also has an unexpected extra bdev_read_only()
test which probably should be moved into blkdev_get().
* open_by_devnum() is modified to take @holder argument and pass it to
blkdev_get().
Most of bdev open/close operations are unified into blkdev_get/put()
and most exclusive accesses are tested atomically at the open time (as
it should). This cleans up code and removes some, both valid and
invalid, but unnecessary all the same, corner cases.
open_bdev_exclusive() and open_by_devnum() can use further cleanup -
rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop
special features. Well, let's leave them for another day.
Most conversions are straight-forward. drbd conversion is a bit more
involved as there was some reordering, but the logic should stay the
same.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: dm-devel@redhat.com
Cc: drbd-dev@lists.linbit.com
Cc: Leo Chen <leochen@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Joern Engel <joern@logfs.org>
Cc: reiserfs-devel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
2010-11-13 13:55:17 +03:00
2020-11-23 15:44:44 +03:00
ret = devcgroup_check_permission ( DEVCG_DEV_BLOCK ,
2020-11-23 15:38:40 +03:00
MAJOR ( dev ) , MINOR ( dev ) ,
2020-11-23 15:44:44 +03:00
( ( mode & FMODE_READ ) ? DEVCG_ACC_READ : 0 ) |
( ( mode & FMODE_WRITE ) ? DEVCG_ACC_WRITE : 0 ) ) ;
2020-08-31 21:02:36 +03:00
if ( ret )
2020-11-23 15:38:40 +03:00
return ERR_PTR ( ret ) ;
2020-11-26 11:23:26 +03:00
bdev = blkdev_get_no_open ( dev ) ;
if ( ! bdev )
return ERR_PTR ( - ENXIO ) ;
disk = bdev - > bd_disk ;
2020-08-31 21:02:36 +03:00
2020-11-23 12:19:22 +03:00
if ( mode & FMODE_EXCL ) {
2020-11-25 23:20:08 +03:00
ret = bd_prepare_to_claim ( bdev , holder ) ;
2020-11-23 12:19:22 +03:00
if ( ret )
2020-11-25 23:20:08 +03:00
goto put_blkdev ;
2020-11-23 12:19:22 +03:00
}
disk_block_events ( disk ) ;
2021-05-25 09:12:56 +03:00
mutex_lock ( & disk - > open_mutex ) ;
2021-05-25 09:12:54 +03:00
ret = - ENXIO ;
2021-08-09 09:40:28 +03:00
if ( ! disk_live ( disk ) )
2021-05-25 09:12:54 +03:00
goto abort_claiming ;
if ( bdev_is_partition ( bdev ) )
ret = blkdev_get_part ( bdev , mode ) ;
else
ret = blkdev_get_whole ( bdev , mode ) ;
2020-11-26 11:23:26 +03:00
if ( ret )
goto abort_claiming ;
if ( mode & FMODE_EXCL ) {
2020-11-25 23:20:08 +03:00
bd_finish_claiming ( bdev , holder ) ;
2020-11-23 12:19:22 +03:00
/*
* Block event polling for write claims if requested . Any write
* holder makes the write_holder state stick until all are
* released . This is good enough and tracking individual
* writeable reference is too fragile given the way @ mode is
* used in blkdev_get / put ( ) .
*/
if ( ( mode & FMODE_WRITE ) & & ! bdev - > bd_write_holder & &
( disk - > flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE ) ) {
bdev - > bd_write_holder = true ;
unblock_events = false ;
}
}
2021-05-25 09:12:56 +03:00
mutex_unlock ( & disk - > open_mutex ) ;
2020-11-23 12:19:22 +03:00
if ( unblock_events )
disk_unblock_events ( disk ) ;
2020-11-26 11:23:26 +03:00
return bdev ;
2020-11-23 12:19:22 +03:00
2020-11-26 11:23:26 +03:00
abort_claiming :
if ( mode & FMODE_EXCL )
2020-11-25 23:20:08 +03:00
bd_abort_claiming ( bdev , holder ) ;
2021-05-25 09:12:56 +03:00
mutex_unlock ( & disk - > open_mutex ) ;
2020-11-26 11:23:26 +03:00
disk_unblock_events ( disk ) ;
put_blkdev :
blkdev_put_no_open ( bdev ) ;
return ERR_PTR ( ret ) ;
[PATCH] lockdep: simplify some aspects of bd_mutex nesting
When we open (actually blkdev_get) a partition we need to also open (get) the
whole device that holds the partition. The involves some limited recursion.
This patch tries to simplify some aspects of this.
As well as opening the whole device, we need to increment ->bd_part_count when
a partition is opened (this is used by rescan_partitions to avoid a rescan if
any partition is active, as that would be confusing).
The main change this patch makes is to move the inc/dec of bd_part_count into
blkdev_{get,put} for the whole rather than doing it in blkdev_{get,put} for
the partition.
More specifically, we introduce __blkdev_get and __blkdev_put which do exactly
what blkdev_{get,put} did, only with an extra "for_part" argument
(blkget_{get,put} then call the __ version with a '0' for the extra argument).
If for_part is 1, then the blkdev is being get(put) because a partition is
being opened(closed) for the first(last) time, and so bd_part_count should be
updated (on success). The particular advantage of pushing this function down
is that the bd_mutex lock (which is needed to update bd_part_count) is already
held at the lower level.
Note that this slightly changes the semantics of bd_part_count. Instead of
updating it whenever a partition is opened or released, it is now only updated
on the first open or last release. This is an adequate semantic as it is only
ever tested for "== 0".
Having introduced these functions we remove the current bd_part_count updates
from do_open (which is really the body of blkdev_get) and call
__blkdev_get(... 1). Similarly in blkget_put we remove the old bd_part_count
updates and call __blkget_put(..., 1). This call is moved to the end of
__blkdev_put to avoid nested locks of bd_mutex.
Finally the mutex_lock on whole->bd_mutex in do_open can be removed. It was
only really needed to protect bd_part_count, and that is now managed (and
protected) within the recursive call.
The observation that bd_part_count is central to the locking issues, and the
modifications to create __blkdev_put are from Peter Zijlstra.
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 13:36:16 +03:00
}
2020-11-23 15:38:40 +03:00
EXPORT_SYMBOL ( blkdev_get_by_dev ) ;
2005-04-17 02:20:36 +04:00
2010-11-13 13:55:18 +03:00
/**
* blkdev_get_by_path - open a block device by name
* @ path : path to the block device to open
* @ mode : FMODE_ * mask
* @ holder : exclusive holder identifier
*
2020-11-23 15:38:40 +03:00
* Open the block device described by the device file at @ path . If @ mode
* includes % FMODE_EXCL , the block device is opened with exclusive access .
* Specifying % FMODE_EXCL with a % NULL @ holder is invalid . Exclusive opens may
* nest for the same @ holder .
2010-11-13 13:55:18 +03:00
*
* CONTEXT :
* Might sleep .
*
* RETURNS :
2020-11-23 15:38:40 +03:00
* Reference to the block_device on success , ERR_PTR ( - errno ) on failure .
2010-11-13 13:55:18 +03:00
*/
struct block_device * blkdev_get_by_path ( const char * path , fmode_t mode ,
void * holder )
{
struct block_device * bdev ;
2020-11-23 15:38:40 +03:00
dev_t dev ;
int error ;
2010-11-13 13:55:18 +03:00
2020-11-23 15:38:40 +03:00
error = lookup_bdev ( path , & dev ) ;
if ( error )
return ERR_PTR ( error ) ;
2010-11-13 13:55:18 +03:00
2020-11-23 15:38:40 +03:00
bdev = blkdev_get_by_dev ( dev , mode , holder ) ;
if ( ! IS_ERR ( bdev ) & & ( mode & FMODE_WRITE ) & & bdev_read_only ( bdev ) ) {
2011-02-17 02:11:53 +03:00
blkdev_put ( bdev , mode ) ;
return ERR_PTR ( - EACCES ) ;
}
2010-11-13 13:55:18 +03:00
return bdev ;
}
EXPORT_SYMBOL ( blkdev_get_by_path ) ;
2013-05-06 06:11:03 +04:00
void blkdev_put ( struct block_device * bdev , fmode_t mode )
2006-12-08 13:36:13 +03:00
{
struct gendisk * disk = bdev - > bd_disk ;
2020-03-25 00:48:27 +03:00
/*
* Sync early if it looks like we ' re the last one . If someone else
* opens the block device between now and the decrement of bd_openers
* then we did a sync that we didn ' t need to , but that ' s not the end
* of the world and we want to avoid long ( could be several minute )
* syncs while holding the mutex .
*/
if ( bdev - > bd_openers = = 1 )
sync_blockdev ( bdev ) ;
2021-05-25 09:12:56 +03:00
mutex_lock ( & disk - > open_mutex ) ;
block: make blkdev_get/put() handle exclusive access
Over time, block layer has accumulated a set of APIs dealing with bdev
open, close, claim and release.
* blkdev_get/put() are the primary open and close functions.
* bd_claim/release() deal with exclusive open.
* open/close_bdev_exclusive() are combination of open and claim and
the other way around, respectively.
* bd_link/unlink_disk_holder() to create and remove holder/slave
symlinks.
* open_by_devnum() wraps bdget() + blkdev_get().
The interface is a bit confusing and the decoupling of open and claim
makes it impossible to properly guarantee exclusive access as
in-kernel open + claim sequence can disturb the existing exclusive
open even before the block layer knows the current open if for another
exclusive access. Reorganize the interface such that,
* blkdev_get() is extended to include exclusive access management.
@holder argument is added and, if is @FMODE_EXCL specified, it will
gain exclusive access atomically w.r.t. other exclusive accesses.
* blkdev_put() is similarly extended. It now takes @mode argument and
if @FMODE_EXCL is set, it releases an exclusive access. Also, when
the last exclusive claim is released, the holder/slave symlinks are
removed automatically.
* bd_claim/release() and close_bdev_exclusive() are no longer
necessary and either made static or removed.
* bd_link_disk_holder() remains the same but bd_unlink_disk_holder()
is no longer necessary and removed.
* open_bdev_exclusive() becomes a simple wrapper around lookup_bdev()
and blkdev_get(). It also has an unexpected extra bdev_read_only()
test which probably should be moved into blkdev_get().
* open_by_devnum() is modified to take @holder argument and pass it to
blkdev_get().
Most of bdev open/close operations are unified into blkdev_get/put()
and most exclusive accesses are tested atomically at the open time (as
it should). This cleans up code and removes some, both valid and
invalid, but unnecessary all the same, corner cases.
open_bdev_exclusive() and open_by_devnum() can use further cleanup -
rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop
special features. Well, let's leave them for another day.
Most conversions are straight-forward. drbd conversion is a bit more
involved as there was some reordering, but the logic should stay the
same.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: dm-devel@redhat.com
Cc: drbd-dev@lists.linbit.com
Cc: Leo Chen <leochen@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Joern Engel <joern@logfs.org>
Cc: reiserfs-devel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
2010-11-13 13:55:17 +03:00
if ( mode & FMODE_EXCL ) {
2020-11-23 15:29:55 +03:00
struct block_device * whole = bdev_whole ( bdev ) ;
2010-11-13 13:55:17 +03:00
bool bdev_free ;
/*
* Release a claim on the device . The holder fields
2021-05-25 09:12:56 +03:00
* are protected with bdev_lock . open_mutex is to
2010-11-13 13:55:17 +03:00
* synchronize disk_holder unlinking .
*/
spin_lock ( & bdev_lock ) ;
WARN_ON_ONCE ( - - bdev - > bd_holders < 0 ) ;
2020-11-23 15:29:55 +03:00
WARN_ON_ONCE ( - - whole - > bd_holders < 0 ) ;
2010-11-13 13:55:17 +03:00
if ( ( bdev_free = ! bdev - > bd_holders ) )
bdev - > bd_holder = NULL ;
2020-11-23 15:29:55 +03:00
if ( ! whole - > bd_holders )
whole - > bd_holder = NULL ;
2010-11-13 13:55:17 +03:00
spin_unlock ( & bdev_lock ) ;
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-08 22:57:37 +03:00
/*
* If this was the last claim , remove holder link and
* unblock evpoll if it was a write holder .
*/
2011-07-01 18:17:47 +04:00
if ( bdev_free & & bdev - > bd_write_holder ) {
2020-11-23 12:19:22 +03:00
disk_unblock_events ( disk ) ;
2011-07-01 18:17:47 +04:00
bdev - > bd_write_holder = false ;
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-08 22:57:37 +03:00
}
2011-03-09 21:54:27 +03:00
}
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-08 22:57:37 +03:00
2011-07-01 18:17:47 +04:00
/*
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
* event . This is to ensure detection of media removal commanded
* from userland - e . g . eject ( 1 ) .
*/
2020-11-23 12:19:22 +03:00
disk_flush_events ( disk , DISK_EVENT_MEDIA_CHANGE ) ;
2011-07-01 18:17:47 +04:00
2021-05-25 09:12:58 +03:00
if ( bdev_is_partition ( bdev ) )
blkdev_put_part ( bdev , mode ) ;
else
blkdev_put_whole ( bdev , mode ) ;
2021-05-25 09:12:56 +03:00
mutex_unlock ( & disk - > open_mutex ) ;
2020-11-26 11:23:26 +03:00
blkdev_put_no_open ( bdev ) ;
[PATCH] lockdep: simplify some aspects of bd_mutex nesting
When we open (actually blkdev_get) a partition we need to also open (get) the
whole device that holds the partition. The involves some limited recursion.
This patch tries to simplify some aspects of this.
As well as opening the whole device, we need to increment ->bd_part_count when
a partition is opened (this is used by rescan_partitions to avoid a rescan if
any partition is active, as that would be confusing).
The main change this patch makes is to move the inc/dec of bd_part_count into
blkdev_{get,put} for the whole rather than doing it in blkdev_{get,put} for
the partition.
More specifically, we introduce __blkdev_get and __blkdev_put which do exactly
what blkdev_{get,put} did, only with an extra "for_part" argument
(blkget_{get,put} then call the __ version with a '0' for the extra argument).
If for_part is 1, then the blkdev is being get(put) because a partition is
being opened(closed) for the first(last) time, and so bd_part_count should be
updated (on success). The particular advantage of pushing this function down
is that the bd_mutex lock (which is needed to update bd_part_count) is already
held at the lower level.
Note that this slightly changes the semantics of bd_part_count. Instead of
updating it whenever a partition is opened or released, it is now only updated
on the first open or last release. This is an adequate semantic as it is only
ever tested for "== 0".
Having introduced these functions we remove the current bd_part_count updates
from do_open (which is really the body of blkdev_get) and call
__blkdev_get(... 1). Similarly in blkget_put we remove the old bd_part_count
updates and call __blkget_put(..., 1). This call is moved to the end of
__blkdev_put to avoid nested locks of bd_mutex.
Finally the mutex_lock on whole->bd_mutex in do_open can be removed. It was
only really needed to protect bd_part_count, and that is now managed (and
protected) within the recursive call.
The observation that bd_part_count is central to the locking issues, and the
modifications to create __blkdev_put are from Peter Zijlstra.
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 13:36:16 +03:00
}
2006-12-08 13:36:13 +03:00
EXPORT_SYMBOL ( blkdev_put ) ;
2005-04-17 02:20:36 +04:00
/**
* lookup_bdev - lookup a struct block_device by name
2009-01-07 01:41:15 +03:00
* @ pathname : special file representing the block device
2020-12-29 06:47:06 +03:00
* @ dev : return value of the block device ' s dev_t
2005-04-17 02:20:36 +04:00
*
2021-10-21 10:13:43 +03:00
* Lookup the block device ' s dev_t at @ pathname in the current
* namespace if possible and return it by @ dev .
*
* RETURNS :
* 0 if succeeded , errno otherwise .
2005-04-17 02:20:36 +04:00
*/
2020-11-23 15:38:40 +03:00
int lookup_bdev ( const char * pathname , dev_t * dev )
2005-04-17 02:20:36 +04:00
{
struct inode * inode ;
2008-08-02 09:04:36 +04:00
struct path path ;
2005-04-17 02:20:36 +04:00
int error ;
2008-08-02 09:04:36 +04:00
if ( ! pathname | | ! * pathname )
2020-11-23 15:38:40 +03:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
2008-08-02 09:04:36 +04:00
error = kern_path ( pathname , LOOKUP_FOLLOW , & path ) ;
2005-04-17 02:20:36 +04:00
if ( error )
2020-11-23 15:38:40 +03:00
return error ;
2005-04-17 02:20:36 +04:00
2015-03-18 01:26:21 +03:00
inode = d_backing_inode ( path . dentry ) ;
2005-04-17 02:20:36 +04:00
error = - ENOTBLK ;
if ( ! S_ISBLK ( inode - > i_mode ) )
2020-11-23 15:38:40 +03:00
goto out_path_put ;
2005-04-17 02:20:36 +04:00
error = - EACCES ;
2016-06-09 23:34:02 +03:00
if ( ! may_open_dev ( & path ) )
2020-11-23 15:38:40 +03:00
goto out_path_put ;
* dev = inode - > i_rdev ;
error = 0 ;
out_path_put :
2008-08-02 09:04:36 +04:00
path_put ( & path ) ;
2020-11-23 15:38:40 +03:00
return error ;
2005-04-17 02:20:36 +04:00
}
2008-08-01 13:00:11 +04:00
EXPORT_SYMBOL ( lookup_bdev ) ;
2005-04-17 02:20:36 +04:00
2011-02-24 09:25:47 +03:00
int __invalidate_device ( struct block_device * bdev , bool kill_dirty )
2006-08-29 22:06:11 +04:00
{
struct super_block * sb = get_super ( bdev ) ;
int res = 0 ;
if ( sb ) {
/*
* no need to lock the super , get_super holds the
* read mutex so the filesystem cannot go away
* under us ( - > put_super runs with the write lock
* hold ) .
*/
shrink_dcache_sb ( sb ) ;
2011-02-24 09:25:47 +03:00
res = invalidate_inodes ( sb , kill_dirty ) ;
2006-08-29 22:06:11 +04:00
drop_super ( sb ) ;
}
2007-05-07 01:49:54 +04:00
invalidate_bdev ( bdev ) ;
2006-08-29 22:06:11 +04:00
return res ;
}
EXPORT_SYMBOL ( __invalidate_device ) ;
2012-07-03 18:45:31 +04:00
2021-10-19 09:25:30 +03:00
void sync_bdevs ( bool wait )
2012-07-03 18:45:31 +04:00
{
struct inode * inode , * old_inode = NULL ;
2015-03-04 20:37:22 +03:00
spin_lock ( & blockdev_superblock - > s_inode_list_lock ) ;
2012-07-03 18:45:31 +04:00
list_for_each_entry ( inode , & blockdev_superblock - > s_inodes , i_sb_list ) {
struct address_space * mapping = inode - > i_mapping ;
2016-12-01 11:18:28 +03:00
struct block_device * bdev ;
2012-07-03 18:45:31 +04:00
spin_lock ( & inode - > i_lock ) ;
if ( inode - > i_state & ( I_FREEING | I_WILL_FREE | I_NEW ) | |
mapping - > nrpages = = 0 ) {
spin_unlock ( & inode - > i_lock ) ;
continue ;
}
__iget ( inode ) ;
spin_unlock ( & inode - > i_lock ) ;
2015-03-04 20:37:22 +03:00
spin_unlock ( & blockdev_superblock - > s_inode_list_lock ) ;
2012-07-03 18:45:31 +04:00
/*
* We hold a reference to ' inode ' so it couldn ' t have been
* removed from s_inodes list while we dropped the
2015-03-04 20:37:22 +03:00
* s_inode_list_lock We cannot iput the inode now as we can
2012-07-03 18:45:31 +04:00
* be holding the last reference and we cannot iput it under
2015-03-04 20:37:22 +03:00
* s_inode_list_lock . So we keep the reference and iput it
2012-07-03 18:45:31 +04:00
* later .
*/
iput ( old_inode ) ;
old_inode = inode ;
2016-12-01 11:18:28 +03:00
bdev = I_BDEV ( inode ) ;
2012-07-03 18:45:31 +04:00
2021-05-25 09:12:56 +03:00
mutex_lock ( & bdev - > bd_disk - > open_mutex ) ;
2021-10-19 09:25:30 +03:00
if ( ! bdev - > bd_openers ) {
; /* skip */
} else if ( wait ) {
/*
* We keep the error status of individual mapping so
* that applications can catch the writeback error using
* fsync ( 2 ) . See filemap_fdatawait_keep_errors ( ) for
* details .
*/
filemap_fdatawait_keep_errors ( inode - > i_mapping ) ;
} else {
filemap_fdatawrite ( inode - > i_mapping ) ;
}
2021-05-25 09:12:56 +03:00
mutex_unlock ( & bdev - > bd_disk - > open_mutex ) ;
2012-07-03 18:45:31 +04:00
2015-03-04 20:37:22 +03:00
spin_lock ( & blockdev_superblock - > s_inode_list_lock ) ;
2012-07-03 18:45:31 +04:00
}
2015-03-04 20:37:22 +03:00
spin_unlock ( & blockdev_superblock - > s_inode_list_lock ) ;
2012-07-03 18:45:31 +04:00
iput ( old_inode ) ;
}