2019-02-18 11:36:29 +03:00
// SPDX-License-Identifier: GPL-2.0
2017-11-02 14:59:30 +03:00
/*
2018-05-14 09:48:54 +03:00
* Copyright ( c ) 2017 - 2018 Christoph Hellwig .
2017-11-02 14:59:30 +03:00
*/
2020-04-09 19:09:04 +03:00
# include <linux/backing-dev.h>
2017-11-02 14:59:30 +03:00
# include <linux/moduleparam.h>
2022-03-09 16:29:00 +03:00
# include <linux/vmalloc.h>
2018-06-07 11:38:47 +03:00
# include <trace/events/block.h>
2017-11-02 14:59:30 +03:00
# include "nvme.h"
2022-03-15 14:58:06 +03:00
bool multipath = true ;
2018-04-26 23:24:29 +03:00
module_param ( multipath , bool , 0444 ) ;
2017-11-02 14:59:30 +03:00
MODULE_PARM_DESC ( multipath ,
" turn on native support for multiple controllers per subsystem " ) ;
2021-12-20 15:51:45 +03:00
static const char * nvme_iopolicy_names [ ] = {
[ NVME_IOPOLICY_NUMA ] = " numa " ,
[ NVME_IOPOLICY_RR ] = " round-robin " ,
} ;
static int iopolicy = NVME_IOPOLICY_NUMA ;
static int nvme_set_iopolicy ( const char * val , const struct kernel_param * kp )
{
if ( ! val )
return - EINVAL ;
if ( ! strncmp ( val , " numa " , 4 ) )
iopolicy = NVME_IOPOLICY_NUMA ;
else if ( ! strncmp ( val , " round-robin " , 11 ) )
iopolicy = NVME_IOPOLICY_RR ;
else
return - EINVAL ;
return 0 ;
}
static int nvme_get_iopolicy ( char * buf , const struct kernel_param * kp )
{
return sprintf ( buf , " %s \n " , nvme_iopolicy_names [ iopolicy ] ) ;
}
module_param_call ( iopolicy , nvme_set_iopolicy , nvme_get_iopolicy ,
& iopolicy , 0644 ) ;
MODULE_PARM_DESC ( iopolicy ,
" Default multipath I/O policy; 'numa' (default) or 'round-robin' " ) ;
void nvme_mpath_default_iopolicy ( struct nvme_subsystem * subsys )
{
subsys - > iopolicy = iopolicy ;
}
2019-07-31 21:00:26 +03:00
void nvme_mpath_unfreeze ( struct nvme_subsystem * subsys )
{
struct nvme_ns_head * h ;
lockdep_assert_held ( & subsys - > lock ) ;
list_for_each_entry ( h , & subsys - > nsheads , entry )
if ( h - > disk )
blk_mq_unfreeze_queue ( h - > disk - > queue ) ;
}
void nvme_mpath_wait_freeze ( struct nvme_subsystem * subsys )
{
struct nvme_ns_head * h ;
lockdep_assert_held ( & subsys - > lock ) ;
list_for_each_entry ( h , & subsys - > nsheads , entry )
if ( h - > disk )
blk_mq_freeze_queue_wait ( h - > disk - > queue ) ;
}
void nvme_mpath_start_freeze ( struct nvme_subsystem * subsys )
{
struct nvme_ns_head * h ;
lockdep_assert_held ( & subsys - > lock ) ;
list_for_each_entry ( h , & subsys - > nsheads , entry )
if ( h - > disk )
blk_freeze_queue_start ( h - > disk - > queue ) ;
}
2020-08-18 10:11:30 +03:00
void nvme_failover_req ( struct request * req )
2017-11-02 14:59:30 +03:00
{
struct nvme_ns * ns = req - > q - > queuedata ;
2020-08-18 10:11:30 +03:00
u16 status = nvme_req ( req ) - > status & 0x7ff ;
2017-11-02 14:59:30 +03:00
unsigned long flags ;
2021-05-03 20:03:03 +03:00
struct bio * bio ;
2017-11-02 14:59:30 +03:00
2020-08-18 10:11:30 +03:00
nvme_mpath_clear_current_path ( ns ) ;
/*
* If we got back an ANA error , we know the controller is alive but not
* ready to serve this namespace . Kick of a re - read of the ANA
* information page , and just try any other available path for now .
*/
if ( nvme_is_ana_error ( status ) & & ns - > ctrl - > ana_log_buf ) {
set_bit ( NVME_NS_ANA_PENDING , & ns - > flags ) ;
queue_work ( nvme_wq , & ns - > ctrl - > ana_work ) ;
2018-05-14 09:48:54 +03:00
}
2020-02-20 04:05:38 +03:00
spin_lock_irqsave ( & ns - > head - > requeue_lock , flags ) ;
2021-10-12 14:12:26 +03:00
for ( bio = req - > bio ; bio ; bio = bio - > bi_next ) {
2021-05-03 20:03:03 +03:00
bio_set_dev ( bio , ns - > head - > disk - > part0 ) ;
2021-10-12 14:12:26 +03:00
if ( bio - > bi_opf & REQ_POLLED ) {
bio - > bi_opf & = ~ REQ_POLLED ;
bio - > bi_cookie = BLK_QC_T_NONE ;
}
}
2020-02-20 04:05:38 +03:00
blk_steal_bios ( & ns - > head - > requeue_list , req ) ;
spin_unlock_irqrestore ( & ns - > head - > requeue_lock , flags ) ;
2020-08-18 10:11:30 +03:00
blk_mq_end_request ( req , 0 ) ;
2017-11-02 14:59:30 +03:00
kblockd_schedule_work ( & ns - > head - > requeue_work ) ;
}
2022-11-29 17:43:19 +03:00
void nvme_mpath_start_request ( struct request * rq )
{
struct nvme_ns * ns = rq - > q - > queuedata ;
struct gendisk * disk = ns - > head - > disk ;
if ( ! blk_queue_io_stat ( disk - > queue ) | | blk_rq_is_passthrough ( rq ) )
return ;
nvme_req ( rq ) - > flags | = NVME_MPATH_IO_STATS ;
2023-02-23 12:12:26 +03:00
nvme_req ( rq ) - > start_time = bdev_start_io_acct ( disk - > part0 , req_op ( rq ) ,
jiffies ) ;
2022-11-29 17:43:19 +03:00
}
EXPORT_SYMBOL_GPL ( nvme_mpath_start_request ) ;
void nvme_mpath_end_request ( struct request * rq )
{
struct nvme_ns * ns = rq - > q - > queuedata ;
if ( ! ( nvme_req ( rq ) - > flags & NVME_MPATH_IO_STATS ) )
return ;
bdev_end_io_acct ( ns - > head - > disk - > part0 , req_op ( rq ) ,
2023-02-23 12:12:26 +03:00
blk_rq_bytes ( rq ) > > SECTOR_SHIFT ,
nvme_req ( rq ) - > start_time ) ;
2022-11-29 17:43:19 +03:00
}
2017-11-02 14:59:30 +03:00
void nvme_kick_requeue_lists ( struct nvme_ctrl * ctrl )
{
struct nvme_ns * ns ;
2018-02-12 15:54:46 +03:00
down_read ( & ctrl - > namespaces_rwsem ) ;
2017-11-02 14:59:30 +03:00
list_for_each_entry ( ns , & ctrl - > namespaces , list ) {
2021-10-06 12:13:13 +03:00
if ( ! ns - > head - > disk )
continue ;
kblockd_schedule_work ( & ns - > head - > requeue_work ) ;
if ( ctrl - > state = = NVME_CTRL_LIVE )
disk_uevent ( ns - > head - > disk , KOBJ_CHANGE ) ;
2017-11-02 14:59:30 +03:00
}
2018-02-12 15:54:46 +03:00
up_read ( & ctrl - > namespaces_rwsem ) ;
2017-11-02 14:59:30 +03:00
}
2018-05-14 09:48:54 +03:00
static const char * nvme_ana_state_names [ ] = {
[ 0 ] = " invalid state " ,
[ NVME_ANA_OPTIMIZED ] = " optimized " ,
[ NVME_ANA_NONOPTIMIZED ] = " non-optimized " ,
[ NVME_ANA_INACCESSIBLE ] = " inaccessible " ,
[ NVME_ANA_PERSISTENT_LOSS ] = " persistent-loss " ,
[ NVME_ANA_CHANGE ] = " change " ,
} ;
2019-07-25 21:56:57 +03:00
bool nvme_mpath_clear_current_path ( struct nvme_ns * ns )
2017-11-02 14:59:30 +03:00
{
2018-09-11 10:51:29 +03:00
struct nvme_ns_head * head = ns - > head ;
2019-07-25 21:56:57 +03:00
bool changed = false ;
2018-09-11 10:51:29 +03:00
int node ;
if ( ! head )
2019-07-25 21:56:57 +03:00
goto out ;
2018-09-11 10:51:29 +03:00
for_each_node ( node ) {
2019-07-25 21:56:57 +03:00
if ( ns = = rcu_access_pointer ( head - > current_path [ node ] ) ) {
2018-09-11 10:51:29 +03:00
rcu_assign_pointer ( head - > current_path [ node ] , NULL ) ;
2019-07-25 21:56:57 +03:00
changed = true ;
}
2018-09-11 10:51:29 +03:00
}
2019-07-25 21:56:57 +03:00
out :
return changed ;
}
void nvme_mpath_clear_ctrl_paths ( struct nvme_ctrl * ctrl )
{
struct nvme_ns * ns ;
2019-11-02 03:27:55 +03:00
down_read ( & ctrl - > namespaces_rwsem ) ;
2021-10-20 08:59:10 +03:00
list_for_each_entry ( ns , & ctrl - > namespaces , list ) {
nvme_mpath_clear_current_path ( ns ) ;
kblockd_schedule_work ( & ns - > head - > requeue_work ) ;
}
2019-11-02 03:27:55 +03:00
up_read ( & ctrl - > namespaces_rwsem ) ;
2018-09-11 10:51:29 +03:00
}
2021-08-24 17:57:42 +03:00
void nvme_mpath_revalidate_paths ( struct nvme_ns * ns )
{
struct nvme_ns_head * head = ns - > head ;
sector_t capacity = get_capacity ( head - > disk ) ;
int node ;
2022-11-19 02:27:56 +03:00
int srcu_idx ;
2021-08-24 17:57:42 +03:00
2022-11-19 02:27:56 +03:00
srcu_idx = srcu_read_lock ( & head - > srcu ) ;
2021-08-24 17:57:42 +03:00
list_for_each_entry_rcu ( ns , & head - > list , siblings ) {
if ( capacity ! = get_capacity ( ns - > disk ) )
clear_bit ( NVME_NS_READY , & ns - > flags ) ;
}
2022-11-19 02:27:56 +03:00
srcu_read_unlock ( & head - > srcu , srcu_idx ) ;
2021-08-24 17:57:42 +03:00
for_each_node ( node )
rcu_assign_pointer ( head - > current_path [ node ] , NULL ) ;
2022-09-29 10:36:47 +03:00
kblockd_schedule_work ( & head - > requeue_work ) ;
2021-08-24 17:57:42 +03:00
}
2019-07-04 09:10:46 +03:00
static bool nvme_path_is_disabled ( struct nvme_ns * ns )
{
2020-07-23 02:32:19 +03:00
/*
* We don ' t treat NVME_CTRL_DELETING as a disabled path as I / O should
* still be able to complete assuming that the controller is connected .
* Otherwise it will fail immediately and return to the requeue list .
*/
if ( ns - > ctrl - > state ! = NVME_CTRL_LIVE & &
ns - > ctrl - > state ! = NVME_CTRL_DELETING )
return true ;
if ( test_bit ( NVME_NS_ANA_PENDING , & ns - > flags ) | |
2021-08-24 17:57:42 +03:00
! test_bit ( NVME_NS_READY , & ns - > flags ) )
2020-07-23 02:32:19 +03:00
return true ;
return false ;
2019-07-04 09:10:46 +03:00
}
2018-09-11 10:51:29 +03:00
static struct nvme_ns * __nvme_find_path ( struct nvme_ns_head * head , int node )
{
int found_distance = INT_MAX , fallback_distance = INT_MAX , distance ;
struct nvme_ns * found = NULL , * fallback = NULL , * ns ;
2017-11-02 14:59:30 +03:00
list_for_each_entry_rcu ( ns , & head - > list , siblings ) {
2019-07-04 09:10:46 +03:00
if ( nvme_path_is_disabled ( ns ) )
2018-05-14 09:48:54 +03:00
continue ;
2018-09-11 10:51:29 +03:00
2019-02-18 13:43:26 +03:00
if ( READ_ONCE ( head - > subsys - > iopolicy ) = = NVME_IOPOLICY_NUMA )
distance = node_distance ( node , ns - > ctrl - > numa_node ) ;
else
distance = LOCAL_DISTANCE ;
2018-09-11 10:51:29 +03:00
2018-05-14 09:48:54 +03:00
switch ( ns - > ana_state ) {
case NVME_ANA_OPTIMIZED :
2018-09-11 10:51:29 +03:00
if ( distance < found_distance ) {
found_distance = distance ;
found = ns ;
}
break ;
2018-05-14 09:48:54 +03:00
case NVME_ANA_NONOPTIMIZED :
2018-09-11 10:51:29 +03:00
if ( distance < fallback_distance ) {
fallback_distance = distance ;
fallback = ns ;
}
2018-05-14 09:48:54 +03:00
break ;
default :
break ;
2017-11-02 14:59:30 +03:00
}
}
2018-09-11 10:51:29 +03:00
if ( ! found )
found = fallback ;
if ( found )
rcu_assign_pointer ( head - > current_path [ node ] , found ) ;
return found ;
2018-05-14 09:48:54 +03:00
}
2019-02-18 13:43:26 +03:00
static struct nvme_ns * nvme_next_ns ( struct nvme_ns_head * head ,
struct nvme_ns * ns )
{
ns = list_next_or_null_rcu ( & head - > list , & ns - > siblings , struct nvme_ns ,
siblings ) ;
if ( ns )
return ns ;
return list_first_or_null_rcu ( & head - > list , struct nvme_ns , siblings ) ;
}
static struct nvme_ns * nvme_round_robin_path ( struct nvme_ns_head * head ,
int node , struct nvme_ns * old )
{
2020-08-06 16:19:32 +03:00
struct nvme_ns * ns , * found = NULL ;
2019-02-18 13:43:26 +03:00
2019-07-04 09:10:46 +03:00
if ( list_is_singular ( & head - > list ) ) {
if ( nvme_path_is_disabled ( old ) )
return NULL ;
2019-02-18 13:43:26 +03:00
return old ;
2019-07-04 09:10:46 +03:00
}
2019-02-18 13:43:26 +03:00
for ( ns = nvme_next_ns ( head , old ) ;
2021-01-27 13:30:33 +03:00
ns & & ns ! = old ;
2019-02-18 13:43:26 +03:00
ns = nvme_next_ns ( head , ns ) ) {
2019-07-04 09:10:46 +03:00
if ( nvme_path_is_disabled ( ns ) )
2019-02-18 13:43:26 +03:00
continue ;
if ( ns - > ana_state = = NVME_ANA_OPTIMIZED ) {
found = ns ;
goto out ;
}
if ( ns - > ana_state = = NVME_ANA_NONOPTIMIZED )
2020-08-06 16:19:32 +03:00
found = ns ;
2019-02-18 13:43:26 +03:00
}
2020-08-06 16:19:31 +03:00
/*
* The loop above skips the current path for round - robin semantics .
* Fall back to the current path if either :
* - no other optimized path found and current is optimized ,
* - no other usable path found and current is usable .
*/
2020-07-27 19:08:02 +03:00
if ( ! nvme_path_is_disabled ( old ) & &
2020-08-06 16:19:31 +03:00
( old - > ana_state = = NVME_ANA_OPTIMIZED | |
2020-08-06 16:19:32 +03:00
( ! found & & old - > ana_state = = NVME_ANA_NONOPTIMIZED ) ) )
2020-08-06 16:19:31 +03:00
return old ;
2020-08-06 16:19:32 +03:00
if ( ! found )
2019-02-18 13:43:26 +03:00
return NULL ;
out :
rcu_assign_pointer ( head - > current_path [ node ] , found ) ;
return found ;
}
2018-05-14 09:48:54 +03:00
static inline bool nvme_path_is_optimized ( struct nvme_ns * ns )
{
return ns - > ctrl - > state = = NVME_CTRL_LIVE & &
ns - > ana_state = = NVME_ANA_OPTIMIZED ;
2017-11-02 14:59:30 +03:00
}
inline struct nvme_ns * nvme_find_path ( struct nvme_ns_head * head )
{
2018-09-11 10:51:29 +03:00
int node = numa_node_id ( ) ;
struct nvme_ns * ns ;
2017-11-02 14:59:30 +03:00
2018-09-11 10:51:29 +03:00
ns = srcu_dereference ( head - > current_path [ node ] , & head - > srcu ) ;
2020-07-27 19:08:03 +03:00
if ( unlikely ( ! ns ) )
return __nvme_find_path ( head , node ) ;
if ( READ_ONCE ( head - > subsys - > iopolicy ) = = NVME_IOPOLICY_RR )
return nvme_round_robin_path ( head , node , ns ) ;
if ( unlikely ( ! nvme_path_is_optimized ( ns ) ) )
return __nvme_find_path ( head , node ) ;
2017-11-02 14:59:30 +03:00
return ns ;
}
2019-07-25 21:56:57 +03:00
static bool nvme_available_path ( struct nvme_ns_head * head )
{
struct nvme_ns * ns ;
list_for_each_entry_rcu ( ns , & head - > list , siblings ) {
2020-11-24 21:34:59 +03:00
if ( test_bit ( NVME_CTRL_FAILFAST_EXPIRED , & ns - > ctrl - > flags ) )
continue ;
2019-07-25 21:56:57 +03:00
switch ( ns - > ctrl - > state ) {
case NVME_CTRL_LIVE :
case NVME_CTRL_RESETTING :
case NVME_CTRL_CONNECTING :
/* fallthru */
return true ;
default :
break ;
}
}
return false ;
}
2021-10-12 14:12:24 +03:00
static void nvme_ns_head_submit_bio ( struct bio * bio )
2017-11-02 14:59:30 +03:00
{
2021-01-24 13:02:34 +03:00
struct nvme_ns_head * head = bio - > bi_bdev - > bd_disk - > private_data ;
2017-11-02 14:59:30 +03:00
struct device * dev = disk_to_dev ( head - > disk ) ;
struct nvme_ns * ns ;
int srcu_idx ;
2019-04-30 19:57:09 +03:00
/*
2020-07-01 11:59:39 +03:00
* The namespace might be going away and the bio might be moved to a
* different queue via blk_steal_bios ( ) , so we need to use the bio_split
* pool from the original queue to allocate the bvecs from .
2019-04-30 19:57:09 +03:00
*/
2022-07-27 19:22:55 +03:00
bio = bio_split_to_limits ( bio ) ;
2023-01-04 18:51:19 +03:00
if ( ! bio )
return ;
2019-04-30 19:57:09 +03:00
2017-11-02 14:59:30 +03:00
srcu_idx = srcu_read_lock ( & head - > srcu ) ;
ns = nvme_find_path ( head ) ;
if ( likely ( ns ) ) {
2021-01-26 17:33:06 +03:00
bio_set_dev ( bio , ns - > disk - > part0 ) ;
2017-11-02 14:59:30 +03:00
bio - > bi_opf | = REQ_NVME_MPATH ;
2020-12-03 19:21:38 +03:00
trace_block_bio_remap ( bio , disk_devt ( ns - > head - > disk ) ,
2018-06-07 11:38:47 +03:00
bio - > bi_iter . bi_sector ) ;
2021-10-12 14:12:24 +03:00
submit_bio_noacct ( bio ) ;
2019-07-25 21:56:57 +03:00
} else if ( nvme_available_path ( head ) ) {
dev_warn_ratelimited ( dev , " no usable path - requeuing I/O \n " ) ;
2017-11-02 14:59:30 +03:00
spin_lock_irq ( & head - > requeue_lock ) ;
bio_list_add ( & head - > requeue_list , bio ) ;
spin_unlock_irq ( & head - > requeue_lock ) ;
} else {
2019-07-25 21:56:57 +03:00
dev_warn_ratelimited ( dev , " no available path - failing I/O \n " ) ;
2017-11-02 14:59:30 +03:00
2022-03-09 09:02:28 +03:00
bio_io_error ( bio ) ;
2017-11-02 14:59:30 +03:00
}
srcu_read_unlock ( & head - > srcu , srcu_idx ) ;
}
2021-04-07 15:22:12 +03:00
static int nvme_ns_head_open ( struct block_device * bdev , fmode_t mode )
{
if ( ! nvme_tryget_ns_head ( bdev - > bd_disk - > private_data ) )
return - ENXIO ;
return 0 ;
}
static void nvme_ns_head_release ( struct gendisk * disk , fmode_t mode )
{
nvme_put_ns_head ( disk - > private_data ) ;
}
2021-05-19 10:17:06 +03:00
# ifdef CONFIG_BLK_DEV_ZONED
static int nvme_ns_head_report_zones ( struct gendisk * disk , sector_t sector ,
unsigned int nr_zones , report_zones_cb cb , void * data )
{
struct nvme_ns_head * head = disk - > private_data ;
struct nvme_ns * ns ;
int srcu_idx , ret = - EWOULDBLOCK ;
srcu_idx = srcu_read_lock ( & head - > srcu ) ;
ns = nvme_find_path ( head ) ;
if ( ns )
ret = nvme_ns_report_zones ( ns , sector , nr_zones , cb , data ) ;
srcu_read_unlock ( & head - > srcu , srcu_idx ) ;
return ret ;
}
# else
# define nvme_ns_head_report_zones NULL
# endif /* CONFIG_BLK_DEV_ZONED */
2021-04-07 15:22:12 +03:00
const struct block_device_operations nvme_ns_head_ops = {
. owner = THIS_MODULE ,
. submit_bio = nvme_ns_head_submit_bio ,
. open = nvme_ns_head_open ,
. release = nvme_ns_head_release ,
. ioctl = nvme_ns_head_ioctl ,
2022-07-21 06:57:35 +03:00
. compat_ioctl = blkdev_compat_ptr_ioctl ,
2021-04-07 15:22:12 +03:00
. getgeo = nvme_getgeo ,
2021-05-19 10:17:06 +03:00
. report_zones = nvme_ns_head_report_zones ,
2021-04-07 15:22:12 +03:00
. pr_ops = & nvme_pr_ops ,
} ;
2021-04-21 10:45:04 +03:00
static inline struct nvme_ns_head * cdev_to_ns_head ( struct cdev * cdev )
{
return container_of ( cdev , struct nvme_ns_head , cdev ) ;
}
static int nvme_ns_head_chr_open ( struct inode * inode , struct file * file )
{
if ( ! nvme_tryget_ns_head ( cdev_to_ns_head ( inode - > i_cdev ) ) )
return - ENXIO ;
return 0 ;
}
static int nvme_ns_head_chr_release ( struct inode * inode , struct file * file )
{
nvme_put_ns_head ( cdev_to_ns_head ( inode - > i_cdev ) ) ;
return 0 ;
}
static const struct file_operations nvme_ns_head_chr_fops = {
. owner = THIS_MODULE ,
. open = nvme_ns_head_chr_open ,
. release = nvme_ns_head_chr_release ,
. unlocked_ioctl = nvme_ns_head_chr_ioctl ,
. compat_ioctl = compat_ptr_ioctl ,
2022-05-11 08:47:48 +03:00
. uring_cmd = nvme_ns_head_chr_uring_cmd ,
2022-08-23 19:14:43 +03:00
. uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll ,
2021-04-21 10:45:04 +03:00
} ;
static int nvme_add_ns_head_cdev ( struct nvme_ns_head * head )
{
int ret ;
head - > cdev_device . parent = & head - > subsys - > dev ;
ret = dev_set_name ( & head - > cdev_device , " ng%dn%d " ,
head - > subsys - > instance , head - > instance ) ;
if ( ret )
return ret ;
ret = nvme_cdev_add ( & head - > cdev , & head - > cdev_device ,
& nvme_ns_head_chr_fops , THIS_MODULE ) ;
return ret ;
}
2017-11-02 14:59:30 +03:00
static void nvme_requeue_work ( struct work_struct * work )
{
struct nvme_ns_head * head =
container_of ( work , struct nvme_ns_head , requeue_work ) ;
struct bio * bio , * next ;
spin_lock_irq ( & head - > requeue_lock ) ;
next = bio_list_get ( & head - > requeue_list ) ;
spin_unlock_irq ( & head - > requeue_lock ) ;
while ( ( bio = next ) ! = NULL ) {
next = bio - > bi_next ;
bio - > bi_next = NULL ;
2020-07-01 11:59:44 +03:00
submit_bio_noacct ( bio ) ;
2017-11-02 14:59:30 +03:00
}
}
int nvme_mpath_alloc_disk ( struct nvme_ctrl * ctrl , struct nvme_ns_head * head )
{
bool vwc = false ;
2018-05-14 09:48:54 +03:00
mutex_init ( & head - > lock ) ;
2017-11-02 14:59:30 +03:00
bio_list_init ( & head - > requeue_list ) ;
spin_lock_init ( & head - > requeue_lock ) ;
INIT_WORK ( & head - > requeue_work , nvme_requeue_work ) ;
/*
* Add a multipath node if the subsystems supports multiple controllers .
2022-03-14 14:05:45 +03:00
* We also do this for private namespaces as the namespace sharing flag
* could change after a rescan .
2017-11-02 14:59:30 +03:00
*/
2022-03-14 14:05:45 +03:00
if ( ! ( ctrl - > subsys - > cmic & NVME_CTRL_CMIC_MULTI_CTRL ) | |
! nvme_is_unique_nsid ( ctrl , head ) | | ! multipath )
2017-11-02 14:59:30 +03:00
return 0 ;
2021-05-21 08:51:08 +03:00
head - > disk = blk_alloc_disk ( ctrl - > numa_node ) ;
2017-11-02 14:59:30 +03:00
if ( ! head - > disk )
2021-05-21 08:51:08 +03:00
return - ENOMEM ;
2017-11-02 14:59:30 +03:00
head - > disk - > fops = & nvme_ns_head_ops ;
head - > disk - > private_data = head ;
sprintf ( head - > disk - > disk_name , " nvme%dn%d " ,
ctrl - > subsys - > instance , head - > instance ) ;
2021-05-21 08:51:08 +03:00
blk_queue_flag_set ( QUEUE_FLAG_NONROT , head - > disk - > queue ) ;
2021-06-14 14:17:34 +03:00
blk_queue_flag_set ( QUEUE_FLAG_NOWAIT , head - > disk - > queue ) ;
2022-11-29 17:43:19 +03:00
blk_queue_flag_set ( QUEUE_FLAG_IO_STAT , head - > disk - > queue ) ;
2021-10-12 14:12:26 +03:00
/*
* This assumes all controllers that refer to a namespace either
* support poll queues or not . That is not a strict guarantee ,
* but if the assumption is wrong the effect is only suboptimal
* performance but not correctness problem .
*/
if ( ctrl - > tagset - > nr_maps > HCTX_TYPE_POLL & &
ctrl - > tagset - > map [ HCTX_TYPE_POLL ] . nr_queues )
blk_queue_flag_set ( QUEUE_FLAG_POLL , head - > disk - > queue ) ;
2021-06-14 14:17:34 +03:00
2021-05-21 08:51:08 +03:00
/* set to a default value of 512 until the disk is validated */
blk_queue_logical_block_size ( head - > disk - > queue , 512 ) ;
blk_set_stacking_limits ( & head - > disk - > queue - > limits ) ;
2022-10-24 21:57:45 +03:00
blk_queue_dma_alignment ( head - > disk - > queue , 3 ) ;
2021-05-21 08:51:08 +03:00
/* we need to propagate up the VMC settings */
if ( ctrl - > vwc & NVME_CTRL_VWC_PRESENT )
vwc = true ;
blk_queue_write_cache ( head - > disk - > queue , vwc , vwc ) ;
return 0 ;
2017-11-02 14:59:30 +03:00
}
2018-05-14 09:48:54 +03:00
static void nvme_mpath_set_live ( struct nvme_ns * ns )
2017-11-02 14:59:30 +03:00
{
2018-05-14 09:48:54 +03:00
struct nvme_ns_head * head = ns - > head ;
2021-10-16 02:52:08 +03:00
int rc ;
2018-05-14 09:48:54 +03:00
2017-11-02 14:59:30 +03:00
if ( ! head - > disk )
return ;
2018-02-28 10:06:04 +03:00
2021-10-16 02:52:08 +03:00
/*
* test_and_set_bit ( ) is used because it is protecting against two nvme
* paths simultaneously calling device_add_disk ( ) on the same namespace
* head .
*/
2021-04-21 10:45:04 +03:00
if ( ! test_and_set_bit ( NVME_NSHEAD_DISK_LIVE , & head - > flags ) ) {
2021-10-16 02:52:08 +03:00
rc = device_add_disk ( & head - > subsys - > dev , head - > disk ,
nvme_ns_id_attr_groups ) ;
if ( rc ) {
clear_bit ( NVME_NSHEAD_DISK_LIVE , & ns - > flags ) ;
return ;
}
2021-04-21 10:45:04 +03:00
nvme_add_ns_head_cdev ( head ) ;
}
2018-05-14 09:48:54 +03:00
2020-06-24 11:53:11 +03:00
mutex_lock ( & head - > lock ) ;
2018-10-05 18:49:37 +03:00
if ( nvme_path_is_optimized ( ns ) ) {
int node , srcu_idx ;
srcu_idx = srcu_read_lock ( & head - > srcu ) ;
for_each_node ( node )
__nvme_find_path ( head , node ) ;
srcu_read_unlock ( & head - > srcu , srcu_idx ) ;
}
2020-06-24 11:53:10 +03:00
mutex_unlock ( & head - > lock ) ;
2018-10-05 18:49:37 +03:00
2020-06-24 11:53:10 +03:00
synchronize_srcu ( & head - > srcu ) ;
kblockd_schedule_work ( & head - > requeue_work ) ;
2018-05-14 09:48:54 +03:00
}
static int nvme_parse_ana_log ( struct nvme_ctrl * ctrl , void * data ,
int ( * cb ) ( struct nvme_ctrl * ctrl , struct nvme_ana_group_desc * ,
void * ) )
{
void * base = ctrl - > ana_log_buf ;
size_t offset = sizeof ( struct nvme_ana_rsp_hdr ) ;
int error , i ;
lockdep_assert_held ( & ctrl - > ana_lock ) ;
for ( i = 0 ; i < le16_to_cpu ( ctrl - > ana_log_buf - > ngrps ) ; i + + ) {
struct nvme_ana_group_desc * desc = base + offset ;
2019-10-29 01:56:48 +03:00
u32 nr_nsids ;
size_t nsid_buf_size ;
if ( WARN_ON_ONCE ( offset > ctrl - > ana_log_size - sizeof ( * desc ) ) )
return - EINVAL ;
nr_nsids = le32_to_cpu ( desc - > nnsids ) ;
2021-10-24 20:29:21 +03:00
nsid_buf_size = flex_array_size ( desc , nsids , nr_nsids ) ;
2018-05-14 09:48:54 +03:00
if ( WARN_ON_ONCE ( desc - > grpid = = 0 ) )
return - EINVAL ;
if ( WARN_ON_ONCE ( le32_to_cpu ( desc - > grpid ) > ctrl - > anagrpmax ) )
return - EINVAL ;
if ( WARN_ON_ONCE ( desc - > state = = 0 ) )
return - EINVAL ;
if ( WARN_ON_ONCE ( desc - > state > NVME_ANA_CHANGE ) )
return - EINVAL ;
offset + = sizeof ( * desc ) ;
if ( WARN_ON_ONCE ( offset > ctrl - > ana_log_size - nsid_buf_size ) )
return - EINVAL ;
error = cb ( ctrl , desc , data ) ;
if ( error )
return error ;
offset + = nsid_buf_size ;
}
return 0 ;
}
static inline bool nvme_state_is_live ( enum nvme_ana_state state )
{
return state = = NVME_ANA_OPTIMIZED | | state = = NVME_ANA_NONOPTIMIZED ;
}
static void nvme_update_ns_ana_state ( struct nvme_ana_group_desc * desc ,
struct nvme_ns * ns )
{
ns - > ana_grpid = le32_to_cpu ( desc - > grpid ) ;
ns - > ana_state = desc - > state ;
clear_bit ( NVME_NS_ANA_PENDING , & ns - > flags ) ;
2022-03-24 22:05:11 +03:00
/*
* nvme_mpath_set_live ( ) will trigger I / O to the multipath path device
* and in turn to this path device . However we cannot accept this I / O
* if the controller is not live . This may deadlock if called from
* nvme_mpath_init_identify ( ) and the ctrl will never complete
* initialization , preventing I / O from completing . For this case we
* will reprocess the ANA log page in nvme_mpath_update ( ) once the
* controller is ready .
*/
if ( nvme_state_is_live ( ns - > ana_state ) & &
ns - > ctrl - > state = = NVME_CTRL_LIVE )
2018-05-14 09:48:54 +03:00
nvme_mpath_set_live ( ns ) ;
}
static int nvme_update_ana_state ( struct nvme_ctrl * ctrl ,
struct nvme_ana_group_desc * desc , void * data )
{
u32 nr_nsids = le32_to_cpu ( desc - > nnsids ) , n = 0 ;
unsigned * nr_change_groups = data ;
struct nvme_ns * ns ;
2019-04-29 06:24:42 +03:00
dev_dbg ( ctrl - > device , " ANA group %d: %s. \n " ,
2018-05-14 09:48:54 +03:00
le32_to_cpu ( desc - > grpid ) ,
nvme_ana_state_names [ desc - > state ] ) ;
if ( desc - > state = = NVME_ANA_CHANGE )
( * nr_change_groups ) + + ;
if ( ! nr_nsids )
return 0 ;
2020-04-02 19:34:54 +03:00
down_read ( & ctrl - > namespaces_rwsem ) ;
2018-05-14 09:48:54 +03:00
list_for_each_entry ( ns , & ctrl - > namespaces , list ) {
nvme-multipath: fix ANA state updates when a namespace is not present
nvme_update_ana_state() has a deficiency that results in a failure to
properly update the ana state for a namespace in the following case:
NSIDs in ctrl->namespaces: 1, 3, 4
NSIDs in desc->nsids: 1, 2, 3, 4
Loop iteration 0:
ns index = 0, n = 0, ns->head->ns_id = 1, nsid = 1, MATCH.
Loop iteration 1:
ns index = 1, n = 1, ns->head->ns_id = 3, nsid = 2, NO MATCH.
Loop iteration 2:
ns index = 2, n = 2, ns->head->ns_id = 4, nsid = 4, MATCH.
Where the update to the ANA state of NSID 3 is missed. To fix this
increment n and retry the update with the same ns when ns->head->ns_id is
higher than nsid,
Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
2021-09-12 21:54:57 +03:00
unsigned nsid ;
again :
nsid = le32_to_cpu ( desc - > nsids [ n ] ) ;
nvme-multipath: fix ana log nsid lookup when nsid is not found
ANA log parsing invokes nvme_update_ana_state() per ANA group desc.
This updates the state of namespaces with nsids in desc->nsids[].
Both ctrl->namespaces list and desc->nsids[] array are sorted by nsid.
Hence nvme_update_ana_state() performs a single walk over ctrl->namespaces:
- if current namespace matches the current desc->nsids[n],
this namespace is updated, and n is incremented.
- the process stops when it encounters the end of either
ctrl->namespaces end or desc->nsids[]
In case desc->nsids[n] does not match any of ctrl->namespaces,
the remaining nsids following desc->nsids[n] will not be updated.
Such situation was considered abnormal and generated WARN_ON_ONCE.
However ANA log MAY contain nsids not (yet) found in ctrl->namespaces.
For example, lets consider the following scenario:
- nvme0 exposes namespaces with nsids = [2, 3] to the host
- a new namespace nsid = 1 is added dynamically
- also, a ANA topology change is triggered
- NS_CHANGED aen is generated and triggers scan_work
- before scan_work discovers nsid=1 and creates a namespace, a NOTICE_ANA
aen was issues and ana_work receives ANA log with nsids=[1, 2, 3]
Result: ana_work fails to update ANA state on existing namespaces [2, 3]
Solution:
Change the way nvme_update_ana_state() namespace list walk
checks the current namespace against desc->nsids[n] as follows:
a) ns->head->ns_id < desc->nsids[n]: keep walking ctrl->namespaces.
b) ns->head->ns_id == desc->nsids[n]: match, update the namespace
c) ns->head->ns_id >= desc->nsids[n]: skip to desc->nsids[n+1]
This enables correct operation in the scenario described above.
This also allows ANA log to contain nsids currently invisible
to the host, i.e. inactive nsids.
Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
2019-08-16 23:00:10 +03:00
if ( ns - > head - > ns_id < nsid )
2018-05-14 09:48:54 +03:00
continue ;
nvme-multipath: fix ana log nsid lookup when nsid is not found
ANA log parsing invokes nvme_update_ana_state() per ANA group desc.
This updates the state of namespaces with nsids in desc->nsids[].
Both ctrl->namespaces list and desc->nsids[] array are sorted by nsid.
Hence nvme_update_ana_state() performs a single walk over ctrl->namespaces:
- if current namespace matches the current desc->nsids[n],
this namespace is updated, and n is incremented.
- the process stops when it encounters the end of either
ctrl->namespaces end or desc->nsids[]
In case desc->nsids[n] does not match any of ctrl->namespaces,
the remaining nsids following desc->nsids[n] will not be updated.
Such situation was considered abnormal and generated WARN_ON_ONCE.
However ANA log MAY contain nsids not (yet) found in ctrl->namespaces.
For example, lets consider the following scenario:
- nvme0 exposes namespaces with nsids = [2, 3] to the host
- a new namespace nsid = 1 is added dynamically
- also, a ANA topology change is triggered
- NS_CHANGED aen is generated and triggers scan_work
- before scan_work discovers nsid=1 and creates a namespace, a NOTICE_ANA
aen was issues and ana_work receives ANA log with nsids=[1, 2, 3]
Result: ana_work fails to update ANA state on existing namespaces [2, 3]
Solution:
Change the way nvme_update_ana_state() namespace list walk
checks the current namespace against desc->nsids[n] as follows:
a) ns->head->ns_id < desc->nsids[n]: keep walking ctrl->namespaces.
b) ns->head->ns_id == desc->nsids[n]: match, update the namespace
c) ns->head->ns_id >= desc->nsids[n]: skip to desc->nsids[n+1]
This enables correct operation in the scenario described above.
This also allows ANA log to contain nsids currently invisible
to the host, i.e. inactive nsids.
Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
2019-08-16 23:00:10 +03:00
if ( ns - > head - > ns_id = = nsid )
nvme_update_ns_ana_state ( desc , ns ) ;
2018-05-14 09:48:54 +03:00
if ( + + n = = nr_nsids )
break ;
nvme-multipath: fix ANA state updates when a namespace is not present
nvme_update_ana_state() has a deficiency that results in a failure to
properly update the ana state for a namespace in the following case:
NSIDs in ctrl->namespaces: 1, 3, 4
NSIDs in desc->nsids: 1, 2, 3, 4
Loop iteration 0:
ns index = 0, n = 0, ns->head->ns_id = 1, nsid = 1, MATCH.
Loop iteration 1:
ns index = 1, n = 1, ns->head->ns_id = 3, nsid = 2, NO MATCH.
Loop iteration 2:
ns index = 2, n = 2, ns->head->ns_id = 4, nsid = 4, MATCH.
Where the update to the ANA state of NSID 3 is missed. To fix this
increment n and retry the update with the same ns when ns->head->ns_id is
higher than nsid,
Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
2021-09-12 21:54:57 +03:00
if ( ns - > head - > ns_id > nsid )
goto again ;
2018-05-14 09:48:54 +03:00
}
2020-04-02 19:34:54 +03:00
up_read ( & ctrl - > namespaces_rwsem ) ;
2018-05-14 09:48:54 +03:00
return 0 ;
}
2019-10-18 21:32:51 +03:00
static int nvme_read_ana_log ( struct nvme_ctrl * ctrl )
2018-05-14 09:48:54 +03:00
{
u32 nr_change_groups = 0 ;
int error ;
mutex_lock ( & ctrl - > ana_lock ) ;
2020-06-29 22:06:40 +03:00
error = nvme_get_log ( ctrl , NVME_NSID_ALL , NVME_LOG_ANA , 0 , NVME_CSI_NVM ,
2018-05-14 09:48:54 +03:00
ctrl - > ana_log_buf , ctrl - > ana_log_size , 0 ) ;
if ( error ) {
dev_warn ( ctrl - > device , " Failed to get ANA log: %d \n " , error ) ;
goto out_unlock ;
}
error = nvme_parse_ana_log ( ctrl , & nr_change_groups ,
nvme_update_ana_state ) ;
if ( error )
goto out_unlock ;
/*
* In theory we should have an ANATT timer per group as they might enter
* the change state at different times . But that is a lot of overhead
* just to protect against a target that keeps entering new changes
* states while never finishing previous ones . But we ' ll still
* eventually time out once all groups are in change state , so this
* isn ' t a big deal .
*
* We also double the ANATT value to provide some slack for transports
* or AEN processing overhead .
*/
if ( nr_change_groups )
mod_timer ( & ctrl - > anatt_timer , ctrl - > anatt * HZ * 2 + jiffies ) ;
else
del_timer_sync ( & ctrl - > anatt_timer ) ;
out_unlock :
mutex_unlock ( & ctrl - > ana_lock ) ;
return error ;
}
static void nvme_ana_work ( struct work_struct * work )
{
struct nvme_ctrl * ctrl = container_of ( work , struct nvme_ctrl , ana_work ) ;
2020-07-23 02:32:19 +03:00
if ( ctrl - > state ! = NVME_CTRL_LIVE )
return ;
2019-10-18 21:32:51 +03:00
nvme_read_ana_log ( ctrl ) ;
2018-05-14 09:48:54 +03:00
}
2022-03-24 22:05:11 +03:00
void nvme_mpath_update ( struct nvme_ctrl * ctrl )
{
u32 nr_change_groups = 0 ;
if ( ! ctrl - > ana_log_buf )
return ;
mutex_lock ( & ctrl - > ana_lock ) ;
nvme_parse_ana_log ( ctrl , & nr_change_groups , nvme_update_ana_state ) ;
mutex_unlock ( & ctrl - > ana_lock ) ;
}
2018-05-14 09:48:54 +03:00
static void nvme_anatt_timeout ( struct timer_list * t )
{
struct nvme_ctrl * ctrl = from_timer ( ctrl , t , anatt_timer ) ;
dev_info ( ctrl - > device , " ANATT timeout, resetting controller. \n " ) ;
nvme_reset_ctrl ( ctrl ) ;
}
void nvme_mpath_stop ( struct nvme_ctrl * ctrl )
{
if ( ! nvme_ctrl_use_ana ( ctrl ) )
return ;
del_timer_sync ( & ctrl - > anatt_timer ) ;
cancel_work_sync ( & ctrl - > ana_work ) ;
}
2019-02-18 13:43:26 +03:00
# define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
struct device_attribute subsys_attr_ # # _name = \
__ATTR ( _name , _mode , _show , _store )
static ssize_t nvme_subsys_iopolicy_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
{
struct nvme_subsystem * subsys =
container_of ( dev , struct nvme_subsystem , dev ) ;
2021-04-01 12:54:10 +03:00
return sysfs_emit ( buf , " %s \n " ,
nvme_iopolicy_names [ READ_ONCE ( subsys - > iopolicy ) ] ) ;
2019-02-18 13:43:26 +03:00
}
static ssize_t nvme_subsys_iopolicy_store ( struct device * dev ,
struct device_attribute * attr , const char * buf , size_t count )
{
struct nvme_subsystem * subsys =
container_of ( dev , struct nvme_subsystem , dev ) ;
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( nvme_iopolicy_names ) ; i + + ) {
if ( sysfs_streq ( buf , nvme_iopolicy_names [ i ] ) ) {
WRITE_ONCE ( subsys - > iopolicy , i ) ;
return count ;
}
}
return - EINVAL ;
}
SUBSYS_ATTR_RW ( iopolicy , S_IRUGO | S_IWUSR ,
nvme_subsys_iopolicy_show , nvme_subsys_iopolicy_store ) ;
2018-05-14 09:48:54 +03:00
static ssize_t ana_grpid_show ( struct device * dev , struct device_attribute * attr ,
char * buf )
{
2021-04-01 12:54:10 +03:00
return sysfs_emit ( buf , " %d \n " , nvme_get_ns_from_dev ( dev ) - > ana_grpid ) ;
2018-05-14 09:48:54 +03:00
}
DEVICE_ATTR_RO ( ana_grpid ) ;
static ssize_t ana_state_show ( struct device * dev , struct device_attribute * attr ,
char * buf )
{
struct nvme_ns * ns = nvme_get_ns_from_dev ( dev ) ;
2021-04-01 12:54:10 +03:00
return sysfs_emit ( buf , " %s \n " , nvme_ana_state_names [ ns - > ana_state ] ) ;
2018-05-14 09:48:54 +03:00
}
DEVICE_ATTR_RO ( ana_state ) ;
2020-06-24 11:53:09 +03:00
static int nvme_lookup_ana_group_desc ( struct nvme_ctrl * ctrl ,
2018-05-14 09:48:54 +03:00
struct nvme_ana_group_desc * desc , void * data )
{
2020-06-24 11:53:09 +03:00
struct nvme_ana_group_desc * dst = data ;
2018-05-14 09:48:54 +03:00
2020-06-24 11:53:09 +03:00
if ( desc - > grpid ! = dst - > grpid )
return 0 ;
2018-05-14 09:48:54 +03:00
2020-06-24 11:53:09 +03:00
* dst = * desc ;
return - ENXIO ; /* just break out of the loop */
2018-05-14 09:48:54 +03:00
}
2022-06-28 22:10:15 +03:00
void nvme_mpath_add_disk ( struct nvme_ns * ns , __le32 anagrpid )
2018-05-14 09:48:54 +03:00
{
if ( nvme_ctrl_use_ana ( ns - > ctrl ) ) {
2020-06-24 11:53:09 +03:00
struct nvme_ana_group_desc desc = {
2022-06-28 22:10:15 +03:00
. grpid = anagrpid ,
2020-06-24 11:53:09 +03:00
. state = 0 ,
} ;
2018-05-14 09:48:54 +03:00
mutex_lock ( & ns - > ctrl - > ana_lock ) ;
2022-06-28 22:10:15 +03:00
ns - > ana_grpid = le32_to_cpu ( anagrpid ) ;
2020-06-24 11:53:09 +03:00
nvme_parse_ana_log ( ns - > ctrl , & desc , nvme_lookup_ana_group_desc ) ;
2018-05-14 09:48:54 +03:00
mutex_unlock ( & ns - > ctrl - > ana_lock ) ;
2020-06-24 11:53:09 +03:00
if ( desc . state ) {
/* found the group desc: update */
nvme_update_ns_ana_state ( & desc , ns ) ;
2020-12-05 18:29:01 +03:00
} else {
/* group desc not found: trigger a re-read */
set_bit ( NVME_NS_ANA_PENDING , & ns - > flags ) ;
queue_work ( nvme_wq , & ns - > ctrl - > ana_work ) ;
2020-06-24 11:53:09 +03:00
}
2018-05-14 09:48:54 +03:00
} else {
2021-04-10 23:15:45 +03:00
ns - > ana_state = NVME_ANA_OPTIMIZED ;
2018-05-14 09:48:54 +03:00
nvme_mpath_set_live ( ns ) ;
2018-02-28 10:06:04 +03:00
}
2020-04-09 19:09:04 +03:00
2020-09-24 09:51:38 +03:00
if ( blk_queue_stable_writes ( ns - > queue ) & & ns - > head - > disk )
blk_queue_flag_set ( QUEUE_FLAG_STABLE_WRITES ,
ns - > head - > disk - > queue ) ;
2021-02-05 22:50:02 +03:00
# ifdef CONFIG_BLK_DEV_ZONED
if ( blk_queue_is_zoned ( ns - > queue ) & & ns - > head - > disk )
2022-07-06 10:03:50 +03:00
ns - > head - > disk - > nr_zones = ns - > disk - > nr_zones ;
2021-02-05 22:50:02 +03:00
# endif
2017-11-02 14:59:30 +03:00
}
2021-07-16 14:30:35 +03:00
void nvme_mpath_shutdown_disk ( struct nvme_ns_head * head )
2017-11-02 14:59:30 +03:00
{
if ( ! head - > disk )
return ;
2021-07-16 14:30:35 +03:00
kblockd_schedule_work ( & head - > requeue_work ) ;
2021-08-09 09:40:24 +03:00
if ( test_bit ( NVME_NSHEAD_DISK_LIVE , & head - > flags ) ) {
2021-04-21 10:45:04 +03:00
nvme_cdev_del ( & head - > cdev , & head - > cdev_device ) ;
2018-05-14 09:48:54 +03:00
del_gendisk ( head - > disk ) ;
2021-04-21 10:45:04 +03:00
}
2021-07-16 14:30:35 +03:00
}
void nvme_mpath_remove_disk ( struct nvme_ns_head * head )
{
if ( ! head - > disk )
return ;
2022-02-17 10:52:31 +03:00
blk_mark_disk_dead ( head - > disk ) ;
2017-11-02 14:59:30 +03:00
/* make sure all pending bios are cleaned up */
kblockd_schedule_work ( & head - > requeue_work ) ;
flush_work ( & head - > requeue_work ) ;
2022-06-19 09:05:52 +03:00
put_disk ( head - > disk ) ;
2017-11-02 14:59:30 +03:00
}
2018-05-14 09:48:54 +03:00
2021-04-29 15:18:53 +03:00
void nvme_mpath_init_ctrl ( struct nvme_ctrl * ctrl )
2018-05-14 09:48:54 +03:00
{
2021-04-29 15:18:53 +03:00
mutex_init ( & ctrl - > ana_lock ) ;
timer_setup ( & ctrl - > anatt_timer , nvme_anatt_timeout , 0 ) ;
INIT_WORK ( & ctrl - > ana_work , nvme_ana_work ) ;
}
int nvme_mpath_init_identify ( struct nvme_ctrl * ctrl , struct nvme_id_ctrl * id )
{
size_t max_transfer_size = ctrl - > max_hw_sectors < < SECTOR_SHIFT ;
size_t ana_log_size ;
int error = 0 ;
2018-05-14 09:48:54 +03:00
2019-07-23 08:41:20 +03:00
/* check if multipath is enabled and we have the capability */
2020-04-03 20:53:46 +03:00
if ( ! multipath | | ! ctrl - > subsys | |
! ( ctrl - > subsys - > cmic & NVME_CTRL_CMIC_ANA ) )
2018-05-14 09:48:54 +03:00
return 0 ;
2021-06-07 11:56:56 +03:00
if ( ! ctrl - > max_namespaces | |
ctrl - > max_namespaces > le32_to_cpu ( id - > nn ) ) {
dev_err ( ctrl - > device ,
" Invalid MNAN value %u \n " , ctrl - > max_namespaces ) ;
return - EINVAL ;
}
2018-05-14 09:48:54 +03:00
ctrl - > anacap = id - > anacap ;
ctrl - > anatt = id - > anatt ;
ctrl - > nanagrpid = le32_to_cpu ( id - > nanagrpid ) ;
ctrl - > anagrpmax = le32_to_cpu ( id - > anagrpmax ) ;
2021-04-29 15:18:53 +03:00
ana_log_size = sizeof ( struct nvme_ana_rsp_hdr ) +
ctrl - > nanagrpid * sizeof ( struct nvme_ana_group_desc ) +
ctrl - > max_namespaces * sizeof ( __le32 ) ;
if ( ana_log_size > max_transfer_size ) {
2018-05-14 09:48:54 +03:00
dev_err ( ctrl - > device ,
2021-04-29 15:18:53 +03:00
" ANA log page size (%zd) larger than MDTS (%zd). \n " ,
ana_log_size , max_transfer_size ) ;
2018-05-14 09:48:54 +03:00
dev_err ( ctrl - > device , " disabling ANA support. \n " ) ;
2021-04-29 15:18:53 +03:00
goto out_uninit ;
2018-05-14 09:48:54 +03:00
}
2021-04-29 15:18:53 +03:00
if ( ana_log_size > ctrl - > ana_log_size ) {
nvme_mpath_stop ( ctrl ) ;
2021-12-03 14:47:15 +03:00
nvme_mpath_uninit ( ctrl ) ;
2022-03-09 16:29:00 +03:00
ctrl - > ana_log_buf = kvmalloc ( ana_log_size , GFP_KERNEL ) ;
2021-04-29 15:18:53 +03:00
if ( ! ctrl - > ana_log_buf )
return - ENOMEM ;
2018-09-25 22:29:15 +03:00
}
2021-04-29 15:18:53 +03:00
ctrl - > ana_log_size = ana_log_size ;
2019-10-18 21:32:51 +03:00
error = nvme_read_ana_log ( ctrl ) ;
2018-05-14 09:48:54 +03:00
if ( error )
2021-04-29 15:18:53 +03:00
goto out_uninit ;
2018-05-14 09:48:54 +03:00
return 0 ;
2021-04-29 15:18:53 +03:00
out_uninit :
nvme_mpath_uninit ( ctrl ) ;
2018-09-25 22:29:15 +03:00
return error ;
2018-05-14 09:48:54 +03:00
}
void nvme_mpath_uninit ( struct nvme_ctrl * ctrl )
{
2022-03-09 16:29:00 +03:00
kvfree ( ctrl - > ana_log_buf ) ;
2019-01-08 14:46:58 +03:00
ctrl - > ana_log_buf = NULL ;
2021-12-03 14:47:15 +03:00
ctrl - > ana_log_size = 0 ;
2018-05-14 09:48:54 +03:00
}