2019-02-18 09:36:29 +01:00
// SPDX-License-Identifier: GPL-2.0
2017-11-02 12:59:30 +01:00
/*
2018-05-14 08:48:54 +02:00
* Copyright ( c ) 2017 - 2018 Christoph Hellwig .
2017-11-02 12:59:30 +01:00
*/
2020-04-09 09:09:04 -07:00
# include <linux/backing-dev.h>
2017-11-02 12:59:30 +01:00
# include <linux/moduleparam.h>
2018-06-07 10:38:47 +02:00
# include <trace/events/block.h>
2017-11-02 12:59:30 +01:00
# include "nvme.h"
static bool multipath = true ;
2018-04-26 14:24:29 -06:00
module_param ( multipath , bool , 0444 ) ;
2017-11-02 12:59:30 +01:00
MODULE_PARM_DESC ( multipath ,
" turn on native support for multiple controllers per subsystem " ) ;
2019-07-31 11:00:26 -07:00
void nvme_mpath_unfreeze ( struct nvme_subsystem * subsys )
{
struct nvme_ns_head * h ;
lockdep_assert_held ( & subsys - > lock ) ;
list_for_each_entry ( h , & subsys - > nsheads , entry )
if ( h - > disk )
blk_mq_unfreeze_queue ( h - > disk - > queue ) ;
}
void nvme_mpath_wait_freeze ( struct nvme_subsystem * subsys )
{
struct nvme_ns_head * h ;
lockdep_assert_held ( & subsys - > lock ) ;
list_for_each_entry ( h , & subsys - > nsheads , entry )
if ( h - > disk )
blk_mq_freeze_queue_wait ( h - > disk - > queue ) ;
}
void nvme_mpath_start_freeze ( struct nvme_subsystem * subsys )
{
struct nvme_ns_head * h ;
lockdep_assert_held ( & subsys - > lock ) ;
list_for_each_entry ( h , & subsys - > nsheads , entry )
if ( h - > disk )
blk_freeze_queue_start ( h - > disk - > queue ) ;
}
2018-04-26 14:22:41 -06:00
/*
* If multipathing is enabled we need to always use the subsystem instance
* number for numbering our devices to avoid conflicts between subsystems that
* have multiple controllers and thus use the multipath - aware subsystem node
* and those that have a single controller and use the controller node
* directly .
*/
void nvme_set_disk_name ( char * disk_name , struct nvme_ns * ns ,
struct nvme_ctrl * ctrl , int * flags )
{
if ( ! multipath ) {
sprintf ( disk_name , " nvme%dn%d " , ctrl - > instance , ns - > head - > instance ) ;
} else if ( ns - > head - > disk ) {
sprintf ( disk_name , " nvme%dc%dn%d " , ctrl - > subsys - > instance ,
2019-05-03 15:37:35 +02:00
ctrl - > instance , ns - > head - > instance ) ;
2018-04-26 14:22:41 -06:00
* flags = GENHD_FL_HIDDEN ;
} else {
sprintf ( disk_name , " nvme%dn%d " , ctrl - > subsys - > instance ,
ns - > head - > instance ) ;
}
}
2020-08-18 09:11:30 +02:00
void nvme_failover_req ( struct request * req )
2017-11-02 12:59:30 +01:00
{
struct nvme_ns * ns = req - > q - > queuedata ;
2020-08-18 09:11:30 +02:00
u16 status = nvme_req ( req ) - > status & 0x7ff ;
2017-11-02 12:59:30 +01:00
unsigned long flags ;
2020-08-18 09:11:30 +02:00
nvme_mpath_clear_current_path ( ns ) ;
/*
* If we got back an ANA error , we know the controller is alive but not
* ready to serve this namespace . Kick of a re - read of the ANA
* information page , and just try any other available path for now .
*/
if ( nvme_is_ana_error ( status ) & & ns - > ctrl - > ana_log_buf ) {
set_bit ( NVME_NS_ANA_PENDING , & ns - > flags ) ;
queue_work ( nvme_wq , & ns - > ctrl - > ana_work ) ;
2018-05-14 08:48:54 +02:00
}
2020-02-20 10:05:38 +09:00
spin_lock_irqsave ( & ns - > head - > requeue_lock , flags ) ;
blk_steal_bios ( & ns - > head - > requeue_list , req ) ;
spin_unlock_irqrestore ( & ns - > head - > requeue_lock , flags ) ;
2020-08-18 09:11:30 +02:00
blk_mq_end_request ( req , 0 ) ;
2017-11-02 12:59:30 +01:00
kblockd_schedule_work ( & ns - > head - > requeue_work ) ;
}
void nvme_kick_requeue_lists ( struct nvme_ctrl * ctrl )
{
struct nvme_ns * ns ;
2018-02-12 20:54:46 +08:00
down_read ( & ctrl - > namespaces_rwsem ) ;
2017-11-02 12:59:30 +01:00
list_for_each_entry ( ns , & ctrl - > namespaces , list ) {
if ( ns - > head - > disk )
kblockd_schedule_work ( & ns - > head - > requeue_work ) ;
}
2018-02-12 20:54:46 +08:00
up_read ( & ctrl - > namespaces_rwsem ) ;
2017-11-02 12:59:30 +01:00
}
2018-05-14 08:48:54 +02:00
static const char * nvme_ana_state_names [ ] = {
[ 0 ] = " invalid state " ,
[ NVME_ANA_OPTIMIZED ] = " optimized " ,
[ NVME_ANA_NONOPTIMIZED ] = " non-optimized " ,
[ NVME_ANA_INACCESSIBLE ] = " inaccessible " ,
[ NVME_ANA_PERSISTENT_LOSS ] = " persistent-loss " ,
[ NVME_ANA_CHANGE ] = " change " ,
} ;
2019-07-25 11:56:57 -07:00
bool nvme_mpath_clear_current_path ( struct nvme_ns * ns )
2017-11-02 12:59:30 +01:00
{
2018-09-11 09:51:29 +02:00
struct nvme_ns_head * head = ns - > head ;
2019-07-25 11:56:57 -07:00
bool changed = false ;
2018-09-11 09:51:29 +02:00
int node ;
if ( ! head )
2019-07-25 11:56:57 -07:00
goto out ;
2018-09-11 09:51:29 +02:00
for_each_node ( node ) {
2019-07-25 11:56:57 -07:00
if ( ns = = rcu_access_pointer ( head - > current_path [ node ] ) ) {
2018-09-11 09:51:29 +02:00
rcu_assign_pointer ( head - > current_path [ node ] , NULL ) ;
2019-07-25 11:56:57 -07:00
changed = true ;
}
2018-09-11 09:51:29 +02:00
}
2019-07-25 11:56:57 -07:00
out :
return changed ;
}
void nvme_mpath_clear_ctrl_paths ( struct nvme_ctrl * ctrl )
{
struct nvme_ns * ns ;
mutex_lock ( & ctrl - > scan_lock ) ;
2019-11-01 17:27:55 -07:00
down_read ( & ctrl - > namespaces_rwsem ) ;
2019-07-25 11:56:57 -07:00
list_for_each_entry ( ns , & ctrl - > namespaces , list )
if ( nvme_mpath_clear_current_path ( ns ) )
kblockd_schedule_work ( & ns - > head - > requeue_work ) ;
2019-11-01 17:27:55 -07:00
up_read ( & ctrl - > namespaces_rwsem ) ;
2019-07-25 11:56:57 -07:00
mutex_unlock ( & ctrl - > scan_lock ) ;
2018-09-11 09:51:29 +02:00
}
2019-07-04 08:10:46 +02:00
static bool nvme_path_is_disabled ( struct nvme_ns * ns )
{
2020-07-22 16:32:19 -07:00
/*
* We don ' t treat NVME_CTRL_DELETING as a disabled path as I / O should
* still be able to complete assuming that the controller is connected .
* Otherwise it will fail immediately and return to the requeue list .
*/
if ( ns - > ctrl - > state ! = NVME_CTRL_LIVE & &
ns - > ctrl - > state ! = NVME_CTRL_DELETING )
return true ;
if ( test_bit ( NVME_NS_ANA_PENDING , & ns - > flags ) | |
test_bit ( NVME_NS_REMOVING , & ns - > flags ) )
return true ;
return false ;
2019-07-04 08:10:46 +02:00
}
2018-09-11 09:51:29 +02:00
static struct nvme_ns * __nvme_find_path ( struct nvme_ns_head * head , int node )
{
int found_distance = INT_MAX , fallback_distance = INT_MAX , distance ;
struct nvme_ns * found = NULL , * fallback = NULL , * ns ;
2017-11-02 12:59:30 +01:00
list_for_each_entry_rcu ( ns , & head - > list , siblings ) {
2019-07-04 08:10:46 +02:00
if ( nvme_path_is_disabled ( ns ) )
2018-05-14 08:48:54 +02:00
continue ;
2018-09-11 09:51:29 +02:00
2019-02-18 11:43:26 +01:00
if ( READ_ONCE ( head - > subsys - > iopolicy ) = = NVME_IOPOLICY_NUMA )
distance = node_distance ( node , ns - > ctrl - > numa_node ) ;
else
distance = LOCAL_DISTANCE ;
2018-09-11 09:51:29 +02:00
2018-05-14 08:48:54 +02:00
switch ( ns - > ana_state ) {
case NVME_ANA_OPTIMIZED :
2018-09-11 09:51:29 +02:00
if ( distance < found_distance ) {
found_distance = distance ;
found = ns ;
}
break ;
2018-05-14 08:48:54 +02:00
case NVME_ANA_NONOPTIMIZED :
2018-09-11 09:51:29 +02:00
if ( distance < fallback_distance ) {
fallback_distance = distance ;
fallback = ns ;
}
2018-05-14 08:48:54 +02:00
break ;
default :
break ;
2017-11-02 12:59:30 +01:00
}
}
2018-09-11 09:51:29 +02:00
if ( ! found )
found = fallback ;
if ( found )
rcu_assign_pointer ( head - > current_path [ node ] , found ) ;
return found ;
2018-05-14 08:48:54 +02:00
}
2019-02-18 11:43:26 +01:00
static struct nvme_ns * nvme_next_ns ( struct nvme_ns_head * head ,
struct nvme_ns * ns )
{
ns = list_next_or_null_rcu ( & head - > list , & ns - > siblings , struct nvme_ns ,
siblings ) ;
if ( ns )
return ns ;
return list_first_or_null_rcu ( & head - > list , struct nvme_ns , siblings ) ;
}
static struct nvme_ns * nvme_round_robin_path ( struct nvme_ns_head * head ,
int node , struct nvme_ns * old )
{
2020-08-06 15:19:32 +02:00
struct nvme_ns * ns , * found = NULL ;
2019-02-18 11:43:26 +01:00
2019-07-04 08:10:46 +02:00
if ( list_is_singular ( & head - > list ) ) {
if ( nvme_path_is_disabled ( old ) )
return NULL ;
2019-02-18 11:43:26 +01:00
return old ;
2019-07-04 08:10:46 +02:00
}
2019-02-18 11:43:26 +01:00
for ( ns = nvme_next_ns ( head , old ) ;
ns ! = old ;
ns = nvme_next_ns ( head , ns ) ) {
2019-07-04 08:10:46 +02:00
if ( nvme_path_is_disabled ( ns ) )
2019-02-18 11:43:26 +01:00
continue ;
if ( ns - > ana_state = = NVME_ANA_OPTIMIZED ) {
found = ns ;
goto out ;
}
if ( ns - > ana_state = = NVME_ANA_NONOPTIMIZED )
2020-08-06 15:19:32 +02:00
found = ns ;
2019-02-18 11:43:26 +01:00
}
2020-08-06 15:19:31 +02:00
/*
* The loop above skips the current path for round - robin semantics .
* Fall back to the current path if either :
* - no other optimized path found and current is optimized ,
* - no other usable path found and current is usable .
*/
2020-07-27 18:08:02 +02:00
if ( ! nvme_path_is_disabled ( old ) & &
2020-08-06 15:19:31 +02:00
( old - > ana_state = = NVME_ANA_OPTIMIZED | |
2020-08-06 15:19:32 +02:00
( ! found & & old - > ana_state = = NVME_ANA_NONOPTIMIZED ) ) )
2020-08-06 15:19:31 +02:00
return old ;
2020-08-06 15:19:32 +02:00
if ( ! found )
2019-02-18 11:43:26 +01:00
return NULL ;
out :
rcu_assign_pointer ( head - > current_path [ node ] , found ) ;
return found ;
}
2018-05-14 08:48:54 +02:00
static inline bool nvme_path_is_optimized ( struct nvme_ns * ns )
{
return ns - > ctrl - > state = = NVME_CTRL_LIVE & &
ns - > ana_state = = NVME_ANA_OPTIMIZED ;
2017-11-02 12:59:30 +01:00
}
inline struct nvme_ns * nvme_find_path ( struct nvme_ns_head * head )
{
2018-09-11 09:51:29 +02:00
int node = numa_node_id ( ) ;
struct nvme_ns * ns ;
2017-11-02 12:59:30 +01:00
2018-09-11 09:51:29 +02:00
ns = srcu_dereference ( head - > current_path [ node ] , & head - > srcu ) ;
2020-07-27 18:08:03 +02:00
if ( unlikely ( ! ns ) )
return __nvme_find_path ( head , node ) ;
if ( READ_ONCE ( head - > subsys - > iopolicy ) = = NVME_IOPOLICY_RR )
return nvme_round_robin_path ( head , node , ns ) ;
if ( unlikely ( ! nvme_path_is_optimized ( ns ) ) )
return __nvme_find_path ( head , node ) ;
2017-11-02 12:59:30 +01:00
return ns ;
}
2019-07-25 11:56:57 -07:00
static bool nvme_available_path ( struct nvme_ns_head * head )
{
struct nvme_ns * ns ;
list_for_each_entry_rcu ( ns , & head - > list , siblings ) {
switch ( ns - > ctrl - > state ) {
case NVME_CTRL_LIVE :
case NVME_CTRL_RESETTING :
case NVME_CTRL_CONNECTING :
/* fallthru */
return true ;
default :
break ;
}
}
return false ;
}
2020-07-01 10:59:43 +02:00
blk_qc_t nvme_ns_head_submit_bio ( struct bio * bio )
2017-11-02 12:59:30 +01:00
{
2020-03-29 19:41:38 +02:00
struct nvme_ns_head * head = bio - > bi_disk - > private_data ;
2017-11-02 12:59:30 +01:00
struct device * dev = disk_to_dev ( head - > disk ) ;
struct nvme_ns * ns ;
blk_qc_t ret = BLK_QC_T_NONE ;
int srcu_idx ;
2019-04-30 18:57:09 +02:00
/*
2020-07-01 10:59:39 +02:00
* The namespace might be going away and the bio might be moved to a
* different queue via blk_steal_bios ( ) , so we need to use the bio_split
* pool from the original queue to allocate the bvecs from .
2019-04-30 18:57:09 +02:00
*/
2020-07-01 10:59:39 +02:00
blk_queue_split ( & bio ) ;
2019-04-30 18:57:09 +02:00
2017-11-02 12:59:30 +01:00
srcu_idx = srcu_read_lock ( & head - > srcu ) ;
ns = nvme_find_path ( head ) ;
if ( likely ( ns ) ) {
bio - > bi_disk = ns - > disk ;
bio - > bi_opf | = REQ_NVME_MPATH ;
2018-06-07 10:38:47 +02:00
trace_block_bio_remap ( bio - > bi_disk - > queue , bio ,
disk_devt ( ns - > head - > disk ) ,
bio - > bi_iter . bi_sector ) ;
2020-07-01 10:59:47 +02:00
ret = submit_bio_noacct ( bio ) ;
2019-07-25 11:56:57 -07:00
} else if ( nvme_available_path ( head ) ) {
dev_warn_ratelimited ( dev , " no usable path - requeuing I/O \n " ) ;
2017-11-02 12:59:30 +01:00
spin_lock_irq ( & head - > requeue_lock ) ;
bio_list_add ( & head - > requeue_list , bio ) ;
spin_unlock_irq ( & head - > requeue_lock ) ;
} else {
2019-07-25 11:56:57 -07:00
dev_warn_ratelimited ( dev , " no available path - failing I/O \n " ) ;
2017-11-02 12:59:30 +01:00
bio - > bi_status = BLK_STS_IOERR ;
bio_endio ( bio ) ;
}
srcu_read_unlock ( & head - > srcu , srcu_idx ) ;
return ret ;
}
static void nvme_requeue_work ( struct work_struct * work )
{
struct nvme_ns_head * head =
container_of ( work , struct nvme_ns_head , requeue_work ) ;
struct bio * bio , * next ;
spin_lock_irq ( & head - > requeue_lock ) ;
next = bio_list_get ( & head - > requeue_list ) ;
spin_unlock_irq ( & head - > requeue_lock ) ;
while ( ( bio = next ) ! = NULL ) {
next = bio - > bi_next ;
bio - > bi_next = NULL ;
/*
* Reset disk to the mpath node and resubmit to select a new
* path .
*/
bio - > bi_disk = head - > disk ;
2020-07-01 10:59:44 +02:00
submit_bio_noacct ( bio ) ;
2017-11-02 12:59:30 +01:00
}
}
int nvme_mpath_alloc_disk ( struct nvme_ctrl * ctrl , struct nvme_ns_head * head )
{
struct request_queue * q ;
bool vwc = false ;
2018-05-14 08:48:54 +02:00
mutex_init ( & head - > lock ) ;
2017-11-02 12:59:30 +01:00
bio_list_init ( & head - > requeue_list ) ;
spin_lock_init ( & head - > requeue_lock ) ;
INIT_WORK ( & head - > requeue_work , nvme_requeue_work ) ;
/*
* Add a multipath node if the subsystems supports multiple controllers .
* We also do this for private namespaces as the namespace sharing data could
* change after a rescan .
*/
2020-04-03 10:53:46 -07:00
if ( ! ( ctrl - > subsys - > cmic & NVME_CTRL_CMIC_MULTI_CTRL ) | | ! multipath )
2017-11-02 12:59:30 +01:00
return 0 ;
2020-07-01 10:59:43 +02:00
q = blk_alloc_queue ( ctrl - > numa_node ) ;
2017-11-02 12:59:30 +01:00
if ( ! q )
goto out ;
2018-03-07 17:10:10 -08:00
blk_queue_flag_set ( QUEUE_FLAG_NONROT , q ) ;
2017-11-02 12:59:30 +01:00
/* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size ( q , 512 ) ;
2018-11-02 11:22:13 -07:00
blk_set_stacking_limits ( & q - > limits ) ;
2017-11-02 12:59:30 +01:00
/* we need to propagate up the VMC settings */
if ( ctrl - > vwc & NVME_CTRL_VWC_PRESENT )
vwc = true ;
blk_queue_write_cache ( q , vwc , vwc ) ;
head - > disk = alloc_disk ( 0 ) ;
if ( ! head - > disk )
goto out_cleanup_queue ;
head - > disk - > fops = & nvme_ns_head_ops ;
head - > disk - > private_data = head ;
head - > disk - > queue = q ;
head - > disk - > flags = GENHD_FL_EXT_DEVT ;
sprintf ( head - > disk - > disk_name , " nvme%dn%d " ,
ctrl - > subsys - > instance , head - > instance ) ;
return 0 ;
out_cleanup_queue :
blk_cleanup_queue ( q ) ;
out :
return - ENOMEM ;
}
2018-05-14 08:48:54 +02:00
static void nvme_mpath_set_live ( struct nvme_ns * ns )
2017-11-02 12:59:30 +01:00
{
2018-05-14 08:48:54 +02:00
struct nvme_ns_head * head = ns - > head ;
2017-11-02 12:59:30 +01:00
if ( ! head - > disk )
return ;
2018-02-28 16:06:04 +09:00
2020-06-24 01:53:11 -07:00
if ( ! test_and_set_bit ( NVME_NSHEAD_DISK_LIVE , & head - > flags ) )
2018-09-28 08:17:20 +02:00
device_add_disk ( & head - > subsys - > dev , head - > disk ,
nvme_ns_id_attr_groups ) ;
2018-05-14 08:48:54 +02:00
2020-06-24 01:53:11 -07:00
mutex_lock ( & head - > lock ) ;
2018-10-05 09:49:37 -06:00
if ( nvme_path_is_optimized ( ns ) ) {
int node , srcu_idx ;
srcu_idx = srcu_read_lock ( & head - > srcu ) ;
for_each_node ( node )
__nvme_find_path ( head , node ) ;
srcu_read_unlock ( & head - > srcu , srcu_idx ) ;
}
2020-06-24 01:53:10 -07:00
mutex_unlock ( & head - > lock ) ;
2018-10-05 09:49:37 -06:00
2020-06-24 01:53:10 -07:00
synchronize_srcu ( & head - > srcu ) ;
kblockd_schedule_work ( & head - > requeue_work ) ;
2018-05-14 08:48:54 +02:00
}
static int nvme_parse_ana_log ( struct nvme_ctrl * ctrl , void * data ,
int ( * cb ) ( struct nvme_ctrl * ctrl , struct nvme_ana_group_desc * ,
void * ) )
{
void * base = ctrl - > ana_log_buf ;
size_t offset = sizeof ( struct nvme_ana_rsp_hdr ) ;
int error , i ;
lockdep_assert_held ( & ctrl - > ana_lock ) ;
for ( i = 0 ; i < le16_to_cpu ( ctrl - > ana_log_buf - > ngrps ) ; i + + ) {
struct nvme_ana_group_desc * desc = base + offset ;
2019-10-28 16:56:48 -06:00
u32 nr_nsids ;
size_t nsid_buf_size ;
if ( WARN_ON_ONCE ( offset > ctrl - > ana_log_size - sizeof ( * desc ) ) )
return - EINVAL ;
nr_nsids = le32_to_cpu ( desc - > nnsids ) ;
nsid_buf_size = nr_nsids * sizeof ( __le32 ) ;
2018-05-14 08:48:54 +02:00
if ( WARN_ON_ONCE ( desc - > grpid = = 0 ) )
return - EINVAL ;
if ( WARN_ON_ONCE ( le32_to_cpu ( desc - > grpid ) > ctrl - > anagrpmax ) )
return - EINVAL ;
if ( WARN_ON_ONCE ( desc - > state = = 0 ) )
return - EINVAL ;
if ( WARN_ON_ONCE ( desc - > state > NVME_ANA_CHANGE ) )
return - EINVAL ;
offset + = sizeof ( * desc ) ;
if ( WARN_ON_ONCE ( offset > ctrl - > ana_log_size - nsid_buf_size ) )
return - EINVAL ;
error = cb ( ctrl , desc , data ) ;
if ( error )
return error ;
offset + = nsid_buf_size ;
}
return 0 ;
}
static inline bool nvme_state_is_live ( enum nvme_ana_state state )
{
return state = = NVME_ANA_OPTIMIZED | | state = = NVME_ANA_NONOPTIMIZED ;
}
static void nvme_update_ns_ana_state ( struct nvme_ana_group_desc * desc ,
struct nvme_ns * ns )
{
ns - > ana_grpid = le32_to_cpu ( desc - > grpid ) ;
ns - > ana_state = desc - > state ;
clear_bit ( NVME_NS_ANA_PENDING , & ns - > flags ) ;
2019-03-27 09:52:56 +01:00
if ( nvme_state_is_live ( ns - > ana_state ) )
2018-05-14 08:48:54 +02:00
nvme_mpath_set_live ( ns ) ;
}
static int nvme_update_ana_state ( struct nvme_ctrl * ctrl ,
struct nvme_ana_group_desc * desc , void * data )
{
u32 nr_nsids = le32_to_cpu ( desc - > nnsids ) , n = 0 ;
unsigned * nr_change_groups = data ;
struct nvme_ns * ns ;
2019-04-28 20:24:42 -07:00
dev_dbg ( ctrl - > device , " ANA group %d: %s. \n " ,
2018-05-14 08:48:54 +02:00
le32_to_cpu ( desc - > grpid ) ,
nvme_ana_state_names [ desc - > state ] ) ;
if ( desc - > state = = NVME_ANA_CHANGE )
( * nr_change_groups ) + + ;
if ( ! nr_nsids )
return 0 ;
2020-04-02 09:34:54 -07:00
down_read ( & ctrl - > namespaces_rwsem ) ;
2018-05-14 08:48:54 +02:00
list_for_each_entry ( ns , & ctrl - > namespaces , list ) {
nvme-multipath: fix ana log nsid lookup when nsid is not found
ANA log parsing invokes nvme_update_ana_state() per ANA group desc.
This updates the state of namespaces with nsids in desc->nsids[].
Both ctrl->namespaces list and desc->nsids[] array are sorted by nsid.
Hence nvme_update_ana_state() performs a single walk over ctrl->namespaces:
- if current namespace matches the current desc->nsids[n],
this namespace is updated, and n is incremented.
- the process stops when it encounters the end of either
ctrl->namespaces end or desc->nsids[]
In case desc->nsids[n] does not match any of ctrl->namespaces,
the remaining nsids following desc->nsids[n] will not be updated.
Such situation was considered abnormal and generated WARN_ON_ONCE.
However ANA log MAY contain nsids not (yet) found in ctrl->namespaces.
For example, lets consider the following scenario:
- nvme0 exposes namespaces with nsids = [2, 3] to the host
- a new namespace nsid = 1 is added dynamically
- also, a ANA topology change is triggered
- NS_CHANGED aen is generated and triggers scan_work
- before scan_work discovers nsid=1 and creates a namespace, a NOTICE_ANA
aen was issues and ana_work receives ANA log with nsids=[1, 2, 3]
Result: ana_work fails to update ANA state on existing namespaces [2, 3]
Solution:
Change the way nvme_update_ana_state() namespace list walk
checks the current namespace against desc->nsids[n] as follows:
a) ns->head->ns_id < desc->nsids[n]: keep walking ctrl->namespaces.
b) ns->head->ns_id == desc->nsids[n]: match, update the namespace
c) ns->head->ns_id >= desc->nsids[n]: skip to desc->nsids[n+1]
This enables correct operation in the scenario described above.
This also allows ANA log to contain nsids currently invisible
to the host, i.e. inactive nsids.
Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
2019-08-16 13:00:10 -07:00
unsigned nsid = le32_to_cpu ( desc - > nsids [ n ] ) ;
if ( ns - > head - > ns_id < nsid )
2018-05-14 08:48:54 +02:00
continue ;
nvme-multipath: fix ana log nsid lookup when nsid is not found
ANA log parsing invokes nvme_update_ana_state() per ANA group desc.
This updates the state of namespaces with nsids in desc->nsids[].
Both ctrl->namespaces list and desc->nsids[] array are sorted by nsid.
Hence nvme_update_ana_state() performs a single walk over ctrl->namespaces:
- if current namespace matches the current desc->nsids[n],
this namespace is updated, and n is incremented.
- the process stops when it encounters the end of either
ctrl->namespaces end or desc->nsids[]
In case desc->nsids[n] does not match any of ctrl->namespaces,
the remaining nsids following desc->nsids[n] will not be updated.
Such situation was considered abnormal and generated WARN_ON_ONCE.
However ANA log MAY contain nsids not (yet) found in ctrl->namespaces.
For example, lets consider the following scenario:
- nvme0 exposes namespaces with nsids = [2, 3] to the host
- a new namespace nsid = 1 is added dynamically
- also, a ANA topology change is triggered
- NS_CHANGED aen is generated and triggers scan_work
- before scan_work discovers nsid=1 and creates a namespace, a NOTICE_ANA
aen was issues and ana_work receives ANA log with nsids=[1, 2, 3]
Result: ana_work fails to update ANA state on existing namespaces [2, 3]
Solution:
Change the way nvme_update_ana_state() namespace list walk
checks the current namespace against desc->nsids[n] as follows:
a) ns->head->ns_id < desc->nsids[n]: keep walking ctrl->namespaces.
b) ns->head->ns_id == desc->nsids[n]: match, update the namespace
c) ns->head->ns_id >= desc->nsids[n]: skip to desc->nsids[n+1]
This enables correct operation in the scenario described above.
This also allows ANA log to contain nsids currently invisible
to the host, i.e. inactive nsids.
Signed-off-by: Anton Eidelman <anton@lightbitslabs.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
2019-08-16 13:00:10 -07:00
if ( ns - > head - > ns_id = = nsid )
nvme_update_ns_ana_state ( desc , ns ) ;
2018-05-14 08:48:54 +02:00
if ( + + n = = nr_nsids )
break ;
}
2020-04-02 09:34:54 -07:00
up_read ( & ctrl - > namespaces_rwsem ) ;
2018-05-14 08:48:54 +02:00
return 0 ;
}
2019-10-18 11:32:51 -07:00
static int nvme_read_ana_log ( struct nvme_ctrl * ctrl )
2018-05-14 08:48:54 +02:00
{
u32 nr_change_groups = 0 ;
int error ;
mutex_lock ( & ctrl - > ana_lock ) ;
2020-06-29 12:06:40 -07:00
error = nvme_get_log ( ctrl , NVME_NSID_ALL , NVME_LOG_ANA , 0 , NVME_CSI_NVM ,
2018-05-14 08:48:54 +02:00
ctrl - > ana_log_buf , ctrl - > ana_log_size , 0 ) ;
if ( error ) {
dev_warn ( ctrl - > device , " Failed to get ANA log: %d \n " , error ) ;
goto out_unlock ;
}
error = nvme_parse_ana_log ( ctrl , & nr_change_groups ,
nvme_update_ana_state ) ;
if ( error )
goto out_unlock ;
/*
* In theory we should have an ANATT timer per group as they might enter
* the change state at different times . But that is a lot of overhead
* just to protect against a target that keeps entering new changes
* states while never finishing previous ones . But we ' ll still
* eventually time out once all groups are in change state , so this
* isn ' t a big deal .
*
* We also double the ANATT value to provide some slack for transports
* or AEN processing overhead .
*/
if ( nr_change_groups )
mod_timer ( & ctrl - > anatt_timer , ctrl - > anatt * HZ * 2 + jiffies ) ;
else
del_timer_sync ( & ctrl - > anatt_timer ) ;
out_unlock :
mutex_unlock ( & ctrl - > ana_lock ) ;
return error ;
}
static void nvme_ana_work ( struct work_struct * work )
{
struct nvme_ctrl * ctrl = container_of ( work , struct nvme_ctrl , ana_work ) ;
2020-07-22 16:32:19 -07:00
if ( ctrl - > state ! = NVME_CTRL_LIVE )
return ;
2019-10-18 11:32:51 -07:00
nvme_read_ana_log ( ctrl ) ;
2018-05-14 08:48:54 +02:00
}
static void nvme_anatt_timeout ( struct timer_list * t )
{
struct nvme_ctrl * ctrl = from_timer ( ctrl , t , anatt_timer ) ;
dev_info ( ctrl - > device , " ANATT timeout, resetting controller. \n " ) ;
nvme_reset_ctrl ( ctrl ) ;
}
void nvme_mpath_stop ( struct nvme_ctrl * ctrl )
{
if ( ! nvme_ctrl_use_ana ( ctrl ) )
return ;
del_timer_sync ( & ctrl - > anatt_timer ) ;
cancel_work_sync ( & ctrl - > ana_work ) ;
}
2019-02-18 11:43:26 +01:00
# define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
struct device_attribute subsys_attr_ # # _name = \
__ATTR ( _name , _mode , _show , _store )
static const char * nvme_iopolicy_names [ ] = {
[ NVME_IOPOLICY_NUMA ] = " numa " ,
[ NVME_IOPOLICY_RR ] = " round-robin " ,
} ;
static ssize_t nvme_subsys_iopolicy_show ( struct device * dev ,
struct device_attribute * attr , char * buf )
{
struct nvme_subsystem * subsys =
container_of ( dev , struct nvme_subsystem , dev ) ;
return sprintf ( buf , " %s \n " ,
nvme_iopolicy_names [ READ_ONCE ( subsys - > iopolicy ) ] ) ;
}
static ssize_t nvme_subsys_iopolicy_store ( struct device * dev ,
struct device_attribute * attr , const char * buf , size_t count )
{
struct nvme_subsystem * subsys =
container_of ( dev , struct nvme_subsystem , dev ) ;
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( nvme_iopolicy_names ) ; i + + ) {
if ( sysfs_streq ( buf , nvme_iopolicy_names [ i ] ) ) {
WRITE_ONCE ( subsys - > iopolicy , i ) ;
return count ;
}
}
return - EINVAL ;
}
SUBSYS_ATTR_RW ( iopolicy , S_IRUGO | S_IWUSR ,
nvme_subsys_iopolicy_show , nvme_subsys_iopolicy_store ) ;
2018-05-14 08:48:54 +02:00
static ssize_t ana_grpid_show ( struct device * dev , struct device_attribute * attr ,
char * buf )
{
return sprintf ( buf , " %d \n " , nvme_get_ns_from_dev ( dev ) - > ana_grpid ) ;
}
DEVICE_ATTR_RO ( ana_grpid ) ;
static ssize_t ana_state_show ( struct device * dev , struct device_attribute * attr ,
char * buf )
{
struct nvme_ns * ns = nvme_get_ns_from_dev ( dev ) ;
return sprintf ( buf , " %s \n " , nvme_ana_state_names [ ns - > ana_state ] ) ;
}
DEVICE_ATTR_RO ( ana_state ) ;
2020-06-24 01:53:09 -07:00
static int nvme_lookup_ana_group_desc ( struct nvme_ctrl * ctrl ,
2018-05-14 08:48:54 +02:00
struct nvme_ana_group_desc * desc , void * data )
{
2020-06-24 01:53:09 -07:00
struct nvme_ana_group_desc * dst = data ;
2018-05-14 08:48:54 +02:00
2020-06-24 01:53:09 -07:00
if ( desc - > grpid ! = dst - > grpid )
return 0 ;
2018-05-14 08:48:54 +02:00
2020-06-24 01:53:09 -07:00
* dst = * desc ;
return - ENXIO ; /* just break out of the loop */
2018-05-14 08:48:54 +02:00
}
void nvme_mpath_add_disk ( struct nvme_ns * ns , struct nvme_id_ns * id )
{
if ( nvme_ctrl_use_ana ( ns - > ctrl ) ) {
2020-06-24 01:53:09 -07:00
struct nvme_ana_group_desc desc = {
. grpid = id - > anagrpid ,
. state = 0 ,
} ;
2018-05-14 08:48:54 +02:00
mutex_lock ( & ns - > ctrl - > ana_lock ) ;
ns - > ana_grpid = le32_to_cpu ( id - > anagrpid ) ;
2020-06-24 01:53:09 -07:00
nvme_parse_ana_log ( ns - > ctrl , & desc , nvme_lookup_ana_group_desc ) ;
2018-05-14 08:48:54 +02:00
mutex_unlock ( & ns - > ctrl - > ana_lock ) ;
2020-06-24 01:53:09 -07:00
if ( desc . state ) {
/* found the group desc: update */
nvme_update_ns_ana_state ( & desc , ns ) ;
}
2018-05-14 08:48:54 +02:00
} else {
ns - > ana_state = NVME_ANA_OPTIMIZED ;
nvme_mpath_set_live ( ns ) ;
2018-02-28 16:06:04 +09:00
}
2020-04-09 09:09:04 -07:00
if ( bdi_cap_stable_pages_required ( ns - > queue - > backing_dev_info ) ) {
2020-06-29 16:30:19 +02:00
struct gendisk * disk = ns - > head - > disk ;
2020-04-09 09:09:04 -07:00
2020-06-29 16:30:19 +02:00
if ( disk )
disk - > queue - > backing_dev_info - > capabilities | =
BDI_CAP_STABLE_WRITES ;
2020-04-09 09:09:04 -07:00
}
2017-11-02 12:59:30 +01:00
}
void nvme_mpath_remove_disk ( struct nvme_ns_head * head )
{
if ( ! head - > disk )
return ;
2018-09-28 08:17:20 +02:00
if ( head - > disk - > flags & GENHD_FL_UP )
2018-05-14 08:48:54 +02:00
del_gendisk ( head - > disk ) ;
2017-11-02 12:59:30 +01:00
blk_set_queue_dying ( head - > disk - > queue ) ;
/* make sure all pending bios are cleaned up */
kblockd_schedule_work ( & head - > requeue_work ) ;
flush_work ( & head - > requeue_work ) ;
blk_cleanup_queue ( head - > disk - > queue ) ;
2020-06-24 01:53:12 -07:00
if ( ! test_bit ( NVME_NSHEAD_DISK_LIVE , & head - > flags ) ) {
/*
* if device_add_disk wasn ' t called , prevent
* disk release to put a bogus reference on the
* request queue
*/
head - > disk - > queue = NULL ;
}
2017-11-02 12:59:30 +01:00
put_disk ( head - > disk ) ;
}
2018-05-14 08:48:54 +02:00
int nvme_mpath_init ( struct nvme_ctrl * ctrl , struct nvme_id_ctrl * id )
{
int error ;
2019-07-23 07:41:20 +02:00
/* check if multipath is enabled and we have the capability */
2020-04-03 10:53:46 -07:00
if ( ! multipath | | ! ctrl - > subsys | |
! ( ctrl - > subsys - > cmic & NVME_CTRL_CMIC_ANA ) )
2018-05-14 08:48:54 +02:00
return 0 ;
ctrl - > anacap = id - > anacap ;
ctrl - > anatt = id - > anatt ;
ctrl - > nanagrpid = le32_to_cpu ( id - > nanagrpid ) ;
ctrl - > anagrpmax = le32_to_cpu ( id - > anagrpmax ) ;
mutex_init ( & ctrl - > ana_lock ) ;
timer_setup ( & ctrl - > anatt_timer , nvme_anatt_timeout , 0 ) ;
ctrl - > ana_log_size = sizeof ( struct nvme_ana_rsp_hdr ) +
ctrl - > nanagrpid * sizeof ( struct nvme_ana_group_desc ) ;
2019-01-09 09:45:15 +01:00
ctrl - > ana_log_size + = ctrl - > max_namespaces * sizeof ( __le32 ) ;
2018-05-14 08:48:54 +02:00
if ( ctrl - > ana_log_size > ctrl - > max_hw_sectors < < SECTOR_SHIFT ) {
dev_err ( ctrl - > device ,
" ANA log page size (%zd) larger than MDTS (%d). \n " ,
ctrl - > ana_log_size ,
ctrl - > max_hw_sectors < < SECTOR_SHIFT ) ;
dev_err ( ctrl - > device , " disabling ANA support. \n " ) ;
return 0 ;
}
INIT_WORK ( & ctrl - > ana_work , nvme_ana_work ) ;
2020-02-20 13:29:53 -07:00
kfree ( ctrl - > ana_log_buf ) ;
2018-05-14 08:48:54 +02:00
ctrl - > ana_log_buf = kmalloc ( ctrl - > ana_log_size , GFP_KERNEL ) ;
2018-09-25 12:29:15 -07:00
if ( ! ctrl - > ana_log_buf ) {
error = - ENOMEM ;
2018-05-14 08:48:54 +02:00
goto out ;
2018-09-25 12:29:15 -07:00
}
2018-05-14 08:48:54 +02:00
2019-10-18 11:32:51 -07:00
error = nvme_read_ana_log ( ctrl ) ;
2018-05-14 08:48:54 +02:00
if ( error )
goto out_free_ana_log_buf ;
return 0 ;
out_free_ana_log_buf :
kfree ( ctrl - > ana_log_buf ) ;
2019-01-08 12:46:58 +01:00
ctrl - > ana_log_buf = NULL ;
2018-05-14 08:48:54 +02:00
out :
2018-09-25 12:29:15 -07:00
return error ;
2018-05-14 08:48:54 +02:00
}
void nvme_mpath_uninit ( struct nvme_ctrl * ctrl )
{
kfree ( ctrl - > ana_log_buf ) ;
2019-01-08 12:46:58 +01:00
ctrl - > ana_log_buf = NULL ;
2018-05-14 08:48:54 +02:00
}