2019-02-18 11:36:08 +03:00
// SPDX-License-Identifier: GPL-2.0
2011-01-20 20:50:14 +03:00
/*
* NVM Express device driver
2014-03-24 18:11:22 +04:00
* Copyright ( c ) 2011 - 2014 , Intel Corporation .
2011-01-20 20:50:14 +03:00
*/
2015-12-08 01:30:31 +03:00
# include <linux/aer.h>
2018-04-27 22:42:52 +03:00
# include <linux/async.h>
2011-01-20 20:50:14 +03:00
# include <linux/blkdev.h>
2014-11-04 18:20:14 +03:00
# include <linux/blk-mq.h>
2016-09-14 17:18:57 +03:00
# include <linux/blk-mq-pci.h>
2017-04-20 23:37:55 +03:00
# include <linux/dmi.h>
2011-01-20 20:50:14 +03:00
# include <linux/init.h>
# include <linux/interrupt.h>
# include <linux/io.h>
# include <linux/mm.h>
# include <linux/module.h>
2015-11-26 14:21:29 +03:00
# include <linux/mutex.h>
2017-09-15 20:05:38 +03:00
# include <linux/once.h>
2011-01-20 20:50:14 +03:00
# include <linux/pci.h>
2019-05-23 18:27:35 +03:00
# include <linux/suspend.h>
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 23:39:03 +03:00
# include <linux/t10-pi.h>
2011-01-20 20:50:14 +03:00
# include <linux/types.h>
2015-08-28 10:27:14 +03:00
# include <linux/io-64-nonatomic-lo-hi.h>
2017-02-03 22:50:32 +03:00
# include <linux/sed-opal.h>
2018-10-05 00:27:43 +03:00
# include <linux/pci-p2pdma.h>
2012-02-07 06:45:33 +04:00
2018-12-18 19:59:53 +03:00
# include "trace.h"
2015-10-03 16:46:41 +03:00
# include "nvme.h"
2011-01-20 20:50:14 +03:00
# define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
# define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
2016-12-16 21:54:50 +03:00
2017-10-17 04:24:20 +03:00
# define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
2014-05-13 21:42:02 +04:00
2018-06-21 18:49:37 +03:00
/*
* These can be higher , but we need to ensure that any command doesn ' t
* require an sg allocation that needs more than a page of data .
*/
# define NVME_MAX_KB_SZ 4096
# define NVME_MAX_SEGS 127
2011-02-06 15:28:06 +03:00
static int use_threaded_interrupts ;
module_param ( use_threaded_interrupts , int , 0 ) ;
2015-07-20 19:14:09 +03:00
static bool use_cmb_sqes = true ;
2018-06-06 17:13:09 +03:00
module_param ( use_cmb_sqes , bool , 0444 ) ;
2015-07-20 19:14:09 +03:00
MODULE_PARM_DESC ( use_cmb_sqes , " use controller's memory buffer for I/O SQes " ) ;
2017-05-12 18:02:58 +03:00
static unsigned int max_host_mem_size_mb = 128 ;
module_param ( max_host_mem_size_mb , uint , 0444 ) ;
MODULE_PARM_DESC ( max_host_mem_size_mb ,
" Maximum Host Memory Buffer (HMB) size per controller (in MiB) " ) ;
2011-03-03 02:37:18 +03:00
2017-10-17 04:24:20 +03:00
static unsigned int sgl_threshold = SZ_32K ;
module_param ( sgl_threshold , uint , 0644 ) ;
MODULE_PARM_DESC ( sgl_threshold ,
" Use SGLs when average request segment size is larger or equal to "
" this size. Use 0 to disable SGLs. " ) ;
2017-07-10 11:46:59 +03:00
static int io_queue_depth_set ( const char * val , const struct kernel_param * kp ) ;
static const struct kernel_param_ops io_queue_depth_ops = {
. set = io_queue_depth_set ,
. get = param_get_int ,
} ;
static int io_queue_depth = 1024 ;
module_param_cb ( io_queue_depth , & io_queue_depth_ops , & io_queue_depth , 0644 ) ;
MODULE_PARM_DESC ( io_queue_depth , " set io queue depth, should >= 2 " ) ;
2018-10-31 17:36:31 +03:00
static int write_queues ;
2019-06-08 21:02:18 +03:00
module_param ( write_queues , int , 0644 ) ;
2018-10-31 17:36:31 +03:00
MODULE_PARM_DESC ( write_queues ,
" Number of queues to use for writes. If not set, reads and writes "
" will share a queue set. " ) ;
2019-06-08 21:02:17 +03:00
static int poll_queues ;
2019-06-08 21:02:18 +03:00
module_param ( poll_queues , int , 0644 ) ;
2018-11-05 22:44:33 +03:00
MODULE_PARM_DESC ( poll_queues , " Number of queues to use for polled IO. " ) ;
2015-11-26 12:06:56 +03:00
struct nvme_dev ;
struct nvme_queue ;
2015-02-03 21:21:42 +03:00
2016-01-13 00:41:18 +03:00
static void nvme_dev_disable ( struct nvme_dev * dev , bool shutdown ) ;
2019-01-05 01:04:33 +03:00
static bool __nvme_disable_io_queues ( struct nvme_dev * dev , u8 opcode ) ;
2013-12-11 00:10:37 +04:00
2015-11-26 12:06:56 +03:00
/*
* Represents an NVM Express device . Each nvme_dev is a PCI function .
*/
struct nvme_dev {
2018-01-14 13:39:01 +03:00
struct nvme_queue * queues ;
2015-11-26 12:06:56 +03:00
struct blk_mq_tag_set tagset ;
struct blk_mq_tag_set admin_tagset ;
u32 __iomem * dbs ;
struct device * dev ;
struct dma_pool * prp_page_pool ;
struct dma_pool * prp_small_pool ;
unsigned online_queues ;
unsigned max_qid ;
2018-12-02 19:46:16 +03:00
unsigned io_queues [ HCTX_MAX_TYPES ] ;
2018-04-12 18:16:10 +03:00
unsigned int num_vecs ;
2015-11-26 12:06:56 +03:00
int q_depth ;
u32 db_stride ;
void __iomem * bar ;
2017-05-24 11:39:55 +03:00
unsigned long bar_mapped_size ;
2015-11-26 14:35:49 +03:00
struct work_struct remove_work ;
2015-11-26 14:21:29 +03:00
struct mutex shutdown_lock ;
2015-11-26 12:06:56 +03:00
bool subsystem ;
u64 cmb_size ;
2018-10-05 00:27:43 +03:00
bool cmb_use_sqes ;
2015-11-26 12:06:56 +03:00
u32 cmbsz ;
2016-10-06 05:01:12 +03:00
u32 cmbloc ;
2015-11-26 12:06:56 +03:00
struct nvme_ctrl ctrl ;
2019-05-23 18:27:35 +03:00
u32 last_ps ;
2017-05-12 18:02:58 +03:00
2018-06-21 18:49:37 +03:00
mempool_t * iod_mempool ;
2017-05-12 18:02:58 +03:00
/* shadow doorbell buffer support: */
2017-04-10 18:51:07 +03:00
u32 * dbbuf_dbs ;
dma_addr_t dbbuf_dbs_dma_addr ;
u32 * dbbuf_eis ;
dma_addr_t dbbuf_eis_dma_addr ;
2017-05-12 18:02:58 +03:00
/* host memory buffer support: */
u64 host_mem_size ;
u32 nr_host_mem_descs ;
2017-08-28 11:47:18 +03:00
dma_addr_t host_mem_descs_dma ;
2017-05-12 18:02:58 +03:00
struct nvme_host_mem_buf_desc * host_mem_descs ;
void * * host_mem_desc_bufs ;
2013-12-11 00:10:40 +04:00
} ;
2011-03-03 02:37:18 +03:00
2017-07-10 11:46:59 +03:00
static int io_queue_depth_set ( const char * val , const struct kernel_param * kp )
{
int n = 0 , ret ;
ret = kstrtoint ( val , 10 , & n ) ;
if ( ret ! = 0 | | n < 2 )
return - EINVAL ;
return param_set_int ( val , kp ) ;
}
2017-04-10 18:51:07 +03:00
static inline unsigned int sq_idx ( unsigned int qid , u32 stride )
{
return qid * 2 * stride ;
}
static inline unsigned int cq_idx ( unsigned int qid , u32 stride )
{
return ( qid * 2 + 1 ) * stride ;
}
2015-11-26 12:06:56 +03:00
static inline struct nvme_dev * to_nvme_dev ( struct nvme_ctrl * ctrl )
{
return container_of ( ctrl , struct nvme_dev , ctrl ) ;
}
2011-01-20 20:50:14 +03:00
/*
* An NVM Express queue . Each device has at least two ( one for admin
* commands and one for I / O commands ) .
*/
struct nvme_queue {
2011-02-10 17:56:01 +03:00
struct nvme_dev * dev ;
2018-05-17 19:31:51 +03:00
spinlock_t sq_lock ;
2011-01-20 20:50:14 +03:00
struct nvme_command * sq_cmds ;
2018-12-02 19:46:23 +03:00
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp ;
2011-01-20 20:50:14 +03:00
volatile struct nvme_completion * cqes ;
2015-06-01 18:29:54 +03:00
struct blk_mq_tags * * tags ;
2011-01-20 20:50:14 +03:00
dma_addr_t sq_dma_addr ;
dma_addr_t cq_dma_addr ;
u32 __iomem * q_db ;
u16 q_depth ;
2019-03-08 20:43:06 +03:00
u16 cq_vector ;
2011-01-20 20:50:14 +03:00
u16 sq_tail ;
2018-11-29 20:02:29 +03:00
u16 last_sq_tail ;
2011-01-20 20:50:14 +03:00
u16 cq_head ;
2018-05-21 17:41:52 +03:00
u16 last_cq_head ;
2013-12-11 00:10:38 +04:00
u16 qid ;
2013-06-24 19:47:34 +04:00
u8 cq_phase ;
2018-12-02 19:46:17 +03:00
unsigned long flags ;
# define NVMEQ_ENABLED 0
2018-12-02 19:46:18 +03:00
# define NVMEQ_SQ_CMB 1
2018-12-02 19:46:22 +03:00
# define NVMEQ_DELETE_ERROR 2
2019-03-08 20:43:06 +03:00
# define NVMEQ_POLLED 3
2017-04-10 18:51:07 +03:00
u32 * dbbuf_sq_db ;
u32 * dbbuf_cq_db ;
u32 * dbbuf_sq_ei ;
u32 * dbbuf_cq_ei ;
2018-12-02 19:46:22 +03:00
struct completion delete_done ;
2011-01-20 20:50:14 +03:00
} ;
2015-10-16 08:58:32 +03:00
/*
2019-03-03 18:04:01 +03:00
* The nvme_iod describes the data in an I / O .
*
* The sg pointer contains the list of PRP / SGL chunk allocations in addition
* to the actual struct scatterlist .
2015-10-16 08:58:32 +03:00
*/
struct nvme_iod {
2016-11-10 18:32:33 +03:00
struct nvme_request req ;
2015-11-28 17:43:10 +03:00
struct nvme_queue * nvmeq ;
2017-10-17 04:24:20 +03:00
bool use_sgl ;
2015-11-28 17:43:10 +03:00
int aborted ;
2015-10-16 08:58:32 +03:00
int npages ; /* In the PRP list. 0 means small pool in use */
int nents ; /* Used in scatterlist */
dma_addr_t first_dma ;
2019-03-05 15:49:34 +03:00
unsigned int dma_len ; /* length of single DMA segment mapping */
2019-03-03 18:19:18 +03:00
dma_addr_t meta_dma ;
2015-11-28 17:43:10 +03:00
struct scatterlist * sg ;
2011-01-20 20:50:14 +03:00
} ;
2018-10-31 17:36:31 +03:00
static unsigned int max_io_queues ( void )
{
2018-11-05 22:44:33 +03:00
return num_possible_cpus ( ) + write_queues + poll_queues ;
2018-10-31 17:36:31 +03:00
}
static unsigned int max_queue_count ( void )
{
/* IO queues + admin queue */
return 1 + max_io_queues ( ) ;
}
2017-04-10 18:51:07 +03:00
static inline unsigned int nvme_dbbuf_size ( u32 stride )
{
2018-10-31 17:36:31 +03:00
return ( max_queue_count ( ) * 8 * stride ) ;
2017-04-10 18:51:07 +03:00
}
static int nvme_dbbuf_dma_alloc ( struct nvme_dev * dev )
{
unsigned int mem_size = nvme_dbbuf_size ( dev - > db_stride ) ;
if ( dev - > dbbuf_dbs )
return 0 ;
dev - > dbbuf_dbs = dma_alloc_coherent ( dev - > dev , mem_size ,
& dev - > dbbuf_dbs_dma_addr ,
GFP_KERNEL ) ;
if ( ! dev - > dbbuf_dbs )
return - ENOMEM ;
dev - > dbbuf_eis = dma_alloc_coherent ( dev - > dev , mem_size ,
& dev - > dbbuf_eis_dma_addr ,
GFP_KERNEL ) ;
if ( ! dev - > dbbuf_eis ) {
dma_free_coherent ( dev - > dev , mem_size ,
dev - > dbbuf_dbs , dev - > dbbuf_dbs_dma_addr ) ;
dev - > dbbuf_dbs = NULL ;
return - ENOMEM ;
}
return 0 ;
}
static void nvme_dbbuf_dma_free ( struct nvme_dev * dev )
{
unsigned int mem_size = nvme_dbbuf_size ( dev - > db_stride ) ;
if ( dev - > dbbuf_dbs ) {
dma_free_coherent ( dev - > dev , mem_size ,
dev - > dbbuf_dbs , dev - > dbbuf_dbs_dma_addr ) ;
dev - > dbbuf_dbs = NULL ;
}
if ( dev - > dbbuf_eis ) {
dma_free_coherent ( dev - > dev , mem_size ,
dev - > dbbuf_eis , dev - > dbbuf_eis_dma_addr ) ;
dev - > dbbuf_eis = NULL ;
}
}
static void nvme_dbbuf_init ( struct nvme_dev * dev ,
struct nvme_queue * nvmeq , int qid )
{
if ( ! dev - > dbbuf_dbs | | ! qid )
return ;
nvmeq - > dbbuf_sq_db = & dev - > dbbuf_dbs [ sq_idx ( qid , dev - > db_stride ) ] ;
nvmeq - > dbbuf_cq_db = & dev - > dbbuf_dbs [ cq_idx ( qid , dev - > db_stride ) ] ;
nvmeq - > dbbuf_sq_ei = & dev - > dbbuf_eis [ sq_idx ( qid , dev - > db_stride ) ] ;
nvmeq - > dbbuf_cq_ei = & dev - > dbbuf_eis [ cq_idx ( qid , dev - > db_stride ) ] ;
}
static void nvme_dbbuf_set ( struct nvme_dev * dev )
{
struct nvme_command c ;
if ( ! dev - > dbbuf_dbs )
return ;
memset ( & c , 0 , sizeof ( c ) ) ;
c . dbbuf . opcode = nvme_admin_dbbuf ;
c . dbbuf . prp1 = cpu_to_le64 ( dev - > dbbuf_dbs_dma_addr ) ;
c . dbbuf . prp2 = cpu_to_le64 ( dev - > dbbuf_eis_dma_addr ) ;
if ( nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ) {
2017-05-20 16:14:43 +03:00
dev_warn ( dev - > ctrl . device , " unable to set dbbuf \n " ) ;
2017-04-10 18:51:07 +03:00
/* Free memory and continue on */
nvme_dbbuf_dma_free ( dev ) ;
}
}
static inline int nvme_dbbuf_need_event ( u16 event_idx , u16 new_idx , u16 old )
{
return ( u16 ) ( new_idx - event_idx - 1 ) < ( u16 ) ( new_idx - old ) ;
}
/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event ( u16 value , u32 * dbbuf_db ,
volatile u32 * dbbuf_ei )
{
if ( dbbuf_db ) {
u16 old_value ;
/*
* Ensure that the queue is written before updating
* the doorbell in memory
*/
wmb ( ) ;
old_value = * dbbuf_db ;
* dbbuf_db = value ;
2018-08-16 01:51:57 +03:00
/*
* Ensure that the doorbell is updated before reading the event
* index from memory . The controller needs to provide similar
* ordering to ensure the envent index is updated before reading
* the doorbell .
*/
mb ( ) ;
2017-04-10 18:51:07 +03:00
if ( ! nvme_dbbuf_need_event ( * dbbuf_ei , value , old_value ) )
return false ;
}
return true ;
2011-01-20 20:50:14 +03:00
}
2015-01-22 22:07:58 +03:00
/*
* Will slightly overestimate the number of pages needed . This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I / O .
*/
static int nvme_npages ( unsigned size , struct nvme_dev * dev )
{
2015-11-28 17:03:49 +03:00
unsigned nprps = DIV_ROUND_UP ( size + dev - > ctrl . page_size ,
dev - > ctrl . page_size ) ;
2015-01-22 22:07:58 +03:00
return DIV_ROUND_UP ( 8 * nprps , PAGE_SIZE - 8 ) ;
}
2017-10-17 04:24:20 +03:00
/*
* Calculates the number of pages needed for the SGL segments . For example a 4 k
* page can accommodate 256 SGL descriptors .
*/
static int nvme_pci_npages_sgl ( unsigned int num_seg )
2015-01-22 22:07:58 +03:00
{
2017-10-17 04:24:20 +03:00
return DIV_ROUND_UP ( num_seg * sizeof ( struct nvme_sgl_desc ) , PAGE_SIZE ) ;
2015-11-28 17:43:10 +03:00
}
2015-01-22 22:07:58 +03:00
2017-10-17 04:24:20 +03:00
static unsigned int nvme_pci_iod_alloc_size ( struct nvme_dev * dev ,
unsigned int size , unsigned int nseg , bool use_sgl )
2015-11-28 17:43:10 +03:00
{
2017-10-17 04:24:20 +03:00
size_t alloc_size ;
if ( use_sgl )
alloc_size = sizeof ( __le64 * ) * nvme_pci_npages_sgl ( nseg ) ;
else
alloc_size = sizeof ( __le64 * ) * nvme_npages ( size , dev ) ;
return alloc_size + sizeof ( struct scatterlist ) * nseg ;
2015-11-28 17:43:10 +03:00
}
2015-01-22 22:07:58 +03:00
2014-11-04 18:20:14 +03:00
static int nvme_admin_init_hctx ( struct blk_mq_hw_ctx * hctx , void * data ,
unsigned int hctx_idx )
2011-02-07 02:30:16 +03:00
{
2014-11-04 18:20:14 +03:00
struct nvme_dev * dev = data ;
2018-01-14 13:39:01 +03:00
struct nvme_queue * nvmeq = & dev - > queues [ 0 ] ;
2014-11-04 18:20:14 +03:00
2015-06-01 18:29:54 +03:00
WARN_ON ( hctx_idx ! = 0 ) ;
WARN_ON ( dev - > admin_tagset . tags [ 0 ] ! = hctx - > tags ) ;
WARN_ON ( nvmeq - > tags ) ;
2014-11-04 18:20:14 +03:00
hctx - > driver_data = nvmeq ;
2015-06-01 18:29:54 +03:00
nvmeq - > tags = & dev - > admin_tagset . tags [ 0 ] ;
2014-11-04 18:20:14 +03:00
return 0 ;
2011-02-07 02:30:16 +03:00
}
2015-06-08 19:08:13 +03:00
static void nvme_admin_exit_hctx ( struct blk_mq_hw_ctx * hctx , unsigned int hctx_idx )
{
struct nvme_queue * nvmeq = hctx - > driver_data ;
nvmeq - > tags = NULL ;
}
2014-11-04 18:20:14 +03:00
static int nvme_init_hctx ( struct blk_mq_hw_ctx * hctx , void * data ,
unsigned int hctx_idx )
2011-01-20 20:50:14 +03:00
{
2014-11-04 18:20:14 +03:00
struct nvme_dev * dev = data ;
2018-01-14 13:39:01 +03:00
struct nvme_queue * nvmeq = & dev - > queues [ hctx_idx + 1 ] ;
2014-11-04 18:20:14 +03:00
2015-06-01 18:29:54 +03:00
if ( ! nvmeq - > tags )
nvmeq - > tags = & dev - > tagset . tags [ hctx_idx ] ;
2011-01-20 20:50:14 +03:00
2015-06-01 18:29:54 +03:00
WARN_ON ( dev - > tagset . tags [ hctx_idx ] ! = hctx - > tags ) ;
2014-11-04 18:20:14 +03:00
hctx - > driver_data = nvmeq ;
return 0 ;
2011-01-20 20:50:14 +03:00
}
2017-05-01 19:19:08 +03:00
static int nvme_init_request ( struct blk_mq_tag_set * set , struct request * req ,
unsigned int hctx_idx , unsigned int numa_node )
2011-01-20 20:50:14 +03:00
{
2017-05-01 19:19:08 +03:00
struct nvme_dev * dev = set - > driver_data ;
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2017-06-13 10:15:18 +03:00
int queue_idx = ( set = = & dev - > tagset ) ? hctx_idx + 1 : 0 ;
2018-01-14 13:39:01 +03:00
struct nvme_queue * nvmeq = & dev - > queues [ queue_idx ] ;
2014-11-04 18:20:14 +03:00
BUG_ON ( ! nvmeq ) ;
2015-11-28 17:43:10 +03:00
iod - > nvmeq = nvmeq ;
2018-06-30 01:50:00 +03:00
nvme_req ( req ) - > ctrl = & dev - > ctrl ;
2014-11-04 18:20:14 +03:00
return 0 ;
}
2018-10-31 17:36:31 +03:00
static int queue_irq_offset ( struct nvme_dev * dev )
{
/* if we have more than 1 vec, admin queue offsets us by 1 */
if ( dev - > num_vecs > 1 )
return 1 ;
return 0 ;
}
2016-09-14 17:18:57 +03:00
static int nvme_pci_map_queues ( struct blk_mq_tag_set * set )
{
struct nvme_dev * dev = set - > driver_data ;
2018-10-31 17:36:31 +03:00
int i , qoff , offset ;
offset = queue_irq_offset ( dev ) ;
for ( i = 0 , qoff = 0 ; i < set - > nr_maps ; i + + ) {
struct blk_mq_queue_map * map = & set - > map [ i ] ;
map - > nr_queues = dev - > io_queues [ i ] ;
if ( ! map - > nr_queues ) {
2018-12-02 19:46:16 +03:00
BUG_ON ( i = = HCTX_TYPE_DEFAULT ) ;
2018-12-17 14:16:27 +03:00
continue ;
2018-10-31 17:36:31 +03:00
}
2018-11-05 22:44:33 +03:00
/*
* The poll queue ( s ) doesn ' t have an IRQ ( and hence IRQ
* affinity ) , so use the regular blk - mq cpu mapping
*/
2018-10-31 17:36:31 +03:00
map - > queue_offset = qoff ;
2019-05-21 19:56:43 +03:00
if ( i ! = HCTX_TYPE_POLL & & offset )
2018-11-05 22:44:33 +03:00
blk_mq_pci_map_queues ( map , to_pci_dev ( dev - > dev ) , offset ) ;
else
blk_mq_map_queues ( map ) ;
2018-10-31 17:36:31 +03:00
qoff + = map - > nr_queues ;
offset + = map - > nr_queues ;
}
return 0 ;
2016-09-14 17:18:57 +03:00
}
2018-11-29 20:02:29 +03:00
/*
* Write sq tail if we are asked to , or if the next command would wrap .
*/
static inline void nvme_write_sq_db ( struct nvme_queue * nvmeq , bool write_sq )
{
if ( ! write_sq ) {
u16 next_tail = nvmeq - > sq_tail + 1 ;
if ( next_tail = = nvmeq - > q_depth )
next_tail = 0 ;
if ( next_tail ! = nvmeq - > last_sq_tail )
return ;
}
if ( nvme_dbbuf_update_and_check_event ( nvmeq - > sq_tail ,
nvmeq - > dbbuf_sq_db , nvmeq - > dbbuf_sq_ei ) )
writel ( nvmeq - > sq_tail , nvmeq - > q_db ) ;
nvmeq - > last_sq_tail = nvmeq - > sq_tail ;
}
2011-01-20 20:50:14 +03:00
/**
2018-05-26 14:45:55 +03:00
* nvme_submit_cmd ( ) - Copy a command into a queue and ring the doorbell
2011-01-20 20:50:14 +03:00
* @ nvmeq : The queue to use
* @ cmd : The command to send
2018-11-29 20:02:29 +03:00
* @ write_sq : whether to write to the SQ doorbell
2011-01-20 20:50:14 +03:00
*/
2018-11-29 20:02:29 +03:00
static void nvme_submit_cmd ( struct nvme_queue * nvmeq , struct nvme_command * cmd ,
bool write_sq )
2011-01-20 20:50:14 +03:00
{
2018-05-26 14:45:55 +03:00
spin_lock ( & nvmeq - > sq_lock ) ;
2018-10-05 00:27:43 +03:00
memcpy ( & nvmeq - > sq_cmds [ nvmeq - > sq_tail ] , cmd , sizeof ( * cmd ) ) ;
2018-05-26 14:45:55 +03:00
if ( + + nvmeq - > sq_tail = = nvmeq - > q_depth )
nvmeq - > sq_tail = 0 ;
2018-11-29 20:02:29 +03:00
nvme_write_sq_db ( nvmeq , write_sq ) ;
spin_unlock ( & nvmeq - > sq_lock ) ;
}
static void nvme_commit_rqs ( struct blk_mq_hw_ctx * hctx )
{
struct nvme_queue * nvmeq = hctx - > driver_data ;
spin_lock ( & nvmeq - > sq_lock ) ;
if ( nvmeq - > sq_tail ! = nvmeq - > last_sq_tail )
nvme_write_sq_db ( nvmeq , true ) ;
2018-05-26 14:45:55 +03:00
spin_unlock ( & nvmeq - > sq_lock ) ;
2011-01-20 20:50:14 +03:00
}
2017-10-17 04:24:20 +03:00
static void * * nvme_pci_iod_list ( struct request * req )
2011-01-20 20:50:14 +03:00
{
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2017-10-17 04:24:20 +03:00
return ( void * * ) ( iod - > sg + blk_rq_nr_phys_segments ( req ) ) ;
2011-01-20 20:50:14 +03:00
}
2017-12-20 10:30:50 +03:00
static inline bool nvme_pci_use_sgls ( struct nvme_dev * dev , struct request * req )
{
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2018-01-18 00:04:37 +03:00
int nseg = blk_rq_nr_phys_segments ( req ) ;
2017-12-20 10:30:50 +03:00
unsigned int avg_seg_size ;
2018-01-18 00:04:37 +03:00
if ( nseg = = 0 )
return false ;
avg_seg_size = DIV_ROUND_UP ( blk_rq_payload_bytes ( req ) , nseg ) ;
2017-12-20 10:30:50 +03:00
if ( ! ( dev - > ctrl . sgls & ( ( 1 < < 0 ) | ( 1 < < 1 ) ) ) )
return false ;
if ( ! iod - > nvmeq - > qid )
return false ;
if ( ! sgl_threshold | | avg_seg_size < sgl_threshold )
return false ;
return true ;
}
2019-03-03 18:15:19 +03:00
static void nvme_unmap_data ( struct nvme_dev * dev , struct request * req )
2011-01-20 20:50:14 +03:00
{
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2019-03-03 18:15:19 +03:00
enum dma_data_direction dma_dir = rq_data_dir ( req ) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE ;
2017-10-17 04:24:20 +03:00
const int last_prp = dev - > ctrl . page_size / sizeof ( __le64 ) - 1 ;
dma_addr_t dma_addr = iod - > first_dma , next_dma_addr ;
2011-12-20 22:34:52 +04:00
int i ;
2019-03-05 15:49:34 +03:00
if ( iod - > dma_len ) {
dma_unmap_page ( dev - > dev , dma_addr , iod - > dma_len , dma_dir ) ;
return ;
2019-03-03 18:15:19 +03:00
}
2019-03-05 15:49:34 +03:00
WARN_ON_ONCE ( ! iod - > nents ) ;
/* P2PDMA requests do not need to be unmapped */
if ( ! is_pci_p2pdma_page ( sg_page ( iod - > sg ) ) )
dma_unmap_sg ( dev - > dev , iod - > sg , iod - > nents , rq_dma_dir ( req ) ) ;
2011-12-20 22:34:52 +04:00
if ( iod - > npages = = 0 )
2017-10-17 04:24:20 +03:00
dma_pool_free ( dev - > prp_small_pool , nvme_pci_iod_list ( req ) [ 0 ] ,
dma_addr ) ;
2011-12-20 22:34:52 +04:00
for ( i = 0 ; i < iod - > npages ; i + + ) {
2017-10-17 04:24:20 +03:00
void * addr = nvme_pci_iod_list ( req ) [ i ] ;
if ( iod - > use_sgl ) {
struct nvme_sgl_desc * sg_list = addr ;
next_dma_addr =
le64_to_cpu ( ( sg_list [ SGES_PER_PAGE - 1 ] ) . addr ) ;
} else {
__le64 * prp_list = addr ;
next_dma_addr = le64_to_cpu ( prp_list [ last_prp ] ) ;
}
dma_pool_free ( dev - > prp_page_pool , addr , dma_addr ) ;
dma_addr = next_dma_addr ;
2011-12-20 22:34:52 +04:00
}
2015-01-22 22:07:58 +03:00
2019-03-05 15:46:58 +03:00
mempool_free ( iod - > sg , dev - > iod_mempool ) ;
2014-08-29 19:06:12 +04:00
}
2017-09-15 20:05:38 +03:00
static void nvme_print_sgl ( struct scatterlist * sgl , int nents )
{
int i ;
struct scatterlist * sg ;
for_each_sg ( sgl , sg , nents , i ) {
dma_addr_t phys = sg_phys ( sg ) ;
pr_warn ( " sg[%d] phys_addr:%pad offset:%d length:%d "
" dma_address:%pad dma_length:%d \n " ,
i , & phys , sg - > offset , sg - > length , & sg_dma_address ( sg ) ,
sg_dma_len ( sg ) ) ;
}
}
2017-10-17 04:24:20 +03:00
static blk_status_t nvme_pci_setup_prps ( struct nvme_dev * dev ,
struct request * req , struct nvme_rw_command * cmnd )
2011-01-26 18:02:29 +03:00
{
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2011-02-10 18:30:34 +03:00
struct dma_pool * pool ;
2017-01-13 14:29:12 +03:00
int length = blk_rq_payload_bytes ( req ) ;
2011-12-20 22:34:52 +04:00
struct scatterlist * sg = iod - > sg ;
2011-01-26 18:02:29 +03:00
int dma_len = sg_dma_len ( sg ) ;
u64 dma_addr = sg_dma_address ( sg ) ;
2015-11-28 17:03:49 +03:00
u32 page_size = dev - > ctrl . page_size ;
2015-03-26 19:07:51 +03:00
int offset = dma_addr & ( page_size - 1 ) ;
2011-02-10 16:51:24 +03:00
__le64 * prp_list ;
2017-10-17 04:24:20 +03:00
void * * list = nvme_pci_iod_list ( req ) ;
2011-02-10 16:51:24 +03:00
dma_addr_t prp_dma ;
2011-12-20 22:34:52 +04:00
int nprps , i ;
2011-01-26 18:02:29 +03:00
2014-06-23 21:34:01 +04:00
length - = ( page_size - offset ) ;
2017-08-27 16:56:37 +03:00
if ( length < = 0 ) {
iod - > first_dma = 0 ;
2017-10-17 04:24:20 +03:00
goto done ;
2017-08-27 16:56:37 +03:00
}
2011-01-26 18:02:29 +03:00
2014-06-23 21:34:01 +04:00
dma_len - = ( page_size - offset ) ;
2011-01-26 18:02:29 +03:00
if ( dma_len ) {
2014-06-23 21:34:01 +04:00
dma_addr + = ( page_size - offset ) ;
2011-01-26 18:02:29 +03:00
} else {
sg = sg_next ( sg ) ;
dma_addr = sg_dma_address ( sg ) ;
dma_len = sg_dma_len ( sg ) ;
}
2014-06-23 21:34:01 +04:00
if ( length < = page_size ) {
2014-04-04 02:45:23 +04:00
iod - > first_dma = dma_addr ;
2017-10-17 04:24:20 +03:00
goto done ;
2011-02-10 16:51:24 +03:00
}
2014-06-23 21:34:01 +04:00
nprps = DIV_ROUND_UP ( length , page_size ) ;
2011-02-10 18:30:34 +03:00
if ( nprps < = ( 256 / 8 ) ) {
pool = dev - > prp_small_pool ;
2011-12-20 22:34:52 +04:00
iod - > npages = 0 ;
2011-02-10 18:30:34 +03:00
} else {
pool = dev - > prp_page_pool ;
2011-12-20 22:34:52 +04:00
iod - > npages = 1 ;
2011-02-10 18:30:34 +03:00
}
2015-10-16 08:58:37 +03:00
prp_list = dma_pool_alloc ( pool , GFP_ATOMIC , & prp_dma ) ;
2011-05-12 21:51:41 +04:00
if ( ! prp_list ) {
2014-04-04 02:45:23 +04:00
iod - > first_dma = dma_addr ;
2011-12-20 22:34:52 +04:00
iod - > npages = - 1 ;
2017-07-12 22:59:07 +03:00
return BLK_STS_RESOURCE ;
2011-05-12 21:51:41 +04:00
}
2011-12-20 22:34:52 +04:00
list [ 0 ] = prp_list ;
iod - > first_dma = prp_dma ;
2011-02-10 16:51:24 +03:00
i = 0 ;
for ( ; ; ) {
2014-06-23 21:34:01 +04:00
if ( i = = page_size > > 3 ) {
2011-02-10 16:51:24 +03:00
__le64 * old_prp_list = prp_list ;
2015-10-16 08:58:37 +03:00
prp_list = dma_pool_alloc ( pool , GFP_ATOMIC , & prp_dma ) ;
2011-12-20 22:34:52 +04:00
if ( ! prp_list )
2017-07-12 22:59:07 +03:00
return BLK_STS_RESOURCE ;
2011-12-20 22:34:52 +04:00
list [ iod - > npages + + ] = prp_list ;
2011-03-16 23:43:40 +03:00
prp_list [ 0 ] = old_prp_list [ i - 1 ] ;
old_prp_list [ i - 1 ] = cpu_to_le64 ( prp_dma ) ;
i = 1 ;
2011-02-10 16:51:24 +03:00
}
prp_list [ i + + ] = cpu_to_le64 ( dma_addr ) ;
2014-06-23 21:34:01 +04:00
dma_len - = page_size ;
dma_addr + = page_size ;
length - = page_size ;
2011-02-10 16:51:24 +03:00
if ( length < = 0 )
break ;
if ( dma_len > 0 )
continue ;
2017-07-12 22:59:07 +03:00
if ( unlikely ( dma_len < 0 ) )
goto bad_sgl ;
2011-02-10 16:51:24 +03:00
sg = sg_next ( sg ) ;
dma_addr = sg_dma_address ( sg ) ;
dma_len = sg_dma_len ( sg ) ;
2011-01-26 18:02:29 +03:00
}
2017-10-17 04:24:20 +03:00
done :
cmnd - > dptr . prp1 = cpu_to_le64 ( sg_dma_address ( iod - > sg ) ) ;
cmnd - > dptr . prp2 = cpu_to_le64 ( iod - > first_dma ) ;
2017-07-12 22:59:07 +03:00
return BLK_STS_OK ;
bad_sgl :
2017-09-15 20:05:38 +03:00
WARN ( DO_ONCE ( nvme_print_sgl , iod - > sg , iod - > nents ) ,
" Invalid SGL for payload:%d nents:%d \n " ,
blk_rq_payload_bytes ( req ) , iod - > nents ) ;
2017-07-12 22:59:07 +03:00
return BLK_STS_IOERR ;
2011-01-26 18:02:29 +03:00
}
2017-10-17 04:24:20 +03:00
static void nvme_pci_sgl_set_data ( struct nvme_sgl_desc * sge ,
struct scatterlist * sg )
{
sge - > addr = cpu_to_le64 ( sg_dma_address ( sg ) ) ;
sge - > length = cpu_to_le32 ( sg_dma_len ( sg ) ) ;
sge - > type = NVME_SGL_FMT_DATA_DESC < < 4 ;
}
static void nvme_pci_sgl_set_seg ( struct nvme_sgl_desc * sge ,
dma_addr_t dma_addr , int entries )
{
sge - > addr = cpu_to_le64 ( dma_addr ) ;
if ( entries < SGES_PER_PAGE ) {
sge - > length = cpu_to_le32 ( entries * sizeof ( * sge ) ) ;
sge - > type = NVME_SGL_FMT_LAST_SEG_DESC < < 4 ;
} else {
sge - > length = cpu_to_le32 ( PAGE_SIZE ) ;
sge - > type = NVME_SGL_FMT_SEG_DESC < < 4 ;
}
}
static blk_status_t nvme_pci_setup_sgls ( struct nvme_dev * dev ,
2018-01-18 00:04:38 +03:00
struct request * req , struct nvme_rw_command * cmd , int entries )
2017-10-17 04:24:20 +03:00
{
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
struct dma_pool * pool ;
struct nvme_sgl_desc * sg_list ;
struct scatterlist * sg = iod - > sg ;
dma_addr_t sgl_dma ;
2018-01-18 00:04:38 +03:00
int i = 0 ;
2017-10-17 04:24:20 +03:00
/* setting the transfer type as SGL */
cmd - > flags = NVME_CMD_SGL_METABUF ;
2018-01-18 00:04:38 +03:00
if ( entries = = 1 ) {
2017-10-17 04:24:20 +03:00
nvme_pci_sgl_set_data ( & cmd - > dptr . sgl , sg ) ;
return BLK_STS_OK ;
}
if ( entries < = ( 256 / sizeof ( struct nvme_sgl_desc ) ) ) {
pool = dev - > prp_small_pool ;
iod - > npages = 0 ;
} else {
pool = dev - > prp_page_pool ;
iod - > npages = 1 ;
}
sg_list = dma_pool_alloc ( pool , GFP_ATOMIC , & sgl_dma ) ;
if ( ! sg_list ) {
iod - > npages = - 1 ;
return BLK_STS_RESOURCE ;
}
nvme_pci_iod_list ( req ) [ 0 ] = sg_list ;
iod - > first_dma = sgl_dma ;
nvme_pci_sgl_set_seg ( & cmd - > dptr . sgl , sgl_dma , entries ) ;
do {
if ( i = = SGES_PER_PAGE ) {
struct nvme_sgl_desc * old_sg_desc = sg_list ;
struct nvme_sgl_desc * link = & old_sg_desc [ i - 1 ] ;
sg_list = dma_pool_alloc ( pool , GFP_ATOMIC , & sgl_dma ) ;
if ( ! sg_list )
return BLK_STS_RESOURCE ;
i = 0 ;
nvme_pci_iod_list ( req ) [ iod - > npages + + ] = sg_list ;
sg_list [ i + + ] = * link ;
nvme_pci_sgl_set_seg ( link , sgl_dma , entries ) ;
}
nvme_pci_sgl_set_data ( & sg_list [ i + + ] , sg ) ;
sg = sg_next ( sg ) ;
2018-01-18 00:04:38 +03:00
} while ( - - entries > 0 ) ;
2017-10-17 04:24:20 +03:00
return BLK_STS_OK ;
}
2019-03-05 15:49:34 +03:00
static blk_status_t nvme_setup_prp_simple ( struct nvme_dev * dev ,
struct request * req , struct nvme_rw_command * cmnd ,
struct bio_vec * bv )
{
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
unsigned int first_prp_len = dev - > ctrl . page_size - bv - > bv_offset ;
iod - > first_dma = dma_map_bvec ( dev - > dev , bv , rq_dma_dir ( req ) , 0 ) ;
if ( dma_mapping_error ( dev - > dev , iod - > first_dma ) )
return BLK_STS_RESOURCE ;
iod - > dma_len = bv - > bv_len ;
cmnd - > dptr . prp1 = cpu_to_le64 ( iod - > first_dma ) ;
if ( bv - > bv_len > first_prp_len )
cmnd - > dptr . prp2 = cpu_to_le64 ( iod - > first_dma + first_prp_len ) ;
return 0 ;
}
2019-03-05 15:54:18 +03:00
static blk_status_t nvme_setup_sgl_simple ( struct nvme_dev * dev ,
struct request * req , struct nvme_rw_command * cmnd ,
struct bio_vec * bv )
{
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
iod - > first_dma = dma_map_bvec ( dev - > dev , bv , rq_dma_dir ( req ) , 0 ) ;
if ( dma_mapping_error ( dev - > dev , iod - > first_dma ) )
return BLK_STS_RESOURCE ;
iod - > dma_len = bv - > bv_len ;
2019-04-30 19:53:29 +03:00
cmnd - > flags = NVME_CMD_SGL_METABUF ;
2019-03-05 15:54:18 +03:00
cmnd - > dptr . sgl . addr = cpu_to_le64 ( iod - > first_dma ) ;
cmnd - > dptr . sgl . length = cpu_to_le32 ( iod - > dma_len ) ;
cmnd - > dptr . sgl . type = NVME_SGL_FMT_DATA_DESC < < 4 ;
return 0 ;
}
2017-06-03 10:38:05 +03:00
static blk_status_t nvme_map_data ( struct nvme_dev * dev , struct request * req ,
2017-01-13 14:29:12 +03:00
struct nvme_command * cmnd )
2015-05-22 12:12:46 +03:00
{
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2019-03-05 15:59:02 +03:00
blk_status_t ret = BLK_STS_RESOURCE ;
2018-01-18 00:04:38 +03:00
int nr_mapped ;
2015-05-22 12:12:46 +03:00
2019-03-05 15:49:34 +03:00
if ( blk_rq_nr_phys_segments ( req ) = = 1 ) {
struct bio_vec bv = req_bvec ( req ) ;
if ( ! is_pci_p2pdma_page ( bv . bv_page ) ) {
if ( bv . bv_offset + bv . bv_len < = dev - > ctrl . page_size * 2 )
return nvme_setup_prp_simple ( dev , req ,
& cmnd - > rw , & bv ) ;
2019-03-05 15:54:18 +03:00
if ( iod - > nvmeq - > qid & &
dev - > ctrl . sgls & ( ( 1 < < 0 ) | ( 1 < < 1 ) ) )
return nvme_setup_sgl_simple ( dev , req ,
& cmnd - > rw , & bv ) ;
2019-03-05 15:49:34 +03:00
}
}
iod - > dma_len = 0 ;
2019-03-05 15:46:58 +03:00
iod - > sg = mempool_alloc ( dev - > iod_mempool , GFP_ATOMIC ) ;
if ( ! iod - > sg )
return BLK_STS_RESOURCE ;
2016-12-09 01:20:32 +03:00
sg_init_table ( iod - > sg , blk_rq_nr_phys_segments ( req ) ) ;
2019-03-05 15:59:02 +03:00
iod - > nents = blk_rq_map_sg ( req - > q , req , iod - > sg ) ;
2015-10-16 08:58:38 +03:00
if ( ! iod - > nents )
goto out ;
2015-05-22 12:12:46 +03:00
2018-10-05 00:27:44 +03:00
if ( is_pci_p2pdma_page ( sg_page ( iod - > sg ) ) )
nr_mapped = pci_p2pdma_map_sg ( dev - > dev , iod - > sg , iod - > nents ,
2019-03-05 15:59:02 +03:00
rq_dma_dir ( req ) ) ;
2018-10-05 00:27:44 +03:00
else
nr_mapped = dma_map_sg_attrs ( dev - > dev , iod - > sg , iod - > nents ,
2019-03-05 15:59:02 +03:00
rq_dma_dir ( req ) , DMA_ATTR_NO_WARN ) ;
2018-01-18 00:04:38 +03:00
if ( ! nr_mapped )
2015-10-16 08:58:38 +03:00
goto out ;
2015-05-22 12:12:46 +03:00
2019-03-05 15:59:02 +03:00
iod - > use_sgl = nvme_pci_use_sgls ( dev , req ) ;
2017-12-20 10:30:50 +03:00
if ( iod - > use_sgl )
2018-01-18 00:04:38 +03:00
ret = nvme_pci_setup_sgls ( dev , req , & cmnd - > rw , nr_mapped ) ;
2017-10-17 04:24:20 +03:00
else
ret = nvme_pci_setup_prps ( dev , req , & cmnd - > rw ) ;
2019-03-03 19:46:28 +03:00
out :
2017-07-12 22:59:07 +03:00
if ( ret ! = BLK_STS_OK )
2019-03-03 19:46:28 +03:00
nvme_unmap_data ( dev , req ) ;
return ret ;
}
2018-10-17 21:34:15 +03:00
2019-03-03 19:46:28 +03:00
static blk_status_t nvme_map_metadata ( struct nvme_dev * dev , struct request * req ,
struct nvme_command * cmnd )
{
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2011-02-22 22:18:30 +03:00
2019-03-03 19:46:28 +03:00
iod - > meta_dma = dma_map_bvec ( dev - > dev , rq_integrity_vec ( req ) ,
rq_dma_dir ( req ) , 0 ) ;
if ( dma_mapping_error ( dev - > dev , iod - > meta_dma ) )
return BLK_STS_IOERR ;
cmnd - > rw . metadata = cpu_to_le64 ( iod - > meta_dma ) ;
return 0 ;
2011-02-22 22:18:30 +03:00
}
2015-05-22 12:12:46 +03:00
/*
* NOTE : ns is NULL when called on the admin queue .
*/
2017-06-03 10:38:05 +03:00
static blk_status_t nvme_queue_rq ( struct blk_mq_hw_ctx * hctx ,
2014-11-04 18:20:14 +03:00
const struct blk_mq_queue_data * bd )
2014-04-04 02:45:23 +04:00
{
2014-11-04 18:20:14 +03:00
struct nvme_ns * ns = hctx - > queue - > queuedata ;
struct nvme_queue * nvmeq = hctx - > driver_data ;
2015-05-22 12:12:46 +03:00
struct nvme_dev * dev = nvmeq - > dev ;
2014-11-04 18:20:14 +03:00
struct request * req = bd - > rq ;
2019-03-03 18:04:01 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2015-10-16 08:58:38 +03:00
struct nvme_command cmnd ;
2017-06-12 19:36:32 +03:00
blk_status_t ret ;
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 23:39:03 +03:00
2019-03-03 18:04:01 +03:00
iod - > aborted = 0 ;
iod - > npages = - 1 ;
iod - > nents = 0 ;
2018-05-17 19:31:49 +03:00
/*
* We should not need to do this , but we ' re still using this to
* ensure we can drain requests on a dying queue .
*/
2018-12-02 19:46:17 +03:00
if ( unlikely ( ! test_bit ( NVMEQ_ENABLED , & nvmeq - > flags ) ) )
2018-05-17 19:31:49 +03:00
return BLK_STS_IOERR ;
2016-12-09 01:20:32 +03:00
ret = nvme_setup_cmd ( ns , req , & cmnd ) ;
2017-06-03 10:38:05 +03:00
if ( ret )
2015-11-28 17:43:10 +03:00
return ret ;
2014-11-04 18:20:14 +03:00
2017-06-03 10:38:05 +03:00
if ( blk_rq_nr_phys_segments ( req ) ) {
2017-01-13 14:29:12 +03:00
ret = nvme_map_data ( dev , req , & cmnd ) ;
2017-06-03 10:38:05 +03:00
if ( ret )
2019-03-03 18:04:01 +03:00
goto out_free_cmd ;
2017-06-03 10:38:05 +03:00
}
2014-11-04 18:20:14 +03:00
2019-03-03 19:46:28 +03:00
if ( blk_integrity_rq ( req ) ) {
ret = nvme_map_metadata ( dev , req , & cmnd ) ;
if ( ret )
goto out_unmap_data ;
}
2015-11-26 14:59:50 +03:00
blk_mq_start_request ( req ) ;
2018-11-29 20:02:29 +03:00
nvme_submit_cmd ( nvmeq , & cmnd , bd - > last ) ;
2017-06-03 10:38:05 +03:00
return BLK_STS_OK ;
2019-03-03 19:46:28 +03:00
out_unmap_data :
nvme_unmap_data ( dev , req ) ;
2016-12-09 01:20:32 +03:00
out_free_cmd :
nvme_cleanup_cmd ( req ) ;
2015-10-16 08:58:38 +03:00
return ret ;
2011-01-20 20:50:14 +03:00
}
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 23:39:03 +03:00
2017-03-30 14:41:32 +03:00
static void nvme_pci_complete_rq ( struct request * req )
2015-11-26 15:03:13 +03:00
{
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
2019-03-03 19:46:28 +03:00
struct nvme_dev * dev = iod - > nvmeq - > dev ;
2014-11-04 18:20:14 +03:00
2019-03-03 18:13:03 +03:00
nvme_cleanup_cmd ( req ) ;
2019-03-03 19:46:28 +03:00
if ( blk_integrity_rq ( req ) )
dma_unmap_page ( dev - > dev , iod - > meta_dma ,
rq_integrity_vec ( req ) - > bv_len , rq_data_dir ( req ) ) ;
2019-03-03 18:52:21 +03:00
if ( blk_rq_nr_phys_segments ( req ) )
2019-03-03 19:46:28 +03:00
nvme_unmap_data ( dev , req ) ;
2017-03-30 14:41:32 +03:00
nvme_complete_rq ( req ) ;
2011-01-20 20:50:14 +03:00
}
2016-03-22 18:02:06 +03:00
/* We read the CQE phase first to check if the rest of the entry is valid */
2018-05-18 17:37:04 +03:00
static inline bool nvme_cqe_pending ( struct nvme_queue * nvmeq )
2016-03-22 18:02:06 +03:00
{
2018-05-18 17:37:04 +03:00
return ( le16_to_cpu ( nvmeq - > cqes [ nvmeq - > cq_head ] . status ) & 1 ) = =
nvmeq - > cq_phase ;
2016-03-22 18:02:06 +03:00
}
2017-06-18 17:28:07 +03:00
static inline void nvme_ring_cq_doorbell ( struct nvme_queue * nvmeq )
2011-01-20 20:50:14 +03:00
{
2017-06-18 17:28:07 +03:00
u16 head = nvmeq - > cq_head ;
2015-11-28 17:42:28 +03:00
2018-06-06 17:13:05 +03:00
if ( nvme_dbbuf_update_and_check_event ( head , nvmeq - > dbbuf_cq_db ,
nvmeq - > dbbuf_cq_ei ) )
writel ( head , nvmeq - > q_db + nvmeq - > dev - > db_stride ) ;
2017-06-18 17:28:07 +03:00
}
2015-11-26 14:59:50 +03:00
2018-05-17 19:31:50 +03:00
static inline void nvme_handle_cqe ( struct nvme_queue * nvmeq , u16 idx )
2017-06-18 17:28:08 +03:00
{
2018-05-17 19:31:50 +03:00
volatile struct nvme_completion * cqe = & nvmeq - > cqes [ idx ] ;
2017-06-18 17:28:08 +03:00
struct request * req ;
2015-11-28 17:42:28 +03:00
2017-06-18 17:28:08 +03:00
if ( unlikely ( cqe - > command_id > = nvmeq - > q_depth ) ) {
dev_warn ( nvmeq - > dev - > ctrl . device ,
" invalid id %d completed on queue %d \n " ,
cqe - > command_id , le16_to_cpu ( cqe - > sq_id ) ) ;
return ;
2011-01-20 20:50:14 +03:00
}
2017-06-18 17:28:08 +03:00
/*
* AEN requests are special as they don ' t time out and can
* survive any kind of queue freeze and often don ' t respond to
* aborts . We don ' t even bother to allocate a struct request
* for them but rather special case them here .
*/
if ( unlikely ( nvmeq - > qid = = 0 & &
2017-11-08 01:13:10 +03:00
cqe - > command_id > = NVME_AQ_BLK_MQ_DEPTH ) ) {
2017-06-18 17:28:08 +03:00
nvme_complete_async_event ( & nvmeq - > dev - > ctrl ,
cqe - > status , & cqe - > result ) ;
2015-11-04 06:37:26 +03:00
return ;
2017-06-18 17:28:08 +03:00
}
2011-01-20 20:50:14 +03:00
2017-06-18 17:28:08 +03:00
req = blk_mq_tag_to_rq ( * nvmeq - > tags , cqe - > command_id ) ;
2018-12-18 19:59:53 +03:00
trace_nvme_sq ( req , cqe - > sq_head , nvmeq - > sq_tail ) ;
2017-06-18 17:28:08 +03:00
nvme_end_request ( req , cqe - > status , cqe - > result ) ;
}
2011-01-20 20:50:14 +03:00
2018-05-17 19:31:50 +03:00
static void nvme_complete_cqes ( struct nvme_queue * nvmeq , u16 start , u16 end )
2011-01-20 20:50:14 +03:00
{
2018-05-17 19:31:50 +03:00
while ( start ! = end ) {
nvme_handle_cqe ( nvmeq , start ) ;
if ( + + start = = nvmeq - > q_depth )
start = 0 ;
}
}
2015-11-28 17:42:28 +03:00
2018-05-17 19:31:50 +03:00
static inline void nvme_update_cq_head ( struct nvme_queue * nvmeq )
{
2019-01-07 05:22:07 +03:00
if ( nvmeq - > cq_head = = nvmeq - > q_depth - 1 ) {
2018-05-17 19:31:50 +03:00
nvmeq - > cq_head = 0 ;
nvmeq - > cq_phase = ! nvmeq - > cq_phase ;
2019-01-07 05:22:07 +03:00
} else {
nvmeq - > cq_head + + ;
2011-01-20 20:50:14 +03:00
}
2015-11-04 06:37:26 +03:00
}
2018-11-26 18:21:49 +03:00
static inline int nvme_process_cq ( struct nvme_queue * nvmeq , u16 * start ,
u16 * end , unsigned int tag )
2015-11-04 06:37:26 +03:00
{
2018-11-26 18:21:49 +03:00
int found = 0 ;
2011-01-20 20:50:14 +03:00
2018-05-17 19:31:50 +03:00
* start = nvmeq - > cq_head ;
2018-11-26 18:21:49 +03:00
while ( nvme_cqe_pending ( nvmeq ) ) {
if ( tag = = - 1U | | nvmeq - > cqes [ nvmeq - > cq_head ] . command_id = = tag )
found + + ;
2018-05-17 19:31:50 +03:00
nvme_update_cq_head ( nvmeq ) ;
2017-06-18 17:28:09 +03:00
}
2018-05-17 19:31:50 +03:00
* end = nvmeq - > cq_head ;
2017-06-18 17:28:07 +03:00
2018-05-17 19:31:50 +03:00
if ( * start ! = * end )
2017-06-18 17:28:09 +03:00
nvme_ring_cq_doorbell ( nvmeq ) ;
2018-05-17 19:31:50 +03:00
return found ;
2011-01-20 20:50:14 +03:00
}
static irqreturn_t nvme_irq ( int irq , void * data )
2011-02-06 15:28:06 +03:00
{
struct nvme_queue * nvmeq = data ;
2018-05-21 17:41:52 +03:00
irqreturn_t ret = IRQ_NONE ;
2018-05-17 19:31:50 +03:00
u16 start , end ;
2018-12-02 19:46:23 +03:00
/*
* The rmb / wmb pair ensures we see all updates from a previous run of
* the irq handler , even if that was on another CPU .
*/
rmb ( ) ;
2018-05-21 17:41:52 +03:00
if ( nvmeq - > cq_head ! = nvmeq - > last_cq_head )
ret = IRQ_HANDLED ;
2018-05-17 19:31:50 +03:00
nvme_process_cq ( nvmeq , & start , & end , - 1 ) ;
2018-05-21 17:41:52 +03:00
nvmeq - > last_cq_head = nvmeq - > cq_head ;
2018-12-02 19:46:23 +03:00
wmb ( ) ;
2018-05-17 19:31:50 +03:00
2018-05-21 17:41:52 +03:00
if ( start ! = end ) {
nvme_complete_cqes ( nvmeq , start , end ) ;
return IRQ_HANDLED ;
}
return ret ;
2011-02-06 15:28:06 +03:00
}
static irqreturn_t nvme_irq_check ( int irq , void * data )
{
struct nvme_queue * nvmeq = data ;
2018-05-18 17:37:04 +03:00
if ( nvme_cqe_pending ( nvmeq ) )
2016-03-22 18:02:06 +03:00
return IRQ_WAKE_THREAD ;
return IRQ_NONE ;
2011-02-06 15:28:06 +03:00
}
2018-12-02 19:46:20 +03:00
/*
* Poll for completions any queue , including those not dedicated to polling .
* Can be called from any context .
*/
static int nvme_poll_irqdisable ( struct nvme_queue * nvmeq , unsigned int tag )
2015-11-04 06:37:26 +03:00
{
2018-12-02 19:46:23 +03:00
struct pci_dev * pdev = to_pci_dev ( nvmeq - > dev - > dev ) ;
2018-05-17 19:31:50 +03:00
u16 start , end ;
2018-11-26 18:21:49 +03:00
int found ;
2015-11-04 06:37:26 +03:00
2018-12-02 19:46:23 +03:00
/*
* For a poll queue we need to protect against the polling thread
* using the CQ lock . For normal interrupt driven threads we have
* to disable the interrupt to avoid racing with it .
*/
2019-03-08 20:43:06 +03:00
if ( test_bit ( NVMEQ_POLLED , & nvmeq - > flags ) ) {
2018-12-02 19:46:23 +03:00
spin_lock ( & nvmeq - > cq_poll_lock ) ;
2018-12-13 11:48:00 +03:00
found = nvme_process_cq ( nvmeq , & start , & end , tag ) ;
2018-12-02 19:46:23 +03:00
spin_unlock ( & nvmeq - > cq_poll_lock ) ;
2018-12-13 11:48:00 +03:00
} else {
disable_irq ( pci_irq_vector ( pdev , nvmeq - > cq_vector ) ) ;
found = nvme_process_cq ( nvmeq , & start , & end , tag ) ;
2018-12-02 19:46:23 +03:00
enable_irq ( pci_irq_vector ( pdev , nvmeq - > cq_vector ) ) ;
2018-12-13 11:48:00 +03:00
}
2017-06-18 17:28:10 +03:00
2018-05-17 19:31:50 +03:00
nvme_complete_cqes ( nvmeq , start , end ) ;
2017-06-18 17:28:10 +03:00
return found ;
2015-11-04 06:37:26 +03:00
}
2018-11-16 19:48:21 +03:00
static int nvme_poll ( struct blk_mq_hw_ctx * hctx )
2018-11-14 19:38:28 +03:00
{
struct nvme_queue * nvmeq = hctx - > driver_data ;
u16 start , end ;
bool found ;
if ( ! nvme_cqe_pending ( nvmeq ) )
return 0 ;
2018-12-02 19:46:23 +03:00
spin_lock ( & nvmeq - > cq_poll_lock ) ;
2018-11-16 19:48:21 +03:00
found = nvme_process_cq ( nvmeq , & start , & end , - 1 ) ;
2018-12-02 19:46:23 +03:00
spin_unlock ( & nvmeq - > cq_poll_lock ) ;
2018-11-14 19:38:28 +03:00
nvme_complete_cqes ( nvmeq , start , end ) ;
return found ;
}
2017-11-08 01:13:12 +03:00
static void nvme_pci_submit_async_event ( struct nvme_ctrl * ctrl )
2011-01-20 20:50:14 +03:00
{
2016-04-26 14:52:00 +03:00
struct nvme_dev * dev = to_nvme_dev ( ctrl ) ;
2018-01-14 13:39:01 +03:00
struct nvme_queue * nvmeq = & dev - > queues [ 0 ] ;
2014-11-04 18:20:14 +03:00
struct nvme_command c ;
2011-01-20 20:50:14 +03:00
2014-11-04 18:20:14 +03:00
memset ( & c , 0 , sizeof ( c ) ) ;
c . common . opcode = nvme_admin_async_event ;
2017-11-08 01:13:12 +03:00
c . common . command_id = NVME_AQ_BLK_MQ_DEPTH ;
2018-11-29 20:02:29 +03:00
nvme_submit_cmd ( nvmeq , & c , true ) ;
2015-05-22 12:12:38 +03:00
}
2011-01-20 20:50:14 +03:00
static int adapter_delete_queue ( struct nvme_dev * dev , u8 opcode , u16 id )
2015-05-22 12:12:38 +03:00
{
2011-01-20 20:50:14 +03:00
struct nvme_command c ;
memset ( & c , 0 , sizeof ( c ) ) ;
c . delete_queue . opcode = opcode ;
c . delete_queue . qid = cpu_to_le16 ( id ) ;
2015-11-26 12:06:56 +03:00
return nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ;
2011-01-20 20:50:14 +03:00
}
static int adapter_alloc_cq ( struct nvme_dev * dev , u16 qid ,
2018-05-24 12:51:33 +03:00
struct nvme_queue * nvmeq , s16 vector )
2011-01-20 20:50:14 +03:00
{
struct nvme_command c ;
2018-11-05 22:44:33 +03:00
int flags = NVME_QUEUE_PHYS_CONTIG ;
2019-03-08 20:43:06 +03:00
if ( ! test_bit ( NVMEQ_POLLED , & nvmeq - > flags ) )
2018-11-05 22:44:33 +03:00
flags | = NVME_CQ_IRQ_ENABLED ;
2011-01-20 20:50:14 +03:00
2015-05-22 12:12:46 +03:00
/*
2017-10-18 16:56:09 +03:00
* Note : we ( ab ) use the fact that the prp fields survive if no data
2015-05-22 12:12:46 +03:00
* is attached to the request .
*/
2011-01-20 20:50:14 +03:00
memset ( & c , 0 , sizeof ( c ) ) ;
c . create_cq . opcode = nvme_admin_create_cq ;
c . create_cq . prp1 = cpu_to_le64 ( nvmeq - > cq_dma_addr ) ;
c . create_cq . cqid = cpu_to_le16 ( qid ) ;
c . create_cq . qsize = cpu_to_le16 ( nvmeq - > q_depth - 1 ) ;
c . create_cq . cq_flags = cpu_to_le16 ( flags ) ;
2019-03-08 20:43:06 +03:00
c . create_cq . irq_vector = cpu_to_le16 ( vector ) ;
2011-01-20 20:50:14 +03:00
2015-11-26 12:06:56 +03:00
return nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ;
2011-01-20 20:50:14 +03:00
}
static int adapter_alloc_sq ( struct nvme_dev * dev , u16 qid ,
struct nvme_queue * nvmeq )
{
2018-05-08 19:25:15 +03:00
struct nvme_ctrl * ctrl = & dev - > ctrl ;
2011-01-20 20:50:14 +03:00
struct nvme_command c ;
2017-04-05 01:18:12 +03:00
int flags = NVME_QUEUE_PHYS_CONTIG ;
2011-01-20 20:50:14 +03:00
2018-05-08 19:25:15 +03:00
/*
* Some drives have a bug that auto - enables WRRU if MEDIUM isn ' t
* set . Since URGENT priority is zeroes , it makes all queues
* URGENT .
*/
if ( ctrl - > quirks & NVME_QUIRK_MEDIUM_PRIO_SQ )
flags | = NVME_SQ_PRIO_MEDIUM ;
2015-05-22 12:12:46 +03:00
/*
2017-10-18 16:56:09 +03:00
* Note : we ( ab ) use the fact that the prp fields survive if no data
2015-05-22 12:12:46 +03:00
* is attached to the request .
*/
2011-01-20 20:50:14 +03:00
memset ( & c , 0 , sizeof ( c ) ) ;
c . create_sq . opcode = nvme_admin_create_sq ;
c . create_sq . prp1 = cpu_to_le64 ( nvmeq - > sq_dma_addr ) ;
c . create_sq . sqid = cpu_to_le16 ( qid ) ;
c . create_sq . qsize = cpu_to_le16 ( nvmeq - > q_depth - 1 ) ;
c . create_sq . sq_flags = cpu_to_le16 ( flags ) ;
c . create_sq . cqid = cpu_to_le16 ( qid ) ;
2015-11-26 12:06:56 +03:00
return nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ;
2011-01-20 20:50:14 +03:00
}
static int adapter_delete_cq ( struct nvme_dev * dev , u16 cqid )
{
return adapter_delete_queue ( dev , nvme_admin_delete_cq , cqid ) ;
}
static int adapter_delete_sq ( struct nvme_dev * dev , u16 sqid )
{
return adapter_delete_queue ( dev , nvme_admin_delete_sq , sqid ) ;
}
2017-06-03 10:38:04 +03:00
static void abort_endio ( struct request * req , blk_status_t error )
2011-09-20 01:08:14 +04:00
{
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
struct nvme_queue * nvmeq = iod - > nvmeq ;
2015-06-27 21:20:34 +03:00
2017-04-20 17:02:57 +03:00
dev_warn ( nvmeq - > dev - > ctrl . device ,
" Abort status: 0x%x " , nvme_req ( req ) - > status ) ;
2015-11-16 12:39:48 +03:00
atomic_inc ( & nvmeq - > dev - > ctrl . abort_limit ) ;
blk_mq_free_request ( req ) ;
2011-09-20 01:08:14 +04:00
}
2017-06-07 21:32:50 +03:00
static bool nvme_should_reset ( struct nvme_dev * dev , u32 csts )
{
/* If true, indicates loss of adapter communication, possibly by a
* NVMe Subsystem reset .
*/
bool nssro = dev - > subsystem & & ( csts & NVME_CSTS_NSSRO ) ;
2018-01-22 17:03:16 +03:00
/* If there is a reset/reinit ongoing, we shouldn't reset again. */
switch ( dev - > ctrl . state ) {
case NVME_CTRL_RESETTING :
2018-01-31 19:31:24 +03:00
case NVME_CTRL_CONNECTING :
2017-06-07 21:32:50 +03:00
return false ;
2018-01-22 17:03:16 +03:00
default :
break ;
}
2017-06-07 21:32:50 +03:00
/* We shouldn't reset unless the controller is on fatal error state
* _or_ if we lost the communication with it .
*/
if ( ! ( csts & NVME_CSTS_CFS ) & & ! nssro )
return false ;
return true ;
}
static void nvme_warn_reset ( struct nvme_dev * dev , u32 csts )
{
/* Read a config register to help see what died. */
u16 pci_status ;
int result ;
result = pci_read_config_word ( to_pci_dev ( dev - > dev ) , PCI_STATUS ,
& pci_status ) ;
if ( result = = PCIBIOS_SUCCESSFUL )
dev_warn ( dev - > ctrl . device ,
" controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx \n " ,
csts , pci_status ) ;
else
dev_warn ( dev - > ctrl . device ,
" controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d) \n " ,
csts , result ) ;
}
2015-10-22 15:03:35 +03:00
static enum blk_eh_timer_return nvme_timeout ( struct request * req , bool reserved )
2013-12-11 00:10:38 +04:00
{
2015-11-28 17:43:10 +03:00
struct nvme_iod * iod = blk_mq_rq_to_pdu ( req ) ;
struct nvme_queue * nvmeq = iod - > nvmeq ;
2013-12-11 00:10:38 +04:00
struct nvme_dev * dev = nvmeq - > dev ;
2014-11-04 18:20:14 +03:00
struct request * abort_req ;
struct nvme_command cmd ;
2017-06-07 21:32:50 +03:00
u32 csts = readl ( dev - > bar + NVME_REG_CSTS ) ;
2018-02-15 23:05:10 +03:00
/* If PCI error recovery process is happening, we cannot reset or
* the recovery mechanism will surely fail .
*/
mb ( ) ;
if ( pci_channel_offline ( to_pci_dev ( dev - > dev ) ) )
return BLK_EH_RESET_TIMER ;
2017-06-07 21:32:50 +03:00
/*
* Reset immediately if the controller is failed
*/
if ( nvme_should_reset ( dev , csts ) ) {
nvme_warn_reset ( dev , csts ) ;
nvme_dev_disable ( dev , false ) ;
2017-06-15 16:41:08 +03:00
nvme_reset_ctrl ( & dev - > ctrl ) ;
2018-05-29 16:52:30 +03:00
return BLK_EH_DONE ;
2017-06-07 21:32:50 +03:00
}
2013-12-11 00:10:38 +04:00
2017-02-25 01:59:28 +03:00
/*
* Did we miss an interrupt ?
*/
2018-12-02 19:46:20 +03:00
if ( nvme_poll_irqdisable ( nvmeq , req - > tag ) ) {
2017-02-25 01:59:28 +03:00
dev_warn ( dev - > ctrl . device ,
" I/O %d QID %d timeout, completion polled \n " ,
req - > tag , nvmeq - > qid ) ;
2018-05-29 16:52:30 +03:00
return BLK_EH_DONE ;
2017-02-25 01:59:28 +03:00
}
2015-10-22 15:03:35 +03:00
/*
2015-11-26 14:42:26 +03:00
* Shutdown immediately if controller times out while starting . The
* reset work will see the pci device disabled when it gets the forced
* cancellation error . All outstanding requests are completed on
2018-05-29 16:52:30 +03:00
* shutdown , so we return BLK_EH_DONE .
2015-11-26 14:42:26 +03:00
*/
2018-02-08 18:55:34 +03:00
switch ( dev - > ctrl . state ) {
case NVME_CTRL_CONNECTING :
2019-05-14 23:27:53 +03:00
nvme_change_ctrl_state ( & dev - > ctrl , NVME_CTRL_DELETING ) ;
/* fall through */
case NVME_CTRL_DELETING :
2018-05-24 23:34:55 +03:00
dev_warn_ratelimited ( dev - > ctrl . device ,
2015-11-26 14:42:26 +03:00
" I/O %d QID %d timeout, disable controller \n " ,
req - > tag , nvmeq - > qid ) ;
2019-05-14 23:27:53 +03:00
nvme_dev_disable ( dev , true ) ;
2017-04-20 17:02:57 +03:00
nvme_req ( req ) - > flags | = NVME_REQ_CANCELLED ;
2018-05-29 16:52:30 +03:00
return BLK_EH_DONE ;
2019-05-14 23:10:41 +03:00
case NVME_CTRL_RESETTING :
return BLK_EH_RESET_TIMER ;
2018-02-08 18:55:34 +03:00
default :
break ;
2013-12-11 00:10:38 +04:00
}
2015-11-26 14:42:26 +03:00
/*
* Shutdown the controller immediately and schedule a reset if the
* command was already aborted once before and still hasn ' t been
* returned to the driver , or if this is the admin queue .
2015-10-22 15:03:35 +03:00
*/
2015-11-28 17:43:10 +03:00
if ( ! nvmeq - > qid | | iod - > aborted ) {
2016-02-10 18:51:15 +03:00
dev_warn ( dev - > ctrl . device ,
2015-11-26 14:11:07 +03:00
" I/O %d QID %d timeout, reset controller \n " ,
req - > tag , nvmeq - > qid ) ;
2016-01-13 00:41:18 +03:00
nvme_dev_disable ( dev , false ) ;
2017-06-15 16:41:08 +03:00
nvme_reset_ctrl ( & dev - > ctrl ) ;
2013-12-11 00:10:38 +04:00
2017-04-20 17:02:57 +03:00
nvme_req ( req ) - > flags | = NVME_REQ_CANCELLED ;
2018-05-29 16:52:30 +03:00
return BLK_EH_DONE ;
2013-12-11 00:10:38 +04:00
}
2015-11-16 12:39:48 +03:00
if ( atomic_dec_return ( & dev - > ctrl . abort_limit ) < 0 ) {
2015-11-20 11:36:44 +03:00
atomic_inc ( & dev - > ctrl . abort_limit ) ;
2015-10-22 15:03:35 +03:00
return BLK_EH_RESET_TIMER ;
2015-11-20 11:36:44 +03:00
}
2017-01-25 02:07:00 +03:00
iod - > aborted = 1 ;
2014-11-04 18:20:14 +03:00
2013-12-11 00:10:38 +04:00
memset ( & cmd , 0 , sizeof ( cmd ) ) ;
cmd . abort . opcode = nvme_admin_abort_cmd ;
2014-11-04 18:20:14 +03:00
cmd . abort . cid = req - > tag ;
2013-12-11 00:10:38 +04:00
cmd . abort . sqid = cpu_to_le16 ( nvmeq - > qid ) ;
2016-02-10 18:51:15 +03:00
dev_warn ( nvmeq - > dev - > ctrl . device ,
" I/O %d QID %d timeout, aborting \n " ,
req - > tag , nvmeq - > qid ) ;
2015-11-16 12:39:48 +03:00
abort_req = nvme_alloc_request ( dev - > ctrl . admin_q , & cmd ,
2016-06-13 17:45:23 +03:00
BLK_MQ_REQ_NOWAIT , NVME_QID_ANY ) ;
2015-11-16 12:39:48 +03:00
if ( IS_ERR ( abort_req ) ) {
atomic_inc ( & dev - > ctrl . abort_limit ) ;
return BLK_EH_RESET_TIMER ;
}
abort_req - > timeout = ADMIN_TIMEOUT ;
abort_req - > end_io_data = NULL ;
blk_execute_rq_nowait ( abort_req - > q , NULL , abort_req , 0 , abort_endio ) ;
2013-12-11 00:10:38 +04:00
2015-10-22 15:03:35 +03:00
/*
* The aborted req will be completed on receiving the abort req .
* We enable the timer again . If hit twice , it ' ll cause a device reset ,
* as the device then is in a faulty state .
*/
return BLK_EH_RESET_TIMER ;
2013-12-11 00:10:38 +04:00
}
2014-11-04 18:20:14 +03:00
static void nvme_free_queue ( struct nvme_queue * nvmeq )
{
2019-03-08 20:43:11 +03:00
dma_free_coherent ( nvmeq - > dev - > dev , CQ_SIZE ( nvmeq - > q_depth ) ,
2012-08-03 21:55:56 +04:00
( void * ) nvmeq - > cqes , nvmeq - > cq_dma_addr ) ;
2018-12-02 19:46:18 +03:00
if ( ! nvmeq - > sq_cmds )
return ;
2018-10-05 00:27:43 +03:00
2018-12-02 19:46:18 +03:00
if ( test_and_clear_bit ( NVMEQ_SQ_CMB , & nvmeq - > flags ) ) {
2019-03-08 20:43:11 +03:00
pci_free_p2pmem ( to_pci_dev ( nvmeq - > dev - > dev ) ,
2018-12-02 19:46:18 +03:00
nvmeq - > sq_cmds , SQ_SIZE ( nvmeq - > q_depth ) ) ;
} else {
2019-03-08 20:43:11 +03:00
dma_free_coherent ( nvmeq - > dev - > dev , SQ_SIZE ( nvmeq - > q_depth ) ,
2018-12-02 19:46:18 +03:00
nvmeq - > sq_cmds , nvmeq - > sq_dma_addr ) ;
2018-10-05 00:27:43 +03:00
}
2012-08-03 21:55:56 +04:00
}
2013-12-16 22:50:00 +04:00
static void nvme_free_queues ( struct nvme_dev * dev , int lowest )
2013-07-16 01:02:20 +04:00
{
int i ;
2017-04-24 10:58:29 +03:00
for ( i = dev - > ctrl . queue_count - 1 ; i > = lowest ; i - - ) {
dev - > ctrl . queue_count - - ;
2018-01-14 13:39:01 +03:00
nvme_free_queue ( & dev - > queues [ i ] ) ;
2015-01-15 07:01:58 +03:00
}
2013-07-16 01:02:20 +04:00
}
2013-12-11 00:10:40 +04:00
/**
* nvme_suspend_queue - put queue into suspended state
2018-10-09 00:28:43 +03:00
* @ nvmeq : queue to suspend
2013-12-11 00:10:40 +04:00
*/
static int nvme_suspend_queue ( struct nvme_queue * nvmeq )
2011-01-20 20:50:14 +03:00
{
2018-12-02 19:46:17 +03:00
if ( ! test_and_clear_bit ( NVMEQ_ENABLED , & nvmeq - > flags ) )
2014-12-22 22:59:04 +03:00
return 1 ;
2012-08-07 23:56:23 +04:00
2018-12-02 19:46:17 +03:00
/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
2018-05-17 19:31:49 +03:00
mb ( ) ;
2012-08-07 23:56:23 +04:00
2018-12-02 19:46:17 +03:00
nvmeq - > dev - > online_queues - - ;
2015-11-26 12:06:56 +03:00
if ( ! nvmeq - > qid & & nvmeq - > dev - > ctrl . admin_q )
2017-07-02 15:53:27 +03:00
blk_mq_quiesce_queue ( nvmeq - > dev - > ctrl . admin_q ) ;
2019-03-08 20:43:06 +03:00
if ( ! test_and_clear_bit ( NVMEQ_POLLED , & nvmeq - > flags ) )
pci_free_irq ( to_pci_dev ( nvmeq - > dev - > dev ) , nvmeq - > cq_vector , nvmeq ) ;
2013-12-11 00:10:40 +04:00
return 0 ;
}
2011-01-20 20:50:14 +03:00
2019-01-05 01:04:33 +03:00
static void nvme_suspend_io_queues ( struct nvme_dev * dev )
{
int i ;
for ( i = dev - > ctrl . queue_count - 1 ; i > 0 ; i - - )
nvme_suspend_queue ( & dev - > queues [ i ] ) ;
}
2016-01-13 00:41:18 +03:00
static void nvme_disable_admin_queue ( struct nvme_dev * dev , bool shutdown )
2013-12-11 00:10:40 +04:00
{
2018-01-14 13:39:01 +03:00
struct nvme_queue * nvmeq = & dev - > queues [ 0 ] ;
2013-12-11 00:10:40 +04:00
2016-01-13 00:41:18 +03:00
if ( shutdown )
nvme_shutdown_ctrl ( & dev - > ctrl ) ;
else
2017-06-27 22:16:38 +03:00
nvme_disable_ctrl ( & dev - > ctrl , dev - > ctrl . cap ) ;
2015-02-19 20:34:48 +03:00
2018-12-02 19:46:20 +03:00
nvme_poll_irqdisable ( nvmeq , - 1 ) ;
2011-01-20 20:50:14 +03:00
}
2015-07-20 19:14:09 +03:00
static int nvme_cmb_qdepth ( struct nvme_dev * dev , int nr_io_queues ,
int entry_size )
{
int q_depth = dev - > q_depth ;
2015-11-28 17:03:49 +03:00
unsigned q_size_aligned = roundup ( q_depth * entry_size ,
dev - > ctrl . page_size ) ;
2015-07-20 19:14:09 +03:00
if ( q_size_aligned * nr_io_queues > dev - > cmb_size ) {
2015-07-22 00:08:13 +03:00
u64 mem_per_q = div_u64 ( dev - > cmb_size , nr_io_queues ) ;
2015-11-28 17:03:49 +03:00
mem_per_q = round_down ( mem_per_q , dev - > ctrl . page_size ) ;
2015-07-22 00:08:13 +03:00
q_depth = div_u64 ( mem_per_q , entry_size ) ;
2015-07-20 19:14:09 +03:00
/*
* Ensure the reduced q_depth is above some threshold where it
* would be better to map queues in system memory with the
* original depth
*/
if ( q_depth < 64 )
return - ENOMEM ;
}
return q_depth ;
}
static int nvme_alloc_sq_cmds ( struct nvme_dev * dev , struct nvme_queue * nvmeq ,
int qid , int depth )
{
2018-10-05 00:27:43 +03:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
if ( qid & & dev - > cmb_use_sqes & & ( dev - > cmbsz & NVME_CMBSZ_SQS ) ) {
nvmeq - > sq_cmds = pci_alloc_p2pmem ( pdev , SQ_SIZE ( depth ) ) ;
2019-07-08 20:05:11 +03:00
if ( nvmeq - > sq_cmds ) {
nvmeq - > sq_dma_addr = pci_p2pmem_virt_to_bus ( pdev ,
nvmeq - > sq_cmds ) ;
if ( nvmeq - > sq_dma_addr ) {
set_bit ( NVMEQ_SQ_CMB , & nvmeq - > flags ) ;
return 0 ;
}
pci_free_p2pmem ( pdev , nvmeq - > sq_cmds , SQ_SIZE ( depth ) ) ;
2018-12-02 19:46:18 +03:00
}
2018-10-05 00:27:43 +03:00
}
2015-07-20 19:14:09 +03:00
2018-12-02 19:46:18 +03:00
nvmeq - > sq_cmds = dma_alloc_coherent ( dev - > dev , SQ_SIZE ( depth ) ,
& nvmeq - > sq_dma_addr , GFP_KERNEL ) ;
2018-02-13 15:44:44 +03:00
if ( ! nvmeq - > sq_cmds )
return - ENOMEM ;
2015-07-20 19:14:09 +03:00
return 0 ;
}
2018-04-12 18:16:09 +03:00
static int nvme_alloc_queue ( struct nvme_dev * dev , int qid , int depth )
2011-01-20 20:50:14 +03:00
{
2018-01-14 13:39:01 +03:00
struct nvme_queue * nvmeq = & dev - > queues [ qid ] ;
2011-01-20 20:50:14 +03:00
2018-01-23 19:16:19 +03:00
if ( dev - > ctrl . queue_count > qid )
return 0 ;
2011-01-20 20:50:14 +03:00
cross-tree: phase out dma_zalloc_coherent()
We already need to zero out memory for dma_alloc_coherent(), as such
using dma_zalloc_coherent() is superflous. Phase it out.
This change was generated with the following Coccinelle SmPL patch:
@ replace_dma_zalloc_coherent @
expression dev, size, data, handle, flags;
@@
-dma_zalloc_coherent(dev, size, handle, flags)
+dma_alloc_coherent(dev, size, handle, flags)
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
[hch: re-ran the script on the latest tree]
Signed-off-by: Christoph Hellwig <hch@lst.de>
2019-01-04 11:23:09 +03:00
nvmeq - > cqes = dma_alloc_coherent ( dev - > dev , CQ_SIZE ( depth ) ,
& nvmeq - > cq_dma_addr , GFP_KERNEL ) ;
2011-01-20 20:50:14 +03:00
if ( ! nvmeq - > cqes )
goto free_nvmeq ;
2015-07-20 19:14:09 +03:00
if ( nvme_alloc_sq_cmds ( dev , nvmeq , qid , depth ) )
2011-01-20 20:50:14 +03:00
goto free_cqdma ;
2011-02-10 17:56:01 +03:00
nvmeq - > dev = dev ;
2018-05-17 19:31:51 +03:00
spin_lock_init ( & nvmeq - > sq_lock ) ;
2018-12-02 19:46:23 +03:00
spin_lock_init ( & nvmeq - > cq_poll_lock ) ;
2011-01-20 20:50:14 +03:00
nvmeq - > cq_head = 0 ;
2011-01-20 21:24:06 +03:00
nvmeq - > cq_phase = 1 ;
2013-09-10 07:25:37 +04:00
nvmeq - > q_db = & dev - > dbs [ qid * 2 * dev - > db_stride ] ;
2011-01-20 20:50:14 +03:00
nvmeq - > q_depth = depth ;
2013-12-11 00:10:38 +04:00
nvmeq - > qid = qid ;
2017-04-24 10:58:29 +03:00
dev - > ctrl . queue_count + + ;
2015-05-27 21:26:23 +03:00
2018-01-14 13:39:01 +03:00
return 0 ;
2011-01-20 20:50:14 +03:00
free_cqdma :
2015-05-22 12:12:39 +03:00
dma_free_coherent ( dev - > dev , CQ_SIZE ( depth ) , ( void * ) nvmeq - > cqes ,
2011-01-20 20:50:14 +03:00
nvmeq - > cq_dma_addr ) ;
free_nvmeq :
2018-01-14 13:39:01 +03:00
return - ENOMEM ;
2011-01-20 20:50:14 +03:00
}
2016-09-14 17:18:57 +03:00
static int queue_request_irq ( struct nvme_queue * nvmeq )
2011-01-20 17:10:15 +03:00
{
2017-04-13 10:06:43 +03:00
struct pci_dev * pdev = to_pci_dev ( nvmeq - > dev - > dev ) ;
int nr = nvmeq - > dev - > ctrl . instance ;
if ( use_threaded_interrupts ) {
return pci_request_irq ( pdev , nvmeq - > cq_vector , nvme_irq_check ,
nvme_irq , nvmeq , " nvme%dq%d " , nr , nvmeq - > qid ) ;
} else {
return pci_request_irq ( pdev , nvmeq - > cq_vector , nvme_irq ,
NULL , nvmeq , " nvme%dq%d " , nr , nvmeq - > qid ) ;
}
2011-01-20 17:10:15 +03:00
}
2013-07-16 01:02:20 +04:00
static void nvme_init_queue ( struct nvme_queue * nvmeq , u16 qid )
2011-01-20 20:50:14 +03:00
{
2013-07-16 01:02:20 +04:00
struct nvme_dev * dev = nvmeq - > dev ;
2011-01-20 20:50:14 +03:00
2013-07-16 01:02:20 +04:00
nvmeq - > sq_tail = 0 ;
2018-11-29 20:02:29 +03:00
nvmeq - > last_sq_tail = 0 ;
2013-07-16 01:02:20 +04:00
nvmeq - > cq_head = 0 ;
nvmeq - > cq_phase = 1 ;
2013-09-10 07:25:37 +04:00
nvmeq - > q_db = & dev - > dbs [ qid * 2 * dev - > db_stride ] ;
2013-07-16 01:02:20 +04:00
memset ( ( void * ) nvmeq - > cqes , 0 , CQ_SIZE ( nvmeq - > q_depth ) ) ;
2017-04-10 18:51:07 +03:00
nvme_dbbuf_init ( dev , nvmeq , qid ) ;
2014-03-24 20:46:25 +04:00
dev - > online_queues + + ;
2018-12-02 19:46:23 +03:00
wmb ( ) ; /* ensure the first interrupt sees the initialization */
2013-07-16 01:02:20 +04:00
}
2018-11-05 22:44:33 +03:00
static int nvme_create_queue ( struct nvme_queue * nvmeq , int qid , bool polled )
2013-07-16 01:02:20 +04:00
{
struct nvme_dev * dev = nvmeq - > dev ;
int result ;
2019-03-08 20:43:06 +03:00
u16 vector = 0 ;
2011-02-01 16:39:04 +03:00
2018-12-02 19:46:22 +03:00
clear_bit ( NVMEQ_DELETE_ERROR , & nvmeq - > flags ) ;
2018-04-12 18:16:10 +03:00
/*
* A queue ' s vector matches the queue identifier unless the controller
* has only one vector available .
*/
2018-11-05 22:44:33 +03:00
if ( ! polled )
vector = dev - > num_vecs = = 1 ? 0 : qid ;
else
2019-03-08 20:43:06 +03:00
set_bit ( NVMEQ_POLLED , & nvmeq - > flags ) ;
2018-11-05 22:44:33 +03:00
2018-05-24 12:51:33 +03:00
result = adapter_alloc_cq ( dev , qid , nvmeq , vector ) ;
2018-06-06 17:13:06 +03:00
if ( result )
return result ;
2011-01-20 20:50:14 +03:00
result = adapter_alloc_sq ( dev , qid , nvmeq ) ;
if ( result < 0 )
2018-06-06 17:13:06 +03:00
return result ;
else if ( result )
2011-01-20 20:50:14 +03:00
goto release_cq ;
2018-05-24 12:51:33 +03:00
nvmeq - > cq_vector = vector ;
2017-09-14 20:54:39 +03:00
nvme_init_queue ( nvmeq , qid ) ;
2018-11-05 22:44:33 +03:00
2019-03-08 20:43:06 +03:00
if ( ! polled ) {
nvmeq - > cq_vector = vector ;
2018-11-05 22:44:33 +03:00
result = queue_request_irq ( nvmeq ) ;
if ( result < 0 )
goto release_sq ;
}
2011-01-20 20:50:14 +03:00
2018-12-02 19:46:17 +03:00
set_bit ( NVMEQ_ENABLED , & nvmeq - > flags ) ;
2013-07-16 01:02:20 +04:00
return result ;
2011-01-20 20:50:14 +03:00
2018-05-24 12:51:33 +03:00
release_sq :
2018-02-15 14:13:41 +03:00
dev - > online_queues - - ;
2011-01-20 20:50:14 +03:00
adapter_delete_sq ( dev , qid ) ;
2018-05-24 12:51:33 +03:00
release_cq :
2011-01-20 20:50:14 +03:00
adapter_delete_cq ( dev , qid ) ;
2013-07-16 01:02:20 +04:00
return result ;
2011-01-20 20:50:14 +03:00
}
2017-03-30 23:39:16 +03:00
static const struct blk_mq_ops nvme_mq_admin_ops = {
2015-05-22 12:12:46 +03:00
. queue_rq = nvme_queue_rq ,
2017-03-30 14:41:32 +03:00
. complete = nvme_pci_complete_rq ,
2014-11-04 18:20:14 +03:00
. init_hctx = nvme_admin_init_hctx ,
2015-06-08 19:08:13 +03:00
. exit_hctx = nvme_admin_exit_hctx ,
2017-06-13 10:15:18 +03:00
. init_request = nvme_init_request ,
2014-11-04 18:20:14 +03:00
. timeout = nvme_timeout ,
} ;
2017-03-30 23:39:16 +03:00
static const struct blk_mq_ops nvme_mq_ops = {
2018-12-02 19:46:27 +03:00
. queue_rq = nvme_queue_rq ,
. complete = nvme_pci_complete_rq ,
. commit_rqs = nvme_commit_rqs ,
. init_hctx = nvme_init_hctx ,
. init_request = nvme_init_request ,
. map_queues = nvme_pci_map_queues ,
. timeout = nvme_timeout ,
. poll = nvme_poll ,
2018-11-14 19:38:28 +03:00
} ;
2015-01-08 04:55:49 +03:00
static void nvme_dev_remove_admin ( struct nvme_dev * dev )
{
2015-11-26 12:06:56 +03:00
if ( dev - > ctrl . admin_q & & ! blk_queue_dying ( dev - > ctrl . admin_q ) ) {
2016-02-24 19:15:56 +03:00
/*
* If the controller was reset during removal , it ' s possible
* user requests may be waiting on a stopped queue . Start the
* queue to flush these to completion .
*/
2017-07-02 15:53:27 +03:00
blk_mq_unquiesce_queue ( dev - > ctrl . admin_q ) ;
2015-11-26 12:06:56 +03:00
blk_cleanup_queue ( dev - > ctrl . admin_q ) ;
2015-01-08 04:55:49 +03:00
blk_mq_free_tag_set ( & dev - > admin_tagset ) ;
}
}
2014-11-04 18:20:14 +03:00
static int nvme_alloc_admin_tags ( struct nvme_dev * dev )
{
2015-11-26 12:06:56 +03:00
if ( ! dev - > ctrl . admin_q ) {
2014-11-04 18:20:14 +03:00
dev - > admin_tagset . ops = & nvme_mq_admin_ops ;
dev - > admin_tagset . nr_hw_queues = 1 ;
2016-01-04 19:10:55 +03:00
2017-11-08 01:13:10 +03:00
dev - > admin_tagset . queue_depth = NVME_AQ_MQ_TAG_DEPTH ;
2014-11-04 18:20:14 +03:00
dev - > admin_tagset . timeout = ADMIN_TIMEOUT ;
2015-05-22 12:12:39 +03:00
dev - > admin_tagset . numa_node = dev_to_node ( dev - > dev ) ;
2019-03-05 15:46:58 +03:00
dev - > admin_tagset . cmd_size = sizeof ( struct nvme_iod ) ;
2017-01-14 00:43:58 +03:00
dev - > admin_tagset . flags = BLK_MQ_F_NO_SCHED ;
2014-11-04 18:20:14 +03:00
dev - > admin_tagset . driver_data = dev ;
if ( blk_mq_alloc_tag_set ( & dev - > admin_tagset ) )
return - ENOMEM ;
2017-07-10 09:22:29 +03:00
dev - > ctrl . admin_tagset = & dev - > admin_tagset ;
2014-11-04 18:20:14 +03:00
2015-11-26 12:06:56 +03:00
dev - > ctrl . admin_q = blk_mq_init_queue ( & dev - > admin_tagset ) ;
if ( IS_ERR ( dev - > ctrl . admin_q ) ) {
2014-11-04 18:20:14 +03:00
blk_mq_free_tag_set ( & dev - > admin_tagset ) ;
return - ENOMEM ;
}
2015-11-26 12:06:56 +03:00
if ( ! blk_get_queue ( dev - > ctrl . admin_q ) ) {
2015-01-08 04:55:49 +03:00
nvme_dev_remove_admin ( dev ) ;
2015-11-26 12:06:56 +03:00
dev - > ctrl . admin_q = NULL ;
2015-01-08 04:55:49 +03:00
return - ENODEV ;
}
2015-01-08 04:55:50 +03:00
} else
2017-07-02 15:53:27 +03:00
blk_mq_unquiesce_queue ( dev - > ctrl . admin_q ) ;
2014-11-04 18:20:14 +03:00
return 0 ;
}
2017-05-24 11:39:55 +03:00
static unsigned long db_bar_size ( struct nvme_dev * dev , unsigned nr_io_queues )
{
return NVME_REG_DBS + ( ( nr_io_queues + 1 ) * 8 * dev - > db_stride ) ;
}
static int nvme_remap_bar ( struct nvme_dev * dev , unsigned long size )
{
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
if ( size < = dev - > bar_mapped_size )
return 0 ;
if ( size > pci_resource_len ( pdev , 0 ) )
return - ENOMEM ;
if ( dev - > bar )
iounmap ( dev - > bar ) ;
dev - > bar = ioremap ( pci_resource_start ( pdev , 0 ) , size ) ;
if ( ! dev - > bar ) {
dev - > bar_mapped_size = 0 ;
return - ENOMEM ;
}
dev - > bar_mapped_size = size ;
dev - > dbs = dev - > bar + NVME_REG_DBS ;
return 0 ;
}
2017-05-01 00:27:17 +03:00
static int nvme_pci_configure_admin_queue ( struct nvme_dev * dev )
2011-01-20 20:50:14 +03:00
{
2013-05-04 14:43:16 +04:00
int result ;
2011-01-20 20:50:14 +03:00
u32 aqa ;
struct nvme_queue * nvmeq ;
2017-05-24 11:39:55 +03:00
result = nvme_remap_bar ( dev , db_bar_size ( dev , 0 ) ) ;
if ( result < 0 )
return result ;
2016-10-19 18:51:05 +03:00
dev - > subsystem = readl ( dev - > bar + NVME_REG_VS ) > = NVME_VS ( 1 , 1 , 0 ) ?
2017-06-27 22:16:38 +03:00
NVME_CAP_NSSRC ( dev - > ctrl . cap ) : 0 ;
2015-08-11 00:20:40 +03:00
2015-11-20 10:58:10 +03:00
if ( dev - > subsystem & &
( readl ( dev - > bar + NVME_REG_CSTS ) & NVME_CSTS_NSSRO ) )
writel ( NVME_CSTS_NSSRO , dev - > bar + NVME_REG_CSTS ) ;
2015-08-11 00:20:40 +03:00
2017-06-27 22:16:38 +03:00
result = nvme_disable_ctrl ( & dev - > ctrl , dev - > ctrl . cap ) ;
2013-05-04 14:43:16 +04:00
if ( result < 0 )
return result ;
2011-01-20 20:50:14 +03:00
2018-04-12 18:16:09 +03:00
result = nvme_alloc_queue ( dev , 0 , NVME_AQ_DEPTH ) ;
2018-01-14 13:39:01 +03:00
if ( result )
return result ;
2011-01-20 20:50:14 +03:00
2018-01-14 13:39:01 +03:00
nvmeq = & dev - > queues [ 0 ] ;
2011-01-20 20:50:14 +03:00
aqa = nvmeq - > q_depth - 1 ;
aqa | = aqa < < 16 ;
2015-11-20 10:58:10 +03:00
writel ( aqa , dev - > bar + NVME_REG_AQA ) ;
lo_hi_writeq ( nvmeq - > sq_dma_addr , dev - > bar + NVME_REG_ASQ ) ;
lo_hi_writeq ( nvmeq - > cq_dma_addr , dev - > bar + NVME_REG_ACQ ) ;
2011-01-20 20:50:14 +03:00
2017-06-27 22:16:38 +03:00
result = nvme_enable_ctrl ( & dev - > ctrl , dev - > ctrl . cap ) ;
2013-05-01 23:07:51 +04:00
if ( result )
2016-11-15 23:56:26 +03:00
return result ;
2014-11-04 18:20:14 +03:00
2014-12-22 22:59:04 +03:00
nvmeq - > cq_vector = 0 ;
2017-09-14 20:54:39 +03:00
nvme_init_queue ( nvmeq , 0 ) ;
2016-09-14 17:18:57 +03:00
result = queue_request_irq ( nvmeq ) ;
2015-06-30 20:22:52 +03:00
if ( result ) {
2019-03-08 20:43:06 +03:00
dev - > online_queues - - ;
2016-11-15 23:56:26 +03:00
return result ;
2015-06-30 20:22:52 +03:00
}
2013-05-01 23:07:51 +04:00
2018-12-02 19:46:17 +03:00
set_bit ( NVMEQ_ENABLED , & nvmeq - > flags ) ;
2011-01-20 20:50:14 +03:00
return result ;
}
2015-11-26 13:46:39 +03:00
static int nvme_create_io_queues ( struct nvme_dev * dev )
2014-03-24 20:46:25 +04:00
{
2018-11-05 22:44:33 +03:00
unsigned i , max , rw_queues ;
2015-11-26 13:46:39 +03:00
int ret = 0 ;
2014-03-24 20:46:25 +04:00
2017-04-24 10:58:29 +03:00
for ( i = dev - > ctrl . queue_count ; i < = dev - > max_qid ; i + + ) {
2018-04-12 18:16:09 +03:00
if ( nvme_alloc_queue ( dev , i , dev - > q_depth ) ) {
2015-11-26 13:46:39 +03:00
ret = - ENOMEM ;
2014-03-24 20:46:25 +04:00
break ;
2015-11-26 13:46:39 +03:00
}
}
2014-03-24 20:46:25 +04:00
2017-04-24 10:58:29 +03:00
max = min ( dev - > max_qid , dev - > ctrl . queue_count - 1 ) ;
2018-12-02 19:46:16 +03:00
if ( max ! = 1 & & dev - > io_queues [ HCTX_TYPE_POLL ] ) {
rw_queues = dev - > io_queues [ HCTX_TYPE_DEFAULT ] +
dev - > io_queues [ HCTX_TYPE_READ ] ;
2018-11-05 22:44:33 +03:00
} else {
rw_queues = max ;
}
2015-12-18 03:08:15 +03:00
for ( i = dev - > online_queues ; i < = max ; i + + ) {
2018-11-05 22:44:33 +03:00
bool polled = i > rw_queues ;
ret = nvme_create_queue ( & dev - > queues [ i ] , i , polled ) ;
2016-11-15 23:56:26 +03:00
if ( ret )
2014-03-24 20:46:25 +04:00
break ;
2014-04-11 19:58:45 +04:00
}
2015-11-26 13:46:39 +03:00
/*
* Ignore failing Create SQ / CQ commands , we can continue with less
2018-01-14 10:14:27 +03:00
* than the desired amount of queues , and even a controller without
* I / O queues can still be used to issue admin commands . This might
2015-11-26 13:46:39 +03:00
* be useful to upgrade a buggy firmware for example .
*/
return ret > = 0 ? 0 : ret ;
2011-01-20 20:50:14 +03:00
}
2016-10-06 05:01:12 +03:00
static ssize_t nvme_cmb_show ( struct device * dev ,
struct device_attribute * attr ,
char * buf )
{
struct nvme_dev * ndev = to_nvme_dev ( dev_get_drvdata ( dev ) ) ;
2016-12-16 21:54:50 +03:00
return scnprintf ( buf , PAGE_SIZE , " cmbloc : x%08x \n cmbsz : x%08x \n " ,
2016-10-06 05:01:12 +03:00
ndev - > cmbloc , ndev - > cmbsz ) ;
}
static DEVICE_ATTR ( cmb , S_IRUGO , nvme_cmb_show , NULL ) ;
2017-12-20 16:50:00 +03:00
static u64 nvme_cmb_size_unit ( struct nvme_dev * dev )
2015-07-20 19:14:09 +03:00
{
2017-12-20 16:50:00 +03:00
u8 szu = ( dev - > cmbsz > > NVME_CMBSZ_SZU_SHIFT ) & NVME_CMBSZ_SZU_MASK ;
return 1ULL < < ( 12 + 4 * szu ) ;
}
static u32 nvme_cmb_size ( struct nvme_dev * dev )
{
return ( dev - > cmbsz > > NVME_CMBSZ_SZ_SHIFT ) & NVME_CMBSZ_SZ_MASK ;
}
2017-12-20 16:25:11 +03:00
static void nvme_map_cmb ( struct nvme_dev * dev )
2015-07-20 19:14:09 +03:00
{
2017-12-20 16:50:00 +03:00
u64 size , offset ;
2015-07-20 19:14:09 +03:00
resource_size_t bar_size ;
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2017-10-01 10:37:35 +03:00
int bar ;
2015-07-20 19:14:09 +03:00
2018-10-31 22:15:29 +03:00
if ( dev - > cmb_size )
return ;
2015-11-20 10:58:10 +03:00
dev - > cmbsz = readl ( dev - > bar + NVME_REG_CMBSZ ) ;
2017-12-20 16:25:11 +03:00
if ( ! dev - > cmbsz )
return ;
2016-10-06 05:01:12 +03:00
dev - > cmbloc = readl ( dev - > bar + NVME_REG_CMBLOC ) ;
2015-07-20 19:14:09 +03:00
2017-12-20 16:50:00 +03:00
size = nvme_cmb_size_unit ( dev ) * nvme_cmb_size ( dev ) ;
offset = nvme_cmb_size_unit ( dev ) * NVME_CMB_OFST ( dev - > cmbloc ) ;
2017-10-01 10:37:35 +03:00
bar = NVME_CMB_BIR ( dev - > cmbloc ) ;
bar_size = pci_resource_len ( pdev , bar ) ;
2015-07-20 19:14:09 +03:00
if ( offset > bar_size )
2017-12-20 16:25:11 +03:00
return ;
2015-07-20 19:14:09 +03:00
/*
* Controllers may support a CMB size larger than their BAR ,
* for example , due to being behind a bridge . Reduce the CMB to
* the reported size of the BAR
*/
if ( size > bar_size - offset )
size = bar_size - offset ;
2018-10-05 00:27:43 +03:00
if ( pci_p2pdma_add_resource ( pdev , bar , size , offset ) ) {
dev_warn ( dev - > ctrl . device ,
" failed to register the CMB \n " ) ;
2017-12-20 16:25:11 +03:00
return ;
2018-10-05 00:27:43 +03:00
}
2015-07-20 19:14:09 +03:00
dev - > cmb_size = size ;
2018-10-05 00:27:43 +03:00
dev - > cmb_use_sqes = use_cmb_sqes & & ( dev - > cmbsz & NVME_CMBSZ_SQS ) ;
if ( ( dev - > cmbsz & ( NVME_CMBSZ_WDS | NVME_CMBSZ_RDS ) ) = =
( NVME_CMBSZ_WDS | NVME_CMBSZ_RDS ) )
pci_p2pmem_publish ( pdev , true ) ;
2017-12-20 16:25:11 +03:00
if ( sysfs_add_file_to_group ( & dev - > ctrl . device - > kobj ,
& dev_attr_cmb . attr , NULL ) )
dev_warn ( dev - > ctrl . device ,
" failed to add sysfs attribute for CMB \n " ) ;
2015-07-20 19:14:09 +03:00
}
static inline void nvme_release_cmb ( struct nvme_dev * dev )
{
2018-10-05 00:27:43 +03:00
if ( dev - > cmb_size ) {
2017-07-30 01:45:08 +03:00
sysfs_remove_file_from_group ( & dev - > ctrl . device - > kobj ,
& dev_attr_cmb . attr , NULL ) ;
2018-10-05 00:27:43 +03:00
dev - > cmb_size = 0 ;
2015-07-20 19:14:09 +03:00
}
}
2017-05-12 18:02:58 +03:00
static int nvme_set_host_mem ( struct nvme_dev * dev , u32 bits )
{
2017-08-28 11:47:18 +03:00
u64 dma_addr = dev - > host_mem_descs_dma ;
2017-05-12 18:02:58 +03:00
struct nvme_command c ;
int ret ;
memset ( & c , 0 , sizeof ( c ) ) ;
c . features . opcode = nvme_admin_set_features ;
c . features . fid = cpu_to_le32 ( NVME_FEAT_HOST_MEM_BUF ) ;
c . features . dword11 = cpu_to_le32 ( bits ) ;
c . features . dword12 = cpu_to_le32 ( dev - > host_mem_size > >
ilog2 ( dev - > ctrl . page_size ) ) ;
c . features . dword13 = cpu_to_le32 ( lower_32_bits ( dma_addr ) ) ;
c . features . dword14 = cpu_to_le32 ( upper_32_bits ( dma_addr ) ) ;
c . features . dword15 = cpu_to_le32 ( dev - > nr_host_mem_descs ) ;
ret = nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ;
if ( ret ) {
dev_warn ( dev - > ctrl . device ,
" failed to set host mem (err %d, flags %#x). \n " ,
ret , bits ) ;
}
return ret ;
}
static void nvme_free_host_mem ( struct nvme_dev * dev )
{
int i ;
for ( i = 0 ; i < dev - > nr_host_mem_descs ; i + + ) {
struct nvme_host_mem_buf_desc * desc = & dev - > host_mem_descs [ i ] ;
size_t size = le32_to_cpu ( desc - > size ) * dev - > ctrl . page_size ;
2018-12-29 20:23:43 +03:00
dma_free_attrs ( dev - > dev , size , dev - > host_mem_desc_bufs [ i ] ,
le64_to_cpu ( desc - > addr ) ,
DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN ) ;
2017-05-12 18:02:58 +03:00
}
kfree ( dev - > host_mem_desc_bufs ) ;
dev - > host_mem_desc_bufs = NULL ;
2017-08-28 11:47:18 +03:00
dma_free_coherent ( dev - > dev ,
dev - > nr_host_mem_descs * sizeof ( * dev - > host_mem_descs ) ,
dev - > host_mem_descs , dev - > host_mem_descs_dma ) ;
2017-05-12 18:02:58 +03:00
dev - > host_mem_descs = NULL ;
2017-11-24 21:03:00 +03:00
dev - > nr_host_mem_descs = 0 ;
2017-05-12 18:02:58 +03:00
}
2017-09-11 19:08:43 +03:00
static int __nvme_alloc_host_mem ( struct nvme_dev * dev , u64 preferred ,
u32 chunk_size )
2013-07-16 01:02:24 +04:00
{
2017-05-12 18:02:58 +03:00
struct nvme_host_mem_buf_desc * descs ;
2017-09-11 19:08:43 +03:00
u32 max_entries , len ;
2017-08-28 11:47:18 +03:00
dma_addr_t descs_dma ;
2017-07-06 12:26:52 +03:00
int i = 0 ;
2017-05-12 18:02:58 +03:00
void * * bufs ;
2017-12-04 23:23:54 +03:00
u64 size , tmp ;
2017-05-12 18:02:58 +03:00
tmp = ( preferred + chunk_size - 1 ) ;
do_div ( tmp , chunk_size ) ;
max_entries = tmp ;
2017-09-11 19:09:28 +03:00
if ( dev - > ctrl . hmmaxd & & dev - > ctrl . hmmaxd < max_entries )
max_entries = dev - > ctrl . hmmaxd ;
cross-tree: phase out dma_zalloc_coherent()
We already need to zero out memory for dma_alloc_coherent(), as such
using dma_zalloc_coherent() is superflous. Phase it out.
This change was generated with the following Coccinelle SmPL patch:
@ replace_dma_zalloc_coherent @
expression dev, size, data, handle, flags;
@@
-dma_zalloc_coherent(dev, size, handle, flags)
+dma_alloc_coherent(dev, size, handle, flags)
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
[hch: re-ran the script on the latest tree]
Signed-off-by: Christoph Hellwig <hch@lst.de>
2019-01-04 11:23:09 +03:00
descs = dma_alloc_coherent ( dev - > dev , max_entries * sizeof ( * descs ) ,
& descs_dma , GFP_KERNEL ) ;
2017-05-12 18:02:58 +03:00
if ( ! descs )
goto out ;
bufs = kcalloc ( max_entries , sizeof ( * bufs ) , GFP_KERNEL ) ;
if ( ! bufs )
goto out_free_descs ;
2017-11-16 19:34:24 +03:00
for ( size = 0 ; size < preferred & & i < max_entries ; size + = len ) {
2017-05-12 18:02:58 +03:00
dma_addr_t dma_addr ;
2017-07-25 18:39:07 +03:00
len = min_t ( u64 , chunk_size , preferred - size ) ;
2017-05-12 18:02:58 +03:00
bufs [ i ] = dma_alloc_attrs ( dev - > dev , len , & dma_addr , GFP_KERNEL ,
DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN ) ;
if ( ! bufs [ i ] )
break ;
descs [ i ] . addr = cpu_to_le64 ( dma_addr ) ;
descs [ i ] . size = cpu_to_le32 ( len / dev - > ctrl . page_size ) ;
i + + ;
}
2017-09-11 19:08:43 +03:00
if ( ! size )
2017-05-12 18:02:58 +03:00
goto out_free_bufs ;
dev - > nr_host_mem_descs = i ;
dev - > host_mem_size = size ;
dev - > host_mem_descs = descs ;
2017-08-28 11:47:18 +03:00
dev - > host_mem_descs_dma = descs_dma ;
2017-05-12 18:02:58 +03:00
dev - > host_mem_desc_bufs = bufs ;
return 0 ;
out_free_bufs :
while ( - - i > = 0 ) {
size_t size = le32_to_cpu ( descs [ i ] . size ) * dev - > ctrl . page_size ;
2018-12-29 20:23:43 +03:00
dma_free_attrs ( dev - > dev , size , bufs [ i ] ,
le64_to_cpu ( descs [ i ] . addr ) ,
DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN ) ;
2017-05-12 18:02:58 +03:00
}
kfree ( bufs ) ;
out_free_descs :
2017-08-28 11:47:18 +03:00
dma_free_coherent ( dev - > dev , max_entries * sizeof ( * descs ) , descs ,
descs_dma ) ;
2017-05-12 18:02:58 +03:00
out :
dev - > host_mem_descs = NULL ;
return - ENOMEM ;
}
2017-09-11 19:08:43 +03:00
static int nvme_alloc_host_mem ( struct nvme_dev * dev , u64 min , u64 preferred )
{
u32 chunk_size ;
/* start big and work our way down */
2017-09-06 13:15:31 +03:00
for ( chunk_size = min_t ( u64 , preferred , PAGE_SIZE * MAX_ORDER_NR_PAGES ) ;
2017-09-11 19:09:28 +03:00
chunk_size > = max_t ( u32 , dev - > ctrl . hmminds * 4096 , PAGE_SIZE * 2 ) ;
2017-09-11 19:08:43 +03:00
chunk_size / = 2 ) {
if ( ! __nvme_alloc_host_mem ( dev , preferred , chunk_size ) ) {
if ( ! min | | dev - > host_mem_size > = min )
return 0 ;
nvme_free_host_mem ( dev ) ;
}
}
return - ENOMEM ;
}
2017-09-06 13:19:57 +03:00
static int nvme_setup_host_mem ( struct nvme_dev * dev )
2017-05-12 18:02:58 +03:00
{
u64 max = ( u64 ) max_host_mem_size_mb * SZ_1M ;
u64 preferred = ( u64 ) dev - > ctrl . hmpre * 4096 ;
u64 min = ( u64 ) dev - > ctrl . hmmin * 4096 ;
u32 enable_bits = NVME_HOST_MEM_ENABLE ;
2017-12-04 23:23:54 +03:00
int ret ;
2017-05-12 18:02:58 +03:00
preferred = min ( preferred , max ) ;
if ( min > max ) {
dev_warn ( dev - > ctrl . device ,
" min host memory (%lld MiB) above limit (%d MiB). \n " ,
min > > ilog2 ( SZ_1M ) , max_host_mem_size_mb ) ;
nvme_free_host_mem ( dev ) ;
2017-09-06 13:19:57 +03:00
return 0 ;
2017-05-12 18:02:58 +03:00
}
/*
* If we already have a buffer allocated check if we can reuse it .
*/
if ( dev - > host_mem_descs ) {
if ( dev - > host_mem_size > = min )
enable_bits | = NVME_HOST_MEM_RETURN ;
else
nvme_free_host_mem ( dev ) ;
}
if ( ! dev - > host_mem_descs ) {
2017-09-11 19:08:43 +03:00
if ( nvme_alloc_host_mem ( dev , min , preferred ) ) {
dev_warn ( dev - > ctrl . device ,
" failed to allocate host memory buffer. \n " ) ;
2017-09-06 13:19:57 +03:00
return 0 ; /* controller must work without HMB */
2017-09-11 19:08:43 +03:00
}
dev_info ( dev - > ctrl . device ,
" allocated %lld MiB host memory buffer. \n " ,
dev - > host_mem_size > > ilog2 ( SZ_1M ) ) ;
2017-05-12 18:02:58 +03:00
}
2017-09-06 13:19:57 +03:00
ret = nvme_set_host_mem ( dev , enable_bits ) ;
if ( ret )
2017-05-12 18:02:58 +03:00
nvme_free_host_mem ( dev ) ;
2017-09-06 13:19:57 +03:00
return ret ;
2013-07-16 01:02:24 +04:00
}
2019-02-16 20:13:10 +03:00
/*
* nirqs is the number of interrupts available for write and read
* queues . The core already reserved an interrupt for the admin queue .
*/
static void nvme_calc_irq_sets ( struct irq_affinity * affd , unsigned int nrirqs )
2018-10-31 17:36:31 +03:00
{
2019-02-16 20:13:10 +03:00
struct nvme_dev * dev = affd - > priv ;
unsigned int nr_read_queues ;
2018-10-31 17:36:31 +03:00
/*
2019-02-16 20:13:10 +03:00
* If there is no interupt available for queues , ensure that
* the default queue is set to 1. The affinity set size is
* also set to one , but the irq core ignores it for this case .
*
* If only one interrupt is available or ' write_queue ' = = 0 , combine
* write and read queues .
*
* If ' write_queues ' > 0 , ensure it leaves room for at least one read
* queue .
2018-10-31 17:36:31 +03:00
*/
2019-02-16 20:13:10 +03:00
if ( ! nrirqs ) {
nrirqs = 1 ;
nr_read_queues = 0 ;
} else if ( nrirqs = = 1 | | ! write_queues ) {
nr_read_queues = 0 ;
} else if ( write_queues > = nrirqs ) {
nr_read_queues = 1 ;
2018-10-31 17:36:31 +03:00
} else {
2019-02-16 20:13:10 +03:00
nr_read_queues = nrirqs - write_queues ;
2018-10-31 17:36:31 +03:00
}
2019-02-16 20:13:10 +03:00
dev - > io_queues [ HCTX_TYPE_DEFAULT ] = nrirqs - nr_read_queues ;
affd - > set_size [ HCTX_TYPE_DEFAULT ] = nrirqs - nr_read_queues ;
dev - > io_queues [ HCTX_TYPE_READ ] = nr_read_queues ;
affd - > set_size [ HCTX_TYPE_READ ] = nr_read_queues ;
affd - > nr_sets = nr_read_queues ? 2 : 1 ;
2018-10-31 17:36:31 +03:00
}
2018-12-09 21:21:45 +03:00
static int nvme_setup_irqs ( struct nvme_dev * dev , unsigned int nr_io_queues )
2018-10-31 17:36:31 +03:00
{
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
struct irq_affinity affd = {
genirq/affinity: Store interrupt sets size in struct irq_affinity
The interrupt affinity spreading mechanism supports to spread out
affinities for one or more interrupt sets. A interrupt set contains one
or more interrupts. Each set is mapped to a specific functionality of a
device, e.g. general I/O queues and read I/O queus of multiqueue block
devices.
The number of interrupts per set is defined by the driver. It depends on
the total number of available interrupts for the device, which is
determined by the PCI capabilites and the availability of underlying CPU
resources, and the number of queues which the device provides and the
driver wants to instantiate.
The driver passes initial configuration for the interrupt allocation via
a pointer to struct irq_affinity.
Right now the allocation mechanism is complex as it requires to have a
loop in the driver to determine the maximum number of interrupts which
are provided by the PCI capabilities and the underlying CPU resources.
This loop would have to be replicated in every driver which wants to
utilize this mechanism. That's unwanted code duplication and error
prone.
In order to move this into generic facilities it is required to have a
mechanism, which allows the recalculation of the interrupt sets and
their size, in the core code. As the core code does not have any
knowledge about the underlying device, a driver specific callback will
be added to struct affinity_desc, which will be invoked by the core
code. The callback will get the number of available interupts as an
argument, so the driver can calculate the corresponding number and size
of interrupt sets.
To support this, two modifications for the handling of struct irq_affinity
are required:
1) The (optional) interrupt sets size information is contained in a
separate array of integers and struct irq_affinity contains a
pointer to it.
This is cumbersome and as the maximum number of interrupt sets is small,
there is no reason to have separate storage. Moving the size array into
struct affinity_desc avoids indirections and makes the code simpler.
2) At the moment the struct irq_affinity pointer which is handed in from
the driver and passed through to several core functions is marked
'const'.
With the upcoming callback to recalculate the number and size of
interrupt sets, it's necessary to remove the 'const'
qualifier. Otherwise the callback would not be able to update the data.
Implement #1 and store the interrupt sets size in 'struct irq_affinity'.
No functional change.
[ tglx: Fixed the memcpy() size so it won't copy beyond the size of the
source. Fixed the kernel doc comments for struct irq_affinity and
de-'This patch'-ed the changelog ]
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: linux-pci@vger.kernel.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Shivasharan Srikanteshwara <shivasharan.srikanteshwara@broadcom.com>
Link: https://lkml.kernel.org/r/20190216172228.423723127@linutronix.de
2019-02-16 20:13:08 +03:00
. pre_vectors = 1 ,
2019-02-16 20:13:10 +03:00
. calc_sets = nvme_calc_irq_sets ,
. priv = dev ,
2018-10-31 17:36:31 +03:00
} ;
2018-12-09 21:21:45 +03:00
unsigned int irq_queues , this_p_queues ;
2019-06-08 21:02:19 +03:00
unsigned int nr_cpus = num_possible_cpus ( ) ;
2018-12-09 21:21:45 +03:00
/*
* Poll queues don ' t need interrupts , but we need at least one IO
* queue left over for non - polled IO .
*/
this_p_queues = poll_queues ;
if ( this_p_queues > = nr_io_queues ) {
this_p_queues = nr_io_queues - 1 ;
irq_queues = 1 ;
} else {
2019-06-08 21:02:19 +03:00
if ( nr_cpus < nr_io_queues - this_p_queues )
irq_queues = nr_cpus + 1 ;
else
irq_queues = nr_io_queues - this_p_queues + 1 ;
2018-12-09 21:21:45 +03:00
}
dev - > io_queues [ HCTX_TYPE_POLL ] = this_p_queues ;
2018-10-31 17:36:31 +03:00
2019-02-16 20:13:10 +03:00
/* Initialize for the single interrupt case */
dev - > io_queues [ HCTX_TYPE_DEFAULT ] = 1 ;
dev - > io_queues [ HCTX_TYPE_READ ] = 0 ;
2018-10-31 17:36:31 +03:00
2019-02-16 20:13:10 +03:00
return pci_alloc_irq_vectors_affinity ( pdev , 1 , irq_queues ,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY , & affd ) ;
2018-10-31 17:36:31 +03:00
}
2019-01-05 01:04:33 +03:00
static void nvme_disable_io_queues ( struct nvme_dev * dev )
{
if ( __nvme_disable_io_queues ( dev , nvme_admin_delete_sq ) )
__nvme_disable_io_queues ( dev , nvme_admin_delete_cq ) ;
}
2012-12-22 03:13:49 +04:00
static int nvme_setup_io_queues ( struct nvme_dev * dev )
2011-01-20 20:50:14 +03:00
{
2018-01-14 13:39:01 +03:00
struct nvme_queue * adminq = & dev - > queues [ 0 ] ;
2015-05-22 12:12:39 +03:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2017-05-24 11:39:55 +03:00
int result , nr_io_queues ;
unsigned long size ;
2011-01-20 20:50:14 +03:00
2018-10-31 17:36:31 +03:00
nr_io_queues = max_io_queues ( ) ;
2015-11-26 13:09:06 +03:00
result = nvme_set_queue_count ( & dev - > ctrl , & nr_io_queues ) ;
if ( result < 0 )
2011-01-20 21:01:49 +03:00
return result ;
2015-11-26 13:09:06 +03:00
2016-06-07 00:20:50 +03:00
if ( nr_io_queues = = 0 )
2016-04-09 01:09:10 +03:00
return 0 ;
2018-12-02 19:46:17 +03:00
clear_bit ( NVMEQ_ENABLED , & adminq - > flags ) ;
2011-01-20 20:50:14 +03:00
2018-10-05 00:27:43 +03:00
if ( dev - > cmb_use_sqes ) {
2015-07-20 19:14:09 +03:00
result = nvme_cmb_qdepth ( dev , nr_io_queues ,
sizeof ( struct nvme_command ) ) ;
if ( result > 0 )
dev - > q_depth = result ;
else
2018-10-05 00:27:43 +03:00
dev - > cmb_use_sqes = false ;
2015-07-20 19:14:09 +03:00
}
2017-05-24 11:39:55 +03:00
do {
size = db_bar_size ( dev , nr_io_queues ) ;
result = nvme_remap_bar ( dev , size ) ;
if ( ! result )
break ;
if ( ! - - nr_io_queues )
return - ENOMEM ;
} while ( 1 ) ;
adminq - > q_db = dev - > dbs ;
2011-10-21 01:00:41 +04:00
2019-01-05 01:04:33 +03:00
retry :
2013-07-16 01:02:24 +04:00
/* Deregister the admin queue's interrupt */
2017-04-13 10:06:43 +03:00
pci_free_irq ( pdev , 0 , adminq ) ;
2013-07-16 01:02:24 +04:00
2014-11-14 19:49:26 +03:00
/*
* If we enable msix early due to not intx , disable it again before
* setting up the full range we need .
*/
2016-09-14 17:18:57 +03:00
pci_free_irq_vectors ( pdev ) ;
2018-10-31 17:36:31 +03:00
result = nvme_setup_irqs ( dev , nr_io_queues ) ;
2018-04-12 18:16:10 +03:00
if ( result < = 0 )
2016-09-14 17:18:57 +03:00
return - EIO ;
2018-10-31 17:36:31 +03:00
2018-04-12 18:16:10 +03:00
dev - > num_vecs = result ;
2018-11-05 22:44:33 +03:00
result = max ( result - 1 , 1 ) ;
2018-12-02 19:46:16 +03:00
dev - > max_qid = result + dev - > io_queues [ HCTX_TYPE_POLL ] ;
2013-05-12 02:19:31 +04:00
2013-06-20 18:53:48 +04:00
/*
* Should investigate if there ' s a performance win from allocating
* more queues than interrupt vectors ; it might allow the submission
* path to scale better , even if the receive path is limited by the
* number of interrupts .
*/
2016-09-14 17:18:57 +03:00
result = queue_request_irq ( adminq ) ;
2019-03-08 20:43:06 +03:00
if ( result )
2016-11-15 23:56:26 +03:00
return result ;
2018-12-02 19:46:17 +03:00
set_bit ( NVMEQ_ENABLED , & adminq - > flags ) ;
2019-01-05 01:04:33 +03:00
result = nvme_create_io_queues ( dev ) ;
if ( result | | dev - > online_queues < 2 )
return result ;
if ( dev - > online_queues - 1 < dev - > max_qid ) {
nr_io_queues = dev - > online_queues - 1 ;
nvme_disable_io_queues ( dev ) ;
nvme_suspend_io_queues ( dev ) ;
goto retry ;
}
dev_info ( dev - > ctrl . device , " %d/%d/%d default/read/poll queues \n " ,
dev - > io_queues [ HCTX_TYPE_DEFAULT ] ,
dev - > io_queues [ HCTX_TYPE_READ ] ,
dev - > io_queues [ HCTX_TYPE_POLL ] ) ;
return 0 ;
2011-01-20 20:50:14 +03:00
}
2017-06-03 10:38:04 +03:00
static void nvme_del_queue_end ( struct request * req , blk_status_t error )
2015-06-01 23:28:14 +03:00
{
2016-01-13 00:41:17 +03:00
struct nvme_queue * nvmeq = req - > end_io_data ;
2015-12-11 23:14:28 +03:00
2016-01-13 00:41:17 +03:00
blk_mq_free_request ( req ) ;
2018-12-02 19:46:22 +03:00
complete ( & nvmeq - > delete_done ) ;
2015-06-01 23:28:14 +03:00
}
2017-06-03 10:38:04 +03:00
static void nvme_del_cq_end ( struct request * req , blk_status_t error )
2015-06-01 23:28:14 +03:00
{
2016-01-13 00:41:17 +03:00
struct nvme_queue * nvmeq = req - > end_io_data ;
2015-06-01 23:28:14 +03:00
2018-12-02 19:46:22 +03:00
if ( error )
set_bit ( NVMEQ_DELETE_ERROR , & nvmeq - > flags ) ;
2016-01-13 00:41:17 +03:00
nvme_del_queue_end ( req , error ) ;
2015-06-01 23:28:14 +03:00
}
2016-01-13 00:41:17 +03:00
static int nvme_delete_queue ( struct nvme_queue * nvmeq , u8 opcode )
2015-09-03 17:18:17 +03:00
{
2016-01-13 00:41:17 +03:00
struct request_queue * q = nvmeq - > dev - > ctrl . admin_q ;
struct request * req ;
struct nvme_command cmd ;
2015-09-03 17:18:17 +03:00
2016-01-13 00:41:17 +03:00
memset ( & cmd , 0 , sizeof ( cmd ) ) ;
cmd . delete_queue . opcode = opcode ;
cmd . delete_queue . qid = cpu_to_le16 ( nvmeq - > qid ) ;
2015-09-03 17:18:17 +03:00
2016-06-13 17:45:23 +03:00
req = nvme_alloc_request ( q , & cmd , BLK_MQ_REQ_NOWAIT , NVME_QID_ANY ) ;
2016-01-13 00:41:17 +03:00
if ( IS_ERR ( req ) )
return PTR_ERR ( req ) ;
2015-09-03 17:18:17 +03:00
2016-01-13 00:41:17 +03:00
req - > timeout = ADMIN_TIMEOUT ;
req - > end_io_data = nvmeq ;
2018-12-02 19:46:22 +03:00
init_completion ( & nvmeq - > delete_done ) ;
2016-01-13 00:41:17 +03:00
blk_execute_rq_nowait ( q , NULL , req , false ,
opcode = = nvme_admin_delete_cq ?
nvme_del_cq_end : nvme_del_queue_end ) ;
return 0 ;
2015-09-03 17:18:17 +03:00
}
2019-01-05 01:04:33 +03:00
static bool __nvme_disable_io_queues ( struct nvme_dev * dev , u8 opcode )
2015-06-01 23:28:14 +03:00
{
2018-12-02 19:46:21 +03:00
int nr_queues = dev - > online_queues - 1 , sent = 0 ;
2016-01-13 00:41:17 +03:00
unsigned long timeout ;
2015-06-01 23:28:14 +03:00
2016-01-13 00:41:17 +03:00
retry :
2018-12-02 19:46:21 +03:00
timeout = ADMIN_TIMEOUT ;
while ( nr_queues > 0 ) {
if ( nvme_delete_queue ( & dev - > queues [ nr_queues ] , opcode ) )
break ;
nr_queues - - ;
sent + + ;
2016-01-13 00:41:17 +03:00
}
2018-12-02 19:46:22 +03:00
while ( sent ) {
struct nvme_queue * nvmeq = & dev - > queues [ nr_queues + sent ] ;
timeout = wait_for_completion_io_timeout ( & nvmeq - > delete_done ,
2018-12-02 19:46:21 +03:00
timeout ) ;
if ( timeout = = 0 )
return false ;
2018-12-02 19:46:22 +03:00
/* handle any remaining CQEs */
if ( opcode = = nvme_admin_delete_cq & &
! test_bit ( NVMEQ_DELETE_ERROR , & nvmeq - > flags ) )
nvme_poll_irqdisable ( nvmeq , - 1 ) ;
sent - - ;
2018-12-02 19:46:21 +03:00
if ( nr_queues )
goto retry ;
}
return true ;
2015-06-01 23:28:14 +03:00
}
2013-04-16 19:22:36 +04:00
/*
2018-01-06 03:01:58 +03:00
* return error value only when tagset allocation failed
2013-04-16 19:22:36 +04:00
*/
2012-12-22 03:13:49 +04:00
static int nvme_dev_add ( struct nvme_dev * dev )
2011-01-20 20:50:14 +03:00
{
2018-01-06 03:01:58 +03:00
int ret ;
2015-11-28 17:39:07 +03:00
if ( ! dev - > ctrl . tagset ) {
2018-12-02 19:46:27 +03:00
dev - > tagset . ops = & nvme_mq_ops ;
2015-06-08 19:08:15 +03:00
dev - > tagset . nr_hw_queues = dev - > online_queues - 1 ;
2019-07-23 06:23:13 +03:00
dev - > tagset . nr_maps = 2 ; /* default + read */
2018-12-14 16:06:59 +03:00
if ( dev - > io_queues [ HCTX_TYPE_POLL ] )
dev - > tagset . nr_maps + + ;
2015-06-08 19:08:15 +03:00
dev - > tagset . timeout = NVME_IO_TIMEOUT ;
dev - > tagset . numa_node = dev_to_node ( dev - > dev ) ;
dev - > tagset . queue_depth =
2014-11-04 18:20:14 +03:00
min_t ( int , dev - > q_depth , BLK_MQ_MAX_DEPTH ) - 1 ;
2019-03-05 15:46:58 +03:00
dev - > tagset . cmd_size = sizeof ( struct nvme_iod ) ;
2015-06-08 19:08:15 +03:00
dev - > tagset . flags = BLK_MQ_F_SHOULD_MERGE ;
dev - > tagset . driver_data = dev ;
2011-01-20 20:50:14 +03:00
2018-01-06 03:01:58 +03:00
ret = blk_mq_alloc_tag_set ( & dev - > tagset ) ;
if ( ret ) {
dev_warn ( dev - > ctrl . device ,
" IO queues tagset allocation failed %d \n " , ret ) ;
return ret ;
}
2015-11-28 17:39:07 +03:00
dev - > ctrl . tagset = & dev - > tagset ;
2015-12-18 03:08:15 +03:00
} else {
blk_mq_update_nr_hw_queues ( & dev - > tagset , dev - > online_queues - 1 ) ;
/* Free previously allocated queues that are no longer usable */
nvme_free_queues ( dev , dev - > online_queues ) ;
2015-06-08 19:08:15 +03:00
}
2015-12-18 03:08:15 +03:00
2019-05-02 14:31:33 +03:00
nvme_dbbuf_set ( dev ) ;
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 23:39:03 +03:00
return 0 ;
2011-01-20 20:50:14 +03:00
}
2016-02-24 19:15:52 +03:00
static int nvme_pci_enable ( struct nvme_dev * dev )
2013-07-16 01:02:19 +04:00
{
2016-02-24 19:15:52 +03:00
int result = - ENOMEM ;
2015-05-22 12:12:39 +03:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2013-07-16 01:02:19 +04:00
if ( pci_enable_device_mem ( pdev ) )
return result ;
pci_set_master ( pdev ) ;
2019-06-28 10:17:48 +03:00
if ( dma_set_mask_and_coherent ( dev - > dev , DMA_BIT_MASK ( 64 ) ) )
2013-06-27 02:49:11 +04:00
goto disable ;
2013-07-16 01:02:19 +04:00
2015-11-20 10:58:10 +03:00
if ( readl ( dev - > bar + NVME_REG_CSTS ) = = - 1 ) {
2013-12-11 00:10:39 +04:00
result = - ENODEV ;
2016-02-24 19:15:52 +03:00
goto disable ;
2013-12-11 00:10:39 +04:00
}
2014-11-14 19:49:26 +03:00
/*
2016-04-09 01:09:10 +03:00
* Some devices and / or platforms don ' t advertise or work with INTx
* interrupts . Pre - enable a single MSIX or MSI vec for setup . We ' ll
* adjust this later .
2014-11-14 19:49:26 +03:00
*/
2016-09-14 17:18:57 +03:00
result = pci_alloc_irq_vectors ( pdev , 1 , 1 , PCI_IRQ_ALL_TYPES ) ;
if ( result < 0 )
return result ;
2014-11-14 19:49:26 +03:00
2017-06-27 22:16:38 +03:00
dev - > ctrl . cap = lo_hi_readq ( dev - > bar + NVME_REG_CAP ) ;
2015-11-20 10:58:10 +03:00
2017-06-27 22:16:38 +03:00
dev - > q_depth = min_t ( int , NVME_CAP_MQES ( dev - > ctrl . cap ) + 1 ,
2017-07-10 11:46:59 +03:00
io_queue_depth ) ;
2017-06-27 22:16:38 +03:00
dev - > db_stride = 1 < < NVME_CAP_STRIDE ( dev - > ctrl . cap ) ;
2015-11-20 10:58:10 +03:00
dev - > dbs = dev - > bar + 4096 ;
2015-12-01 23:23:22 +03:00
/*
* Temporary fix for the Apple controller found in the MacBook8 , 1 and
* some MacBook7 , 1 to avoid controller resets and data loss .
*/
if ( pdev - > vendor = = PCI_VENDOR_ID_APPLE & & pdev - > device = = 0x2001 ) {
dev - > q_depth = 2 ;
2017-05-20 16:14:43 +03:00
dev_warn ( dev - > ctrl . device , " detected Apple NVMe controller, "
" set queue depth=%u to work around controller resets \n " ,
2015-12-01 23:23:22 +03:00
dev - > q_depth ) ;
2017-06-28 05:27:57 +03:00
} else if ( pdev - > vendor = = PCI_VENDOR_ID_SAMSUNG & &
( pdev - > device = = 0xa821 | | pdev - > device = = 0xa822 ) & &
2017-06-27 22:16:38 +03:00
NVME_CAP_MQES ( dev - > ctrl . cap ) = = 0 ) {
2017-06-28 05:27:57 +03:00
dev - > q_depth = 64 ;
dev_err ( dev - > ctrl . device , " detected PM1725 NVMe controller, "
" set queue depth=%u \n " , dev - > q_depth ) ;
2015-12-01 23:23:22 +03:00
}
2017-12-20 16:25:11 +03:00
nvme_map_cmb ( dev ) ;
2016-10-06 05:01:12 +03:00
2015-12-08 01:30:31 +03:00
pci_enable_pcie_error_reporting ( pdev ) ;
pci_save_state ( pdev ) ;
2013-07-16 01:02:19 +04:00
return 0 ;
disable :
pci_disable_device ( pdev ) ;
return result ;
}
static void nvme_dev_unmap ( struct nvme_dev * dev )
2016-02-24 19:15:52 +03:00
{
if ( dev - > bar )
iounmap ( dev - > bar ) ;
2016-06-07 10:44:02 +03:00
pci_release_mem_regions ( to_pci_dev ( dev - > dev ) ) ;
2016-02-24 19:15:52 +03:00
}
static void nvme_pci_disable ( struct nvme_dev * dev )
2013-07-16 01:02:19 +04:00
{
2015-05-22 12:12:39 +03:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2016-09-14 17:18:57 +03:00
pci_free_irq_vectors ( pdev ) ;
2013-07-16 01:02:19 +04:00
2015-12-08 01:30:31 +03:00
if ( pci_is_enabled ( pdev ) ) {
pci_disable_pcie_error_reporting ( pdev ) ;
2015-05-22 12:12:39 +03:00
pci_disable_device ( pdev ) ;
2013-12-11 00:10:40 +04:00
}
}
2016-01-13 00:41:18 +03:00
static void nvme_dev_disable ( struct nvme_dev * dev , bool shutdown )
2011-01-20 20:50:14 +03:00
{
2019-05-14 23:07:38 +03:00
bool dead = true , freeze = false ;
2017-03-01 22:22:12 +03:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2013-07-16 01:02:20 +04:00
2015-11-26 14:21:29 +03:00
mutex_lock ( & dev - > shutdown_lock ) ;
2017-03-01 22:22:12 +03:00
if ( pci_is_enabled ( pdev ) ) {
u32 csts = readl ( dev - > bar + NVME_REG_CSTS ) ;
2017-06-28 02:44:05 +03:00
if ( dev - > ctrl . state = = NVME_CTRL_LIVE | |
2019-05-14 23:07:38 +03:00
dev - > ctrl . state = = NVME_CTRL_RESETTING ) {
freeze = true ;
2017-03-01 22:22:12 +03:00
nvme_start_freeze ( & dev - > ctrl ) ;
2019-05-14 23:07:38 +03:00
}
2017-03-01 22:22:12 +03:00
dead = ! ! ( ( csts & NVME_CSTS_CFS ) | | ! ( csts & NVME_CSTS_RDY ) | |
pdev - > error_state ! = pci_channel_io_normal ) ;
2015-01-08 04:55:52 +03:00
}
2016-08-11 18:35:57 +03:00
2017-03-01 22:22:12 +03:00
/*
* Give the controller a chance to complete all entered requests if
* doing a safe shutdown .
*/
2019-05-14 23:07:38 +03:00
if ( ! dead & & shutdown & & freeze )
nvme_wait_freeze_timeout ( & dev - > ctrl , NVME_IO_TIMEOUT ) ;
2018-02-12 15:57:24 +03:00
nvme_stop_queues ( & dev - > ctrl ) ;
2017-05-12 18:02:58 +03:00
2018-04-12 18:16:08 +03:00
if ( ! dead & & dev - > ctrl . queue_count > 0 ) {
2019-01-05 01:04:33 +03:00
nvme_disable_io_queues ( dev ) ;
2016-01-13 00:41:18 +03:00
nvme_disable_admin_queue ( dev , shutdown ) ;
2013-12-11 00:10:40 +04:00
}
2019-01-05 01:04:33 +03:00
nvme_suspend_io_queues ( dev ) ;
nvme_suspend_queue ( & dev - > queues [ 0 ] ) ;
2016-02-24 19:15:52 +03:00
nvme_pci_disable ( dev ) ;
2015-02-19 20:34:48 +03:00
2016-05-19 00:05:01 +03:00
blk_mq_tagset_busy_iter ( & dev - > tagset , nvme_cancel_request , & dev - > ctrl ) ;
blk_mq_tagset_busy_iter ( & dev - > admin_tagset , nvme_cancel_request , & dev - > ctrl ) ;
2017-03-01 22:22:12 +03:00
/*
* The driver will not be starting up queues again if shutting down so
* must flush all entered requests to their failed completion to avoid
* deadlocking blk - mq hot - cpu notifier .
*/
2019-04-30 18:33:41 +03:00
if ( shutdown ) {
2017-03-01 22:22:12 +03:00
nvme_start_queues ( & dev - > ctrl ) ;
2019-04-30 18:33:41 +03:00
if ( dev - > ctrl . admin_q & & ! blk_queue_dying ( dev - > ctrl . admin_q ) )
blk_mq_unquiesce_queue ( dev - > ctrl . admin_q ) ;
}
2015-11-26 14:21:29 +03:00
mutex_unlock ( & dev - > shutdown_lock ) ;
2011-01-20 20:50:14 +03:00
}
2011-02-10 17:56:01 +03:00
static int nvme_setup_prp_pools ( struct nvme_dev * dev )
{
2015-05-22 12:12:39 +03:00
dev - > prp_page_pool = dma_pool_create ( " prp list page " , dev - > dev ,
2011-02-10 17:56:01 +03:00
PAGE_SIZE , PAGE_SIZE , 0 ) ;
if ( ! dev - > prp_page_pool )
return - ENOMEM ;
2011-02-10 18:30:34 +03:00
/* Optimisation for I/Os between 4k and 128k */
2015-05-22 12:12:39 +03:00
dev - > prp_small_pool = dma_pool_create ( " prp list 256 " , dev - > dev ,
2011-02-10 18:30:34 +03:00
256 , 256 , 0 ) ;
if ( ! dev - > prp_small_pool ) {
dma_pool_destroy ( dev - > prp_page_pool ) ;
return - ENOMEM ;
}
2011-02-10 17:56:01 +03:00
return 0 ;
}
static void nvme_release_prp_pools ( struct nvme_dev * dev )
{
dma_pool_destroy ( dev - > prp_page_pool ) ;
2011-02-10 18:30:34 +03:00
dma_pool_destroy ( dev - > prp_small_pool ) ;
2011-02-10 17:56:01 +03:00
}
2015-11-26 12:54:19 +03:00
static void nvme_pci_free_ctrl ( struct nvme_ctrl * ctrl )
2013-02-19 21:17:58 +04:00
{
2015-11-26 12:54:19 +03:00
struct nvme_dev * dev = to_nvme_dev ( ctrl ) ;
2014-02-01 03:53:39 +04:00
2017-04-10 18:51:07 +03:00
nvme_dbbuf_dma_free ( dev ) ;
2015-05-22 12:12:39 +03:00
put_device ( dev - > dev ) ;
2015-06-08 19:08:13 +03:00
if ( dev - > tagset . tags )
blk_mq_free_tag_set ( & dev - > tagset ) ;
2015-11-26 12:06:56 +03:00
if ( dev - > ctrl . admin_q )
blk_put_queue ( dev - > ctrl . admin_q ) ;
2013-02-19 21:17:58 +04:00
kfree ( dev - > queues ) ;
2017-02-22 20:15:07 +03:00
free_opal_dev ( dev - > ctrl . opal_dev ) ;
2018-06-21 18:49:37 +03:00
mempool_destroy ( dev - > iod_mempool ) ;
2013-02-19 21:17:58 +04:00
kfree ( dev ) ;
}
2019-06-08 23:16:32 +03:00
static void nvme_remove_dead_ctrl ( struct nvme_dev * dev )
2016-02-24 19:15:55 +03:00
{
2017-10-18 14:25:42 +03:00
nvme_get_ctrl ( & dev - > ctrl ) ;
2016-02-24 19:15:56 +03:00
nvme_dev_disable ( dev , false ) ;
2018-06-20 08:42:22 +03:00
nvme_kill_queues ( & dev - > ctrl ) ;
2017-11-09 14:32:07 +03:00
if ( ! queue_work ( nvme_wq , & dev - > remove_work ) )
2016-02-24 19:15:55 +03:00
nvme_put_ctrl ( & dev - > ctrl ) ;
}
2015-11-26 14:42:26 +03:00
static void nvme_reset_work ( struct work_struct * work )
2013-02-19 21:17:58 +04:00
{
2017-06-15 16:41:08 +03:00
struct nvme_dev * dev =
container_of ( work , struct nvme_dev , ctrl . reset_work ) ;
2017-02-03 22:50:32 +03:00
bool was_suspend = ! ! ( dev - > ctrl . ctrl_config & NVME_CC_SHN_NORMAL ) ;
2019-06-08 23:01:02 +03:00
int result ;
2018-01-06 03:01:58 +03:00
enum nvme_ctrl_state new_state = NVME_CTRL_LIVE ;
2013-02-19 21:17:58 +04:00
2019-06-08 23:01:02 +03:00
if ( WARN_ON ( dev - > ctrl . state ! = NVME_CTRL_RESETTING ) ) {
result = - ENODEV ;
2015-11-26 14:42:26 +03:00
goto out ;
2019-06-08 23:01:02 +03:00
}
2013-02-19 21:17:58 +04:00
2015-11-26 14:42:26 +03:00
/*
* If we ' re called to reset a live controller first shut it down before
* moving on .
*/
2016-02-24 19:15:52 +03:00
if ( dev - > ctrl . ctrl_config & NVME_CC_ENABLE )
2016-01-13 00:41:18 +03:00
nvme_dev_disable ( dev , false ) ;
2019-05-14 23:46:09 +03:00
nvme_sync_queues ( & dev - > ctrl ) ;
2013-02-19 21:17:58 +04:00
2019-01-24 04:46:11 +03:00
mutex_lock ( & dev - > shutdown_lock ) ;
2016-02-24 19:15:52 +03:00
result = nvme_pci_enable ( dev ) ;
2013-07-16 01:02:21 +04:00
if ( result )
2019-02-11 19:23:50 +03:00
goto out_unlock ;
2013-07-16 01:02:21 +04:00
2017-05-01 00:27:17 +03:00
result = nvme_pci_configure_admin_queue ( dev ) ;
2013-07-16 01:02:21 +04:00
if ( result )
2019-02-11 19:23:50 +03:00
goto out_unlock ;
2013-07-16 01:02:21 +04:00
2015-01-08 04:55:50 +03:00
result = nvme_alloc_admin_tags ( dev ) ;
if ( result )
2019-02-11 19:23:50 +03:00
goto out_unlock ;
2014-04-08 03:10:11 +04:00
2018-06-21 18:49:37 +03:00
/*
* Limit the max command size to prevent iod - > sg allocations going
* over a single page .
*/
2019-07-03 19:54:44 +03:00
dev - > ctrl . max_hw_sectors = min_t ( u32 ,
NVME_MAX_KB_SZ < < 1 , dma_max_mapping_size ( dev - > dev ) > > 9 ) ;
2018-06-21 18:49:37 +03:00
dev - > ctrl . max_segments = NVME_MAX_SEGS ;
2019-06-05 22:08:24 +03:00
/*
* Don ' t limit the IOMMU merged segment size .
*/
dma_set_max_seg_size ( dev - > dev , 0xffffffff ) ;
2019-01-24 04:46:11 +03:00
mutex_unlock ( & dev - > shutdown_lock ) ;
/*
* Introduce CONNECTING state from nvme - fc / rdma transports to mark the
* initializing procedure here .
*/
if ( ! nvme_change_ctrl_state ( & dev - > ctrl , NVME_CTRL_CONNECTING ) ) {
dev_warn ( dev - > ctrl . device ,
" failed to mark controller CONNECTING \n " ) ;
2019-06-08 21:35:20 +03:00
result = - EBUSY ;
2019-01-24 04:46:11 +03:00
goto out ;
}
2018-06-21 18:49:37 +03:00
2015-10-16 08:58:46 +03:00
result = nvme_init_identify ( & dev - > ctrl ) ;
if ( result )
2016-02-24 19:15:55 +03:00
goto out ;
2015-10-16 08:58:46 +03:00
2017-02-22 20:15:07 +03:00
if ( dev - > ctrl . oacs & NVME_CTRL_OACS_SEC_SUPP ) {
if ( ! dev - > ctrl . opal_dev )
dev - > ctrl . opal_dev =
init_opal_dev ( & dev - > ctrl , & nvme_sec_submit ) ;
else if ( was_suspend )
opal_unlock_from_suspend ( dev - > ctrl . opal_dev ) ;
} else {
free_opal_dev ( dev - > ctrl . opal_dev ) ;
dev - > ctrl . opal_dev = NULL ;
2017-02-17 15:59:39 +03:00
}
2017-02-03 22:50:32 +03:00
2017-04-10 18:51:07 +03:00
if ( dev - > ctrl . oacs & NVME_CTRL_OACS_DBBUF_SUPP ) {
result = nvme_dbbuf_dma_alloc ( dev ) ;
if ( result )
dev_warn ( dev - > dev ,
" unable to allocate dma for dbbuf \n " ) ;
}
2017-09-06 13:19:57 +03:00
if ( dev - > ctrl . hmpre ) {
result = nvme_setup_host_mem ( dev ) ;
if ( result < 0 )
goto out ;
}
2017-05-12 18:02:58 +03:00
2013-07-16 01:02:21 +04:00
result = nvme_setup_io_queues ( dev ) ;
2014-06-24 00:25:35 +04:00
if ( result )
2016-02-24 19:15:55 +03:00
goto out ;
2013-07-16 01:02:21 +04:00
2015-10-02 19:51:31 +03:00
/*
* Keep the controller around but remove all namespaces if we don ' t have
* any working I / O queue .
*/
2015-10-03 10:49:23 +03:00
if ( dev - > online_queues < 2 ) {
2016-02-10 18:51:15 +03:00
dev_warn ( dev - > ctrl . device , " IO queues not created \n " ) ;
2016-04-28 00:51:18 +03:00
nvme_kill_queues ( & dev - > ctrl ) ;
2015-11-28 17:39:07 +03:00
nvme_remove_namespaces ( & dev - > ctrl ) ;
2018-01-06 03:01:58 +03:00
new_state = NVME_CTRL_ADMIN_ONLY ;
2015-10-03 10:49:23 +03:00
} else {
2016-01-04 19:10:57 +03:00
nvme_start_queues ( & dev - > ctrl ) ;
2017-03-01 22:22:12 +03:00
nvme_wait_freeze ( & dev - > ctrl ) ;
2018-01-06 03:01:58 +03:00
/* hit this only when allocate tagset fails */
if ( nvme_dev_add ( dev ) )
new_state = NVME_CTRL_ADMIN_ONLY ;
2017-03-01 22:22:12 +03:00
nvme_unfreeze ( & dev - > ctrl ) ;
2015-10-03 10:49:23 +03:00
}
2018-01-06 03:01:58 +03:00
/*
* If only admin queue live , keep it to do further investigation or
* recovery .
*/
if ( ! nvme_change_ctrl_state ( & dev - > ctrl , new_state ) ) {
dev_warn ( dev - > ctrl . device ,
" failed to mark controller state %d \n " , new_state ) ;
2019-06-08 23:01:02 +03:00
result = - ENODEV ;
2016-04-26 14:51:57 +03:00
goto out ;
}
2016-04-26 14:51:58 +03:00
2017-07-02 10:56:43 +03:00
nvme_start_ctrl ( & dev - > ctrl ) ;
2015-10-03 10:49:23 +03:00
return ;
2013-07-16 01:02:21 +04:00
2019-02-11 19:23:50 +03:00
out_unlock :
mutex_unlock ( & dev - > shutdown_lock ) ;
2015-10-03 10:49:23 +03:00
out :
2019-06-08 23:16:32 +03:00
if ( result )
dev_warn ( dev - > ctrl . device ,
" Removing after probe failure status: %d \n " , result ) ;
nvme_remove_dead_ctrl ( dev ) ;
2013-07-16 01:02:21 +04:00
}
2015-11-26 14:35:49 +03:00
static void nvme_remove_dead_ctrl_work ( struct work_struct * work )
2013-12-11 00:10:36 +04:00
{
2015-11-26 14:35:49 +03:00
struct nvme_dev * dev = container_of ( work , struct nvme_dev , remove_work ) ;
2015-05-22 12:12:39 +03:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2013-12-11 00:10:36 +04:00
if ( pci_get_drvdata ( pdev ) )
2016-03-29 01:03:21 +03:00
device_release_driver ( & pdev - > dev ) ;
2015-11-26 12:54:19 +03:00
nvme_put_ctrl ( & dev - > ctrl ) ;
2013-12-11 00:10:36 +04:00
}
2015-11-26 12:06:56 +03:00
static int nvme_pci_reg_read32 ( struct nvme_ctrl * ctrl , u32 off , u32 * val )
2014-03-07 19:24:49 +04:00
{
2015-11-26 12:06:56 +03:00
* val = readl ( to_nvme_dev ( ctrl ) - > bar + off ) ;
2015-10-02 19:49:23 +03:00
return 0 ;
2014-03-07 19:24:49 +04:00
}
2015-11-28 17:03:49 +03:00
static int nvme_pci_reg_write32 ( struct nvme_ctrl * ctrl , u32 off , u32 val )
2015-06-05 19:30:08 +03:00
{
2015-11-28 17:03:49 +03:00
writel ( val , to_nvme_dev ( ctrl ) - > bar + off ) ;
return 0 ;
}
2015-06-05 19:30:08 +03:00
2015-11-28 17:37:52 +03:00
static int nvme_pci_reg_read64 ( struct nvme_ctrl * ctrl , u32 off , u64 * val )
{
* val = readq ( to_nvme_dev ( ctrl ) - > bar + off ) ;
return 0 ;
2015-06-05 19:30:08 +03:00
}
2018-03-09 00:50:32 +03:00
static int nvme_pci_get_address ( struct nvme_ctrl * ctrl , char * buf , int size )
{
struct pci_dev * pdev = to_pci_dev ( to_nvme_dev ( ctrl ) - > dev ) ;
return snprintf ( buf , size , " %s " , dev_name ( & pdev - > dev ) ) ;
}
2015-11-26 12:06:56 +03:00
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
2016-06-13 17:45:24 +03:00
. name = " pcie " ,
2016-02-10 21:03:29 +03:00
. module = THIS_MODULE ,
2018-10-05 00:27:44 +03:00
. flags = NVME_F_METADATA_SUPPORTED |
NVME_F_PCI_P2PDMA ,
2015-11-26 12:06:56 +03:00
. reg_read32 = nvme_pci_reg_read32 ,
2015-11-28 17:03:49 +03:00
. reg_write32 = nvme_pci_reg_write32 ,
2015-11-28 17:37:52 +03:00
. reg_read64 = nvme_pci_reg_read64 ,
2015-11-26 12:54:19 +03:00
. free_ctrl = nvme_pci_free_ctrl ,
2016-04-26 14:52:00 +03:00
. submit_async_event = nvme_pci_submit_async_event ,
2018-03-09 00:50:32 +03:00
. get_address = nvme_pci_get_address ,
2015-11-26 12:06:56 +03:00
} ;
2015-06-05 19:30:08 +03:00
2016-02-24 19:15:52 +03:00
static int nvme_dev_map ( struct nvme_dev * dev )
{
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2016-06-07 10:44:02 +03:00
if ( pci_request_mem_regions ( pdev , " nvme " ) )
2016-02-24 19:15:52 +03:00
return - ENODEV ;
2017-05-24 11:39:55 +03:00
if ( nvme_remap_bar ( dev , NVME_REG_DBS + 4096 ) )
2016-02-24 19:15:52 +03:00
goto release ;
2016-12-19 17:18:24 +03:00
return 0 ;
2016-02-24 19:15:52 +03:00
release :
2016-12-19 17:18:24 +03:00
pci_release_mem_regions ( pdev ) ;
return - ENODEV ;
2016-02-24 19:15:52 +03:00
}
2017-11-09 09:12:03 +03:00
static unsigned long check_vendor_combination_bug ( struct pci_dev * pdev )
2017-04-20 23:37:55 +03:00
{
if ( pdev - > vendor = = 0x144d & & pdev - > device = = 0xa802 ) {
/*
* Several Samsung devices seem to drop off the PCIe bus
* randomly when APST is on and uses the deepest sleep state .
* This has been observed on a Samsung " SM951 NVMe SAMSUNG
* 256 GB " , a " PM951 NVMe SAMSUNG 512 GB " , and a " Samsung SSD
* 950 PRO 256 GB " , but it seems to be restricted to two Dell
* laptops .
*/
if ( dmi_match ( DMI_SYS_VENDOR , " Dell Inc. " ) & &
( dmi_match ( DMI_PRODUCT_NAME , " XPS 15 9550 " ) | |
dmi_match ( DMI_PRODUCT_NAME , " Precision 5510 " ) ) )
return NVME_QUIRK_NO_DEEPEST_PS ;
2017-11-09 09:12:03 +03:00
} else if ( pdev - > vendor = = 0x144d & & pdev - > device = = 0xa804 ) {
/*
* Samsung SSD 960 EVO drops off the PCIe bus after system
2018-03-11 21:51:56 +03:00
* suspend on a Ryzen board , ASUS PRIME B350M - A , as well as
* within few minutes after bootup on a Coffee Lake board -
* ASUS PRIME Z370 - A
2017-11-09 09:12:03 +03:00
*/
if ( dmi_match ( DMI_BOARD_VENDOR , " ASUSTeK COMPUTER INC. " ) & &
2018-03-11 21:51:56 +03:00
( dmi_match ( DMI_BOARD_NAME , " PRIME B350M-A " ) | |
dmi_match ( DMI_BOARD_NAME , " PRIME Z370-A " ) ) )
2017-11-09 09:12:03 +03:00
return NVME_QUIRK_NO_APST ;
2017-04-20 23:37:55 +03:00
}
return 0 ;
}
2018-04-27 22:42:52 +03:00
static void nvme_async_probe ( void * data , async_cookie_t cookie )
{
struct nvme_dev * dev = data ;
2018-05-07 17:30:24 +03:00
2019-07-30 01:34:52 +03:00
flush_work ( & dev - > ctrl . reset_work ) ;
2018-04-27 22:42:52 +03:00
flush_work ( & dev - > ctrl . scan_work ) ;
2018-05-07 17:30:24 +03:00
nvme_put_ctrl ( & dev - > ctrl ) ;
2018-04-27 22:42:52 +03:00
}
2012-12-22 03:13:49 +04:00
static int nvme_probe ( struct pci_dev * pdev , const struct pci_device_id * id )
2011-01-20 20:50:14 +03:00
{
2014-11-04 18:20:14 +03:00
int node , result = - ENOMEM ;
2011-01-20 20:50:14 +03:00
struct nvme_dev * dev ;
2017-04-20 23:37:55 +03:00
unsigned long quirks = id - > driver_data ;
2018-06-21 18:49:37 +03:00
size_t alloc_size ;
2011-01-20 20:50:14 +03:00
2014-11-04 18:20:14 +03:00
node = dev_to_node ( & pdev - > dev ) ;
if ( node = = NUMA_NO_NODE )
2016-06-20 03:33:17 +03:00
set_dev_node ( & pdev - > dev , first_memory_node ) ;
2014-11-04 18:20:14 +03:00
dev = kzalloc_node ( sizeof ( * dev ) , GFP_KERNEL , node ) ;
2011-01-20 20:50:14 +03:00
if ( ! dev )
return - ENOMEM ;
2018-01-14 13:39:01 +03:00
2018-10-31 17:36:31 +03:00
dev - > queues = kcalloc_node ( max_queue_count ( ) , sizeof ( struct nvme_queue ) ,
GFP_KERNEL , node ) ;
2011-01-20 20:50:14 +03:00
if ( ! dev - > queues )
goto free ;
2015-05-22 12:12:39 +03:00
dev - > dev = get_device ( & pdev - > dev ) ;
2013-12-11 00:10:36 +04:00
pci_set_drvdata ( pdev , dev ) ;
2015-11-26 12:06:56 +03:00
2016-02-24 19:15:52 +03:00
result = nvme_dev_map ( dev ) ;
if ( result )
2017-07-16 11:39:03 +03:00
goto put_pci ;
2016-02-24 19:15:52 +03:00
2017-06-15 16:41:08 +03:00
INIT_WORK ( & dev - > ctrl . reset_work , nvme_reset_work ) ;
2015-11-26 14:35:49 +03:00
INIT_WORK ( & dev - > remove_work , nvme_remove_dead_ctrl_work ) ;
2015-11-26 14:21:29 +03:00
mutex_init ( & dev - > shutdown_lock ) ;
2011-01-20 20:50:14 +03:00
2011-02-10 17:56:01 +03:00
result = nvme_setup_prp_pools ( dev ) ;
if ( result )
2017-07-16 11:39:03 +03:00
goto unmap ;
2015-06-05 19:30:08 +03:00
2017-11-09 09:12:03 +03:00
quirks | = check_vendor_combination_bug ( pdev ) ;
2017-04-20 23:37:55 +03:00
2018-06-21 18:49:37 +03:00
/*
* Double check that our mempool alloc size will cover the biggest
* command we support .
*/
alloc_size = nvme_pci_iod_alloc_size ( dev , NVME_MAX_KB_SZ ,
NVME_MAX_SEGS , true ) ;
WARN_ON_ONCE ( alloc_size > PAGE_SIZE ) ;
dev - > iod_mempool = mempool_create_node ( 1 , mempool_kmalloc ,
mempool_kfree ,
( void * ) alloc_size ,
GFP_KERNEL , node ) ;
if ( ! dev - > iod_mempool ) {
result = - ENOMEM ;
goto release_pools ;
}
2018-07-12 01:44:44 +03:00
result = nvme_init_ctrl ( & dev - > ctrl , & pdev - > dev , & nvme_pci_ctrl_ops ,
quirks ) ;
if ( result )
goto release_mempool ;
2016-02-10 18:51:15 +03:00
dev_info ( dev - > ctrl . device , " pci function %s \n " , dev_name ( & pdev - > dev ) ) ;
2019-07-30 01:34:52 +03:00
nvme_reset_ctrl ( & dev - > ctrl ) ;
2018-05-07 17:30:24 +03:00
nvme_get_ctrl ( & dev - > ctrl ) ;
2018-04-27 22:42:52 +03:00
async_schedule ( nvme_async_probe , dev ) ;
2017-12-31 15:01:19 +03:00
2011-01-20 20:50:14 +03:00
return 0 ;
2018-07-12 01:44:44 +03:00
release_mempool :
mempool_destroy ( dev - > iod_mempool ) ;
2013-07-16 01:02:19 +04:00
release_pools :
2011-02-10 17:56:01 +03:00
nvme_release_prp_pools ( dev ) ;
2017-07-16 11:39:03 +03:00
unmap :
nvme_dev_unmap ( dev ) ;
2014-08-20 05:15:59 +04:00
put_pci :
2015-05-22 12:12:39 +03:00
put_device ( dev - > dev ) ;
2011-01-20 20:50:14 +03:00
free :
kfree ( dev - > queues ) ;
kfree ( dev ) ;
return result ;
}
2017-06-01 14:10:38 +03:00
static void nvme_reset_prepare ( struct pci_dev * pdev )
2014-05-02 20:40:43 +04:00
{
2014-06-24 02:03:21 +04:00
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
2017-07-09 01:51:57 +03:00
nvme_dev_disable ( dev , false ) ;
2017-06-01 14:10:38 +03:00
}
2014-05-02 20:40:43 +04:00
2017-06-01 14:10:38 +03:00
static void nvme_reset_done ( struct pci_dev * pdev )
{
2017-07-09 01:51:57 +03:00
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
2018-01-14 13:39:00 +03:00
nvme_reset_ctrl_sync ( & dev - > ctrl ) ;
2014-05-02 20:40:43 +04:00
}
2014-01-27 20:29:40 +04:00
static void nvme_shutdown ( struct pci_dev * pdev )
{
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
2016-01-13 00:41:18 +03:00
nvme_dev_disable ( dev , true ) ;
2014-01-27 20:29:40 +04:00
}
2016-02-24 19:15:55 +03:00
/*
* The driver ' s remove may be called on a device in a partially initialized
* state . This function must not have any dependencies on the device state in
* order to proceed .
*/
2012-12-22 03:13:49 +04:00
static void nvme_remove ( struct pci_dev * pdev )
2011-01-20 20:50:14 +03:00
{
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
2013-12-11 00:10:36 +04:00
2016-04-26 14:51:57 +03:00
nvme_change_ctrl_state ( & dev - > ctrl , NVME_CTRL_DELETING ) ;
2013-12-11 00:10:36 +04:00
pci_set_drvdata ( pdev , NULL ) ;
2016-05-12 17:37:14 +03:00
2017-02-11 02:15:49 +03:00
if ( ! pci_device_is_present ( pdev ) ) {
2016-05-12 17:37:14 +03:00
nvme_change_ctrl_state ( & dev - > ctrl , NVME_CTRL_DEAD ) ;
2018-06-06 17:13:08 +03:00
nvme_dev_disable ( dev , true ) ;
2018-10-15 19:19:06 +03:00
nvme_dev_remove_admin ( dev ) ;
2017-02-11 02:15:49 +03:00
}
2016-05-12 17:37:14 +03:00
2017-06-15 16:41:08 +03:00
flush_work ( & dev - > ctrl . reset_work ) ;
2017-07-02 10:56:43 +03:00
nvme_stop_ctrl ( & dev - > ctrl ) ;
nvme_remove_namespaces ( & dev - > ctrl ) ;
2016-01-13 00:41:18 +03:00
nvme_dev_disable ( dev , true ) ;
2018-10-31 22:15:29 +03:00
nvme_release_cmb ( dev ) ;
2017-05-12 18:02:58 +03:00
nvme_free_host_mem ( dev ) ;
2014-11-04 18:20:14 +03:00
nvme_dev_remove_admin ( dev ) ;
2013-12-16 22:50:00 +04:00
nvme_free_queues ( dev , 0 ) ;
2017-07-02 10:56:43 +03:00
nvme_uninit_ctrl ( & dev - > ctrl ) ;
2013-12-11 00:10:36 +04:00
nvme_release_prp_pools ( dev ) ;
2016-02-24 19:15:52 +03:00
nvme_dev_unmap ( dev ) ;
2015-11-26 12:54:19 +03:00
nvme_put_ctrl ( & dev - > ctrl ) ;
2011-01-20 20:50:14 +03:00
}
2014-02-13 06:19:14 +04:00
# ifdef CONFIG_PM_SLEEP
2019-05-23 18:27:35 +03:00
static int nvme_get_power_state ( struct nvme_ctrl * ctrl , u32 * ps )
{
return nvme_get_features ( ctrl , NVME_FEAT_POWER_MGMT , 0 , NULL , 0 , ps ) ;
}
static int nvme_set_power_state ( struct nvme_ctrl * ctrl , u32 ps )
{
return nvme_set_features ( ctrl , NVME_FEAT_POWER_MGMT , ps , NULL , 0 , NULL ) ;
}
static int nvme_resume ( struct device * dev )
{
struct nvme_dev * ndev = pci_get_drvdata ( to_pci_dev ( dev ) ) ;
struct nvme_ctrl * ctrl = & ndev - > ctrl ;
nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled
One of the modifications made by commit d916b1be94b6 ("nvme-pci: use
host managed power state for suspend") was adding a pci_save_state()
call to nvme_suspend() so as to instruct the PCI bus type to leave
devices handled by the nvme driver in D0 during suspend-to-idle.
That was done with the assumption that ASPM would transition the
device's PCIe link into a low-power state when the device became
inactive. However, if ASPM is disabled for the device, its PCIe
link will stay in L0 and in that case commit d916b1be94b6 is likely
to cause the energy used by the system while suspended to increase.
Namely, if the device in question works in accordance with the PCIe
specification, putting it into D3hot causes its PCIe link to go to
L1 or L2/L3 Ready, which is lower-power than L0. Since the energy
used by the system while suspended depends on the state of its PCIe
link (as a general rule, the lower-power the state of the link, the
less energy the system will use), putting the device into D3hot
during suspend-to-idle should be more energy-efficient that leaving
it in D0 with disabled ASPM.
For this reason, avoid leaving NVMe devices with disabled ASPM in D0
during suspend-to-idle. Instead, shut them down entirely and let
the PCI bus type put them into D3.
Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend")
Link: https://lore.kernel.org/linux-pm/2763495.NmdaWeg79L@kreacher/T/#t
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
2019-08-09 00:58:38 +03:00
if ( ndev - > last_ps = = U32_MAX | |
2019-05-23 18:27:35 +03:00
nvme_set_power_state ( ctrl , ndev - > last_ps ) ! = 0 )
nvme_reset_ctrl ( ctrl ) ;
return 0 ;
}
2013-07-16 01:02:23 +04:00
static int nvme_suspend ( struct device * dev )
{
struct pci_dev * pdev = to_pci_dev ( dev ) ;
struct nvme_dev * ndev = pci_get_drvdata ( pdev ) ;
2019-05-23 18:27:35 +03:00
struct nvme_ctrl * ctrl = & ndev - > ctrl ;
int ret = - EBUSY ;
nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled
One of the modifications made by commit d916b1be94b6 ("nvme-pci: use
host managed power state for suspend") was adding a pci_save_state()
call to nvme_suspend() so as to instruct the PCI bus type to leave
devices handled by the nvme driver in D0 during suspend-to-idle.
That was done with the assumption that ASPM would transition the
device's PCIe link into a low-power state when the device became
inactive. However, if ASPM is disabled for the device, its PCIe
link will stay in L0 and in that case commit d916b1be94b6 is likely
to cause the energy used by the system while suspended to increase.
Namely, if the device in question works in accordance with the PCIe
specification, putting it into D3hot causes its PCIe link to go to
L1 or L2/L3 Ready, which is lower-power than L0. Since the energy
used by the system while suspended depends on the state of its PCIe
link (as a general rule, the lower-power the state of the link, the
less energy the system will use), putting the device into D3hot
during suspend-to-idle should be more energy-efficient that leaving
it in D0 with disabled ASPM.
For this reason, avoid leaving NVMe devices with disabled ASPM in D0
during suspend-to-idle. Instead, shut them down entirely and let
the PCI bus type put them into D3.
Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend")
Link: https://lore.kernel.org/linux-pm/2763495.NmdaWeg79L@kreacher/T/#t
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
2019-08-09 00:58:38 +03:00
ndev - > last_ps = U32_MAX ;
2019-05-23 18:27:35 +03:00
/*
* The platform does not remove power for a kernel managed suspend so
* use host managed nvme power settings for lowest idle power if
* possible . This should have quicker resume latency than a full device
* shutdown . But if the firmware is involved after the suspend or the
* device does not support any non - default power states , shut down the
* device fully .
nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled
One of the modifications made by commit d916b1be94b6 ("nvme-pci: use
host managed power state for suspend") was adding a pci_save_state()
call to nvme_suspend() so as to instruct the PCI bus type to leave
devices handled by the nvme driver in D0 during suspend-to-idle.
That was done with the assumption that ASPM would transition the
device's PCIe link into a low-power state when the device became
inactive. However, if ASPM is disabled for the device, its PCIe
link will stay in L0 and in that case commit d916b1be94b6 is likely
to cause the energy used by the system while suspended to increase.
Namely, if the device in question works in accordance with the PCIe
specification, putting it into D3hot causes its PCIe link to go to
L1 or L2/L3 Ready, which is lower-power than L0. Since the energy
used by the system while suspended depends on the state of its PCIe
link (as a general rule, the lower-power the state of the link, the
less energy the system will use), putting the device into D3hot
during suspend-to-idle should be more energy-efficient that leaving
it in D0 with disabled ASPM.
For this reason, avoid leaving NVMe devices with disabled ASPM in D0
during suspend-to-idle. Instead, shut them down entirely and let
the PCI bus type put them into D3.
Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend")
Link: https://lore.kernel.org/linux-pm/2763495.NmdaWeg79L@kreacher/T/#t
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
2019-08-09 00:58:38 +03:00
*
* If ASPM is not enabled for the device , shut down the device and allow
* the PCI bus layer to put it into D3 in order to take the PCIe link
* down , so as to allow the platform to achieve its minimum low - power
* state ( which may not be possible if the link is up ) .
2019-05-23 18:27:35 +03:00
*/
nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled
One of the modifications made by commit d916b1be94b6 ("nvme-pci: use
host managed power state for suspend") was adding a pci_save_state()
call to nvme_suspend() so as to instruct the PCI bus type to leave
devices handled by the nvme driver in D0 during suspend-to-idle.
That was done with the assumption that ASPM would transition the
device's PCIe link into a low-power state when the device became
inactive. However, if ASPM is disabled for the device, its PCIe
link will stay in L0 and in that case commit d916b1be94b6 is likely
to cause the energy used by the system while suspended to increase.
Namely, if the device in question works in accordance with the PCIe
specification, putting it into D3hot causes its PCIe link to go to
L1 or L2/L3 Ready, which is lower-power than L0. Since the energy
used by the system while suspended depends on the state of its PCIe
link (as a general rule, the lower-power the state of the link, the
less energy the system will use), putting the device into D3hot
during suspend-to-idle should be more energy-efficient that leaving
it in D0 with disabled ASPM.
For this reason, avoid leaving NVMe devices with disabled ASPM in D0
during suspend-to-idle. Instead, shut them down entirely and let
the PCI bus type put them into D3.
Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend")
Link: https://lore.kernel.org/linux-pm/2763495.NmdaWeg79L@kreacher/T/#t
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
2019-08-09 00:58:38 +03:00
if ( pm_suspend_via_firmware ( ) | | ! ctrl - > npss | |
2019-08-16 23:16:19 +03:00
! pcie_aspm_enabled ( pdev ) | |
( ndev - > ctrl . quirks & NVME_QUIRK_SIMPLE_SUSPEND ) ) {
2019-05-23 18:27:35 +03:00
nvme_dev_disable ( ndev , true ) ;
return 0 ;
}
nvme_start_freeze ( ctrl ) ;
nvme_wait_freeze ( ctrl ) ;
nvme_sync_queues ( ctrl ) ;
if ( ctrl - > state ! = NVME_CTRL_LIVE & &
ctrl - > state ! = NVME_CTRL_ADMIN_ONLY )
goto unfreeze ;
ret = nvme_get_power_state ( ctrl , & ndev - > last_ps ) ;
if ( ret < 0 )
goto unfreeze ;
ret = nvme_set_power_state ( ctrl , ctrl - > npss ) ;
if ( ret < 0 )
goto unfreeze ;
if ( ret ) {
/*
* Clearing npss forces a controller reset on resume . The
* correct value will be resdicovered then .
*/
nvme_dev_disable ( ndev , true ) ;
ctrl - > npss = 0 ;
ret = 0 ;
goto unfreeze ;
}
/*
* A saved state prevents pci pm from generically controlling the
* device ' s power . If we ' re using protocol specific settings , we don ' t
* want pci interfering .
*/
pci_save_state ( pdev ) ;
unfreeze :
nvme_unfreeze ( ctrl ) ;
return ret ;
}
static int nvme_simple_suspend ( struct device * dev )
{
struct nvme_dev * ndev = pci_get_drvdata ( to_pci_dev ( dev ) ) ;
2013-07-16 01:02:23 +04:00
2016-01-13 00:41:18 +03:00
nvme_dev_disable ( ndev , true ) ;
2013-07-16 01:02:23 +04:00
return 0 ;
}
2019-05-23 18:27:35 +03:00
static int nvme_simple_resume ( struct device * dev )
2013-07-16 01:02:23 +04:00
{
struct pci_dev * pdev = to_pci_dev ( dev ) ;
struct nvme_dev * ndev = pci_get_drvdata ( pdev ) ;
2017-06-15 16:41:08 +03:00
nvme_reset_ctrl ( & ndev - > ctrl ) ;
2013-12-11 00:10:36 +04:00
return 0 ;
2013-07-16 01:02:23 +04:00
}
2019-06-26 05:09:02 +03:00
static const struct dev_pm_ops nvme_dev_pm_ops = {
2019-05-23 18:27:35 +03:00
. suspend = nvme_suspend ,
. resume = nvme_resume ,
. freeze = nvme_simple_suspend ,
. thaw = nvme_simple_resume ,
. poweroff = nvme_simple_suspend ,
. restore = nvme_simple_resume ,
} ;
# endif /* CONFIG_PM_SLEEP */
2011-01-20 20:50:14 +03:00
2015-12-08 01:30:31 +03:00
static pci_ers_result_t nvme_error_detected ( struct pci_dev * pdev ,
pci_channel_state_t state )
{
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
/*
* A frozen channel requires a reset . When detected , this method will
* shutdown the controller to quiesce . The controller will be restarted
* after the slot reset through driver ' s slot_reset callback .
*/
switch ( state ) {
case pci_channel_io_normal :
return PCI_ERS_RESULT_CAN_RECOVER ;
case pci_channel_io_frozen :
2016-04-05 00:07:41 +03:00
dev_warn ( dev - > ctrl . device ,
" frozen state error detected, reset controller \n " ) ;
2016-01-13 00:41:18 +03:00
nvme_dev_disable ( dev , false ) ;
2015-12-08 01:30:31 +03:00
return PCI_ERS_RESULT_NEED_RESET ;
case pci_channel_io_perm_failure :
2016-04-05 00:07:41 +03:00
dev_warn ( dev - > ctrl . device ,
" failure state error detected, request disconnect \n " ) ;
2015-12-08 01:30:31 +03:00
return PCI_ERS_RESULT_DISCONNECT ;
}
return PCI_ERS_RESULT_NEED_RESET ;
}
static pci_ers_result_t nvme_slot_reset ( struct pci_dev * pdev )
{
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
2016-02-10 18:51:15 +03:00
dev_info ( dev - > ctrl . device , " restart after slot reset \n " ) ;
2015-12-08 01:30:31 +03:00
pci_restore_state ( pdev ) ;
2017-06-15 16:41:08 +03:00
nvme_reset_ctrl ( & dev - > ctrl ) ;
2015-12-08 01:30:31 +03:00
return PCI_ERS_RESULT_RECOVERED ;
}
static void nvme_error_resume ( struct pci_dev * pdev )
{
2018-05-25 01:16:04 +03:00
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
flush_work ( & dev - > ctrl . reset_work ) ;
2015-12-08 01:30:31 +03:00
}
2012-09-07 20:33:17 +04:00
static const struct pci_error_handlers nvme_err_handler = {
2011-01-20 20:50:14 +03:00
. error_detected = nvme_error_detected ,
. slot_reset = nvme_slot_reset ,
. resume = nvme_error_resume ,
2017-06-01 14:10:38 +03:00
. reset_prepare = nvme_reset_prepare ,
. reset_done = nvme_reset_done ,
2011-01-20 20:50:14 +03:00
} ;
2014-03-24 18:11:22 +04:00
static const struct pci_device_id nvme_id_table [ ] = {
2015-11-26 12:07:41 +03:00
{ PCI_VDEVICE ( INTEL , 0x0953 ) ,
2016-03-04 23:15:17 +03:00
. driver_data = NVME_QUIRK_STRIPE_SIZE |
2017-04-05 20:21:13 +03:00
NVME_QUIRK_DEALLOCATE_ZEROES , } ,
2016-05-03 00:14:24 +03:00
{ PCI_VDEVICE ( INTEL , 0x0a53 ) ,
. driver_data = NVME_QUIRK_STRIPE_SIZE |
2017-04-05 20:21:13 +03:00
NVME_QUIRK_DEALLOCATE_ZEROES , } ,
2016-05-03 00:14:24 +03:00
{ PCI_VDEVICE ( INTEL , 0x0a54 ) ,
. driver_data = NVME_QUIRK_STRIPE_SIZE |
2017-04-05 20:21:13 +03:00
NVME_QUIRK_DEALLOCATE_ZEROES , } ,
2017-07-10 21:39:59 +03:00
{ PCI_VDEVICE ( INTEL , 0x0a55 ) ,
. driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES , } ,
2017-05-25 01:06:31 +03:00
{ PCI_VDEVICE ( INTEL , 0xf1a5 ) , /* Intel 600P/P3100 */
2018-05-08 19:25:15 +03:00
. driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_MEDIUM_PRIO_SQ } ,
2019-01-08 20:20:51 +03:00
{ PCI_VDEVICE ( INTEL , 0xf1a6 ) , /* Intel 760p/Pro 7600p */
. driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN , } ,
2015-10-23 00:45:06 +03:00
{ PCI_VDEVICE ( INTEL , 0x5845 ) , /* Qemu emulated controller */
2019-03-13 20:55:05 +03:00
. driver_data = NVME_QUIRK_IDENTIFY_CNS |
NVME_QUIRK_DISABLE_WRITE_ZEROES , } ,
2018-04-12 22:25:25 +03:00
{ PCI_DEVICE ( 0x1bb1 , 0x0100 ) , /* Seagate Nytro Flash Storage */
. driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY , } ,
2016-06-15 00:22:41 +03:00
{ PCI_DEVICE ( 0x1c58 , 0x0003 ) , /* HGST adapter */
. driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY , } ,
2017-11-21 19:44:37 +03:00
{ PCI_DEVICE ( 0x1c58 , 0x0023 ) , /* WDC SN200 adapter */
. driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY , } ,
2016-09-08 19:12:11 +03:00
{ PCI_DEVICE ( 0x1c5f , 0x0540 ) , /* Memblaze Pblaze4 adapter */
. driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY , } ,
2017-06-28 05:27:57 +03:00
{ PCI_DEVICE ( 0x144d , 0xa821 ) , /* Samsung PM1725 */
. driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY , } ,
{ PCI_DEVICE ( 0x144d , 0xa822 ) , /* Samsung PM1725a */
. driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY , } ,
2017-09-06 12:45:24 +03:00
{ PCI_DEVICE ( 0x1d1d , 0x1f1f ) , /* LighNVM qemu device */
. driver_data = NVME_QUIRK_LIGHTNVM , } ,
{ PCI_DEVICE ( 0x1d1d , 0x2807 ) , /* CNEX WL */
. driver_data = NVME_QUIRK_LIGHTNVM , } ,
2018-04-26 23:59:19 +03:00
{ PCI_DEVICE ( 0x1d1d , 0x2601 ) , /* CNEX Granby */
. driver_data = NVME_QUIRK_LIGHTNVM , } ,
2019-07-15 10:11:49 +03:00
{ PCI_DEVICE ( 0x10ec , 0x5762 ) , /* ADATA SX6000LNP */
. driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN , } ,
2011-01-20 20:50:14 +03:00
{ PCI_DEVICE_CLASS ( PCI_CLASS_STORAGE_EXPRESS , 0xffffff ) } ,
2015-11-04 02:49:45 +03:00
{ PCI_DEVICE ( PCI_VENDOR_ID_APPLE , 0x2001 ) } ,
2017-02-23 01:17:29 +03:00
{ PCI_DEVICE ( PCI_VENDOR_ID_APPLE , 0x2003 ) } ,
2011-01-20 20:50:14 +03:00
{ 0 , }
} ;
MODULE_DEVICE_TABLE ( pci , nvme_id_table ) ;
static struct pci_driver nvme_driver = {
. name = " nvme " ,
. id_table = nvme_id_table ,
. probe = nvme_probe ,
2012-12-22 03:13:49 +04:00
. remove = nvme_remove ,
2014-01-27 20:29:40 +04:00
. shutdown = nvme_shutdown ,
2019-05-23 18:27:35 +03:00
# ifdef CONFIG_PM_SLEEP
2013-07-16 01:02:23 +04:00
. driver = {
. pm = & nvme_dev_pm_ops ,
} ,
2019-05-23 18:27:35 +03:00
# endif
2018-04-25 00:47:27 +03:00
. sriov_configure = pci_sriov_configure_simple ,
2011-01-20 20:50:14 +03:00
. err_handler = & nvme_err_handler ,
} ;
static int __init nvme_init ( void )
{
2019-04-30 18:36:52 +03:00
BUILD_BUG_ON ( sizeof ( struct nvme_create_cq ) ! = 64 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_create_sq ) ! = 64 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_delete_queue ) ! = 64 ) ;
2019-02-16 20:13:10 +03:00
BUILD_BUG_ON ( IRQ_AFFINITY_MAX_SETS < 2 ) ;
2017-06-07 21:31:55 +03:00
return pci_register_driver ( & nvme_driver ) ;
2011-01-20 20:50:14 +03:00
}
static void __exit nvme_exit ( void )
{
pci_unregister_driver ( & nvme_driver ) ;
2017-11-09 14:32:07 +03:00
flush_workqueue ( nvme_wq ) ;
2011-01-20 20:50:14 +03:00
}
MODULE_AUTHOR ( " Matthew Wilcox <willy@linux.intel.com> " ) ;
MODULE_LICENSE ( " GPL " ) ;
2014-11-22 01:16:32 +03:00
MODULE_VERSION ( " 1.0 " ) ;
2011-01-20 20:50:14 +03:00
module_init ( nvme_init ) ;
module_exit ( nvme_exit ) ;