2011-01-20 12:50:14 -05:00
/*
* NVM Express device driver
2014-03-24 10:11:22 -04:00
* Copyright ( c ) 2011 - 2014 , Intel Corporation .
2011-01-20 12:50:14 -05:00
*
* This program is free software ; you can redistribute it and / or modify it
* under the terms and conditions of the GNU General Public License ,
* version 2 , as published by the Free Software Foundation .
*
* This program is distributed in the hope it will be useful , but WITHOUT
* ANY WARRANTY ; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE . See the GNU General Public License for
* more details .
*/
2011-05-12 13:50:28 -04:00
# include <linux/bitops.h>
2011-01-20 12:50:14 -05:00
# include <linux/blkdev.h>
2014-11-04 08:20:14 -07:00
# include <linux/blk-mq.h>
2014-03-24 10:46:25 -06:00
# include <linux/cpu.h>
2011-05-06 08:37:54 -04:00
# include <linux/delay.h>
2011-01-20 12:50:14 -05:00
# include <linux/errno.h>
# include <linux/fs.h>
# include <linux/genhd.h>
2014-04-02 15:45:37 -06:00
# include <linux/hdreg.h>
2011-05-06 08:45:47 -04:00
# include <linux/idr.h>
2011-01-20 12:50:14 -05:00
# include <linux/init.h>
# include <linux/interrupt.h>
# include <linux/io.h>
# include <linux/kdev_t.h>
2011-03-02 18:37:18 -05:00
# include <linux/kthread.h>
2011-01-20 12:50:14 -05:00
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/module.h>
# include <linux/moduleparam.h>
2015-11-26 12:21:29 +01:00
# include <linux/mutex.h>
2011-01-20 12:50:14 -05:00
# include <linux/pci.h>
2011-02-06 07:53:23 -05:00
# include <linux/poison.h>
2013-07-08 17:26:25 -04:00
# include <linux/ptrace.h>
2011-01-20 12:50:14 -05:00
# include <linux/sched.h>
# include <linux/slab.h>
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 13:39:03 -07:00
# include <linux/t10-pi.h>
2011-01-20 12:50:14 -05:00
# include <linux/types.h>
2015-08-28 09:27:14 +02:00
# include <linux/io-64-nonatomic-lo-hi.h>
2015-10-15 14:10:52 +02:00
# include <asm/unaligned.h>
2012-02-07 11:45:33 +09:00
2015-10-03 15:46:41 +02:00
# include "nvme.h"
2014-05-13 11:42:02 -06:00
# define NVME_Q_DEPTH 1024
2015-03-06 12:56:13 -07:00
# define NVME_AQ_DEPTH 256
2011-01-20 12:50:14 -05:00
# define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
# define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
2014-05-13 11:42:02 -06:00
2015-11-26 09:08:36 +01:00
unsigned char admin_timeout = 60 ;
2014-05-13 11:42:02 -06:00
module_param ( admin_timeout , byte , 0644 ) ;
MODULE_PARM_DESC ( admin_timeout , " timeout in seconds for admin commands " ) ;
2011-01-20 12:50:14 -05:00
2014-06-03 23:04:30 -04:00
unsigned char nvme_io_timeout = 30 ;
module_param_named ( io_timeout , nvme_io_timeout , byte , 0644 ) ;
2014-04-04 11:43:36 -06:00
MODULE_PARM_DESC ( io_timeout , " timeout in seconds for I/O " ) ;
2011-01-20 12:50:14 -05:00
2015-11-28 15:03:49 +01:00
unsigned char shutdown_timeout = 5 ;
2014-07-01 09:33:32 -06:00
module_param ( shutdown_timeout , byte , 0644 ) ;
MODULE_PARM_DESC ( shutdown_timeout , " timeout in seconds for controller shutdown " ) ;
2011-02-06 07:28:06 -05:00
static int use_threaded_interrupts ;
module_param ( use_threaded_interrupts , int , 0 ) ;
2015-07-20 10:14:09 -06:00
static bool use_cmb_sqes = true ;
module_param ( use_cmb_sqes , bool , 0644 ) ;
MODULE_PARM_DESC ( use_cmb_sqes , " use controller's memory buffer for I/O SQes " ) ;
2011-03-02 18:37:18 -05:00
static LIST_HEAD ( dev_list ) ;
static struct task_struct * nvme_thread ;
2013-12-10 13:10:36 -07:00
static struct workqueue_struct * nvme_workq ;
2014-04-07 17:10:11 -06:00
static wait_queue_head_t nvme_kthread_wait ;
2011-03-02 18:37:18 -05:00
2015-11-26 10:06:56 +01:00
struct nvme_dev ;
struct nvme_queue ;
2015-11-26 10:51:23 +01:00
struct nvme_iod ;
2015-11-26 10:06:56 +01:00
2015-10-02 18:49:23 +02:00
static int __nvme_reset ( struct nvme_dev * dev ) ;
2015-06-05 10:30:08 -06:00
static int nvme_reset ( struct nvme_dev * dev ) ;
2015-11-03 20:37:26 -07:00
static void nvme_process_cq ( struct nvme_queue * nvmeq ) ;
2015-11-26 10:51:23 +01:00
static void nvme_unmap_data ( struct nvme_dev * dev , struct nvme_iod * iod ) ;
2015-10-03 09:49:23 +02:00
static void nvme_dead_ctrl ( struct nvme_dev * dev ) ;
2013-12-10 13:10:37 -07:00
2013-12-10 13:10:40 -07:00
struct async_cmd_info {
struct kthread_work work ;
struct kthread_worker * worker ;
2014-11-04 08:20:14 -07:00
struct request * req ;
2013-12-10 13:10:40 -07:00
u32 result ;
int status ;
void * ctx ;
} ;
2011-03-02 18:37:18 -05:00
2015-11-26 10:06:56 +01:00
/*
* Represents an NVM Express device . Each nvme_dev is a PCI function .
*/
struct nvme_dev {
struct list_head node ;
struct nvme_queue * * queues ;
struct blk_mq_tag_set tagset ;
struct blk_mq_tag_set admin_tagset ;
u32 __iomem * dbs ;
struct device * dev ;
struct dma_pool * prp_page_pool ;
struct dma_pool * prp_small_pool ;
unsigned queue_count ;
unsigned online_queues ;
unsigned max_qid ;
int q_depth ;
u32 db_stride ;
struct msix_entry * entry ;
void __iomem * bar ;
struct work_struct reset_work ;
struct work_struct probe_work ;
struct work_struct scan_work ;
2015-11-26 12:21:29 +01:00
struct mutex shutdown_lock ;
2015-11-26 10:06:56 +01:00
bool subsystem ;
void __iomem * cmb ;
dma_addr_t cmb_dma_addr ;
u64 cmb_size ;
u32 cmbsz ;
struct nvme_ctrl ctrl ;
} ;
static inline struct nvme_dev * to_nvme_dev ( struct nvme_ctrl * ctrl )
{
return container_of ( ctrl , struct nvme_dev , ctrl ) ;
}
2011-01-20 12:50:14 -05:00
/*
* An NVM Express queue . Each device has at least two ( one for admin
* commands and one for I / O commands ) .
*/
struct nvme_queue {
struct device * q_dmadev ;
2011-02-10 09:56:01 -05:00
struct nvme_dev * dev ;
2014-01-27 15:57:22 -05:00
char irqname [ 24 ] ; /* nvme4294967295-65535\0 */
2011-01-20 12:50:14 -05:00
spinlock_t q_lock ;
struct nvme_command * sq_cmds ;
2015-07-20 10:14:09 -06:00
struct nvme_command __iomem * sq_cmds_io ;
2011-01-20 12:50:14 -05:00
volatile struct nvme_completion * cqes ;
2015-06-01 09:29:54 -06:00
struct blk_mq_tags * * tags ;
2011-01-20 12:50:14 -05:00
dma_addr_t sq_dma_addr ;
dma_addr_t cq_dma_addr ;
u32 __iomem * q_db ;
u16 q_depth ;
2015-01-15 15:19:10 -07:00
s16 cq_vector ;
2011-01-20 12:50:14 -05:00
u16 sq_head ;
u16 sq_tail ;
u16 cq_head ;
2013-12-10 13:10:38 -07:00
u16 qid ;
2013-06-24 11:47:34 -04:00
u8 cq_phase ;
u8 cqe_seen ;
2013-12-10 13:10:40 -07:00
struct async_cmd_info cmdinfo ;
2011-01-20 12:50:14 -05:00
} ;
2015-10-16 07:58:32 +02:00
/*
* The nvme_iod describes the data in an I / O , including the list of PRP
* entries . You can ' t see it in this data structure because C doesn ' t let
* me express that . Use nvme_alloc_iod to ensure there ' s enough space
* allocated to store the PRP list .
*/
struct nvme_iod {
unsigned long private ; /* For the use of the submitter of the I/O */
int npages ; /* In the PRP list. 0 means small pool in use */
int offset ; /* Of PRP list */
int nents ; /* Used in scatterlist */
int length ; /* Of data, in bytes */
dma_addr_t first_dma ;
struct scatterlist meta_sg [ 1 ] ; /* metadata requires single contiguous buffer */
struct scatterlist sg [ 0 ] ;
} ;
2011-01-20 12:50:14 -05:00
/*
* Check we didin ' t inadvertently grow the command struct
*/
static inline void _nvme_check_size ( void )
{
BUILD_BUG_ON ( sizeof ( struct nvme_rw_command ) ! = 64 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_create_cq ) ! = 64 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_create_sq ) ! = 64 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_delete_queue ) ! = 64 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_features ) ! = 64 ) ;
2013-03-27 07:13:41 -04:00
BUILD_BUG_ON ( sizeof ( struct nvme_format_cmd ) ! = 64 ) ;
2013-12-10 13:10:38 -07:00
BUILD_BUG_ON ( sizeof ( struct nvme_abort_cmd ) ! = 64 ) ;
2011-01-20 12:50:14 -05:00
BUILD_BUG_ON ( sizeof ( struct nvme_command ) ! = 64 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_id_ctrl ) ! = 4096 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_id_ns ) ! = 4096 ) ;
BUILD_BUG_ON ( sizeof ( struct nvme_lba_range_type ) ! = 64 ) ;
2012-09-26 12:49:27 -06:00
BUILD_BUG_ON ( sizeof ( struct nvme_smart_log ) ! = 512 ) ;
2011-01-20 12:50:14 -05:00
}
2014-04-03 16:45:23 -06:00
typedef void ( * nvme_completion_fn ) ( struct nvme_queue * , void * ,
2011-10-15 07:33:46 -04:00
struct nvme_completion * ) ;
2011-02-06 18:30:16 -05:00
struct nvme_cmd_info {
2011-10-15 07:33:46 -04:00
nvme_completion_fn fn ;
void * ctx ;
2013-12-10 13:10:38 -07:00
int aborted ;
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq ;
2015-01-22 12:07:58 -07:00
struct nvme_iod iod [ 0 ] ;
2011-02-06 18:30:16 -05:00
} ;
2015-01-22 12:07:58 -07:00
/*
* Max size of iod being embedded in the request payload
*/
# define NVME_INT_PAGES 2
2015-11-28 15:03:49 +01:00
# define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size)
2015-03-27 09:21:32 +08:00
# define NVME_INT_MASK 0x01
2015-01-22 12:07:58 -07:00
/*
* Will slightly overestimate the number of pages needed . This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I / O .
*/
static int nvme_npages ( unsigned size , struct nvme_dev * dev )
{
2015-11-28 15:03:49 +01:00
unsigned nprps = DIV_ROUND_UP ( size + dev - > ctrl . page_size ,
dev - > ctrl . page_size ) ;
2015-01-22 12:07:58 -07:00
return DIV_ROUND_UP ( 8 * nprps , PAGE_SIZE - 8 ) ;
}
static unsigned int nvme_cmd_size ( struct nvme_dev * dev )
{
unsigned int ret = sizeof ( struct nvme_cmd_info ) ;
ret + = sizeof ( struct nvme_iod ) ;
ret + = sizeof ( __le64 * ) * nvme_npages ( NVME_INT_BYTES ( dev ) , dev ) ;
ret + = sizeof ( struct scatterlist ) * NVME_INT_PAGES ;
return ret ;
}
2014-11-04 08:20:14 -07:00
static int nvme_admin_init_hctx ( struct blk_mq_hw_ctx * hctx , void * data ,
unsigned int hctx_idx )
2011-02-06 18:30:16 -05:00
{
2014-11-04 08:20:14 -07:00
struct nvme_dev * dev = data ;
struct nvme_queue * nvmeq = dev - > queues [ 0 ] ;
2015-06-01 09:29:54 -06:00
WARN_ON ( hctx_idx ! = 0 ) ;
WARN_ON ( dev - > admin_tagset . tags [ 0 ] ! = hctx - > tags ) ;
WARN_ON ( nvmeq - > tags ) ;
2014-11-04 08:20:14 -07:00
hctx - > driver_data = nvmeq ;
2015-06-01 09:29:54 -06:00
nvmeq - > tags = & dev - > admin_tagset . tags [ 0 ] ;
2014-11-04 08:20:14 -07:00
return 0 ;
2011-02-06 18:30:16 -05:00
}
2015-06-08 10:08:13 -06:00
static void nvme_admin_exit_hctx ( struct blk_mq_hw_ctx * hctx , unsigned int hctx_idx )
{
struct nvme_queue * nvmeq = hctx - > driver_data ;
nvmeq - > tags = NULL ;
}
2014-11-04 08:20:14 -07:00
static int nvme_admin_init_request ( void * data , struct request * req ,
unsigned int hctx_idx , unsigned int rq_idx ,
unsigned int numa_node )
2013-07-15 15:02:20 -06:00
{
2014-11-04 08:20:14 -07:00
struct nvme_dev * dev = data ;
struct nvme_cmd_info * cmd = blk_mq_rq_to_pdu ( req ) ;
struct nvme_queue * nvmeq = dev - > queues [ 0 ] ;
BUG_ON ( ! nvmeq ) ;
cmd - > nvmeq = nvmeq ;
return 0 ;
2013-07-15 15:02:20 -06:00
}
2014-11-04 08:20:14 -07:00
static int nvme_init_hctx ( struct blk_mq_hw_ctx * hctx , void * data ,
unsigned int hctx_idx )
2011-01-20 12:50:14 -05:00
{
2014-11-04 08:20:14 -07:00
struct nvme_dev * dev = data ;
2015-06-01 09:29:54 -06:00
struct nvme_queue * nvmeq = dev - > queues [ hctx_idx + 1 ] ;
2014-11-04 08:20:14 -07:00
2015-06-01 09:29:54 -06:00
if ( ! nvmeq - > tags )
nvmeq - > tags = & dev - > tagset . tags [ hctx_idx ] ;
2011-01-20 12:50:14 -05:00
2015-06-01 09:29:54 -06:00
WARN_ON ( dev - > tagset . tags [ hctx_idx ] ! = hctx - > tags ) ;
2014-11-04 08:20:14 -07:00
hctx - > driver_data = nvmeq ;
return 0 ;
2011-01-20 12:50:14 -05:00
}
2014-11-04 08:20:14 -07:00
static int nvme_init_request ( void * data , struct request * req ,
unsigned int hctx_idx , unsigned int rq_idx ,
unsigned int numa_node )
2011-01-20 12:50:14 -05:00
{
2014-11-04 08:20:14 -07:00
struct nvme_dev * dev = data ;
struct nvme_cmd_info * cmd = blk_mq_rq_to_pdu ( req ) ;
struct nvme_queue * nvmeq = dev - > queues [ hctx_idx + 1 ] ;
BUG_ON ( ! nvmeq ) ;
cmd - > nvmeq = nvmeq ;
return 0 ;
}
static void nvme_set_info ( struct nvme_cmd_info * cmd , void * ctx ,
nvme_completion_fn handler )
{
cmd - > fn = handler ;
cmd - > ctx = ctx ;
cmd - > aborted = 0 ;
2015-01-07 18:55:48 -07:00
blk_mq_start_request ( blk_mq_rq_from_pdu ( cmd ) ) ;
2011-01-20 12:50:14 -05:00
}
2015-01-22 12:07:58 -07:00
static void * iod_get_private ( struct nvme_iod * iod )
{
return ( void * ) ( iod - > private & ~ 0x1UL ) ;
}
/*
* If bit 0 is set , the iod is embedded in the request payload .
*/
static bool iod_should_kfree ( struct nvme_iod * iod )
{
2015-03-27 09:21:32 +08:00
return ( iod - > private & NVME_INT_MASK ) = = 0 ;
2015-01-22 12:07:58 -07:00
}
2011-10-15 07:33:46 -04:00
/* Special values must be less than 0x1000 */
# define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
2011-02-07 15:55:59 -05:00
# define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
# define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
# define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
2011-02-06 07:53:23 -05:00
2014-04-03 16:45:23 -06:00
static void special_completion ( struct nvme_queue * nvmeq , void * ctx ,
2011-10-15 07:33:46 -04:00
struct nvme_completion * cqe )
{
if ( ctx = = CMD_CTX_CANCELLED )
return ;
if ( ctx = = CMD_CTX_COMPLETED ) {
2014-04-03 16:45:23 -06:00
dev_warn ( nvmeq - > q_dmadev ,
2011-10-15 07:33:46 -04:00
" completed id %d twice on queue %d \n " ,
cqe - > command_id , le16_to_cpup ( & cqe - > sq_id ) ) ;
return ;
}
if ( ctx = = CMD_CTX_INVALID ) {
2014-04-03 16:45:23 -06:00
dev_warn ( nvmeq - > q_dmadev ,
2011-10-15 07:33:46 -04:00
" invalid id %d completed on queue %d \n " ,
cqe - > command_id , le16_to_cpup ( & cqe - > sq_id ) ) ;
return ;
}
2014-04-03 16:45:23 -06:00
dev_warn ( nvmeq - > q_dmadev , " Unknown special completion %p \n " , ctx ) ;
2011-10-15 07:33:46 -04:00
}
2014-11-04 08:20:14 -07:00
static void * cancel_cmd_info ( struct nvme_cmd_info * cmd , nvme_completion_fn * fn )
2011-01-20 12:50:14 -05:00
{
2011-10-15 07:33:46 -04:00
void * ctx ;
2011-01-20 12:50:14 -05:00
2012-08-02 14:05:59 -06:00
if ( fn )
2014-11-04 08:20:14 -07:00
* fn = cmd - > fn ;
ctx = cmd - > ctx ;
cmd - > fn = special_completion ;
cmd - > ctx = CMD_CTX_CANCELLED ;
2011-10-15 07:33:46 -04:00
return ctx ;
2011-01-20 12:50:14 -05:00
}
2014-11-04 08:20:14 -07:00
static void async_req_completion ( struct nvme_queue * nvmeq , void * ctx ,
struct nvme_completion * cqe )
2011-02-04 16:03:56 -05:00
{
2014-11-04 08:20:14 -07:00
u32 result = le32_to_cpup ( & cqe - > result ) ;
u16 status = le16_to_cpup ( & cqe - > status ) > > 1 ;
if ( status = = NVME_SC_SUCCESS | | status = = NVME_SC_ABORT_REQ )
2015-11-26 10:06:56 +01:00
+ + nvmeq - > dev - > ctrl . event_limit ;
2015-06-01 14:28:14 -06:00
if ( status ! = NVME_SC_SUCCESS )
return ;
switch ( result & 0xff07 ) {
case NVME_AER_NOTICE_NS_CHANGED :
dev_info ( nvmeq - > q_dmadev , " rescanning \n " ) ;
schedule_work ( & nvmeq - > dev - > scan_work ) ;
default :
dev_warn ( nvmeq - > q_dmadev , " async event result %08x \n " , result ) ;
}
2011-01-20 12:50:14 -05:00
}
2014-11-04 08:20:14 -07:00
static void abort_completion ( struct nvme_queue * nvmeq , void * ctx ,
struct nvme_completion * cqe )
2014-02-21 14:13:44 -07:00
{
2014-11-04 08:20:14 -07:00
struct request * req = ctx ;
u16 status = le16_to_cpup ( & cqe - > status ) > > 1 ;
u32 result = le32_to_cpup ( & cqe - > result ) ;
2014-05-13 10:32:46 -06:00
2015-06-01 09:29:54 -06:00
blk_mq_free_request ( req ) ;
2014-05-13 10:32:46 -06:00
2014-11-04 08:20:14 -07:00
dev_warn ( nvmeq - > q_dmadev , " Abort status:%x result:%x " , status , result ) ;
2015-11-26 10:06:56 +01:00
+ + nvmeq - > dev - > ctrl . abort_limit ;
2014-02-21 14:13:44 -07:00
}
2014-11-04 08:20:14 -07:00
static void async_completion ( struct nvme_queue * nvmeq , void * ctx ,
struct nvme_completion * cqe )
2011-01-20 12:50:14 -05:00
{
2014-11-04 08:20:14 -07:00
struct async_cmd_info * cmdinfo = ctx ;
cmdinfo - > result = le32_to_cpup ( & cqe - > result ) ;
cmdinfo - > status = le16_to_cpup ( & cqe - > status ) > > 1 ;
queue_kthread_work ( cmdinfo - > worker , & cmdinfo - > work ) ;
2015-06-01 09:29:54 -06:00
blk_mq_free_request ( cmdinfo - > req ) ;
2011-01-20 12:50:14 -05:00
}
2014-11-04 08:20:14 -07:00
static inline struct nvme_cmd_info * get_cmd_from_tag ( struct nvme_queue * nvmeq ,
unsigned int tag )
2011-01-20 12:50:14 -05:00
{
2015-06-01 09:29:54 -06:00
struct request * req = blk_mq_tag_to_rq ( * nvmeq - > tags , tag ) ;
2014-05-13 10:32:46 -06:00
2014-11-04 08:20:14 -07:00
return blk_mq_rq_to_pdu ( req ) ;
2014-03-03 16:39:13 -07:00
}
2014-11-04 08:20:14 -07:00
/*
* Called with local interrupts disabled and the q_lock held . May not sleep .
*/
static void * nvme_finish_cmd ( struct nvme_queue * nvmeq , int tag ,
nvme_completion_fn * fn )
2014-03-03 16:39:13 -07:00
{
2014-11-04 08:20:14 -07:00
struct nvme_cmd_info * cmd = get_cmd_from_tag ( nvmeq , tag ) ;
void * ctx ;
if ( tag > = nvmeq - > q_depth ) {
* fn = special_completion ;
return CMD_CTX_INVALID ;
}
if ( fn )
* fn = cmd - > fn ;
ctx = cmd - > ctx ;
cmd - > fn = special_completion ;
cmd - > ctx = CMD_CTX_COMPLETED ;
return ctx ;
2011-01-20 12:50:14 -05:00
}
/**
2011-03-16 16:28:24 -04:00
* nvme_submit_cmd ( ) - Copy a command into a queue and ring the doorbell
2011-01-20 12:50:14 -05:00
* @ nvmeq : The queue to use
* @ cmd : The command to send
*
* Safe to use from interrupt context
*/
2015-07-31 18:56:58 +05:30
static void __nvme_submit_cmd ( struct nvme_queue * nvmeq ,
struct nvme_command * cmd )
2011-01-20 12:50:14 -05:00
{
2014-11-04 08:20:14 -07:00
u16 tail = nvmeq - > sq_tail ;
2015-07-20 10:14:09 -06:00
if ( nvmeq - > sq_cmds_io )
memcpy_toio ( & nvmeq - > sq_cmds_io [ tail ] , cmd , sizeof ( * cmd ) ) ;
else
memcpy ( & nvmeq - > sq_cmds [ tail ] , cmd , sizeof ( * cmd ) ) ;
2011-01-20 12:50:14 -05:00
if ( + + tail = = nvmeq - > q_depth )
tail = 0 ;
2011-02-16 09:59:59 -05:00
writel ( tail , nvmeq - > q_db ) ;
2011-01-20 12:50:14 -05:00
nvmeq - > sq_tail = tail ;
}
2015-07-31 18:56:58 +05:30
static void nvme_submit_cmd ( struct nvme_queue * nvmeq , struct nvme_command * cmd )
2014-11-04 08:20:14 -07:00
{
unsigned long flags ;
spin_lock_irqsave ( & nvmeq - > q_lock , flags ) ;
2015-07-31 18:56:58 +05:30
__nvme_submit_cmd ( nvmeq , cmd ) ;
2014-11-04 08:20:14 -07:00
spin_unlock_irqrestore ( & nvmeq - > q_lock , flags ) ;
}
2011-12-20 13:34:52 -05:00
static __le64 * * iod_list ( struct nvme_iod * iod )
2011-02-10 08:51:24 -05:00
{
2011-12-20 13:34:52 -05:00
return ( ( void * ) iod ) + iod - > offset ;
2011-02-10 08:51:24 -05:00
}
2015-01-22 12:07:58 -07:00
static inline void iod_init ( struct nvme_iod * iod , unsigned nbytes ,
unsigned nseg , unsigned long private )
2011-12-20 13:34:52 -05:00
{
2015-01-22 12:07:58 -07:00
iod - > private = private ;
iod - > offset = offsetof ( struct nvme_iod , sg [ nseg ] ) ;
iod - > npages = - 1 ;
iod - > length = nbytes ;
iod - > nents = 0 ;
2011-12-20 13:34:52 -05:00
}
2011-01-20 12:50:14 -05:00
2011-12-20 13:34:52 -05:00
static struct nvme_iod *
2015-01-22 12:07:58 -07:00
__nvme_alloc_iod ( unsigned nseg , unsigned bytes , struct nvme_dev * dev ,
unsigned long priv , gfp_t gfp )
2011-01-20 12:50:14 -05:00
{
2011-12-20 13:34:52 -05:00
struct nvme_iod * iod = kmalloc ( sizeof ( struct nvme_iod ) +
2015-01-22 12:07:58 -07:00
sizeof ( __le64 * ) * nvme_npages ( bytes , dev ) +
2011-12-20 13:34:52 -05:00
sizeof ( struct scatterlist ) * nseg , gfp ) ;
2015-01-22 12:07:58 -07:00
if ( iod )
iod_init ( iod , bytes , nseg , priv ) ;
2011-12-20 13:34:52 -05:00
return iod ;
2011-01-20 12:50:14 -05:00
}
2015-01-22 12:07:58 -07:00
static struct nvme_iod * nvme_alloc_iod ( struct request * rq , struct nvme_dev * dev ,
gfp_t gfp )
{
unsigned size = ! ( rq - > cmd_flags & REQ_DISCARD ) ? blk_rq_bytes ( rq ) :
sizeof ( struct nvme_dsm_range ) ;
struct nvme_iod * iod ;
if ( rq - > nr_phys_segments < = NVME_INT_PAGES & &
size < = NVME_INT_BYTES ( dev ) ) {
struct nvme_cmd_info * cmd = blk_mq_rq_to_pdu ( rq ) ;
iod = cmd - > iod ;
iod_init ( iod , size , rq - > nr_phys_segments ,
2015-03-27 09:21:32 +08:00
( unsigned long ) rq | NVME_INT_MASK ) ;
2015-01-22 12:07:58 -07:00
return iod ;
}
return __nvme_alloc_iod ( rq - > nr_phys_segments , size , dev ,
( unsigned long ) rq , gfp ) ;
}
2015-05-22 11:12:46 +02:00
static void nvme_free_iod ( struct nvme_dev * dev , struct nvme_iod * iod )
2011-01-20 12:50:14 -05:00
{
2015-11-28 15:03:49 +01:00
const int last_prp = dev - > ctrl . page_size / 8 - 1 ;
2011-12-20 13:34:52 -05:00
int i ;
__le64 * * list = iod_list ( iod ) ;
dma_addr_t prp_dma = iod - > first_dma ;
if ( iod - > npages = = 0 )
dma_pool_free ( dev - > prp_small_pool , list [ 0 ] , prp_dma ) ;
for ( i = 0 ; i < iod - > npages ; i + + ) {
__le64 * prp_list = list [ i ] ;
dma_addr_t next_prp_dma = le64_to_cpu ( prp_list [ last_prp ] ) ;
dma_pool_free ( dev - > prp_page_pool , prp_list , prp_dma ) ;
prp_dma = next_prp_dma ;
}
2015-01-22 12:07:58 -07:00
if ( iod_should_kfree ( iod ) )
kfree ( iod ) ;
2011-01-20 12:50:14 -05:00
}
2015-02-23 09:16:21 -07:00
# ifdef CONFIG_BLK_DEV_INTEGRITY
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 13:39:03 -07:00
static void nvme_dif_prep ( u32 p , u32 v , struct t10_pi_tuple * pi )
{
if ( be32_to_cpu ( pi - > ref_tag ) = = v )
pi - > ref_tag = cpu_to_be32 ( p ) ;
}
static void nvme_dif_complete ( u32 p , u32 v , struct t10_pi_tuple * pi )
{
if ( be32_to_cpu ( pi - > ref_tag ) = = p )
pi - > ref_tag = cpu_to_be32 ( v ) ;
}
/**
* nvme_dif_remap - remaps ref tags to bip seed and physical lba
*
* The virtual start sector is the one that was originally submitted by the
* block layer . Due to partitioning , MD / DM cloning , etc . the actual physical
* start sector may be different . Remap protection information to match the
* physical LBA on writes , and back to the original seed on reads .
*
* Type 0 and 3 do not have a ref tag , so no remapping required .
*/
static void nvme_dif_remap ( struct request * req ,
void ( * dif_swap ) ( u32 p , u32 v , struct t10_pi_tuple * pi ) )
{
struct nvme_ns * ns = req - > rq_disk - > private_data ;
struct bio_integrity_payload * bip ;
struct t10_pi_tuple * pi ;
void * p , * pmap ;
u32 i , nlb , ts , phys , virt ;
if ( ! ns - > pi_type | | ns - > pi_type = = NVME_NS_DPS_PI_TYPE3 )
return ;
bip = bio_integrity ( req - > bio ) ;
if ( ! bip )
return ;
pmap = kmap_atomic ( bip - > bip_vec - > bv_page ) + bip - > bip_vec - > bv_offset ;
p = pmap ;
virt = bip_get_seed ( bip ) ;
phys = nvme_block_nr ( ns , blk_rq_pos ( req ) ) ;
nlb = ( blk_rq_bytes ( req ) > > ns - > lba_shift ) ;
2015-10-21 13:20:18 -04:00
ts = ns - > disk - > queue - > integrity . tuple_size ;
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 13:39:03 -07:00
for ( i = 0 ; i < nlb ; i + + , virt + + , phys + + ) {
pi = ( struct t10_pi_tuple * ) p ;
dif_swap ( phys , virt , pi ) ;
p + = ts ;
}
kunmap_atomic ( pmap ) ;
}
2015-02-23 09:16:21 -07:00
# else /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_dif_remap ( struct request * req ,
void ( * dif_swap ) ( u32 p , u32 v , struct t10_pi_tuple * pi ) )
{
}
static void nvme_dif_prep ( u32 p , u32 v , struct t10_pi_tuple * pi )
{
}
static void nvme_dif_complete ( u32 p , u32 v , struct t10_pi_tuple * pi )
{
}
# endif
2014-11-04 08:20:14 -07:00
static void req_completion ( struct nvme_queue * nvmeq , void * ctx ,
2011-01-20 12:50:14 -05:00
struct nvme_completion * cqe )
{
2011-12-20 13:34:52 -05:00
struct nvme_iod * iod = ctx ;
2015-01-22 12:07:58 -07:00
struct request * req = iod_get_private ( iod ) ;
2014-11-04 08:20:14 -07:00
struct nvme_cmd_info * cmd_rq = blk_mq_rq_to_pdu ( req ) ;
2011-01-20 12:50:14 -05:00
u16 status = le16_to_cpup ( & cqe - > status ) > > 1 ;
2015-10-12 21:23:39 +02:00
int error = 0 ;
2011-01-20 12:50:14 -05:00
2014-04-03 16:45:23 -06:00
if ( unlikely ( status ) ) {
2014-11-04 08:20:14 -07:00
if ( ! ( status & NVME_SC_DNR | | blk_noretry_request ( req ) )
& & ( jiffies - req - > start_time ) < req - > timeout ) {
2015-01-07 18:55:52 -07:00
unsigned long flags ;
2015-11-26 10:51:23 +01:00
nvme_unmap_data ( nvmeq - > dev , iod ) ;
2014-11-04 08:20:14 -07:00
blk_mq_requeue_request ( req ) ;
2015-01-07 18:55:52 -07:00
spin_lock_irqsave ( req - > q - > queue_lock , flags ) ;
if ( ! blk_queue_stopped ( req - > q ) )
blk_mq_kick_requeue_list ( req - > q ) ;
spin_unlock_irqrestore ( req - > q - > queue_lock , flags ) ;
2015-11-26 10:51:23 +01:00
return ;
2014-04-03 16:45:23 -06:00
}
2015-09-27 21:01:50 +02:00
2015-05-22 11:12:46 +02:00
if ( req - > cmd_type = = REQ_TYPE_DRV_PRIV ) {
2015-06-08 10:08:14 -06:00
if ( cmd_rq - > ctx = = CMD_CTX_CANCELLED )
2015-10-12 21:23:39 +02:00
error = - EINTR ;
else
error = status ;
2015-05-22 11:12:46 +02:00
} else {
2015-10-12 21:23:39 +02:00
error = nvme_error_status ( status ) ;
2015-05-22 11:12:46 +02:00
}
2015-09-27 21:01:50 +02:00
}
2015-05-22 12:28:31 -06:00
if ( req - > cmd_type = = REQ_TYPE_DRV_PRIV ) {
u32 result = le32_to_cpup ( & cqe - > result ) ;
req - > special = ( void * ) ( uintptr_t ) result ;
}
2014-11-04 08:20:14 -07:00
if ( cmd_rq - > aborted )
2015-05-22 11:12:39 +02:00
dev_warn ( nvmeq - > dev - > dev ,
2014-11-04 08:20:14 -07:00
" completing aborted command with status:%04x \n " ,
2015-10-12 21:23:39 +02:00
error ) ;
2014-11-04 08:20:14 -07:00
2015-11-26 10:51:23 +01:00
nvme_unmap_data ( nvmeq - > dev , iod ) ;
blk_mq_complete_request ( req , error ) ;
2011-01-20 12:50:14 -05:00
}
2015-10-16 07:58:37 +02:00
static bool nvme_setup_prps ( struct nvme_dev * dev , struct nvme_iod * iod ,
int total_len )
2011-01-26 10:02:29 -05:00
{
2011-02-10 10:30:34 -05:00
struct dma_pool * pool ;
2011-12-20 13:34:52 -05:00
int length = total_len ;
struct scatterlist * sg = iod - > sg ;
2011-01-26 10:02:29 -05:00
int dma_len = sg_dma_len ( sg ) ;
u64 dma_addr = sg_dma_address ( sg ) ;
2015-11-28 15:03:49 +01:00
u32 page_size = dev - > ctrl . page_size ;
2015-03-26 11:07:51 -05:00
int offset = dma_addr & ( page_size - 1 ) ;
2011-02-10 08:51:24 -05:00
__le64 * prp_list ;
2011-12-20 13:34:52 -05:00
__le64 * * list = iod_list ( iod ) ;
2011-02-10 08:51:24 -05:00
dma_addr_t prp_dma ;
2011-12-20 13:34:52 -05:00
int nprps , i ;
2011-01-26 10:02:29 -05:00
2014-06-23 11:34:01 -06:00
length - = ( page_size - offset ) ;
2011-01-26 10:02:29 -05:00
if ( length < = 0 )
2015-10-16 07:58:37 +02:00
return true ;
2011-01-26 10:02:29 -05:00
2014-06-23 11:34:01 -06:00
dma_len - = ( page_size - offset ) ;
2011-01-26 10:02:29 -05:00
if ( dma_len ) {
2014-06-23 11:34:01 -06:00
dma_addr + = ( page_size - offset ) ;
2011-01-26 10:02:29 -05:00
} else {
sg = sg_next ( sg ) ;
dma_addr = sg_dma_address ( sg ) ;
dma_len = sg_dma_len ( sg ) ;
}
2014-06-23 11:34:01 -06:00
if ( length < = page_size ) {
2014-04-03 16:45:23 -06:00
iod - > first_dma = dma_addr ;
2015-10-16 07:58:37 +02:00
return true ;
2011-02-10 08:51:24 -05:00
}
2014-06-23 11:34:01 -06:00
nprps = DIV_ROUND_UP ( length , page_size ) ;
2011-02-10 10:30:34 -05:00
if ( nprps < = ( 256 / 8 ) ) {
pool = dev - > prp_small_pool ;
2011-12-20 13:34:52 -05:00
iod - > npages = 0 ;
2011-02-10 10:30:34 -05:00
} else {
pool = dev - > prp_page_pool ;
2011-12-20 13:34:52 -05:00
iod - > npages = 1 ;
2011-02-10 10:30:34 -05:00
}
2015-10-16 07:58:37 +02:00
prp_list = dma_pool_alloc ( pool , GFP_ATOMIC , & prp_dma ) ;
2011-05-12 13:51:41 -04:00
if ( ! prp_list ) {
2014-04-03 16:45:23 -06:00
iod - > first_dma = dma_addr ;
2011-12-20 13:34:52 -05:00
iod - > npages = - 1 ;
2015-10-16 07:58:37 +02:00
return false ;
2011-05-12 13:51:41 -04:00
}
2011-12-20 13:34:52 -05:00
list [ 0 ] = prp_list ;
iod - > first_dma = prp_dma ;
2011-02-10 08:51:24 -05:00
i = 0 ;
for ( ; ; ) {
2014-06-23 11:34:01 -06:00
if ( i = = page_size > > 3 ) {
2011-02-10 08:51:24 -05:00
__le64 * old_prp_list = prp_list ;
2015-10-16 07:58:37 +02:00
prp_list = dma_pool_alloc ( pool , GFP_ATOMIC , & prp_dma ) ;
2011-12-20 13:34:52 -05:00
if ( ! prp_list )
2015-10-16 07:58:37 +02:00
return false ;
2011-12-20 13:34:52 -05:00
list [ iod - > npages + + ] = prp_list ;
2011-03-16 16:43:40 -04:00
prp_list [ 0 ] = old_prp_list [ i - 1 ] ;
old_prp_list [ i - 1 ] = cpu_to_le64 ( prp_dma ) ;
i = 1 ;
2011-02-10 08:51:24 -05:00
}
prp_list [ i + + ] = cpu_to_le64 ( dma_addr ) ;
2014-06-23 11:34:01 -06:00
dma_len - = page_size ;
dma_addr + = page_size ;
length - = page_size ;
2011-02-10 08:51:24 -05:00
if ( length < = 0 )
break ;
if ( dma_len > 0 )
continue ;
BUG_ON ( dma_len < 0 ) ;
sg = sg_next ( sg ) ;
dma_addr = sg_dma_address ( sg ) ;
dma_len = sg_dma_len ( sg ) ;
2011-01-26 10:02:29 -05:00
}
2015-10-16 07:58:37 +02:00
return true ;
2011-01-26 10:02:29 -05:00
}
2015-10-16 07:58:38 +02:00
static int nvme_map_data ( struct nvme_dev * dev , struct nvme_iod * iod ,
struct nvme_command * cmnd )
2015-05-22 11:12:46 +02:00
{
2015-10-16 07:58:38 +02:00
struct request * req = iod_get_private ( iod ) ;
struct request_queue * q = req - > q ;
enum dma_data_direction dma_dir = rq_data_dir ( req ) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE ;
int ret = BLK_MQ_RQ_QUEUE_ERROR ;
sg_init_table ( iod - > sg , req - > nr_phys_segments ) ;
iod - > nents = blk_rq_map_sg ( q , req , iod - > sg ) ;
if ( ! iod - > nents )
goto out ;
ret = BLK_MQ_RQ_QUEUE_BUSY ;
if ( ! dma_map_sg ( dev - > dev , iod - > sg , iod - > nents , dma_dir ) )
goto out ;
if ( ! nvme_setup_prps ( dev , iod , blk_rq_bytes ( req ) ) )
goto out_unmap ;
ret = BLK_MQ_RQ_QUEUE_ERROR ;
if ( blk_integrity_rq ( req ) ) {
if ( blk_rq_count_integrity_sg ( q , req - > bio ) ! = 1 )
goto out_unmap ;
sg_init_table ( iod - > meta_sg , 1 ) ;
if ( blk_rq_map_integrity_sg ( q , req - > bio , iod - > meta_sg ) ! = 1 )
goto out_unmap ;
2015-05-22 11:12:46 +02:00
2015-10-16 07:58:38 +02:00
if ( rq_data_dir ( req ) )
nvme_dif_remap ( req , nvme_dif_prep ) ;
if ( ! dma_map_sg ( dev - > dev , iod - > meta_sg , 1 , dma_dir ) )
goto out_unmap ;
2015-05-22 11:12:46 +02:00
}
2015-10-16 07:58:38 +02:00
cmnd - > rw . prp1 = cpu_to_le64 ( sg_dma_address ( iod - > sg ) ) ;
cmnd - > rw . prp2 = cpu_to_le64 ( iod - > first_dma ) ;
if ( blk_integrity_rq ( req ) )
cmnd - > rw . metadata = cpu_to_le64 ( sg_dma_address ( iod - > meta_sg ) ) ;
return BLK_MQ_RQ_QUEUE_OK ;
out_unmap :
dma_unmap_sg ( dev - > dev , iod - > sg , iod - > nents , dma_dir ) ;
out :
return ret ;
2015-05-22 11:12:46 +02:00
}
2015-11-26 10:51:23 +01:00
static void nvme_unmap_data ( struct nvme_dev * dev , struct nvme_iod * iod )
{
struct request * req = iod_get_private ( iod ) ;
enum dma_data_direction dma_dir = rq_data_dir ( req ) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE ;
if ( iod - > nents ) {
dma_unmap_sg ( dev - > dev , iod - > sg , iod - > nents , dma_dir ) ;
if ( blk_integrity_rq ( req ) ) {
if ( ! rq_data_dir ( req ) )
nvme_dif_remap ( req , nvme_dif_complete ) ;
dma_unmap_sg ( dev - > dev , iod - > meta_sg , 1 , dma_dir ) ;
}
}
nvme_free_iod ( dev , iod ) ;
}
2014-11-04 08:20:14 -07:00
/*
* We reuse the small pool to allocate the 16 - byte range here as it is not
* worth having a special pool for these or additional cases to handle freeing
* the iod .
*/
2015-10-16 07:58:38 +02:00
static int nvme_setup_discard ( struct nvme_queue * nvmeq , struct nvme_ns * ns ,
struct nvme_iod * iod , struct nvme_command * cmnd )
2012-11-09 16:33:05 -07:00
{
2015-10-16 07:58:38 +02:00
struct request * req = iod_get_private ( iod ) ;
struct nvme_dsm_range * range ;
range = dma_pool_alloc ( nvmeq - > dev - > prp_small_pool , GFP_ATOMIC ,
& iod - > first_dma ) ;
if ( ! range )
return BLK_MQ_RQ_QUEUE_BUSY ;
iod_list ( iod ) [ 0 ] = ( __le64 * ) range ;
iod - > npages = 0 ;
2012-11-09 16:33:05 -07:00
range - > cattr = cpu_to_le32 ( 0 ) ;
2014-11-04 08:20:14 -07:00
range - > nlb = cpu_to_le32 ( blk_rq_bytes ( req ) > > ns - > lba_shift ) ;
range - > slba = cpu_to_le64 ( nvme_block_nr ( ns , blk_rq_pos ( req ) ) ) ;
2012-11-09 16:33:05 -07:00
2015-10-16 07:58:38 +02:00
memset ( cmnd , 0 , sizeof ( * cmnd ) ) ;
cmnd - > dsm . opcode = nvme_cmd_dsm ;
cmnd - > dsm . nsid = cpu_to_le32 ( ns - > ns_id ) ;
cmnd - > dsm . prp1 = cpu_to_le64 ( iod - > first_dma ) ;
cmnd - > dsm . nr = 0 ;
cmnd - > dsm . attributes = cpu_to_le32 ( NVME_DSMGMT_AD ) ;
return BLK_MQ_RQ_QUEUE_OK ;
2012-11-09 16:33:05 -07:00
}
2015-05-22 11:12:46 +02:00
/*
* NOTE : ns is NULL when called on the admin queue .
*/
2014-11-04 08:20:14 -07:00
static int nvme_queue_rq ( struct blk_mq_hw_ctx * hctx ,
const struct blk_mq_queue_data * bd )
2014-04-03 16:45:23 -06:00
{
2014-11-04 08:20:14 -07:00
struct nvme_ns * ns = hctx - > queue - > queuedata ;
struct nvme_queue * nvmeq = hctx - > driver_data ;
2015-05-22 11:12:46 +02:00
struct nvme_dev * dev = nvmeq - > dev ;
2014-11-04 08:20:14 -07:00
struct request * req = bd - > rq ;
struct nvme_cmd_info * cmd = blk_mq_rq_to_pdu ( req ) ;
2014-04-03 16:45:23 -06:00
struct nvme_iod * iod ;
2015-10-16 07:58:38 +02:00
struct nvme_command cmnd ;
int ret = BLK_MQ_RQ_QUEUE_OK ;
2014-04-03 16:45:23 -06:00
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 13:39:03 -07:00
/*
* If formated with metadata , require the block layer provide a buffer
* unless this namespace is formated such that the metadata can be
* stripped / generated by the controller with PRACT = 1.
*/
2015-05-22 11:12:46 +02:00
if ( ns & & ns - > ms & & ! blk_integrity_rq ( req ) ) {
2015-06-19 11:07:30 -06:00
if ( ! ( ns - > pi_type & & ns - > ms = = 8 ) & &
req - > cmd_type ! = REQ_TYPE_DRV_PRIV ) {
2015-09-27 21:01:50 +02:00
blk_mq_complete_request ( req , - EFAULT ) ;
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 13:39:03 -07:00
return BLK_MQ_RQ_QUEUE_OK ;
}
}
2015-05-22 11:12:46 +02:00
iod = nvme_alloc_iod ( req , dev , GFP_ATOMIC ) ;
2014-04-03 16:45:23 -06:00
if ( ! iod )
2014-12-11 13:58:39 -07:00
return BLK_MQ_RQ_QUEUE_BUSY ;
2014-11-04 08:20:14 -07:00
if ( req - > cmd_flags & REQ_DISCARD ) {
2015-10-16 07:58:38 +02:00
ret = nvme_setup_discard ( nvmeq , ns , iod , & cmnd ) ;
} else {
if ( req - > cmd_type = = REQ_TYPE_DRV_PRIV )
memcpy ( & cmnd , req - > cmd , sizeof ( cmnd ) ) ;
else if ( req - > cmd_flags & REQ_FLUSH )
nvme_setup_flush ( ns , & cmnd ) ;
else
nvme_setup_rw ( ns , req , & cmnd ) ;
2014-11-04 08:20:14 -07:00
2015-10-16 07:58:38 +02:00
if ( req - > nr_phys_segments )
ret = nvme_map_data ( dev , iod , & cmnd ) ;
2014-04-03 16:45:23 -06:00
}
2011-02-10 12:01:09 -05:00
2015-10-16 07:58:38 +02:00
if ( ret )
goto out ;
cmnd . common . command_id = req - > tag ;
2014-12-03 17:07:13 -07:00
nvme_set_info ( cmd , iod , req_completion ) ;
2014-11-04 08:20:14 -07:00
2015-10-16 07:58:38 +02:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
__nvme_submit_cmd ( nvmeq , & cmnd ) ;
2014-11-04 08:20:14 -07:00
nvme_process_cq ( nvmeq ) ;
spin_unlock_irq ( & nvmeq - > q_lock ) ;
return BLK_MQ_RQ_QUEUE_OK ;
2015-10-16 07:58:38 +02:00
out :
2015-05-22 11:12:46 +02:00
nvme_free_iod ( dev , iod ) ;
2015-10-16 07:58:38 +02:00
return ret ;
2011-01-20 12:50:14 -05:00
}
2015-11-03 20:37:26 -07:00
static void __nvme_process_cq ( struct nvme_queue * nvmeq , unsigned int * tag )
2011-01-20 12:50:14 -05:00
{
2011-01-20 13:24:06 -05:00
u16 head , phase ;
2011-01-20 12:50:14 -05:00
head = nvmeq - > cq_head ;
2011-01-20 13:24:06 -05:00
phase = nvmeq - > cq_phase ;
2011-01-20 12:50:14 -05:00
for ( ; ; ) {
2011-10-15 07:33:46 -04:00
void * ctx ;
nvme_completion_fn fn ;
2011-01-20 12:50:14 -05:00
struct nvme_completion cqe = nvmeq - > cqes [ head ] ;
2011-01-20 13:24:06 -05:00
if ( ( le16_to_cpu ( cqe . status ) & 1 ) ! = phase )
2011-01-20 12:50:14 -05:00
break ;
nvmeq - > sq_head = le16_to_cpu ( cqe . sq_head ) ;
if ( + + head = = nvmeq - > q_depth ) {
head = 0 ;
2011-01-20 13:24:06 -05:00
phase = ! phase ;
2011-01-20 12:50:14 -05:00
}
2015-11-03 20:37:26 -07:00
if ( tag & & * tag = = cqe . command_id )
* tag = - 1 ;
2014-11-04 08:20:14 -07:00
ctx = nvme_finish_cmd ( nvmeq , cqe . command_id , & fn ) ;
2014-04-03 16:45:23 -06:00
fn ( nvmeq , ctx , & cqe ) ;
2011-01-20 12:50:14 -05:00
}
/* If the controller ignores the cq head doorbell and continuously
* writes to the queue , it is theoretically possible to wrap around
* the queue twice and mistakenly return IRQ_NONE . Linux only
* requires that 0.1 % of your interrupts are handled , so this isn ' t
* a big problem .
*/
2011-01-20 13:24:06 -05:00
if ( head = = nvmeq - > cq_head & & phase = = nvmeq - > cq_phase )
2015-11-03 20:37:26 -07:00
return ;
2011-01-20 12:50:14 -05:00
2015-11-20 08:38:13 -07:00
if ( likely ( nvmeq - > cq_vector > = 0 ) )
writel ( head , nvmeq - > q_db + nvmeq - > dev - > db_stride ) ;
2011-01-20 12:50:14 -05:00
nvmeq - > cq_head = head ;
2011-01-20 13:24:06 -05:00
nvmeq - > cq_phase = phase ;
2011-01-20 12:50:14 -05:00
2013-06-24 11:47:34 -04:00
nvmeq - > cqe_seen = 1 ;
2015-11-03 20:37:26 -07:00
}
static void nvme_process_cq ( struct nvme_queue * nvmeq )
{
__nvme_process_cq ( nvmeq , NULL ) ;
2011-01-20 12:50:14 -05:00
}
static irqreturn_t nvme_irq ( int irq , void * data )
2011-02-06 07:28:06 -05:00
{
irqreturn_t result ;
struct nvme_queue * nvmeq = data ;
spin_lock ( & nvmeq - > q_lock ) ;
2013-06-24 11:47:34 -04:00
nvme_process_cq ( nvmeq ) ;
result = nvmeq - > cqe_seen ? IRQ_HANDLED : IRQ_NONE ;
nvmeq - > cqe_seen = 0 ;
2011-02-06 07:28:06 -05:00
spin_unlock ( & nvmeq - > q_lock ) ;
return result ;
}
static irqreturn_t nvme_irq_check ( int irq , void * data )
{
struct nvme_queue * nvmeq = data ;
struct nvme_completion cqe = nvmeq - > cqes [ nvmeq - > cq_head ] ;
if ( ( le16_to_cpu ( cqe . status ) & 1 ) ! = nvmeq - > cq_phase )
return IRQ_NONE ;
return IRQ_WAKE_THREAD ;
}
2015-11-03 20:37:26 -07:00
static int nvme_poll ( struct blk_mq_hw_ctx * hctx , unsigned int tag )
{
struct nvme_queue * nvmeq = hctx - > driver_data ;
if ( ( le16_to_cpu ( nvmeq - > cqes [ nvmeq - > cq_head ] . status ) & 1 ) = =
nvmeq - > cq_phase ) {
spin_lock_irq ( & nvmeq - > q_lock ) ;
__nvme_process_cq ( nvmeq , & tag ) ;
spin_unlock_irq ( & nvmeq - > q_lock ) ;
if ( tag = = - 1 )
return 1 ;
}
return 0 ;
}
2014-11-04 08:20:14 -07:00
static int nvme_submit_async_admin_req ( struct nvme_dev * dev )
{
struct nvme_queue * nvmeq = dev - > queues [ 0 ] ;
struct nvme_command c ;
struct nvme_cmd_info * cmd_info ;
struct request * req ;
2015-11-26 10:06:56 +01:00
req = blk_mq_alloc_request ( dev - > ctrl . admin_q , WRITE ,
2015-11-26 09:13:05 +01:00
BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED ) ;
2014-11-05 23:39:09 +03:00
if ( IS_ERR ( req ) )
return PTR_ERR ( req ) ;
2014-11-04 08:20:14 -07:00
2015-01-07 18:55:48 -07:00
req - > cmd_flags | = REQ_NO_TIMEOUT ;
2014-11-04 08:20:14 -07:00
cmd_info = blk_mq_rq_to_pdu ( req ) ;
2015-03-31 10:37:17 -06:00
nvme_set_info ( cmd_info , NULL , async_req_completion ) ;
2014-11-04 08:20:14 -07:00
memset ( & c , 0 , sizeof ( c ) ) ;
c . common . opcode = nvme_admin_async_event ;
c . common . command_id = req - > tag ;
2015-06-01 09:29:54 -06:00
blk_mq_free_request ( req ) ;
2015-07-31 18:56:58 +05:30
__nvme_submit_cmd ( nvmeq , & c ) ;
return 0 ;
2014-11-04 08:20:14 -07:00
}
static int nvme_submit_admin_async_cmd ( struct nvme_dev * dev ,
2013-12-10 13:10:40 -07:00
struct nvme_command * cmd ,
struct async_cmd_info * cmdinfo , unsigned timeout )
{
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = dev - > queues [ 0 ] ;
struct request * req ;
struct nvme_cmd_info * cmd_rq ;
2013-12-10 13:10:40 -07:00
2015-11-26 10:06:56 +01:00
req = blk_mq_alloc_request ( dev - > ctrl . admin_q , WRITE , 0 ) ;
2014-11-05 23:39:09 +03:00
if ( IS_ERR ( req ) )
return PTR_ERR ( req ) ;
2014-11-04 08:20:14 -07:00
req - > timeout = timeout ;
cmd_rq = blk_mq_rq_to_pdu ( req ) ;
cmdinfo - > req = req ;
nvme_set_info ( cmd_rq , cmdinfo , async_completion ) ;
2013-12-10 13:10:40 -07:00
cmdinfo - > status = - EINTR ;
2014-11-04 08:20:14 -07:00
cmd - > common . command_id = req - > tag ;
2015-07-31 18:56:58 +05:30
nvme_submit_cmd ( nvmeq , cmd ) ;
return 0 ;
2013-12-10 13:10:40 -07:00
}
2011-01-20 12:50:14 -05:00
static int adapter_delete_queue ( struct nvme_dev * dev , u8 opcode , u16 id )
{
struct nvme_command c ;
memset ( & c , 0 , sizeof ( c ) ) ;
c . delete_queue . opcode = opcode ;
c . delete_queue . qid = cpu_to_le16 ( id ) ;
2015-11-26 10:06:56 +01:00
return nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ;
2011-01-20 12:50:14 -05:00
}
static int adapter_alloc_cq ( struct nvme_dev * dev , u16 qid ,
struct nvme_queue * nvmeq )
{
struct nvme_command c ;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED ;
2015-05-22 11:12:46 +02:00
/*
* Note : we ( ab ) use the fact the the prp fields survive if no data
* is attached to the request .
*/
2011-01-20 12:50:14 -05:00
memset ( & c , 0 , sizeof ( c ) ) ;
c . create_cq . opcode = nvme_admin_create_cq ;
c . create_cq . prp1 = cpu_to_le64 ( nvmeq - > cq_dma_addr ) ;
c . create_cq . cqid = cpu_to_le16 ( qid ) ;
c . create_cq . qsize = cpu_to_le16 ( nvmeq - > q_depth - 1 ) ;
c . create_cq . cq_flags = cpu_to_le16 ( flags ) ;
c . create_cq . irq_vector = cpu_to_le16 ( nvmeq - > cq_vector ) ;
2015-11-26 10:06:56 +01:00
return nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ;
2011-01-20 12:50:14 -05:00
}
static int adapter_alloc_sq ( struct nvme_dev * dev , u16 qid ,
struct nvme_queue * nvmeq )
{
struct nvme_command c ;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM ;
2015-05-22 11:12:46 +02:00
/*
* Note : we ( ab ) use the fact the the prp fields survive if no data
* is attached to the request .
*/
2011-01-20 12:50:14 -05:00
memset ( & c , 0 , sizeof ( c ) ) ;
c . create_sq . opcode = nvme_admin_create_sq ;
c . create_sq . prp1 = cpu_to_le64 ( nvmeq - > sq_dma_addr ) ;
c . create_sq . sqid = cpu_to_le16 ( qid ) ;
c . create_sq . qsize = cpu_to_le16 ( nvmeq - > q_depth - 1 ) ;
c . create_sq . sq_flags = cpu_to_le16 ( flags ) ;
c . create_sq . cqid = cpu_to_le16 ( qid ) ;
2015-11-26 10:06:56 +01:00
return nvme_submit_sync_cmd ( dev - > ctrl . admin_q , & c , NULL , 0 ) ;
2011-01-20 12:50:14 -05:00
}
static int adapter_delete_cq ( struct nvme_dev * dev , u16 cqid )
{
return adapter_delete_queue ( dev , nvme_admin_delete_cq , cqid ) ;
}
static int adapter_delete_sq ( struct nvme_dev * dev , u16 sqid )
{
return adapter_delete_queue ( dev , nvme_admin_delete_sq , sqid ) ;
}
2013-12-10 13:10:38 -07:00
/**
2014-11-04 08:20:14 -07:00
* nvme_abort_req - Attempt aborting a request
2013-12-10 13:10:38 -07:00
*
* Schedule controller reset if the command was already aborted once before and
* still hasn ' t been returned to the driver , or if this is the admin queue .
*/
2014-11-04 08:20:14 -07:00
static void nvme_abort_req ( struct request * req )
2013-12-10 13:10:38 -07:00
{
2014-11-04 08:20:14 -07:00
struct nvme_cmd_info * cmd_rq = blk_mq_rq_to_pdu ( req ) ;
struct nvme_queue * nvmeq = cmd_rq - > nvmeq ;
2013-12-10 13:10:38 -07:00
struct nvme_dev * dev = nvmeq - > dev ;
2014-11-04 08:20:14 -07:00
struct request * abort_req ;
struct nvme_cmd_info * abort_cmd ;
struct nvme_command cmd ;
2013-12-10 13:10:38 -07:00
2014-11-04 08:20:14 -07:00
if ( ! nvmeq - > qid | | cmd_rq - > aborted ) {
2015-10-02 18:49:23 +02:00
spin_lock ( & dev_list_lock ) ;
if ( ! __nvme_reset ( dev ) ) {
dev_warn ( dev - > dev ,
" I/O %d QID %d timeout, reset controller \n " ,
req - > tag , nvmeq - > qid ) ;
}
spin_unlock ( & dev_list_lock ) ;
2013-12-10 13:10:38 -07:00
return ;
}
2015-11-26 10:06:56 +01:00
if ( ! dev - > ctrl . abort_limit )
2013-12-10 13:10:38 -07:00
return ;
2015-11-26 10:06:56 +01:00
abort_req = blk_mq_alloc_request ( dev - > ctrl . admin_q , WRITE ,
2015-11-26 09:13:05 +01:00
BLK_MQ_REQ_NOWAIT ) ;
2014-11-05 23:39:09 +03:00
if ( IS_ERR ( abort_req ) )
2013-12-10 13:10:38 -07:00
return ;
2014-11-04 08:20:14 -07:00
abort_cmd = blk_mq_rq_to_pdu ( abort_req ) ;
nvme_set_info ( abort_cmd , abort_req , abort_completion ) ;
2013-12-10 13:10:38 -07:00
memset ( & cmd , 0 , sizeof ( cmd ) ) ;
cmd . abort . opcode = nvme_admin_abort_cmd ;
2014-11-04 08:20:14 -07:00
cmd . abort . cid = req - > tag ;
2013-12-10 13:10:38 -07:00
cmd . abort . sqid = cpu_to_le16 ( nvmeq - > qid ) ;
2014-11-04 08:20:14 -07:00
cmd . abort . command_id = abort_req - > tag ;
2013-12-10 13:10:38 -07:00
2015-11-26 10:06:56 +01:00
- - dev - > ctrl . abort_limit ;
2014-11-04 08:20:14 -07:00
cmd_rq - > aborted = 1 ;
2013-12-10 13:10:38 -07:00
2014-11-04 08:20:14 -07:00
dev_warn ( nvmeq - > q_dmadev , " Aborting I/O %d QID %d \n " , req - > tag ,
2013-12-10 13:10:38 -07:00
nvmeq - > qid ) ;
2015-07-31 18:56:58 +05:30
nvme_submit_cmd ( dev - > queues [ 0 ] , & cmd ) ;
2013-12-10 13:10:38 -07:00
}
2015-06-01 09:29:54 -06:00
static void nvme_cancel_queue_ios ( struct request * req , void * data , bool reserved )
2012-08-07 15:56:23 -04:00
{
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = data ;
void * ctx ;
nvme_completion_fn fn ;
struct nvme_cmd_info * cmd ;
2015-01-07 18:55:51 -07:00
struct nvme_completion cqe ;
if ( ! blk_mq_request_started ( req ) )
return ;
2012-08-07 15:56:23 -04:00
2014-11-04 08:20:14 -07:00
cmd = blk_mq_rq_to_pdu ( req ) ;
2012-08-07 15:56:23 -04:00
2014-11-04 08:20:14 -07:00
if ( cmd - > ctx = = CMD_CTX_CANCELLED )
return ;
2015-01-07 18:55:51 -07:00
if ( blk_queue_dying ( req - > q ) )
cqe . status = cpu_to_le16 ( ( NVME_SC_ABORT_REQ | NVME_SC_DNR ) < < 1 ) ;
else
cqe . status = cpu_to_le16 ( NVME_SC_ABORT_REQ < < 1 ) ;
2014-11-04 08:20:14 -07:00
dev_warn ( nvmeq - > q_dmadev , " Cancelling I/O %d QID %d \n " ,
req - > tag , nvmeq - > qid ) ;
ctx = cancel_cmd_info ( cmd , & fn ) ;
fn ( nvmeq , ctx , & cqe ) ;
2012-08-07 15:56:23 -04:00
}
2014-11-04 08:20:14 -07:00
static enum blk_eh_timer_return nvme_timeout ( struct request * req , bool reserved )
2012-08-03 13:55:56 -04:00
{
2014-11-04 08:20:14 -07:00
struct nvme_cmd_info * cmd = blk_mq_rq_to_pdu ( req ) ;
struct nvme_queue * nvmeq = cmd - > nvmeq ;
dev_warn ( nvmeq - > q_dmadev , " Timeout I/O %d QID %d \n " , req - > tag ,
nvmeq - > qid ) ;
2015-01-07 18:55:53 -07:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
2015-02-19 10:34:48 -07:00
nvme_abort_req ( req ) ;
2015-01-07 18:55:53 -07:00
spin_unlock_irq ( & nvmeq - > q_lock ) ;
2014-11-04 08:20:14 -07:00
2015-02-19 10:34:48 -07:00
/*
* The aborted req will be completed on receiving the abort req .
* We enable the timer again . If hit twice , it ' ll cause a device reset ,
* as the device then is in a faulty state .
*/
return BLK_EH_RESET_TIMER ;
2014-11-04 08:20:14 -07:00
}
2013-07-15 15:02:20 -06:00
2014-11-04 08:20:14 -07:00
static void nvme_free_queue ( struct nvme_queue * nvmeq )
{
2012-08-03 13:55:56 -04:00
dma_free_coherent ( nvmeq - > q_dmadev , CQ_SIZE ( nvmeq - > q_depth ) ,
( void * ) nvmeq - > cqes , nvmeq - > cq_dma_addr ) ;
2015-07-20 10:14:09 -06:00
if ( nvmeq - > sq_cmds )
dma_free_coherent ( nvmeq - > q_dmadev , SQ_SIZE ( nvmeq - > q_depth ) ,
2012-08-03 13:55:56 -04:00
nvmeq - > sq_cmds , nvmeq - > sq_dma_addr ) ;
kfree ( nvmeq ) ;
}
2013-12-16 13:50:00 -05:00
static void nvme_free_queues ( struct nvme_dev * dev , int lowest )
2013-07-15 15:02:20 -06:00
{
int i ;
2013-12-16 13:50:00 -05:00
for ( i = dev - > queue_count - 1 ; i > = lowest ; i - - ) {
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = dev - > queues [ i ] ;
2013-07-15 15:02:20 -06:00
dev - > queue_count - - ;
2014-11-04 08:20:14 -07:00
dev - > queues [ i ] = NULL ;
2014-07-07 09:14:42 -06:00
nvme_free_queue ( nvmeq ) ;
2015-01-14 21:01:58 -07:00
}
2013-07-15 15:02:20 -06:00
}
2013-12-10 13:10:40 -07:00
/**
* nvme_suspend_queue - put queue into suspended state
* @ nvmeq - queue to suspend
*/
static int nvme_suspend_queue ( struct nvme_queue * nvmeq )
2011-01-20 12:50:14 -05:00
{
2014-12-22 12:59:04 -07:00
int vector ;
2011-01-20 12:50:14 -05:00
2012-08-07 15:56:23 -04:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
2014-12-22 12:59:04 -07:00
if ( nvmeq - > cq_vector = = - 1 ) {
spin_unlock_irq ( & nvmeq - > q_lock ) ;
return 1 ;
}
vector = nvmeq - > dev - > entry [ nvmeq - > cq_vector ] . vector ;
2014-03-24 10:46:25 -06:00
nvmeq - > dev - > online_queues - - ;
2014-12-22 12:59:04 -07:00
nvmeq - > cq_vector = - 1 ;
2012-08-07 15:56:23 -04:00
spin_unlock_irq ( & nvmeq - > q_lock ) ;
2015-11-26 10:06:56 +01:00
if ( ! nvmeq - > qid & & nvmeq - > dev - > ctrl . admin_q )
blk_mq_freeze_queue_start ( nvmeq - > dev - > ctrl . admin_q ) ;
2015-03-26 13:49:33 -06:00
2011-03-27 08:52:06 -04:00
irq_set_affinity_hint ( vector , NULL ) ;
free_irq ( vector , nvmeq ) ;
2011-01-20 12:50:14 -05:00
2013-12-10 13:10:40 -07:00
return 0 ;
}
2011-01-20 12:50:14 -05:00
2013-12-10 13:10:40 -07:00
static void nvme_clear_queue ( struct nvme_queue * nvmeq )
{
2013-07-15 15:02:20 -06:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
2015-06-01 09:29:54 -06:00
if ( nvmeq - > tags & & * nvmeq - > tags )
blk_mq_all_tag_busy_iter ( * nvmeq - > tags , nvme_cancel_queue_ios , nvmeq ) ;
2013-07-15 15:02:20 -06:00
spin_unlock_irq ( & nvmeq - > q_lock ) ;
2011-01-20 12:50:14 -05:00
}
2013-12-10 13:10:40 -07:00
static void nvme_disable_queue ( struct nvme_dev * dev , int qid )
{
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = dev - > queues [ qid ] ;
2013-12-10 13:10:40 -07:00
if ( ! nvmeq )
return ;
if ( nvme_suspend_queue ( nvmeq ) )
return ;
2013-12-10 13:10:39 -07:00
/* Don't tell the adapter to delete the admin queue.
* Don ' t tell a removed adapter to delete IO queues . */
2015-11-20 08:58:10 +01:00
if ( qid & & readl ( dev - > bar + NVME_REG_CSTS ) ! = - 1 ) {
2011-01-20 12:50:14 -05:00
adapter_delete_sq ( dev , qid ) ;
adapter_delete_cq ( dev , qid ) ;
}
2015-02-19 10:34:48 -07:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
nvme_process_cq ( nvmeq ) ;
spin_unlock_irq ( & nvmeq - > q_lock ) ;
2011-01-20 12:50:14 -05:00
}
2015-07-20 10:14:09 -06:00
static int nvme_cmb_qdepth ( struct nvme_dev * dev , int nr_io_queues ,
int entry_size )
{
int q_depth = dev - > q_depth ;
2015-11-28 15:03:49 +01:00
unsigned q_size_aligned = roundup ( q_depth * entry_size ,
dev - > ctrl . page_size ) ;
2015-07-20 10:14:09 -06:00
if ( q_size_aligned * nr_io_queues > dev - > cmb_size ) {
2015-07-21 15:08:13 -06:00
u64 mem_per_q = div_u64 ( dev - > cmb_size , nr_io_queues ) ;
2015-11-28 15:03:49 +01:00
mem_per_q = round_down ( mem_per_q , dev - > ctrl . page_size ) ;
2015-07-21 15:08:13 -06:00
q_depth = div_u64 ( mem_per_q , entry_size ) ;
2015-07-20 10:14:09 -06:00
/*
* Ensure the reduced q_depth is above some threshold where it
* would be better to map queues in system memory with the
* original depth
*/
if ( q_depth < 64 )
return - ENOMEM ;
}
return q_depth ;
}
static int nvme_alloc_sq_cmds ( struct nvme_dev * dev , struct nvme_queue * nvmeq ,
int qid , int depth )
{
if ( qid & & dev - > cmb & & use_cmb_sqes & & NVME_CMB_SQS ( dev - > cmbsz ) ) {
2015-11-28 15:03:49 +01:00
unsigned offset = ( qid - 1 ) * roundup ( SQ_SIZE ( depth ) ,
dev - > ctrl . page_size ) ;
2015-07-20 10:14:09 -06:00
nvmeq - > sq_dma_addr = dev - > cmb_dma_addr + offset ;
nvmeq - > sq_cmds_io = dev - > cmb + offset ;
} else {
nvmeq - > sq_cmds = dma_alloc_coherent ( dev - > dev , SQ_SIZE ( depth ) ,
& nvmeq - > sq_dma_addr , GFP_KERNEL ) ;
if ( ! nvmeq - > sq_cmds )
return - ENOMEM ;
}
return 0 ;
}
2011-01-20 12:50:14 -05:00
static struct nvme_queue * nvme_alloc_queue ( struct nvme_dev * dev , int qid ,
2014-12-22 12:59:04 -07:00
int depth )
2011-01-20 12:50:14 -05:00
{
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = kzalloc ( sizeof ( * nvmeq ) , GFP_KERNEL ) ;
2011-01-20 12:50:14 -05:00
if ( ! nvmeq )
return NULL ;
2015-05-22 11:12:39 +02:00
nvmeq - > cqes = dma_zalloc_coherent ( dev - > dev , CQ_SIZE ( depth ) ,
2014-06-15 13:37:33 -07:00
& nvmeq - > cq_dma_addr , GFP_KERNEL ) ;
2011-01-20 12:50:14 -05:00
if ( ! nvmeq - > cqes )
goto free_nvmeq ;
2015-07-20 10:14:09 -06:00
if ( nvme_alloc_sq_cmds ( dev , nvmeq , qid , depth ) )
2011-01-20 12:50:14 -05:00
goto free_cqdma ;
2015-05-22 11:12:39 +02:00
nvmeq - > q_dmadev = dev - > dev ;
2011-02-10 09:56:01 -05:00
nvmeq - > dev = dev ;
2014-01-27 15:57:22 -05:00
snprintf ( nvmeq - > irqname , sizeof ( nvmeq - > irqname ) , " nvme%dq%d " ,
2015-11-26 10:06:56 +01:00
dev - > ctrl . instance , qid ) ;
2011-01-20 12:50:14 -05:00
spin_lock_init ( & nvmeq - > q_lock ) ;
nvmeq - > cq_head = 0 ;
2011-01-20 13:24:06 -05:00
nvmeq - > cq_phase = 1 ;
2013-09-10 11:25:37 +08:00
nvmeq - > q_db = & dev - > dbs [ qid * 2 * dev - > db_stride ] ;
2011-01-20 12:50:14 -05:00
nvmeq - > q_depth = depth ;
2013-12-10 13:10:38 -07:00
nvmeq - > qid = qid ;
2015-06-30 11:22:52 -06:00
nvmeq - > cq_vector = - 1 ;
2014-11-04 08:20:14 -07:00
dev - > queues [ qid ] = nvmeq ;
2011-01-20 12:50:14 -05:00
2015-05-27 12:26:23 -06:00
/* make sure queue descriptor is set before queue count, for kthread */
mb ( ) ;
dev - > queue_count + + ;
2011-01-20 12:50:14 -05:00
return nvmeq ;
free_cqdma :
2015-05-22 11:12:39 +02:00
dma_free_coherent ( dev - > dev , CQ_SIZE ( depth ) , ( void * ) nvmeq - > cqes ,
2011-01-20 12:50:14 -05:00
nvmeq - > cq_dma_addr ) ;
free_nvmeq :
kfree ( nvmeq ) ;
return NULL ;
}
2011-01-20 09:10:15 -05:00
static int queue_request_irq ( struct nvme_dev * dev , struct nvme_queue * nvmeq ,
const char * name )
{
2011-02-06 07:28:06 -05:00
if ( use_threaded_interrupts )
return request_threaded_irq ( dev - > entry [ nvmeq - > cq_vector ] . vector ,
2013-10-12 06:23:29 +02:00
nvme_irq_check , nvme_irq , IRQF_SHARED ,
2011-02-06 07:28:06 -05:00
name , nvmeq ) ;
2011-01-20 09:10:15 -05:00
return request_irq ( dev - > entry [ nvmeq - > cq_vector ] . vector , nvme_irq ,
2013-10-12 06:23:29 +02:00
IRQF_SHARED , name , nvmeq ) ;
2011-01-20 09:10:15 -05:00
}
2013-07-15 15:02:20 -06:00
static void nvme_init_queue ( struct nvme_queue * nvmeq , u16 qid )
2011-01-20 12:50:14 -05:00
{
2013-07-15 15:02:20 -06:00
struct nvme_dev * dev = nvmeq - > dev ;
2011-01-20 12:50:14 -05:00
2014-09-10 15:48:47 -06:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
2013-07-15 15:02:20 -06:00
nvmeq - > sq_tail = 0 ;
nvmeq - > cq_head = 0 ;
nvmeq - > cq_phase = 1 ;
2013-09-10 11:25:37 +08:00
nvmeq - > q_db = & dev - > dbs [ qid * 2 * dev - > db_stride ] ;
2013-07-15 15:02:20 -06:00
memset ( ( void * ) nvmeq - > cqes , 0 , CQ_SIZE ( nvmeq - > q_depth ) ) ;
2014-03-24 10:46:25 -06:00
dev - > online_queues + + ;
2014-09-10 15:48:47 -06:00
spin_unlock_irq ( & nvmeq - > q_lock ) ;
2013-07-15 15:02:20 -06:00
}
static int nvme_create_queue ( struct nvme_queue * nvmeq , int qid )
{
struct nvme_dev * dev = nvmeq - > dev ;
int result ;
2011-02-01 08:39:04 -05:00
2014-12-22 12:59:04 -07:00
nvmeq - > cq_vector = qid - 1 ;
2011-01-20 12:50:14 -05:00
result = adapter_alloc_cq ( dev , qid , nvmeq ) ;
if ( result < 0 )
2013-07-15 15:02:20 -06:00
return result ;
2011-01-20 12:50:14 -05:00
result = adapter_alloc_sq ( dev , qid , nvmeq ) ;
if ( result < 0 )
goto release_cq ;
2014-01-27 15:57:22 -05:00
result = queue_request_irq ( dev , nvmeq , nvmeq - > irqname ) ;
2011-01-20 12:50:14 -05:00
if ( result < 0 )
goto release_sq ;
2013-07-15 15:02:20 -06:00
nvme_init_queue ( nvmeq , qid ) ;
return result ;
2011-01-20 12:50:14 -05:00
release_sq :
adapter_delete_sq ( dev , qid ) ;
release_cq :
adapter_delete_cq ( dev , qid ) ;
2013-07-15 15:02:20 -06:00
return result ;
2011-01-20 12:50:14 -05:00
}
2014-11-04 08:20:14 -07:00
static struct blk_mq_ops nvme_mq_admin_ops = {
2015-05-22 11:12:46 +02:00
. queue_rq = nvme_queue_rq ,
2014-11-04 08:20:14 -07:00
. map_queue = blk_mq_map_queue ,
. init_hctx = nvme_admin_init_hctx ,
2015-06-08 10:08:13 -06:00
. exit_hctx = nvme_admin_exit_hctx ,
2014-11-04 08:20:14 -07:00
. init_request = nvme_admin_init_request ,
. timeout = nvme_timeout ,
} ;
static struct blk_mq_ops nvme_mq_ops = {
. queue_rq = nvme_queue_rq ,
. map_queue = blk_mq_map_queue ,
. init_hctx = nvme_init_hctx ,
. init_request = nvme_init_request ,
. timeout = nvme_timeout ,
2015-11-03 20:37:26 -07:00
. poll = nvme_poll ,
2014-11-04 08:20:14 -07:00
} ;
2015-01-07 18:55:49 -07:00
static void nvme_dev_remove_admin ( struct nvme_dev * dev )
{
2015-11-26 10:06:56 +01:00
if ( dev - > ctrl . admin_q & & ! blk_queue_dying ( dev - > ctrl . admin_q ) ) {
blk_cleanup_queue ( dev - > ctrl . admin_q ) ;
2015-01-07 18:55:49 -07:00
blk_mq_free_tag_set ( & dev - > admin_tagset ) ;
}
}
2014-11-04 08:20:14 -07:00
static int nvme_alloc_admin_tags ( struct nvme_dev * dev )
{
2015-11-26 10:06:56 +01:00
if ( ! dev - > ctrl . admin_q ) {
2014-11-04 08:20:14 -07:00
dev - > admin_tagset . ops = & nvme_mq_admin_ops ;
dev - > admin_tagset . nr_hw_queues = 1 ;
dev - > admin_tagset . queue_depth = NVME_AQ_DEPTH - 1 ;
2015-03-31 10:37:17 -06:00
dev - > admin_tagset . reserved_tags = 1 ;
2014-11-04 08:20:14 -07:00
dev - > admin_tagset . timeout = ADMIN_TIMEOUT ;
2015-05-22 11:12:39 +02:00
dev - > admin_tagset . numa_node = dev_to_node ( dev - > dev ) ;
2015-01-22 12:07:58 -07:00
dev - > admin_tagset . cmd_size = nvme_cmd_size ( dev ) ;
2014-11-04 08:20:14 -07:00
dev - > admin_tagset . driver_data = dev ;
if ( blk_mq_alloc_tag_set ( & dev - > admin_tagset ) )
return - ENOMEM ;
2015-11-26 10:06:56 +01:00
dev - > ctrl . admin_q = blk_mq_init_queue ( & dev - > admin_tagset ) ;
if ( IS_ERR ( dev - > ctrl . admin_q ) ) {
2014-11-04 08:20:14 -07:00
blk_mq_free_tag_set ( & dev - > admin_tagset ) ;
return - ENOMEM ;
}
2015-11-26 10:06:56 +01:00
if ( ! blk_get_queue ( dev - > ctrl . admin_q ) ) {
2015-01-07 18:55:49 -07:00
nvme_dev_remove_admin ( dev ) ;
2015-11-26 10:06:56 +01:00
dev - > ctrl . admin_q = NULL ;
2015-01-07 18:55:49 -07:00
return - ENODEV ;
}
2015-01-07 18:55:50 -07:00
} else
2015-11-26 10:06:56 +01:00
blk_mq_unfreeze_queue ( dev - > ctrl . admin_q ) ;
2014-11-04 08:20:14 -07:00
return 0 ;
}
2012-12-21 15:13:49 -08:00
static int nvme_configure_admin_queue ( struct nvme_dev * dev )
2011-01-20 12:50:14 -05:00
{
2013-05-04 06:43:16 -04:00
int result ;
2011-01-20 12:50:14 -05:00
u32 aqa ;
2015-11-20 08:58:10 +01:00
u64 cap = lo_hi_readq ( dev - > bar + NVME_REG_CAP ) ;
2011-01-20 12:50:14 -05:00
struct nvme_queue * nvmeq ;
2015-11-20 08:58:10 +01:00
dev - > subsystem = readl ( dev - > bar + NVME_REG_VS ) > = NVME_VS ( 1 , 1 ) ?
2015-08-10 15:20:40 -06:00
NVME_CAP_NSSRC ( cap ) : 0 ;
2015-11-20 08:58:10 +01:00
if ( dev - > subsystem & &
( readl ( dev - > bar + NVME_REG_CSTS ) & NVME_CSTS_NSSRO ) )
writel ( NVME_CSTS_NSSRO , dev - > bar + NVME_REG_CSTS ) ;
2015-08-10 15:20:40 -06:00
2015-11-28 15:03:49 +01:00
result = nvme_disable_ctrl ( & dev - > ctrl , cap ) ;
2013-05-04 06:43:16 -04:00
if ( result < 0 )
return result ;
2011-01-20 12:50:14 -05:00
2014-11-04 08:20:14 -07:00
nvmeq = dev - > queues [ 0 ] ;
2013-07-15 15:02:23 -06:00
if ( ! nvmeq ) {
2014-12-22 12:59:04 -07:00
nvmeq = nvme_alloc_queue ( dev , 0 , NVME_AQ_DEPTH ) ;
2013-07-15 15:02:23 -06:00
if ( ! nvmeq )
return - ENOMEM ;
}
2011-01-20 12:50:14 -05:00
aqa = nvmeq - > q_depth - 1 ;
aqa | = aqa < < 16 ;
2015-11-20 08:58:10 +01:00
writel ( aqa , dev - > bar + NVME_REG_AQA ) ;
lo_hi_writeq ( nvmeq - > sq_dma_addr , dev - > bar + NVME_REG_ASQ ) ;
lo_hi_writeq ( nvmeq - > cq_dma_addr , dev - > bar + NVME_REG_ACQ ) ;
2011-01-20 12:50:14 -05:00
2015-11-28 15:03:49 +01:00
result = nvme_enable_ctrl ( & dev - > ctrl , cap ) ;
2013-05-01 13:07:51 -06:00
if ( result )
2014-11-04 08:20:14 -07:00
goto free_nvmeq ;
2014-12-22 12:59:04 -07:00
nvmeq - > cq_vector = 0 ;
2014-01-27 15:57:22 -05:00
result = queue_request_irq ( dev , nvmeq , nvmeq - > irqname ) ;
2015-06-30 11:22:52 -06:00
if ( result ) {
nvmeq - > cq_vector = - 1 ;
2015-01-07 18:55:50 -07:00
goto free_nvmeq ;
2015-06-30 11:22:52 -06:00
}
2013-05-01 13:07:51 -06:00
2011-01-20 12:50:14 -05:00
return result ;
2014-11-04 08:20:14 -07:00
free_nvmeq :
nvme_free_queues ( dev , 0 ) ;
return result ;
2011-01-20 12:50:14 -05:00
}
2011-03-02 18:37:18 -05:00
static int nvme_kthread ( void * data )
{
2013-12-10 13:10:37 -07:00
struct nvme_dev * dev , * next ;
2011-03-02 18:37:18 -05:00
while ( ! kthread_should_stop ( ) ) {
2013-05-01 16:38:23 -04:00
set_current_state ( TASK_INTERRUPTIBLE ) ;
2011-03-02 18:37:18 -05:00
spin_lock ( & dev_list_lock ) ;
2013-12-10 13:10:37 -07:00
list_for_each_entry_safe ( dev , next , & dev_list , node ) {
2011-03-02 18:37:18 -05:00
int i ;
2015-11-20 08:58:10 +01:00
u32 csts = readl ( dev - > bar + NVME_REG_CSTS ) ;
2015-08-10 15:20:40 -06:00
if ( ( dev - > subsystem & & ( csts & NVME_CSTS_NSSRO ) ) | |
csts & NVME_CSTS_CFS ) {
2015-10-02 18:49:23 +02:00
if ( ! __nvme_reset ( dev ) ) {
dev_warn ( dev - > dev ,
" Failed status: %x, reset controller \n " ,
2015-11-20 08:58:10 +01:00
readl ( dev - > bar + NVME_REG_CSTS ) ) ;
2015-10-02 18:49:23 +02:00
}
2013-12-10 13:10:37 -07:00
continue ;
}
2011-03-02 18:37:18 -05:00
for ( i = 0 ; i < dev - > queue_count ; i + + ) {
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = dev - > queues [ i ] ;
2011-02-15 16:28:20 -05:00
if ( ! nvmeq )
continue ;
2011-03-02 18:37:18 -05:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
2013-06-24 11:56:42 -04:00
nvme_process_cq ( nvmeq ) ;
2014-06-18 13:58:57 -06:00
2015-11-26 10:06:56 +01:00
while ( i = = 0 & & dev - > ctrl . event_limit > 0 ) {
2014-11-04 08:20:14 -07:00
if ( nvme_submit_async_admin_req ( dev ) )
2014-06-18 13:58:57 -06:00
break ;
2015-11-26 10:06:56 +01:00
dev - > ctrl . event_limit - - ;
2014-06-18 13:58:57 -06:00
}
2011-03-02 18:37:18 -05:00
spin_unlock_irq ( & nvmeq - > q_lock ) ;
}
}
spin_unlock ( & dev_list_lock ) ;
2013-02-04 14:44:33 -08:00
schedule_timeout ( round_jiffies_relative ( HZ ) ) ;
2011-03-02 18:37:18 -05:00
}
return 0 ;
}
2015-11-26 11:46:39 +01:00
static int nvme_create_io_queues ( struct nvme_dev * dev )
2014-03-24 10:46:25 -06:00
{
2014-11-04 08:20:14 -07:00
unsigned i ;
2015-11-26 11:46:39 +01:00
int ret = 0 ;
2014-03-24 10:46:25 -06:00
2015-11-26 11:46:39 +01:00
for ( i = dev - > queue_count ; i < = dev - > max_qid ; i + + ) {
if ( ! nvme_alloc_queue ( dev , i , dev - > q_depth ) ) {
ret = - ENOMEM ;
2014-03-24 10:46:25 -06:00
break ;
2015-11-26 11:46:39 +01:00
}
}
2014-03-24 10:46:25 -06:00
2015-11-26 11:46:39 +01:00
for ( i = dev - > online_queues ; i < = dev - > queue_count - 1 ; i + + ) {
ret = nvme_create_queue ( dev - > queues [ i ] , i ) ;
if ( ret ) {
2015-10-02 18:51:31 +02:00
nvme_free_queues ( dev , i ) ;
2014-03-24 10:46:25 -06:00
break ;
2015-10-02 18:51:31 +02:00
}
2015-11-26 11:46:39 +01:00
}
/*
* Ignore failing Create SQ / CQ commands , we can continue with less
* than the desired aount of queues , and even a controller without
* I / O queues an still be used to issue admin commands . This might
* be useful to upgrade a buggy firmware for example .
*/
return ret > = 0 ? 0 : ret ;
2014-03-24 10:46:25 -06:00
}
2015-07-20 10:14:09 -06:00
static void __iomem * nvme_map_cmb ( struct nvme_dev * dev )
{
u64 szu , size , offset ;
u32 cmbloc ;
resource_size_t bar_size ;
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
void __iomem * cmb ;
dma_addr_t dma_addr ;
if ( ! use_cmb_sqes )
return NULL ;
2015-11-20 08:58:10 +01:00
dev - > cmbsz = readl ( dev - > bar + NVME_REG_CMBSZ ) ;
2015-07-20 10:14:09 -06:00
if ( ! ( NVME_CMB_SZ ( dev - > cmbsz ) ) )
return NULL ;
2015-11-20 08:58:10 +01:00
cmbloc = readl ( dev - > bar + NVME_REG_CMBLOC ) ;
2015-07-20 10:14:09 -06:00
szu = ( u64 ) 1 < < ( 12 + 4 * NVME_CMB_SZU ( dev - > cmbsz ) ) ;
size = szu * NVME_CMB_SZ ( dev - > cmbsz ) ;
offset = szu * NVME_CMB_OFST ( cmbloc ) ;
bar_size = pci_resource_len ( pdev , NVME_CMB_BIR ( cmbloc ) ) ;
if ( offset > bar_size )
return NULL ;
/*
* Controllers may support a CMB size larger than their BAR ,
* for example , due to being behind a bridge . Reduce the CMB to
* the reported size of the BAR
*/
if ( size > bar_size - offset )
size = bar_size - offset ;
dma_addr = pci_resource_start ( pdev , NVME_CMB_BIR ( cmbloc ) ) + offset ;
cmb = ioremap_wc ( dma_addr , size ) ;
if ( ! cmb )
return NULL ;
dev - > cmb_dma_addr = dma_addr ;
dev - > cmb_size = size ;
return cmb ;
}
static inline void nvme_release_cmb ( struct nvme_dev * dev )
{
if ( dev - > cmb ) {
iounmap ( dev - > cmb ) ;
dev - > cmb = NULL ;
}
}
2013-07-15 15:02:24 -06:00
static size_t db_bar_size ( struct nvme_dev * dev , unsigned nr_io_queues )
{
2013-09-10 11:25:37 +08:00
return 4096 + ( ( nr_io_queues + 1 ) * 8 * dev - > db_stride ) ;
2013-07-15 15:02:24 -06:00
}
2012-12-21 15:13:49 -08:00
static int nvme_setup_io_queues ( struct nvme_dev * dev )
2011-01-20 12:50:14 -05:00
{
2014-11-04 08:20:14 -07:00
struct nvme_queue * adminq = dev - > queues [ 0 ] ;
2015-05-22 11:12:39 +02:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2014-03-24 10:46:25 -06:00
int result , i , vecs , nr_io_queues , size ;
2011-01-20 12:50:14 -05:00
2014-03-24 10:46:25 -06:00
nr_io_queues = num_possible_cpus ( ) ;
2015-11-26 11:09:06 +01:00
result = nvme_set_queue_count ( & dev - > ctrl , & nr_io_queues ) ;
if ( result < 0 )
2011-01-20 13:01:49 -05:00
return result ;
2015-11-26 11:09:06 +01:00
/*
* Degraded controllers might return an error when setting the queue
* count . We still want to be able to bring them online and offer
* access to the admin queue , as that might be only way to fix them up .
*/
if ( result > 0 ) {
dev_err ( dev - > dev , " Could not set queue count (%d) \n " , result ) ;
nr_io_queues = 0 ;
result = 0 ;
}
2011-01-20 12:50:14 -05:00
2015-07-20 10:14:09 -06:00
if ( dev - > cmb & & NVME_CMB_SQS ( dev - > cmbsz ) ) {
result = nvme_cmb_qdepth ( dev , nr_io_queues ,
sizeof ( struct nvme_command ) ) ;
if ( result > 0 )
dev - > q_depth = result ;
else
nvme_release_cmb ( dev ) ;
}
2013-07-15 15:02:24 -06:00
size = db_bar_size ( dev , nr_io_queues ) ;
if ( size > 8192 ) {
2011-10-20 17:00:41 -04:00
iounmap ( dev - > bar ) ;
2013-07-15 15:02:24 -06:00
do {
dev - > bar = ioremap ( pci_resource_start ( pdev , 0 ) , size ) ;
if ( dev - > bar )
break ;
if ( ! - - nr_io_queues )
return - ENOMEM ;
size = db_bar_size ( dev , nr_io_queues ) ;
} while ( 1 ) ;
2015-11-20 08:58:10 +01:00
dev - > dbs = dev - > bar + 4096 ;
2014-02-21 14:13:44 -07:00
adminq - > q_db = dev - > dbs ;
2011-10-20 17:00:41 -04:00
}
2013-07-15 15:02:24 -06:00
/* Deregister the admin queue's interrupt */
2014-01-27 15:57:22 -05:00
free_irq ( dev - > entry [ 0 ] . vector , adminq ) ;
2013-07-15 15:02:24 -06:00
2014-11-14 09:49:26 -07:00
/*
* If we enable msix early due to not intx , disable it again before
* setting up the full range we need .
*/
if ( ! pdev - > irq )
pci_disable_msix ( pdev ) ;
2014-03-04 16:22:00 +01:00
for ( i = 0 ; i < nr_io_queues ; i + + )
2011-01-20 13:01:49 -05:00
dev - > entry [ i ] . entry = i ;
2014-03-04 16:22:00 +01:00
vecs = pci_enable_msix_range ( pdev , dev - > entry , 1 , nr_io_queues ) ;
if ( vecs < 0 ) {
vecs = pci_enable_msi_range ( pdev , 1 , min ( nr_io_queues , 32 ) ) ;
if ( vecs < 0 ) {
vecs = 1 ;
} else {
for ( i = 0 ; i < vecs ; i + + )
dev - > entry [ i ] . vector = i + pdev - > irq ;
2013-05-11 15:19:31 -07:00
}
}
2013-06-20 10:53:48 -04:00
/*
* Should investigate if there ' s a performance win from allocating
* more queues than interrupt vectors ; it might allow the submission
* path to scale better , even if the receive path is limited by the
* number of interrupts .
*/
nr_io_queues = vecs ;
2014-03-24 10:46:25 -06:00
dev - > max_qid = nr_io_queues ;
2013-06-20 10:53:48 -04:00
2014-01-27 15:57:22 -05:00
result = queue_request_irq ( dev , adminq , adminq - > irqname ) ;
2015-06-30 11:22:52 -06:00
if ( result ) {
adminq - > cq_vector = - 1 ;
2013-07-15 15:02:20 -06:00
goto free_queues ;
2015-06-30 11:22:52 -06:00
}
2011-01-20 13:01:49 -05:00
2013-07-15 15:02:23 -06:00
/* Free previously allocated queues that are no longer usable */
2014-03-24 10:46:25 -06:00
nvme_free_queues ( dev , nr_io_queues + 1 ) ;
2015-11-26 11:46:39 +01:00
return nvme_create_io_queues ( dev ) ;
2011-01-20 12:50:14 -05:00
2013-07-15 15:02:20 -06:00
free_queues :
2013-12-16 13:50:00 -05:00
nvme_free_queues ( dev , 1 ) ;
2013-07-15 15:02:20 -06:00
return result ;
2011-01-20 12:50:14 -05:00
}
2015-09-03 08:18:17 -06:00
static void nvme_set_irq_hints ( struct nvme_dev * dev )
{
struct nvme_queue * nvmeq ;
int i ;
for ( i = 0 ; i < dev - > online_queues ; i + + ) {
nvmeq = dev - > queues [ i ] ;
if ( ! nvmeq - > tags | | ! ( * nvmeq - > tags ) )
continue ;
irq_set_affinity_hint ( dev - > entry [ nvmeq - > cq_vector ] . vector ,
blk_mq_tags_cpumask ( * nvmeq - > tags ) ) ;
}
}
2015-06-01 14:28:14 -06:00
static void nvme_dev_scan ( struct work_struct * work )
{
struct nvme_dev * dev = container_of ( work , struct nvme_dev , scan_work ) ;
if ( ! dev - > tagset . tags )
return ;
2015-11-28 15:39:07 +01:00
nvme_scan_namespaces ( & dev - > ctrl ) ;
2015-09-03 08:18:17 -06:00
nvme_set_irq_hints ( dev ) ;
2015-06-01 14:28:14 -06:00
}
2013-04-16 11:22:36 -04:00
/*
* Return : error value if an error occurred setting up the queues or calling
* Identify Device . 0 if these succeeded , even if adding some of the
* namespaces failed . At the moment , these failures are silent . TBD which
* failures should be reported .
*/
2012-12-21 15:13:49 -08:00
static int nvme_dev_add ( struct nvme_dev * dev )
2011-01-20 12:50:14 -05:00
{
2015-11-28 15:39:07 +01:00
if ( ! dev - > ctrl . tagset ) {
2015-06-08 10:08:15 -06:00
dev - > tagset . ops = & nvme_mq_ops ;
dev - > tagset . nr_hw_queues = dev - > online_queues - 1 ;
dev - > tagset . timeout = NVME_IO_TIMEOUT ;
dev - > tagset . numa_node = dev_to_node ( dev - > dev ) ;
dev - > tagset . queue_depth =
2014-11-04 08:20:14 -07:00
min_t ( int , dev - > q_depth , BLK_MQ_MAX_DEPTH ) - 1 ;
2015-06-08 10:08:15 -06:00
dev - > tagset . cmd_size = nvme_cmd_size ( dev ) ;
dev - > tagset . flags = BLK_MQ_F_SHOULD_MERGE ;
dev - > tagset . driver_data = dev ;
2011-01-20 12:50:14 -05:00
2015-06-08 10:08:15 -06:00
if ( blk_mq_alloc_tag_set ( & dev - > tagset ) )
return 0 ;
2015-11-28 15:39:07 +01:00
dev - > ctrl . tagset = & dev - > tagset ;
2015-06-08 10:08:15 -06:00
}
2015-06-01 14:28:14 -06:00
schedule_work ( & dev - > scan_work ) ;
NVMe: Metadata format support
Adds support for NVMe metadata formats and exposes block devices for
all namespaces regardless of their format. Namespace formats that are
unusable will have disk capacity set to 0, but a handle to the block
device is created to simplify device management. A namespace is not
usable when the format requires host interleave block and metadata in
single buffer, has no provisioned storage, or has better data but failed
to register with blk integrity.
The namespace has to be scanned in two phases to support separate
metadata formats. The first establishes the sector size and capacity
prior to invoking add_disk. If metadata is required, the capacity will
be temporarilly set to 0 until it can be revalidated and registered with
the integrity extenstions after add_disk completes.
The driver relies on the integrity extensions to provide the metadata
buffer. NVMe requires this be a single physically contiguous region,
so only one integrity segment is allowed per command. If the metadata
is used for T10 PI, the driver provides mappings to save and restore
the reftag physical block translation. The driver provides no-op
functions for generate and verify if metadata is not used for protection
information. This way the setup is always provided by the block layer.
If a request does not supply a required metadata buffer, the command
is failed with bad address. This could only happen if a user manually
disables verify/generate on such a disk. The only exception to where
this is okay is if the controller is capable of stripping/generating
the metadata, which is possible on some types of formats.
The metadata scatter gather list now occupies the spot in the nvme_iod
that used to be used to link retryable IOD's, but we don't do that
anymore, so the field was unused.
Signed-off-by: Keith Busch <keith.busch@intel.com>
2015-02-19 13:39:03 -07:00
return 0 ;
2011-01-20 12:50:14 -05:00
}
2013-07-15 15:02:19 -06:00
static int nvme_dev_map ( struct nvme_dev * dev )
{
2014-03-24 10:46:25 -06:00
u64 cap ;
2013-07-15 15:02:19 -06:00
int bars , result = - ENOMEM ;
2015-05-22 11:12:39 +02:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2013-07-15 15:02:19 -06:00
if ( pci_enable_device_mem ( pdev ) )
return result ;
dev - > entry [ 0 ] . vector = pdev - > irq ;
pci_set_master ( pdev ) ;
bars = pci_select_bars ( pdev , IORESOURCE_MEM ) ;
2014-11-14 09:50:19 -07:00
if ( ! bars )
goto disable_pci ;
2013-07-15 15:02:19 -06:00
if ( pci_request_selected_regions ( pdev , bars , " nvme " ) )
goto disable_pci ;
2015-05-22 11:12:39 +02:00
if ( dma_set_mask_and_coherent ( dev - > dev , DMA_BIT_MASK ( 64 ) ) & &
dma_set_mask_and_coherent ( dev - > dev , DMA_BIT_MASK ( 32 ) ) )
2013-06-26 23:49:11 +01:00
goto disable ;
2013-07-15 15:02:19 -06:00
dev - > bar = ioremap ( pci_resource_start ( pdev , 0 ) , 8192 ) ;
if ( ! dev - > bar )
goto disable ;
2014-11-14 09:49:26 -07:00
2015-11-20 08:58:10 +01:00
if ( readl ( dev - > bar + NVME_REG_CSTS ) = = - 1 ) {
2013-12-10 13:10:39 -07:00
result = - ENODEV ;
goto unmap ;
}
2014-11-14 09:49:26 -07:00
/*
* Some devices don ' t advertse INTx interrupts , pre - enable a single
* MSIX vec for setup . We ' ll adjust this later .
*/
if ( ! pdev - > irq ) {
result = pci_enable_msix ( pdev , dev - > entry , 1 ) ;
if ( result < 0 )
goto unmap ;
}
2015-11-20 08:58:10 +01:00
cap = lo_hi_readq ( dev - > bar + NVME_REG_CAP ) ;
2014-03-24 10:46:25 -06:00
dev - > q_depth = min_t ( int , NVME_CAP_MQES ( cap ) + 1 , NVME_Q_DEPTH ) ;
dev - > db_stride = 1 < < NVME_CAP_STRIDE ( cap ) ;
2015-11-20 08:58:10 +01:00
dev - > dbs = dev - > bar + 4096 ;
if ( readl ( dev - > bar + NVME_REG_VS ) > = NVME_VS ( 1 , 2 ) )
2015-07-20 10:14:09 -06:00
dev - > cmb = nvme_map_cmb ( dev ) ;
2013-07-15 15:02:19 -06:00
return 0 ;
2013-12-10 13:10:39 -07:00
unmap :
iounmap ( dev - > bar ) ;
dev - > bar = NULL ;
2013-07-15 15:02:19 -06:00
disable :
pci_release_regions ( pdev ) ;
disable_pci :
pci_disable_device ( pdev ) ;
return result ;
}
static void nvme_dev_unmap ( struct nvme_dev * dev )
{
2015-05-22 11:12:39 +02:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
if ( pdev - > msi_enabled )
pci_disable_msi ( pdev ) ;
else if ( pdev - > msix_enabled )
pci_disable_msix ( pdev ) ;
2013-07-15 15:02:19 -06:00
if ( dev - > bar ) {
iounmap ( dev - > bar ) ;
dev - > bar = NULL ;
2015-05-22 11:12:39 +02:00
pci_release_regions ( pdev ) ;
2013-07-15 15:02:19 -06:00
}
2015-05-22 11:12:39 +02:00
if ( pci_is_enabled ( pdev ) )
pci_disable_device ( pdev ) ;
2013-07-15 15:02:19 -06:00
}
2013-12-10 13:10:40 -07:00
struct nvme_delq_ctx {
struct task_struct * waiter ;
struct kthread_worker * worker ;
atomic_t refcount ;
} ;
static void nvme_wait_dq ( struct nvme_delq_ctx * dq , struct nvme_dev * dev )
{
dq - > waiter = current ;
mb ( ) ;
for ( ; ; ) {
set_current_state ( TASK_KILLABLE ) ;
if ( ! atomic_read ( & dq - > refcount ) )
break ;
if ( ! schedule_timeout ( ADMIN_TIMEOUT ) | |
fatal_signal_pending ( current ) ) {
2015-01-07 18:55:50 -07:00
/*
* Disable the controller first since we can ' t trust it
* at this point , but leave the admin queue enabled
* until all queue deletion requests are flushed .
* FIXME : This may take a while if there are more h / w
* queues than admin tags .
*/
2013-12-10 13:10:40 -07:00
set_current_state ( TASK_RUNNING ) ;
2015-11-28 15:03:49 +01:00
nvme_disable_ctrl ( & dev - > ctrl ,
2015-11-20 08:58:10 +01:00
lo_hi_readq ( dev - > bar + NVME_REG_CAP ) ) ;
2015-01-07 18:55:50 -07:00
nvme_clear_queue ( dev - > queues [ 0 ] ) ;
2013-12-10 13:10:40 -07:00
flush_kthread_worker ( dq - > worker ) ;
2015-01-07 18:55:50 -07:00
nvme_disable_queue ( dev , 0 ) ;
2013-12-10 13:10:40 -07:00
return ;
}
}
set_current_state ( TASK_RUNNING ) ;
}
static void nvme_put_dq ( struct nvme_delq_ctx * dq )
{
atomic_dec ( & dq - > refcount ) ;
if ( dq - > waiter )
wake_up_process ( dq - > waiter ) ;
}
static struct nvme_delq_ctx * nvme_get_dq ( struct nvme_delq_ctx * dq )
{
atomic_inc ( & dq - > refcount ) ;
return dq ;
}
static void nvme_del_queue_end ( struct nvme_queue * nvmeq )
{
struct nvme_delq_ctx * dq = nvmeq - > cmdinfo . ctx ;
nvme_put_dq ( dq ) ;
2015-11-20 08:38:13 -07:00
spin_lock_irq ( & nvmeq - > q_lock ) ;
nvme_process_cq ( nvmeq ) ;
spin_unlock_irq ( & nvmeq - > q_lock ) ;
2013-12-10 13:10:40 -07:00
}
static int adapter_async_del_queue ( struct nvme_queue * nvmeq , u8 opcode ,
kthread_work_func_t fn )
{
struct nvme_command c ;
memset ( & c , 0 , sizeof ( c ) ) ;
c . delete_queue . opcode = opcode ;
c . delete_queue . qid = cpu_to_le16 ( nvmeq - > qid ) ;
init_kthread_work ( & nvmeq - > cmdinfo . work , fn ) ;
2014-11-04 08:20:14 -07:00
return nvme_submit_admin_async_cmd ( nvmeq - > dev , & c , & nvmeq - > cmdinfo ,
ADMIN_TIMEOUT ) ;
2013-12-10 13:10:40 -07:00
}
static void nvme_del_cq_work_handler ( struct kthread_work * work )
{
struct nvme_queue * nvmeq = container_of ( work , struct nvme_queue ,
cmdinfo . work ) ;
nvme_del_queue_end ( nvmeq ) ;
}
static int nvme_delete_cq ( struct nvme_queue * nvmeq )
{
return adapter_async_del_queue ( nvmeq , nvme_admin_delete_cq ,
nvme_del_cq_work_handler ) ;
}
static void nvme_del_sq_work_handler ( struct kthread_work * work )
{
struct nvme_queue * nvmeq = container_of ( work , struct nvme_queue ,
cmdinfo . work ) ;
int status = nvmeq - > cmdinfo . status ;
if ( ! status )
status = nvme_delete_cq ( nvmeq ) ;
if ( status )
nvme_del_queue_end ( nvmeq ) ;
}
static int nvme_delete_sq ( struct nvme_queue * nvmeq )
{
return adapter_async_del_queue ( nvmeq , nvme_admin_delete_sq ,
nvme_del_sq_work_handler ) ;
}
static void nvme_del_queue_start ( struct kthread_work * work )
{
struct nvme_queue * nvmeq = container_of ( work , struct nvme_queue ,
cmdinfo . work ) ;
if ( nvme_delete_sq ( nvmeq ) )
nvme_del_queue_end ( nvmeq ) ;
}
static void nvme_disable_io_queues ( struct nvme_dev * dev )
{
int i ;
DEFINE_KTHREAD_WORKER_ONSTACK ( worker ) ;
struct nvme_delq_ctx dq ;
struct task_struct * kworker_task = kthread_run ( kthread_worker_fn ,
2015-11-26 10:06:56 +01:00
& worker , " nvme%d " , dev - > ctrl . instance ) ;
2013-12-10 13:10:40 -07:00
if ( IS_ERR ( kworker_task ) ) {
2015-05-22 11:12:39 +02:00
dev_err ( dev - > dev ,
2013-12-10 13:10:40 -07:00
" Failed to create queue del task \n " ) ;
for ( i = dev - > queue_count - 1 ; i > 0 ; i - - )
nvme_disable_queue ( dev , i ) ;
return ;
}
dq . waiter = NULL ;
atomic_set ( & dq . refcount , 0 ) ;
dq . worker = & worker ;
for ( i = dev - > queue_count - 1 ; i > 0 ; i - - ) {
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = dev - > queues [ i ] ;
2013-12-10 13:10:40 -07:00
if ( nvme_suspend_queue ( nvmeq ) )
continue ;
nvmeq - > cmdinfo . ctx = nvme_get_dq ( & dq ) ;
nvmeq - > cmdinfo . worker = dq . worker ;
init_kthread_work ( & nvmeq - > cmdinfo . work , nvme_del_queue_start ) ;
queue_kthread_work ( dq . worker , & nvmeq - > cmdinfo . work ) ;
}
nvme_wait_dq ( & dq , dev ) ;
kthread_stop ( kworker_task ) ;
}
2015-10-22 14:03:33 +02:00
static int nvme_dev_list_add ( struct nvme_dev * dev )
{
bool start_thread = false ;
spin_lock ( & dev_list_lock ) ;
if ( list_empty ( & dev_list ) & & IS_ERR_OR_NULL ( nvme_thread ) ) {
start_thread = true ;
nvme_thread = NULL ;
}
list_add ( & dev - > node , & dev_list ) ;
spin_unlock ( & dev_list_lock ) ;
if ( start_thread ) {
nvme_thread = kthread_run ( nvme_kthread , NULL , " nvme " ) ;
wake_up_all ( & nvme_kthread_wait ) ;
} else
wait_event_killable ( nvme_kthread_wait , nvme_thread ) ;
if ( IS_ERR_OR_NULL ( nvme_thread ) )
return nvme_thread ? PTR_ERR ( nvme_thread ) : - EINTR ;
return 0 ;
}
2014-04-07 17:10:11 -06:00
/*
* Remove the node from the device list and check
* for whether or not we need to stop the nvme_thread .
*/
static void nvme_dev_list_remove ( struct nvme_dev * dev )
{
struct task_struct * tmp = NULL ;
spin_lock ( & dev_list_lock ) ;
list_del_init ( & dev - > node ) ;
if ( list_empty ( & dev_list ) & & ! IS_ERR_OR_NULL ( nvme_thread ) ) {
tmp = nvme_thread ;
nvme_thread = NULL ;
}
spin_unlock ( & dev_list_lock ) ;
if ( tmp )
kthread_stop ( tmp ) ;
}
2015-01-07 18:55:52 -07:00
static void nvme_freeze_queues ( struct nvme_dev * dev )
{
struct nvme_ns * ns ;
2015-11-28 15:39:07 +01:00
list_for_each_entry ( ns , & dev - > ctrl . namespaces , list ) {
2015-01-07 18:55:52 -07:00
blk_mq_freeze_queue_start ( ns - > queue ) ;
2015-05-07 09:38:14 +02:00
spin_lock_irq ( ns - > queue - > queue_lock ) ;
2015-01-07 18:55:52 -07:00
queue_flag_set ( QUEUE_FLAG_STOPPED , ns - > queue ) ;
2015-05-07 09:38:14 +02:00
spin_unlock_irq ( ns - > queue - > queue_lock ) ;
2015-01-07 18:55:52 -07:00
blk_mq_cancel_requeue_work ( ns - > queue ) ;
blk_mq_stop_hw_queues ( ns - > queue ) ;
}
}
static void nvme_unfreeze_queues ( struct nvme_dev * dev )
{
struct nvme_ns * ns ;
2015-11-28 15:39:07 +01:00
list_for_each_entry ( ns , & dev - > ctrl . namespaces , list ) {
2015-01-07 18:55:52 -07:00
queue_flag_clear_unlocked ( QUEUE_FLAG_STOPPED , ns - > queue ) ;
blk_mq_unfreeze_queue ( ns - > queue ) ;
blk_mq_start_stopped_hw_queues ( ns - > queue , true ) ;
blk_mq_kick_requeue_list ( ns - > queue ) ;
}
}
2013-07-15 15:02:21 -06:00
static void nvme_dev_shutdown ( struct nvme_dev * dev )
2011-01-20 12:50:14 -05:00
{
2013-07-15 15:02:20 -06:00
int i ;
2014-06-25 11:18:12 -06:00
u32 csts = - 1 ;
2013-07-15 15:02:20 -06:00
2014-04-07 17:10:11 -06:00
nvme_dev_list_remove ( dev ) ;
2011-03-02 18:37:18 -05:00
2015-11-26 12:21:29 +01:00
mutex_lock ( & dev - > shutdown_lock ) ;
2015-01-07 18:55:52 -07:00
if ( dev - > bar ) {
nvme_freeze_queues ( dev ) ;
2015-11-20 08:58:10 +01:00
csts = readl ( dev - > bar + NVME_REG_CSTS ) ;
2015-01-07 18:55:52 -07:00
}
2014-06-25 11:18:12 -06:00
if ( csts & NVME_CSTS_CFS | | ! ( csts & NVME_CSTS_RDY ) ) {
2013-12-10 13:10:40 -07:00
for ( i = dev - > queue_count - 1 ; i > = 0 ; i - - ) {
2014-11-04 08:20:14 -07:00
struct nvme_queue * nvmeq = dev - > queues [ i ] ;
2013-12-10 13:10:40 -07:00
nvme_suspend_queue ( nvmeq ) ;
}
} else {
nvme_disable_io_queues ( dev ) ;
2015-11-28 15:03:49 +01:00
nvme_shutdown_ctrl ( & dev - > ctrl ) ;
2013-12-10 13:10:40 -07:00
nvme_disable_queue ( dev , 0 ) ;
}
2013-07-15 15:02:21 -06:00
nvme_dev_unmap ( dev ) ;
2015-02-19 10:34:48 -07:00
for ( i = dev - > queue_count - 1 ; i > = 0 ; i - - )
nvme_clear_queue ( dev - > queues [ i ] ) ;
2015-11-26 12:21:29 +01:00
mutex_unlock ( & dev - > shutdown_lock ) ;
2013-07-15 15:02:21 -06:00
}
2011-02-10 09:56:01 -05:00
static int nvme_setup_prp_pools ( struct nvme_dev * dev )
{
2015-05-22 11:12:39 +02:00
dev - > prp_page_pool = dma_pool_create ( " prp list page " , dev - > dev ,
2011-02-10 09:56:01 -05:00
PAGE_SIZE , PAGE_SIZE , 0 ) ;
if ( ! dev - > prp_page_pool )
return - ENOMEM ;
2011-02-10 10:30:34 -05:00
/* Optimisation for I/Os between 4k and 128k */
2015-05-22 11:12:39 +02:00
dev - > prp_small_pool = dma_pool_create ( " prp list 256 " , dev - > dev ,
2011-02-10 10:30:34 -05:00
256 , 256 , 0 ) ;
if ( ! dev - > prp_small_pool ) {
dma_pool_destroy ( dev - > prp_page_pool ) ;
return - ENOMEM ;
}
2011-02-10 09:56:01 -05:00
return 0 ;
}
static void nvme_release_prp_pools ( struct nvme_dev * dev )
{
dma_pool_destroy ( dev - > prp_page_pool ) ;
2011-02-10 10:30:34 -05:00
dma_pool_destroy ( dev - > prp_small_pool ) ;
2011-02-10 09:56:01 -05:00
}
2015-11-26 10:54:19 +01:00
static void nvme_pci_free_ctrl ( struct nvme_ctrl * ctrl )
2013-02-19 10:17:58 -07:00
{
2015-11-26 10:54:19 +01:00
struct nvme_dev * dev = to_nvme_dev ( ctrl ) ;
2014-01-31 16:53:39 -07:00
2015-05-22 11:12:39 +02:00
put_device ( dev - > dev ) ;
2015-06-08 10:08:13 -06:00
if ( dev - > tagset . tags )
blk_mq_free_tag_set ( & dev - > tagset ) ;
2015-11-26 10:06:56 +01:00
if ( dev - > ctrl . admin_q )
blk_put_queue ( dev - > ctrl . admin_q ) ;
2013-02-19 10:17:58 -07:00
kfree ( dev - > queues ) ;
kfree ( dev - > entry ) ;
kfree ( dev ) ;
}
2015-10-03 09:49:23 +02:00
static void nvme_probe_work ( struct work_struct * work )
2013-07-15 15:02:21 -06:00
{
2015-10-03 09:49:23 +02:00
struct nvme_dev * dev = container_of ( work , struct nvme_dev , probe_work ) ;
int result ;
2013-07-15 15:02:21 -06:00
result = nvme_dev_map ( dev ) ;
if ( result )
2015-10-03 09:49:23 +02:00
goto out ;
2013-07-15 15:02:21 -06:00
result = nvme_configure_admin_queue ( dev ) ;
if ( result )
goto unmap ;
2014-11-04 08:20:14 -07:00
nvme_init_queue ( dev - > queues [ 0 ] , 0 ) ;
2015-01-07 18:55:50 -07:00
result = nvme_alloc_admin_tags ( dev ) ;
if ( result )
goto disable ;
2014-04-07 17:10:11 -06:00
2015-10-16 07:58:46 +02:00
result = nvme_init_identify ( & dev - > ctrl ) ;
if ( result )
goto free_tags ;
2013-07-15 15:02:21 -06:00
result = nvme_setup_io_queues ( dev ) ;
2014-06-23 14:25:35 -06:00
if ( result )
2015-01-07 18:55:50 -07:00
goto free_tags ;
2013-07-15 15:02:21 -06:00
2015-11-26 10:06:56 +01:00
dev - > ctrl . event_limit = 1 ;
2015-10-03 09:49:23 +02:00
2015-10-22 14:03:33 +02:00
result = nvme_dev_list_add ( dev ) ;
if ( result )
goto remove ;
2015-10-02 18:51:31 +02:00
/*
* Keep the controller around but remove all namespaces if we don ' t have
* any working I / O queue .
*/
2015-10-03 09:49:23 +02:00
if ( dev - > online_queues < 2 ) {
dev_warn ( dev - > dev , " IO queues not created \n " ) ;
2015-11-28 15:39:07 +01:00
nvme_remove_namespaces ( & dev - > ctrl ) ;
2015-10-03 09:49:23 +02:00
} else {
nvme_unfreeze_queues ( dev ) ;
nvme_dev_add ( dev ) ;
}
return ;
2013-07-15 15:02:21 -06:00
2015-10-22 14:03:33 +02:00
remove :
nvme_dev_list_remove ( dev ) ;
2015-01-07 18:55:50 -07:00
free_tags :
nvme_dev_remove_admin ( dev ) ;
2015-11-26 10:06:56 +01:00
blk_put_queue ( dev - > ctrl . admin_q ) ;
dev - > ctrl . admin_q = NULL ;
2015-06-08 10:08:13 -06:00
dev - > queues [ 0 ] - > tags = NULL ;
2013-07-15 15:02:21 -06:00
disable :
2013-12-16 13:50:00 -05:00
nvme_disable_queue ( dev , 0 ) ;
2013-07-15 15:02:21 -06:00
unmap :
nvme_dev_unmap ( dev ) ;
2015-10-03 09:49:23 +02:00
out :
if ( ! work_busy ( & dev - > reset_work ) )
nvme_dead_ctrl ( dev ) ;
2013-07-15 15:02:21 -06:00
}
2013-12-10 13:10:36 -07:00
static int nvme_remove_dead_ctrl ( void * arg )
{
struct nvme_dev * dev = ( struct nvme_dev * ) arg ;
2015-05-22 11:12:39 +02:00
struct pci_dev * pdev = to_pci_dev ( dev - > dev ) ;
2013-12-10 13:10:36 -07:00
if ( pci_get_drvdata ( pdev ) )
2014-06-23 15:24:53 -06:00
pci_stop_and_remove_bus_device_locked ( pdev ) ;
2015-11-26 10:54:19 +01:00
nvme_put_ctrl ( & dev - > ctrl ) ;
2013-12-10 13:10:36 -07:00
return 0 ;
}
2015-06-18 13:36:39 -06:00
static void nvme_dead_ctrl ( struct nvme_dev * dev )
{
dev_warn ( dev - > dev , " Device failed to resume \n " ) ;
2015-11-26 10:54:19 +01:00
kref_get ( & dev - > ctrl . kref ) ;
2015-06-18 13:36:39 -06:00
if ( IS_ERR ( kthread_run ( nvme_remove_dead_ctrl , dev , " nvme%d " ,
2015-11-26 10:06:56 +01:00
dev - > ctrl . instance ) ) ) {
2015-06-18 13:36:39 -06:00
dev_err ( dev - > dev ,
" Failed to start controller remove task \n " ) ;
2015-11-26 10:54:19 +01:00
nvme_put_ctrl ( & dev - > ctrl ) ;
2015-06-18 13:36:39 -06:00
}
}
2015-10-02 17:41:18 +02:00
static void nvme_reset_work ( struct work_struct * ws )
2013-12-10 13:10:36 -07:00
{
2015-10-02 17:41:18 +02:00
struct nvme_dev * dev = container_of ( ws , struct nvme_dev , reset_work ) ;
2015-06-08 10:08:15 -06:00
bool in_probe = work_busy ( & dev - > probe_work ) ;
2013-12-10 13:10:36 -07:00
nvme_dev_shutdown ( dev ) ;
2015-06-08 10:08:15 -06:00
/* Synchronize with device probe so that work will see failure status
* and exit gracefully without trying to schedule another reset */
flush_work ( & dev - > probe_work ) ;
/* Fail this device if reset occured during probe to avoid
* infinite initialization loops . */
if ( in_probe ) {
2015-06-18 13:36:39 -06:00
nvme_dead_ctrl ( dev ) ;
2015-06-08 10:08:15 -06:00
return ;
2013-12-10 13:10:36 -07:00
}
2015-06-08 10:08:15 -06:00
/* Schedule device resume asynchronously so the reset work is available
* to cleanup errors that may occur during reinitialization */
schedule_work ( & dev - > probe_work ) ;
2013-12-10 13:10:36 -07:00
}
2015-10-02 18:49:23 +02:00
static int __nvme_reset ( struct nvme_dev * dev )
2014-03-07 10:24:49 -05:00
{
2015-10-02 18:49:23 +02:00
if ( work_pending ( & dev - > reset_work ) )
return - EBUSY ;
list_del_init ( & dev - > node ) ;
queue_work ( nvme_workq , & dev - > reset_work ) ;
return 0 ;
2014-03-07 10:24:49 -05:00
}
2015-06-05 10:30:08 -06:00
static int nvme_reset ( struct nvme_dev * dev )
{
2015-10-02 18:49:23 +02:00
int ret ;
2015-06-05 10:30:08 -06:00
2015-11-26 10:06:56 +01:00
if ( ! dev - > ctrl . admin_q | | blk_queue_dying ( dev - > ctrl . admin_q ) )
2015-06-05 10:30:08 -06:00
return - ENODEV ;
spin_lock ( & dev_list_lock ) ;
2015-10-02 18:49:23 +02:00
ret = __nvme_reset ( dev ) ;
2015-06-05 10:30:08 -06:00
spin_unlock ( & dev_list_lock ) ;
if ( ! ret ) {
flush_work ( & dev - > reset_work ) ;
2015-06-08 10:08:15 -06:00
flush_work ( & dev - > probe_work ) ;
2015-06-05 10:30:08 -06:00
return 0 ;
}
return ret ;
}
2015-11-26 10:06:56 +01:00
static int nvme_pci_reg_read32 ( struct nvme_ctrl * ctrl , u32 off , u32 * val )
{
* val = readl ( to_nvme_dev ( ctrl ) - > bar + off ) ;
return 0 ;
}
2015-11-28 15:03:49 +01:00
static int nvme_pci_reg_write32 ( struct nvme_ctrl * ctrl , u32 off , u32 val )
{
writel ( val , to_nvme_dev ( ctrl ) - > bar + off ) ;
return 0 ;
}
2015-11-28 15:37:52 +01:00
static int nvme_pci_reg_read64 ( struct nvme_ctrl * ctrl , u32 off , u64 * val )
{
* val = readq ( to_nvme_dev ( ctrl ) - > bar + off ) ;
return 0 ;
}
2015-11-28 15:39:07 +01:00
static bool nvme_pci_io_incapable ( struct nvme_ctrl * ctrl )
{
struct nvme_dev * dev = to_nvme_dev ( ctrl ) ;
return ! dev - > bar | | dev - > online_queues < 2 ;
}
2015-11-28 15:40:19 +01:00
static int nvme_pci_reset_ctrl ( struct nvme_ctrl * ctrl )
{
return nvme_reset ( to_nvme_dev ( ctrl ) ) ;
}
2015-11-26 10:06:56 +01:00
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
. reg_read32 = nvme_pci_reg_read32 ,
2015-11-28 15:03:49 +01:00
. reg_write32 = nvme_pci_reg_write32 ,
2015-11-28 15:37:52 +01:00
. reg_read64 = nvme_pci_reg_read64 ,
2015-11-28 15:39:07 +01:00
. io_incapable = nvme_pci_io_incapable ,
2015-11-28 15:40:19 +01:00
. reset_ctrl = nvme_pci_reset_ctrl ,
2015-11-26 10:54:19 +01:00
. free_ctrl = nvme_pci_free_ctrl ,
2015-11-26 10:06:56 +01:00
} ;
2012-12-21 15:13:49 -08:00
static int nvme_probe ( struct pci_dev * pdev , const struct pci_device_id * id )
2011-01-20 12:50:14 -05:00
{
2014-11-04 08:20:14 -07:00
int node , result = - ENOMEM ;
2011-01-20 12:50:14 -05:00
struct nvme_dev * dev ;
2014-11-04 08:20:14 -07:00
node = dev_to_node ( & pdev - > dev ) ;
if ( node = = NUMA_NO_NODE )
set_dev_node ( & pdev - > dev , 0 ) ;
dev = kzalloc_node ( sizeof ( * dev ) , GFP_KERNEL , node ) ;
2011-01-20 12:50:14 -05:00
if ( ! dev )
return - ENOMEM ;
2014-11-04 08:20:14 -07:00
dev - > entry = kzalloc_node ( num_possible_cpus ( ) * sizeof ( * dev - > entry ) ,
GFP_KERNEL , node ) ;
2011-01-20 12:50:14 -05:00
if ( ! dev - > entry )
goto free ;
2014-11-04 08:20:14 -07:00
dev - > queues = kzalloc_node ( ( num_possible_cpus ( ) + 1 ) * sizeof ( void * ) ,
GFP_KERNEL , node ) ;
2011-01-20 12:50:14 -05:00
if ( ! dev - > queues )
goto free ;
2015-05-22 11:12:39 +02:00
dev - > dev = get_device ( & pdev - > dev ) ;
2013-12-10 13:10:36 -07:00
pci_set_drvdata ( pdev , dev ) ;
2015-11-26 10:06:56 +01:00
2015-11-28 15:40:19 +01:00
INIT_LIST_HEAD ( & dev - > node ) ;
INIT_WORK ( & dev - > scan_work , nvme_dev_scan ) ;
INIT_WORK ( & dev - > probe_work , nvme_probe_work ) ;
INIT_WORK ( & dev - > reset_work , nvme_reset_work ) ;
2015-11-26 12:21:29 +01:00
mutex_init ( & dev - > shutdown_lock ) ;
2015-11-26 10:06:56 +01:00
2015-11-28 15:40:19 +01:00
result = nvme_setup_prp_pools ( dev ) ;
2012-02-21 16:50:53 -07:00
if ( result )
2014-08-19 19:15:59 -06:00
goto put_pci ;
2011-01-20 12:50:14 -05:00
2015-11-28 15:40:19 +01:00
result = nvme_init_ctrl ( & dev - > ctrl , & pdev - > dev , & nvme_pci_ctrl_ops ,
id - > driver_data ) ;
2011-02-10 09:56:01 -05:00
if ( result )
2015-02-12 15:33:00 -07:00
goto release_pools ;
2011-02-15 16:28:20 -05:00
2015-02-12 15:33:00 -07:00
schedule_work ( & dev - > probe_work ) ;
2011-01-20 12:50:14 -05:00
return 0 ;
2013-07-15 15:02:19 -06:00
release_pools :
2011-02-10 09:56:01 -05:00
nvme_release_prp_pools ( dev ) ;
2014-08-19 19:15:59 -06:00
put_pci :
2015-05-22 11:12:39 +02:00
put_device ( dev - > dev ) ;
2011-01-20 12:50:14 -05:00
free :
kfree ( dev - > queues ) ;
kfree ( dev - > entry ) ;
kfree ( dev ) ;
return result ;
}
2014-05-02 10:40:43 -06:00
static void nvme_reset_notify ( struct pci_dev * pdev , bool prepare )
{
2014-06-23 16:03:21 -06:00
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
2014-05-02 10:40:43 -06:00
2014-06-23 16:03:21 -06:00
if ( prepare )
nvme_dev_shutdown ( dev ) ;
else
2015-10-02 10:37:29 -06:00
schedule_work ( & dev - > probe_work ) ;
2014-05-02 10:40:43 -06:00
}
2014-01-27 11:29:40 -05:00
static void nvme_shutdown ( struct pci_dev * pdev )
{
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
nvme_dev_shutdown ( dev ) ;
}
2012-12-21 15:13:49 -08:00
static void nvme_remove ( struct pci_dev * pdev )
2011-01-20 12:50:14 -05:00
{
struct nvme_dev * dev = pci_get_drvdata ( pdev ) ;
2013-12-10 13:10:36 -07:00
spin_lock ( & dev_list_lock ) ;
list_del_init ( & dev - > node ) ;
spin_unlock ( & dev_list_lock ) ;
pci_set_drvdata ( pdev , NULL ) ;
2015-02-12 15:33:00 -07:00
flush_work ( & dev - > probe_work ) ;
2013-12-10 13:10:36 -07:00
flush_work ( & dev - > reset_work ) ;
2015-06-01 14:28:14 -06:00
flush_work ( & dev - > scan_work ) ;
2015-11-28 15:39:07 +01:00
nvme_remove_namespaces ( & dev - > ctrl ) ;
2015-06-18 13:36:40 -06:00
nvme_dev_shutdown ( dev ) ;
2014-11-04 08:20:14 -07:00
nvme_dev_remove_admin ( dev ) ;
2013-12-16 13:50:00 -05:00
nvme_free_queues ( dev , 0 ) ;
2015-07-20 10:14:09 -06:00
nvme_release_cmb ( dev ) ;
2013-12-10 13:10:36 -07:00
nvme_release_prp_pools ( dev ) ;
2015-11-26 10:54:19 +01:00
nvme_put_ctrl ( & dev - > ctrl ) ;
2011-01-20 12:50:14 -05:00
}
/* These functions are yet to be implemented */
# define nvme_error_detected NULL
# define nvme_dump_registers NULL
# define nvme_link_reset NULL
# define nvme_slot_reset NULL
# define nvme_error_resume NULL
2013-07-15 15:02:23 -06:00
2014-02-13 11:19:14 +09:00
# ifdef CONFIG_PM_SLEEP
2013-07-15 15:02:23 -06:00
static int nvme_suspend ( struct device * dev )
{
struct pci_dev * pdev = to_pci_dev ( dev ) ;
struct nvme_dev * ndev = pci_get_drvdata ( pdev ) ;
nvme_dev_shutdown ( ndev ) ;
return 0 ;
}
static int nvme_resume ( struct device * dev )
{
struct pci_dev * pdev = to_pci_dev ( dev ) ;
struct nvme_dev * ndev = pci_get_drvdata ( pdev ) ;
2015-10-02 10:37:29 -06:00
schedule_work ( & ndev - > probe_work ) ;
2013-12-10 13:10:36 -07:00
return 0 ;
2013-07-15 15:02:23 -06:00
}
2014-02-13 11:19:14 +09:00
# endif
2013-07-15 15:02:23 -06:00
static SIMPLE_DEV_PM_OPS ( nvme_dev_pm_ops , nvme_suspend , nvme_resume ) ;
2011-01-20 12:50:14 -05:00
2012-09-07 09:33:17 -07:00
static const struct pci_error_handlers nvme_err_handler = {
2011-01-20 12:50:14 -05:00
. error_detected = nvme_error_detected ,
. mmio_enabled = nvme_dump_registers ,
. link_reset = nvme_link_reset ,
. slot_reset = nvme_slot_reset ,
. resume = nvme_error_resume ,
2014-05-02 10:40:43 -06:00
. reset_notify = nvme_reset_notify ,
2011-01-20 12:50:14 -05:00
} ;
/* Move to pci_ids.h later */
# define PCI_CLASS_STORAGE_EXPRESS 0x010802
2014-03-24 10:11:22 -04:00
static const struct pci_device_id nvme_id_table [ ] = {
2015-11-26 10:07:41 +01:00
{ PCI_VDEVICE ( INTEL , 0x0953 ) ,
. driver_data = NVME_QUIRK_STRIPE_SIZE , } ,
2011-01-20 12:50:14 -05:00
{ PCI_DEVICE_CLASS ( PCI_CLASS_STORAGE_EXPRESS , 0xffffff ) } ,
2015-11-04 00:49:45 +01:00
{ PCI_DEVICE ( PCI_VENDOR_ID_APPLE , 0x2001 ) } ,
2011-01-20 12:50:14 -05:00
{ 0 , }
} ;
MODULE_DEVICE_TABLE ( pci , nvme_id_table ) ;
static struct pci_driver nvme_driver = {
. name = " nvme " ,
. id_table = nvme_id_table ,
. probe = nvme_probe ,
2012-12-21 15:13:49 -08:00
. remove = nvme_remove ,
2014-01-27 11:29:40 -05:00
. shutdown = nvme_shutdown ,
2013-07-15 15:02:23 -06:00
. driver = {
. pm = & nvme_dev_pm_ops ,
} ,
2011-01-20 12:50:14 -05:00
. err_handler = & nvme_err_handler ,
} ;
static int __init nvme_init ( void )
{
2012-07-31 13:31:15 -04:00
int result ;
2011-03-02 18:37:18 -05:00
2014-04-07 17:10:11 -06:00
init_waitqueue_head ( & nvme_kthread_wait ) ;
2011-01-20 12:50:14 -05:00
2013-12-10 13:10:36 -07:00
nvme_workq = create_singlethread_workqueue ( " nvme " ) ;
if ( ! nvme_workq )
2014-04-07 17:10:11 -06:00
return - ENOMEM ;
2013-12-10 13:10:36 -07:00
2015-11-28 15:39:07 +01:00
result = nvme_core_init ( ) ;
2012-07-25 16:05:18 -06:00
if ( result < 0 )
2013-12-10 13:10:36 -07:00
goto kill_workq ;
2011-01-20 12:50:14 -05:00
2014-06-11 11:51:35 -06:00
result = pci_register_driver ( & nvme_driver ) ;
if ( result )
2015-11-28 15:40:19 +01:00
goto core_exit ;
2011-03-02 18:37:18 -05:00
return 0 ;
2011-01-20 12:50:14 -05:00
2015-11-28 15:40:19 +01:00
core_exit :
2015-11-28 15:39:07 +01:00
nvme_core_exit ( ) ;
2013-12-10 13:10:36 -07:00
kill_workq :
destroy_workqueue ( nvme_workq ) ;
2011-01-20 12:50:14 -05:00
return result ;
}
static void __exit nvme_exit ( void )
{
pci_unregister_driver ( & nvme_driver ) ;
2015-11-28 15:39:07 +01:00
nvme_core_exit ( ) ;
2013-12-10 13:10:36 -07:00
destroy_workqueue ( nvme_workq ) ;
2014-04-07 17:10:11 -06:00
BUG_ON ( nvme_thread & & ! IS_ERR ( nvme_thread ) ) ;
2014-05-09 22:42:26 -04:00
_nvme_check_size ( ) ;
2011-01-20 12:50:14 -05:00
}
MODULE_AUTHOR ( " Matthew Wilcox <willy@linux.intel.com> " ) ;
MODULE_LICENSE ( " GPL " ) ;
2014-11-21 15:16:32 -07:00
MODULE_VERSION ( " 1.0 " ) ;
2011-01-20 12:50:14 -05:00
module_init ( nvme_init ) ;
module_exit ( nvme_exit ) ;