linux/drivers/nvme/host/core.c

2944 lines
73 KiB
C
Raw Normal View History

/*
* NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hdreg.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list_sort.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/pr.h>
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/t10-pi.h>
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
#include "nvme.h"
#include "fabrics.h"
#define NVME_MINORS (1U << MINORBITS)
unsigned int admin_timeout = 60;
module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
EXPORT_SYMBOL_GPL(admin_timeout);
unsigned int nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
EXPORT_SYMBOL_GPL(nvme_io_timeout);
static unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
static u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
static unsigned long default_ps_max_latency_us = 100000;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us,
"max power saving latency for new devices; use PM QOS to change per device");
static bool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
static bool streams;
module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
static LIST_HEAD(nvme_ctrl_list);
static DEFINE_SPINLOCK(dev_list_lock);
static DEFINE_IDA(nvme_instance_ida);
static struct class *nvme_class;
static __le32 nvme_get_log_dw10(u8 lid, size_t size)
{
return cpu_to_le32((((size / 4) - 1) << 16) | lid);
}
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return -EBUSY;
if (!queue_work(nvme_wq, &ctrl->reset_work))
return -EBUSY;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
{
int ret;
ret = nvme_reset_ctrl(ctrl);
if (!ret)
flush_work(&ctrl->reset_work);
return ret;
}
static blk_status_t nvme_error_status(struct request *req)
{
switch (nvme_req(req)->status & 0x7ff) {
case NVME_SC_SUCCESS:
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
return BLK_STS_NOSPC;
case NVME_SC_ONCS_NOT_SUPPORTED:
return BLK_STS_NOTSUPP;
case NVME_SC_WRITE_FAULT:
case NVME_SC_READ_ERROR:
case NVME_SC_UNWRITTEN_BLOCK:
case NVME_SC_ACCESS_DENIED:
case NVME_SC_READ_ONLY:
return BLK_STS_MEDIUM;
case NVME_SC_GUARD_CHECK:
case NVME_SC_APPTAG_CHECK:
case NVME_SC_REFTAG_CHECK:
case NVME_SC_INVALID_PI:
return BLK_STS_PROTECTION;
case NVME_SC_RESERVATION_CONFLICT:
return BLK_STS_NEXUS;
default:
return BLK_STS_IOERR;
}
}
static inline bool nvme_req_needs_retry(struct request *req)
{
if (blk_noretry_request(req))
return false;
if (nvme_req(req)->status & NVME_SC_DNR)
return false;
if (nvme_req(req)->retries >= nvme_max_retries)
return false;
return true;
}
void nvme_complete_rq(struct request *req)
{
if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
nvme_req(req)->retries++;
blk_mq_requeue_request(req, true);
return;
}
blk_mq_end_request(req, nvme_error_status(req));
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
int status;
if (!blk_mq_request_started(req))
return;
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
nvme_req(req)->status = status;
blk_mq_complete_request(req);
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
enum nvme_ctrl_state old_state;
unsigned long flags;
bool changed = false;
spin_lock_irqsave(&ctrl->lock, flags);
old_state = ctrl->state;
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_RESETTING:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_RECONNECTING:
switch (old_state) {
case NVME_CTRL_LIVE:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_DEAD:
switch (old_state) {
case NVME_CTRL_DELETING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
default:
break;
}
if (changed)
ctrl->state = new_state;
spin_unlock_irqrestore(&ctrl->lock, flags);
return changed;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
static void nvme_free_ns(struct kref *kref)
{
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
if (ns->ndev)
nvme_nvm_unregister(ns);
put_disk(ns->disk);
ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
nvme_put_ctrl(ns->ctrl);
kfree(ns);
}
static void nvme_put_ns(struct nvme_ns *ns)
{
kref_put(&ns->kref, nvme_free_ns);
}
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid)
{
unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
struct request *req;
if (qid == NVME_QID_ANY) {
req = blk_mq_alloc_request(q, op, flags);
} else {
req = blk_mq_alloc_request_hctx(q, op, flags,
qid ? qid - 1 : 0);
}
if (IS_ERR(req))
return req;
req->cmd_flags |= REQ_FAILFAST_DRIVER;
nvme_req(req)->cmd = cmd;
return req;
}
EXPORT_SYMBOL_GPL(nvme_alloc_request);
static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
{
struct nvme_command c;
memset(&c, 0, sizeof(c));
c.directive.opcode = nvme_admin_directive_send;
c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
c.directive.dtype = NVME_DIR_IDENTIFY;
c.directive.tdtype = NVME_DIR_STREAMS;
c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
}
static int nvme_disable_streams(struct nvme_ctrl *ctrl)
{
return nvme_toggle_streams(ctrl, false);
}
static int nvme_enable_streams(struct nvme_ctrl *ctrl)
{
return nvme_toggle_streams(ctrl, true);
}
static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
struct streams_directive_params *s, u32 nsid)
{
struct nvme_command c;
memset(&c, 0, sizeof(c));
memset(s, 0, sizeof(*s));
c.directive.opcode = nvme_admin_directive_recv;
c.directive.nsid = cpu_to_le32(nsid);
c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
c.directive.dtype = NVME_DIR_STREAMS;
return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
}
static int nvme_configure_directives(struct nvme_ctrl *ctrl)
{
struct streams_directive_params s;
int ret;
if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
return 0;
if (!streams)
return 0;
ret = nvme_enable_streams(ctrl);
if (ret)
return ret;
ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
if (ret)
return ret;
ctrl->nssa = le16_to_cpu(s.nssa);
if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
dev_info(ctrl->device, "too few streams (%u) available\n",
ctrl->nssa);
nvme_disable_streams(ctrl);
return 0;
}
ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
return 0;
}
/*
* Check if 'req' has a write hint associated with it. If it does, assign
* a valid namespace stream to the write.
*/
static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
struct request *req, u16 *control,
u32 *dsmgmt)
{
enum rw_hint streamid = req->write_hint;
if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
streamid = 0;
else {
streamid--;
if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
return;
*control |= NVME_RW_DTYPE_STREAMS;
*dsmgmt |= streamid << 16;
}
if (streamid < ARRAY_SIZE(req->q->write_hints))
req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
}
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
memset(cmnd, 0, sizeof(*cmnd));
cmnd->common.opcode = nvme_cmd_flush;
cmnd->common.nsid = cpu_to_le32(ns->ns_id);
}
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd)
{
unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
struct nvme_dsm_range *range;
struct bio *bio;
range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
if (!range)
return BLK_STS_RESOURCE;
__rq_for_each_bio(bio, req) {
u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
range[n].cattr = cpu_to_le32(0);
range[n].nlb = cpu_to_le32(nlb);
range[n].slba = cpu_to_le64(slba);
n++;
}
if (WARN_ON_ONCE(n != segments)) {
kfree(range);
return BLK_STS_IOERR;
}
memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
cmnd->dsm.nr = cpu_to_le32(segments - 1);
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
req->special_vec.bv_page = virt_to_page(range);
req->special_vec.bv_offset = offset_in_page(range);
req->special_vec.bv_len = sizeof(*range) * segments;
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
return BLK_STS_OK;
}
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
u32 dsmgmt = 0;
/*
* If formated with metadata, require the block layer provide a buffer
* unless this namespace is formated such that the metadata can be
* stripped/generated by the controller with PRACT=1.
*/
if (ns && ns->ms &&
(!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
!blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
return BLK_STS_NOTSUPP;
if (req->cmd_flags & REQ_FUA)
control |= NVME_RW_FUA;
if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
control |= NVME_RW_LR;
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
memset(cmnd, 0, sizeof(*cmnd));
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
if (ns->ms) {
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
cmnd->rw.reftag = cpu_to_le32(
nvme_block_nr(ns, blk_rq_pos(req)));
break;
}
if (!blk_integrity_rq(req))
control |= NVME_RW_PRINFO_PRACT;
}
cmnd->rw.control = cpu_to_le16(control);
cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
return 0;
}
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd)
{
blk_status_t ret = BLK_STS_OK;
if (!(req->rq_flags & RQF_DONTPREP)) {
nvme_req(req)->retries = 0;
nvme_req(req)->flags = 0;
req->rq_flags |= RQF_DONTPREP;
}
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
break;
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
case REQ_OP_WRITE_ZEROES:
/* currently only aliased to deallocate for a few ctrls: */
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
case REQ_OP_READ:
case REQ_OP_WRITE:
ret = nvme_setup_rw(ns, req, cmd);
break;
default:
WARN_ON_ONCE(1);
return BLK_STS_IOERR;
}
cmd->common.command_id = req->tag;
return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head, int flags)
{
struct request *req;
int ret;
req = nvme_alloc_request(q, cmd, flags, qid);
if (IS_ERR(req))
return PTR_ERR(req);
req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
if (ret)
goto out;
}
blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
ret = nvme_req(req)->status;
out:
blk_mq_free_request(req);
return ret;
}
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
NVME_QID_ANY, 0, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
unsigned len, u32 seed, bool write)
{
struct bio_integrity_payload *bip;
int ret = -ENOMEM;
void *buf;
buf = kmalloc(len, GFP_KERNEL);
if (!buf)
goto out;
ret = -EFAULT;
if (write && copy_from_user(buf, ubuf, len))
goto out_free_meta;
bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
if (IS_ERR(bip)) {
ret = PTR_ERR(bip);
goto out_free_meta;
}
bip->bip_iter.bi_size = len;
bip->bip_iter.bi_sector = seed;
ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf));
if (ret == len)
return buf;
ret = -ENOMEM;
out_free_meta:
kfree(buf);
out:
return ERR_PTR(ret);
}
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, u32 *result, unsigned timeout)
{
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
struct gendisk *disk = ns ? ns->disk : NULL;
struct request *req;
struct bio *bio = NULL;
void *meta = NULL;
int ret;
req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
if (IS_ERR(req))
return PTR_ERR(req);
req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
if (ubuffer && bufflen) {
ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
GFP_KERNEL);
if (ret)
goto out;
bio = req->bio;
bio->bi_disk = disk;
if (disk && meta_buffer && meta_len) {
meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
meta_seed, write);
if (IS_ERR(meta)) {
ret = PTR_ERR(meta);
goto out_unmap;
}
}
}
blk_execute_rq(req->q, disk, req, 0);
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
ret = nvme_req(req)->status;
if (result)
*result = le32_to_cpu(nvme_req(req)->result.u32);
if (meta && !ret && !write) {
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
}
kfree(meta);
out_unmap:
if (bio)
blk_rq_unmap_user(bio);
out:
blk_mq_free_request(req);
return ret;
}
static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
{
struct nvme_ctrl *ctrl = rq->end_io_data;
blk_mq_free_request(rq);
if (status) {
dev_err(ctrl->device,
"failed nvme_keep_alive_end_io error=%d\n",
status);
return;
}
schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
static int nvme_keep_alive(struct nvme_ctrl *ctrl)
{
struct nvme_command c;
struct request *rq;
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_keep_alive;
rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
NVME_QID_ANY);
if (IS_ERR(rq))
return PTR_ERR(rq);
rq->timeout = ctrl->kato * HZ;
rq->end_io_data = ctrl;
blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
return 0;
}
static void nvme_keep_alive_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_ctrl, ka_work);
if (nvme_keep_alive(ctrl)) {
/* allocation failure, reset the controller */
dev_err(ctrl->device, "keep-alive failed\n");
nvme_reset_ctrl(ctrl);
return;
}
}
void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
{
if (unlikely(ctrl->kato == 0))
return;
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
}
EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
{
if (unlikely(ctrl->kato == 0))
return;
cancel_delayed_work_sync(&ctrl->ka_work);
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
struct nvme_command c = { };
int error;
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
c.identify.opcode = nvme_admin_identify;
c.identify.cns = NVME_ID_CNS_CTRL;
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
if (!*id)
return -ENOMEM;
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl));
if (error)
kfree(*id);
return error;
}
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
u8 *eui64, u8 *nguid, uuid_t *uuid)
{
struct nvme_command c = { };
int status;
void *data;
int pos;
int len;
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(nsid);
c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!data)
return -ENOMEM;
status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
NVME_IDENTIFY_DATA_SIZE);
if (status)
goto free_data;
for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
struct nvme_ns_id_desc *cur = data + pos;
if (cur->nidl == 0)
break;
switch (cur->nidt) {
case NVME_NIDT_EUI64:
if (cur->nidl != NVME_NIDT_EUI64_LEN) {
dev_warn(ctrl->device,
"ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
cur->nidl);
goto free_data;
}
len = NVME_NIDT_EUI64_LEN;
memcpy(eui64, data + pos + sizeof(*cur), len);
break;
case NVME_NIDT_NGUID:
if (cur->nidl != NVME_NIDT_NGUID_LEN) {
dev_warn(ctrl->device,
"ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
cur->nidl);
goto free_data;
}
len = NVME_NIDT_NGUID_LEN;
memcpy(nguid, data + pos + sizeof(*cur), len);
break;
case NVME_NIDT_UUID:
if (cur->nidl != NVME_NIDT_UUID_LEN) {
dev_warn(ctrl->device,
"ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
cur->nidl);
goto free_data;
}
len = NVME_NIDT_UUID_LEN;
uuid_copy(uuid, data + pos + sizeof(*cur));
break;
default:
/* Skip unnkown types */
len = cur->nidl;
break;
}
len += sizeof(*cur);
}
free_data:
kfree(data);
return status;
}
static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
{
struct nvme_command c = { };
c.identify.opcode = nvme_admin_identify;
c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
c.identify.nsid = cpu_to_le32(nsid);
return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
}
static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
unsigned nsid)
{
struct nvme_id_ns *id;
struct nvme_command c = { };
int error;
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(nsid);
c.identify.cns = NVME_ID_CNS_NS;
id = kmalloc(sizeof(*id), GFP_KERNEL);
if (!id)
return NULL;
error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
if (error) {
dev_warn(ctrl->device, "Identify namespace failed\n");
kfree(id);
return NULL;
}
return id;
}
static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
void *buffer, size_t buflen, u32 *result)
{
struct nvme_command c;
union nvme_result res;
int ret;
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_set_features;
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
buffer, buflen, 0, NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
}
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
u32 q_count = (*count - 1) | ((*count - 1) << 16);
u32 result;
int status, nr_io_queues;
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
&result);
if (status < 0)
return status;
/*
* Degraded controllers might return an error when setting the queue
* count. We still want to be able to bring them online and offer
* access to the admin queue, as that might be only way to fix them up.
*/
if (status > 0) {
dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
*count = 0;
} else {
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
*count = min(*count, nr_io_queues);
}
return 0;
}
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_user_io io;
struct nvme_command c;
unsigned length, meta_len;
void __user *metadata;
if (copy_from_user(&io, uio, sizeof(io)))
return -EFAULT;
if (io.flags)
return -EINVAL;
switch (io.opcode) {
case nvme_cmd_write:
case nvme_cmd_read:
case nvme_cmd_compare:
break;
default:
return -EINVAL;
}
length = (io.nblocks + 1) << ns->lba_shift;
meta_len = (io.nblocks + 1) * ns->ms;
metadata = (void __user *)(uintptr_t)io.metadata;
if (ns->ext) {
length += meta_len;
meta_len = 0;
} else if (meta_len) {
if ((io.metadata & 3) || !io.metadata)
return -EINVAL;
}
memset(&c, 0, sizeof(c));
c.rw.opcode = io.opcode;
c.rw.flags = io.flags;
c.rw.nsid = cpu_to_le32(ns->ns_id);
c.rw.slba = cpu_to_le64(io.slba);
c.rw.length = cpu_to_le16(io.nblocks);
c.rw.control = cpu_to_le16(io.control);
c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
c.rw.reftag = cpu_to_le32(io.reftag);
c.rw.apptag = cpu_to_le16(io.apptag);
c.rw.appmask = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c,
(void __user *)(uintptr_t)io.addr, length,
metadata, meta_len, io.slba, NULL, 0);
}
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = 0;
int status;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
return -EINVAL;
memset(&c, 0, sizeof(c));
c.common.opcode = cmd.opcode;
c.common.flags = cmd.flags;
c.common.nsid = cpu_to_le32(cmd.nsid);
c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
0, &cmd.result, timeout);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
return -EFAULT;
}
return status;
}
static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
return ns->ns_id;
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
case NVME_IOCTL_IO_CMD:
return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
case NVME_IOCTL_SUBMIT_IO:
return nvme_submit_io(ns, (void __user *)arg);
default:
#ifdef CONFIG_NVM
if (ns->ndev)
return nvme_nvm_ioctl(ns, cmd, arg);
#endif
if (is_sed_ioctl(cmd))
return sed_ioctl(ns->ctrl->opal_dev, cmd,
(void __user *) arg);
return -ENOTTY;
}
}
static int nvme_open(struct block_device *bdev, fmode_t mode)
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
if (!kref_get_unless_zero(&ns->kref))
return -ENXIO;
if (!try_module_get(ns->ctrl->ops->module)) {
kref_put(&ns->kref, nvme_free_ns);
return -ENXIO;
}
return 0;
}
static void nvme_release(struct gendisk *disk, fmode_t mode)
{
struct nvme_ns *ns = disk->private_data;
module_put(ns->ctrl->ops->module);
nvme_put_ns(ns);
}
static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
/* some standard values */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
return 0;
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
u16 bs)
{
struct nvme_ns *ns = disk->private_data;
u16 old_ms = ns->ms;
u8 pi_type = 0;
ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
/* PI implementation requires metadata equal t10 pi tuple size */
if (ns->ms == sizeof(struct t10_pi_tuple))
pi_type = id->dps & NVME_NS_DPS_PI_MASK;
if (blk_get_integrity(disk) &&
(ns->pi_type != pi_type || ns->ms != old_ms ||
bs != queue_logical_block_size(disk->queue) ||
(ns->ms && ns->ext)))
blk_integrity_unregister(disk);
ns->pi_type = pi_type;
}
static void nvme_init_integrity(struct nvme_ns *ns)
{
struct blk_integrity integrity;
memset(&integrity, 0, sizeof(integrity));
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
integrity.profile = &t10_pi_type1_crc;
integrity.tag_size = sizeof(u16);
integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
break;
default:
integrity.profile = NULL;
break;
}
integrity.tuple_size = ns->ms;
blk_integrity_register(ns->disk, &integrity);
blk_queue_max_integrity_segments(ns->queue, 1);
}
#else
static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
u16 bs)
{
}
static void nvme_init_integrity(struct nvme_ns *ns)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_set_chunk_size(struct nvme_ns *ns)
{
u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
}
static void nvme_config_discard(struct nvme_ns *ns)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u32 logical_block_size = queue_logical_block_size(ns->queue);
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
if (ctrl->nr_streams && ns->sws && ns->sgs) {
unsigned int sz = logical_block_size * ns->sws * ns->sgs;
ns->queue->limits.discard_alignment = sz;
ns->queue->limits.discard_granularity = sz;
} else {
ns->queue->limits.discard_alignment = logical_block_size;
ns->queue->limits.discard_granularity = logical_block_size;
}
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
}
static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid)
{
if (ctrl->vs >= NVME_VS(1, 1, 0))
memcpy(eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
memcpy(nguid, id->nguid, sizeof(id->nguid));
if (ctrl->vs >= NVME_VS(1, 3, 0)) {
/* Don't treat error as fatal we potentially
* already have a NGUID or EUI-64
*/
if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid))
dev_warn(ctrl->device,
"%s: Identify Descriptors failed\n", __func__);
}
}
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
u16 bs;
/*
* If identify namespace failed, use default 512 byte block size so
* block layer can use before failing read/write for 0 capacity.
*/
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
bs = 1 << ns->lba_shift;
ns->noiob = le16_to_cpu(id->noiob);
blk_mq_freeze_queue(disk->queue);
if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
nvme_prep_integrity(disk, id, bs);
blk_queue_logical_block_size(ns->queue, bs);
if (ns->noiob)
nvme_set_chunk_size(ns);
if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
nvme_init_integrity(ns);
if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
set_capacity(disk, 0);
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
blk_mq_unfreeze_queue(disk->queue);
}
static int nvme_revalidate_disk(struct gendisk *disk)
{
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_id_ns *id;
u8 eui64[8] = { 0 }, nguid[16] = { 0 };
uuid_t uuid = uuid_null;
int ret = 0;
if (test_bit(NVME_NS_DEAD, &ns->flags)) {
set_capacity(disk, 0);
return -ENODEV;
}
id = nvme_identify_ns(ctrl, ns->ns_id);
if (!id)
return -ENODEV;
if (id->ncap == 0) {
ret = -ENODEV;
goto out;
}
nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid);
if (!uuid_equal(&ns->uuid, &uuid) ||
memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
dev_err(ctrl->device,
"identifiers changed for nsid %d\n", ns->ns_id);
ret = -ENODEV;
}
out:
kfree(id);
return ret;
}
static char nvme_pr_type(enum pr_type type)
{
switch (type) {
case PR_WRITE_EXCLUSIVE:
return 1;
case PR_EXCLUSIVE_ACCESS:
return 2;
case PR_WRITE_EXCLUSIVE_REG_ONLY:
return 3;
case PR_EXCLUSIVE_ACCESS_REG_ONLY:
return 4;
case PR_WRITE_EXCLUSIVE_ALL_REGS:
return 5;
case PR_EXCLUSIVE_ACCESS_ALL_REGS:
return 6;
default:
return 0;
}
};
static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
u64 key, u64 sa_key, u8 op)
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
struct nvme_command c;
u8 data[16] = { 0, };
put_unaligned_le64(key, &data[0]);
put_unaligned_le64(sa_key, &data[8]);
memset(&c, 0, sizeof(c));
c.common.opcode = op;
c.common.nsid = cpu_to_le32(ns->ns_id);
c.common.cdw10[0] = cpu_to_le32(cdw10);
return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
}
static int nvme_pr_register(struct block_device *bdev, u64 old,
u64 new, unsigned flags)
{
u32 cdw10;
if (flags & ~PR_FL_IGNORE_KEY)
return -EOPNOTSUPP;
cdw10 = old ? 2 : 0;
cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
}
static int nvme_pr_reserve(struct block_device *bdev, u64 key,
enum pr_type type, unsigned flags)
{
u32 cdw10;
if (flags & ~PR_FL_IGNORE_KEY)
return -EOPNOTSUPP;
cdw10 = nvme_pr_type(type) << 8;
cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
}
static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
enum pr_type type, bool abort)
{
u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
}
static int nvme_pr_clear(struct block_device *bdev, u64 key)
{
u32 cdw10 = 1 | (key ? 1 << 3 : 0);
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
}
static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
}
static const struct pr_ops nvme_pr_ops = {
.pr_register = nvme_pr_register,
.pr_reserve = nvme_pr_reserve,
.pr_release = nvme_pr_release,
.pr_preempt = nvme_pr_preempt,
.pr_clear = nvme_pr_clear,
};
#ifdef CONFIG_BLK_SED_OPAL
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send)
{
struct nvme_ctrl *ctrl = data;
struct nvme_command cmd;
memset(&cmd, 0, sizeof(cmd));
if (send)
cmd.common.opcode = nvme_admin_security_send;
else
cmd.common.opcode = nvme_admin_security_recv;
cmd.common.nsid = 0;
cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
cmd.common.cdw10[1] = cpu_to_le32(len);
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
}
EXPORT_SYMBOL_GPL(nvme_sec_submit);
#endif /* CONFIG_BLK_SED_OPAL */
static const struct block_device_operations nvme_fops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
.compat_ioctl = nvme_ioctl,
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
.revalidate_disk= nvme_revalidate_disk,
.pr_ops = &nvme_pr_ops,
};
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
{
unsigned long timeout =
((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
int ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if (csts == ~0)
return -ENODEV;
if ((csts & NVME_CSTS_RDY) == bit)
break;
msleep(100);
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout)) {
dev_err(ctrl->device,
"Device not ready; aborting %s\n", enabled ?
"initialisation" : "reset");
return -ENODEV;
}
}
return ret;
}
/*
* If the device has been passed off to us in an enabled state, just clear
* the enabled bit. The spec says we should set the 'shutdown notification
* bits', but doing so may cause the device to complete commands to the
* admin queue ... and we don't know what memory that might be pointing at!
*/
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
{
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
nvme: apply DELAY_BEFORE_CHK_RDY quirk at probe time too Commit 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness") introduced a quirk to adapters that cannot read the bit NVME_CSTS_RDY right after register NVME_REG_CC is set; these adapters need a delay or else the action of reading the bit NVME_CSTS_RDY could somehow corrupt adapter's registers state and it never recovers. When this quirk was added, we checked ctrl->tagset in order to avoid quirking in probe time, supposing we would never require such delay during probe. Well, it was too optimistic; we in fact need this quirk at probe time in some cases, like after a kexec. In some experiments, after abnormal shutdown of machine (aka power cord unplug), we booted into our bootloader in Power, which is a Linux kernel, and kexec'ed into another distro. If this kexec is too quick, we end up reaching the probe of NVMe adapter in that distro when adapter is in bad state (not fully initialized on our bootloader). What happens next is that nvme_wait_ready() is unable to complete, except if the quirk is enabled. So, this patch removes the original ctrl->tagset verification in order to enable the quirk even on probe time. Fixes: 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness") Reported-by: Andrew Byrne <byrneadw@ie.ibm.com> Reported-by: Jaime A. H. Gomez <jahgomez@mx1.ibm.com> Reported-by: Zachary D. Myers <zdmyers@us.ibm.com> Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com> Acked-by: Jeffrey Lien <Jeff.Lien@wdc.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2016-12-29 03:13:15 +03:00
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
return nvme_wait_ready(ctrl, cap, false);
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
{
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accomodate architectures with differing
* kernel and IO page sizes.
*/
unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
int ret;
if (page_shift < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
1 << dev_page_min, 1 << page_shift);
return -ENODEV;
}
ctrl->page_size = 1 << page_shift;
ctrl->ctrl_config = NVME_CC_CSS_NVM;
ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
ctrl->ctrl_config |= NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
return nvme_wait_ready(ctrl, cap, true);
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
{
unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
u32 csts;
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
break;
msleep(100);
if (fatal_signal_pending(current))
return -EINTR;
if (time_after(jiffies, timeout)) {
dev_err(ctrl->device,
"Device shutdown incomplete; abort shutdown\n");
return -ENODEV;
}
}
return ret;
}
EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
struct request_queue *q)
{
bool vwc = false;
if (ctrl->max_hw_sectors) {
u32 max_segments =
(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
blk_queue_virt_boundary(q, ctrl->page_size - 1);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
blk_queue_write_cache(q, vwc, vwc);
}
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
{
__le64 ts;
int ret;
if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
return 0;
ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
NULL);
if (ret)
dev_warn_once(ctrl->device,
"could not set timestamp (%d)\n", ret);
return ret;
}
static int nvme_configure_apst(struct nvme_ctrl *ctrl)
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
{
/*
* APST (Autonomous Power State Transition) lets us program a
* table of power state transitions that the controller will
* perform automatically. We configure it with a simple
* heuristic: we are willing to spend at most 2% of the time
* transitioning between power states. Therefore, when running
* in any given state, we will enter the next lower-power
* non-operational state after waiting 50 * (enlat + exlat)
* microseconds, as long as that state's exit latency is under
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
* the requested maximum latency.
*
* We will not autonomously enter any non-operational state for
* which the total latency exceeds ps_max_latency_us. Users
* can set ps_max_latency_us to zero to turn off APST.
*/
unsigned apste;
struct nvme_feat_auto_pst *table;
u64 max_lat_us = 0;
int max_ps = -1;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
int ret;
/*
* If APST isn't supported or if we haven't been initialized yet,
* then don't do anything.
*/
if (!ctrl->apsta)
return 0;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
if (ctrl->npss > 31) {
dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
return 0;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
}
table = kzalloc(sizeof(*table), GFP_KERNEL);
if (!table)
return 0;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
/* Turn off APST. */
apste = 0;
dev_dbg(ctrl->device, "APST disabled\n");
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
} else {
__le64 target = cpu_to_le64(0);
int state;
/*
* Walk through all states from lowest- to highest-power.
* According to the spec, lower-numbered states use more
* power. NPSS, despite the name, is the index of the
* lowest-power state, not the number of states.
*/
for (state = (int)ctrl->npss; state >= 0; state--) {
u64 total_latency_us, exit_latency_us, transition_ms;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
if (target)
table->entries[state] = target;
/*
* Don't allow transitions to the deepest state
* if it's quirked off.
*/
if (state == ctrl->npss &&
(ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
continue;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
/*
* Is this state a useful non-operational state for
* higher-power states to autonomously transition to?
*/
if (!(ctrl->psd[state].flags &
NVME_PS_FLAGS_NON_OP_STATE))
continue;
exit_latency_us =
(u64)le32_to_cpu(ctrl->psd[state].exit_lat);
if (exit_latency_us > ctrl->ps_max_latency_us)
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
continue;
total_latency_us =
exit_latency_us +
le32_to_cpu(ctrl->psd[state].entry_lat);
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
/*
* This state is good. Use it as the APST idle
* target for higher power states.
*/
transition_ms = total_latency_us + 19;
do_div(transition_ms, 20);
if (transition_ms > (1 << 24) - 1)
transition_ms = (1 << 24) - 1;
target = cpu_to_le64((state << 3) |
(transition_ms << 8));
if (max_ps == -1)
max_ps = state;
if (total_latency_us > max_lat_us)
max_lat_us = total_latency_us;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
}
apste = 1;
if (max_ps == -1) {
dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
} else {
dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
max_ps, max_lat_us, (int)sizeof(*table), table);
}
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
}
ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
table, sizeof(*table), NULL);
if (ret)
dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
kfree(table);
return ret;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
}
static void nvme_set_latency_tolerance(struct device *dev, s32 val)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
u64 latency;
switch (val) {
case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
case PM_QOS_LATENCY_ANY:
latency = U64_MAX;
break;
default:
latency = val;
}
if (ctrl->ps_max_latency_us != latency) {
ctrl->ps_max_latency_us = latency;
nvme_configure_apst(ctrl);
}
}
struct nvme_core_quirk_entry {
/*
* NVMe model and firmware strings are padded with spaces. For
* simplicity, strings in the quirk table are padded with NULLs
* instead.
*/
u16 vid;
const char *mn;
const char *fr;
unsigned long quirks;
};
static const struct nvme_core_quirk_entry core_quirks[] = {
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
{
/*
* This Toshiba device seems to die using any APST states. See:
* https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
*/
.vid = 0x1179,
.mn = "THNSF5256GPUK TOSHIBA",
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
.quirks = NVME_QUIRK_NO_APST,
}
};
/* match is null-terminated but idstr is space-padded. */
static bool string_matches(const char *idstr, const char *match, size_t len)
{
size_t matchlen;
if (!match)
return true;
matchlen = strlen(match);
WARN_ON_ONCE(matchlen > len);
if (memcmp(idstr, match, matchlen))
return false;
for (; matchlen < len; matchlen++)
if (idstr[matchlen] != ' ')
return false;
return true;
}
static bool quirk_matches(const struct nvme_id_ctrl *id,
const struct nvme_core_quirk_entry *q)
{
return q->vid == le16_to_cpu(id->vid) &&
string_matches(id->mn, q->mn, sizeof(id->mn)) &&
string_matches(id->fr, q->fr, sizeof(id->fr));
}
static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
size_t nqnlen;
int off;
nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
strcpy(ctrl->subnqn, id->subnqn);
return;
}
if (ctrl->vs >= NVME_VS(1, 2, 1))
dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE,
"nqn.2014.08.org.nvmexpress:%4x%4x",
le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn));
off += sizeof(id->sn);
memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn));
off += sizeof(id->mn);
memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off);
}
/*
* Initialize the cached copies of the Identify data and various controller
* register in our nvme_ctrl structure. This should be called as soon as
* the admin queue is fully up and running.
*/
int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
bool prev_apst_enabled;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
if (ret) {
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
}
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
if (ret) {
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
return ret;
}
page_shift = NVME_CAP_MPSMIN(cap) + 12;
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(cap);
ret = nvme_identify_ctrl(ctrl, &id);
if (ret) {
dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
return -EIO;
}
nvme_init_subnqn(ctrl, id);
if (!ctrl->identified) {
/*
* Check for quirks. Quirk can depend on firmware version,
* so, in principle, the set of quirks present can change
* across a reset. As a possible future enhancement, we
* could re-scan for quirks every time we reinitialize
* the device, but we'd have to make sure that the driver
* behaves intelligently if the quirks change.
*/
int i;
for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
if (quirk_matches(id, &core_quirks[i]))
ctrl->quirks |= core_quirks[i].quirks;
}
}
if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
}
ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->vid = le16_to_cpu(id->vid);
ctrl->oncs = le16_to_cpup(&id->oncs);
atomic_set(&ctrl->abort_limit, id->acl + 1);
ctrl->vwc = id->vwc;
ctrl->cntlid = le16_to_cpup(&id->cntlid);
memcpy(ctrl->serial, id->sn, sizeof(id->sn));
memcpy(ctrl->model, id->mn, sizeof(id->mn));
memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
if (id->mdts)
max_hw_sectors = 1 << (id->mdts + page_shift - 9);
else
max_hw_sectors = UINT_MAX;
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
if (id->rtd3e) {
/* us -> s */
u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
shutdown_timeout, 60);
if (ctrl->shutdown_timeout != shutdown_timeout)
dev_warn(ctrl->device,
"Shutdown timeout set to %u seconds\n",
ctrl->shutdown_timeout);
} else
ctrl->shutdown_timeout = shutdown_timeout;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
ctrl->npss = id->npss;
ctrl->apsta = id->apsta;
prev_apst_enabled = ctrl->apst_enabled;
if (ctrl->quirks & NVME_QUIRK_NO_APST) {
if (force_apst && id->apsta) {
dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
ctrl->apst_enabled = true;
} else {
ctrl->apst_enabled = false;
}
} else {
ctrl->apst_enabled = id->apsta;
}
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->icdoff = le16_to_cpu(id->icdoff);
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
/*
* In fabrics we need to verify the cntlid matches the
* admin connect
*/
if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
ret = -EINVAL;
goto out_free;
}
if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
dev_err(ctrl->device,
"keep-alive support is mandatory for fabrics\n");
ret = -EINVAL;
goto out_free;
}
} else {
ctrl->cntlid = le16_to_cpu(id->cntlid);
ctrl->hmpre = le32_to_cpu(id->hmpre);
ctrl->hmmin = le32_to_cpu(id->hmmin);
ctrl->hmminds = le32_to_cpu(id->hmminds);
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
}
kfree(id);
if (ctrl->apst_enabled && !prev_apst_enabled)
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
dev_pm_qos_hide_latency_tolerance(ctrl->device);
ret = nvme_configure_apst(ctrl);
if (ret < 0)
return ret;
ret = nvme_configure_timestamp(ctrl);
if (ret < 0)
return ret;
ret = nvme_configure_directives(ctrl);
if (ret < 0)
return ret;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
ctrl->identified = true;
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
return 0;
out_free:
kfree(id);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_identify);
static int nvme_dev_open(struct inode *inode, struct file *file)
{
struct nvme_ctrl *ctrl;
int instance = iminor(inode);
int ret = -ENODEV;
spin_lock(&dev_list_lock);
list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
if (ctrl->instance != instance)
continue;
if (!ctrl->admin_q) {
ret = -EWOULDBLOCK;
break;
}
if (!kref_get_unless_zero(&ctrl->kref))
break;
file->private_data = ctrl;
ret = 0;
break;
}
spin_unlock(&dev_list_lock);
return ret;
}
static int nvme_dev_release(struct inode *inode, struct file *file)
{
nvme_put_ctrl(file->private_data);
return 0;
}
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
{
struct nvme_ns *ns;
int ret;
mutex_lock(&ctrl->namespaces_mutex);
if (list_empty(&ctrl->namespaces)) {
ret = -ENOTTY;
goto out_unlock;
}
ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
dev_warn(ctrl->device,
"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
ret = -EINVAL;
goto out_unlock;
}
dev_warn(ctrl->device,
"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
kref_get(&ns->kref);
mutex_unlock(&ctrl->namespaces_mutex);
ret = nvme_user_cmd(ctrl, ns, argp);
nvme_put_ns(ns);
return ret;
out_unlock:
mutex_unlock(&ctrl->namespaces_mutex);
return ret;
}
static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct nvme_ctrl *ctrl = file->private_data;
void __user *argp = (void __user *)arg;
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ctrl, NULL, argp);
case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp);
case NVME_IOCTL_RESET:
dev_warn(ctrl->device, "resetting controller\n");
return nvme_reset_ctrl_sync(ctrl);
case NVME_IOCTL_SUBSYS_RESET:
return nvme_reset_subsystem(ctrl);
case NVME_IOCTL_RESCAN:
nvme_queue_scan(ctrl);
return 0;
default:
return -ENOTTY;
}
}
static const struct file_operations nvme_dev_fops = {
.owner = THIS_MODULE,
.open = nvme_dev_open,
.release = nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl = nvme_dev_ioctl,
};
static ssize_t nvme_sysfs_reset(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
int ret;
ret = nvme_reset_ctrl_sync(ctrl);
if (ret < 0)
return ret;
return count;
}
static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
static ssize_t nvme_sysfs_rescan(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
nvme_queue_scan(ctrl);
return count;
}
static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
lightnvm: expose device geometry through sysfs For a host to access an Open-Channel SSD, it has to know its geometry, so that it writes and reads at the appropriate device bounds. Currently, the geometry information is kept within the kernel, and not exported to user-space for consumption. This patch exposes the configuration through sysfs and enables user-space libraries, such as liblightnvm, to use the sysfs implementation to get the geometry of an Open-Channel SSD. The sysfs entries are stored within the device hierarchy, and can be found using the "lightnvm" device type. An example configuration looks like this: /sys/class/nvme/ └── nvme0n1 ├── capabilities: 3 ├── device_mode: 1 ├── erase_max: 1000000 ├── erase_typ: 1000000 ├── flash_media_type: 0 ├── media_capabilities: 0x00000001 ├── media_type: 0 ├── multiplane: 0x00010101 ├── num_blocks: 1022 ├── num_channels: 1 ├── num_luns: 4 ├── num_pages: 64 ├── num_planes: 1 ├── page_size: 4096 ├── prog_max: 100000 ├── prog_typ: 100000 ├── read_max: 10000 ├── read_typ: 10000 ├── sector_oob_size: 0 ├── sector_size: 4096 ├── media_manager: gennvm ├── ppa_format: 0x380830082808001010102008 ├── vendor_opcode: 0 ├── max_phys_secs: 64 └── version: 1 Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com> Signed-off-by: Matias Bjørling <m@bjorling.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2016-09-16 15:25:08 +03:00
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
struct nvme_ctrl *ctrl = ns->ctrl;
int serial_len = sizeof(ctrl->serial);
int model_len = sizeof(ctrl->model);
if (!uuid_is_null(&ns->uuid))
return sprintf(buf, "uuid.%pU\n", &ns->uuid);
if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
return sprintf(buf, "eui.%16phN\n", ns->nguid);
if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
return sprintf(buf, "eui.%8phN\n", ns->eui);
while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
ctrl->serial[serial_len - 1] == '\0'))
serial_len--;
while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
ctrl->model[model_len - 1] == '\0'))
model_len--;
return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
}
static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%pU\n", ns->nguid);
}
static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
lightnvm: expose device geometry through sysfs For a host to access an Open-Channel SSD, it has to know its geometry, so that it writes and reads at the appropriate device bounds. Currently, the geometry information is kept within the kernel, and not exported to user-space for consumption. This patch exposes the configuration through sysfs and enables user-space libraries, such as liblightnvm, to use the sysfs implementation to get the geometry of an Open-Channel SSD. The sysfs entries are stored within the device hierarchy, and can be found using the "lightnvm" device type. An example configuration looks like this: /sys/class/nvme/ └── nvme0n1 ├── capabilities: 3 ├── device_mode: 1 ├── erase_max: 1000000 ├── erase_typ: 1000000 ├── flash_media_type: 0 ├── media_capabilities: 0x00000001 ├── media_type: 0 ├── multiplane: 0x00010101 ├── num_blocks: 1022 ├── num_channels: 1 ├── num_luns: 4 ├── num_pages: 64 ├── num_planes: 1 ├── page_size: 4096 ├── prog_max: 100000 ├── prog_typ: 100000 ├── read_max: 10000 ├── read_typ: 10000 ├── sector_oob_size: 0 ├── sector_size: 4096 ├── media_manager: gennvm ├── ppa_format: 0x380830082808001010102008 ├── vendor_opcode: 0 ├── max_phys_secs: 64 └── version: 1 Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com> Signed-off-by: Matias Bjørling <m@bjorling.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2016-09-16 15:25:08 +03:00
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
/* For backward compatibility expose the NGUID to userspace if
* we have no UUID set
*/
if (uuid_is_null(&ns->uuid)) {
printk_ratelimited(KERN_WARNING
"No UUID available providing old NGUID\n");
return sprintf(buf, "%pU\n", ns->nguid);
}
return sprintf(buf, "%pU\n", &ns->uuid);
}
static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
lightnvm: expose device geometry through sysfs For a host to access an Open-Channel SSD, it has to know its geometry, so that it writes and reads at the appropriate device bounds. Currently, the geometry information is kept within the kernel, and not exported to user-space for consumption. This patch exposes the configuration through sysfs and enables user-space libraries, such as liblightnvm, to use the sysfs implementation to get the geometry of an Open-Channel SSD. The sysfs entries are stored within the device hierarchy, and can be found using the "lightnvm" device type. An example configuration looks like this: /sys/class/nvme/ └── nvme0n1 ├── capabilities: 3 ├── device_mode: 1 ├── erase_max: 1000000 ├── erase_typ: 1000000 ├── flash_media_type: 0 ├── media_capabilities: 0x00000001 ├── media_type: 0 ├── multiplane: 0x00010101 ├── num_blocks: 1022 ├── num_channels: 1 ├── num_luns: 4 ├── num_pages: 64 ├── num_planes: 1 ├── page_size: 4096 ├── prog_max: 100000 ├── prog_typ: 100000 ├── read_max: 10000 ├── read_typ: 10000 ├── sector_oob_size: 0 ├── sector_size: 4096 ├── media_manager: gennvm ├── ppa_format: 0x380830082808001010102008 ├── vendor_opcode: 0 ├── max_phys_secs: 64 └── version: 1 Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com> Signed-off-by: Matias Bjørling <m@bjorling.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2016-09-16 15:25:08 +03:00
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%8phd\n", ns->eui);
}
static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
lightnvm: expose device geometry through sysfs For a host to access an Open-Channel SSD, it has to know its geometry, so that it writes and reads at the appropriate device bounds. Currently, the geometry information is kept within the kernel, and not exported to user-space for consumption. This patch exposes the configuration through sysfs and enables user-space libraries, such as liblightnvm, to use the sysfs implementation to get the geometry of an Open-Channel SSD. The sysfs entries are stored within the device hierarchy, and can be found using the "lightnvm" device type. An example configuration looks like this: /sys/class/nvme/ └── nvme0n1 ├── capabilities: 3 ├── device_mode: 1 ├── erase_max: 1000000 ├── erase_typ: 1000000 ├── flash_media_type: 0 ├── media_capabilities: 0x00000001 ├── media_type: 0 ├── multiplane: 0x00010101 ├── num_blocks: 1022 ├── num_channels: 1 ├── num_luns: 4 ├── num_pages: 64 ├── num_planes: 1 ├── page_size: 4096 ├── prog_max: 100000 ├── prog_typ: 100000 ├── read_max: 10000 ├── read_typ: 10000 ├── sector_oob_size: 0 ├── sector_size: 4096 ├── media_manager: gennvm ├── ppa_format: 0x380830082808001010102008 ├── vendor_opcode: 0 ├── max_phys_secs: 64 └── version: 1 Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com> Signed-off-by: Matias Bjørling <m@bjorling.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2016-09-16 15:25:08 +03:00
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%d\n", ns->ns_id);
}
static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
static struct attribute *nvme_ns_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
&dev_attr_nsid.attr,
NULL,
};
static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
lightnvm: expose device geometry through sysfs For a host to access an Open-Channel SSD, it has to know its geometry, so that it writes and reads at the appropriate device bounds. Currently, the geometry information is kept within the kernel, and not exported to user-space for consumption. This patch exposes the configuration through sysfs and enables user-space libraries, such as liblightnvm, to use the sysfs implementation to get the geometry of an Open-Channel SSD. The sysfs entries are stored within the device hierarchy, and can be found using the "lightnvm" device type. An example configuration looks like this: /sys/class/nvme/ └── nvme0n1 ├── capabilities: 3 ├── device_mode: 1 ├── erase_max: 1000000 ├── erase_typ: 1000000 ├── flash_media_type: 0 ├── media_capabilities: 0x00000001 ├── media_type: 0 ├── multiplane: 0x00010101 ├── num_blocks: 1022 ├── num_channels: 1 ├── num_luns: 4 ├── num_pages: 64 ├── num_planes: 1 ├── page_size: 4096 ├── prog_max: 100000 ├── prog_typ: 100000 ├── read_max: 10000 ├── read_typ: 10000 ├── sector_oob_size: 0 ├── sector_size: 4096 ├── media_manager: gennvm ├── ppa_format: 0x380830082808001010102008 ├── vendor_opcode: 0 ├── max_phys_secs: 64 └── version: 1 Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com> Signed-off-by: Matias Bjørling <m@bjorling.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2016-09-16 15:25:08 +03:00
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
if (a == &dev_attr_uuid.attr) {
if (uuid_is_null(&ns->uuid) ||
!memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
return 0;
}
if (a == &dev_attr_nguid.attr) {
if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
return 0;
}
if (a == &dev_attr_eui.attr) {
if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
return 0;
}
return a->mode;
}
static const struct attribute_group nvme_ns_attr_group = {
.attrs = nvme_ns_attrs,
.is_visible = nvme_ns_attrs_are_visible,
};
#define nvme_show_str_function(field) \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
{ \
struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \
} \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
#define nvme_show_int_function(field) \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
{ \
struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
return sprintf(buf, "%d\n", ctrl->field); \
} \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_str_function(model);
nvme_show_str_function(serial);
nvme_show_str_function(firmware_rev);
nvme_show_int_function(cntlid);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (device_remove_file_self(dev, attr))
ctrl->ops->delete_ctrl(ctrl);
return count;
}
static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
static ssize_t nvme_sysfs_show_transport(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
}
static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
static ssize_t nvme_sysfs_show_state(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live",
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_RECONNECTING]= "reconnecting",
[NVME_CTRL_DELETING] = "deleting",
[NVME_CTRL_DEAD] = "dead",
};
if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
state_name[ctrl->state])
return sprintf(buf, "%s\n", state_name[ctrl->state]);
return sprintf(buf, "unknown state\n");
}
static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn);
}
static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
static ssize_t nvme_sysfs_show_address(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
}
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
&dev_attr_model.attr,
&dev_attr_serial.attr,
&dev_attr_firmware_rev.attr,
&dev_attr_cntlid.attr,
&dev_attr_delete_controller.attr,
&dev_attr_transport.attr,
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
&dev_attr_state.attr,
NULL
};
static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
return 0;
if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
return 0;
return a->mode;
}
static struct attribute_group nvme_dev_attrs_group = {
.attrs = nvme_dev_attrs,
.is_visible = nvme_dev_attrs_are_visible,
};
static const struct attribute_group *nvme_dev_attr_groups[] = {
&nvme_dev_attrs_group,
NULL,
};
static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
{
struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
return nsa->ns_id - nsb->ns_id;
}
static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list) {
if (ns->ns_id == nsid) {
if (!kref_get_unless_zero(&ns->kref))
continue;
ret = ns;
break;
}
if (ns->ns_id > nsid)
break;
}
mutex_unlock(&ctrl->namespaces_mutex);
return ret;
}
static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
{
struct streams_directive_params s;
int ret;
if (!ctrl->nr_streams)
return 0;
ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
if (ret)
return ret;
ns->sws = le32_to_cpu(s.sws);
ns->sgs = le16_to_cpu(s.sgs);
if (ns->sws) {
unsigned int bs = 1 << ns->lba_shift;
blk_queue_io_min(ns->queue, bs * ns->sws);
if (ns->sgs)
blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
}
return 0;
}
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
int node = dev_to_node(ctrl->dev);
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
return;
ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
if (ns->instance < 0)
goto out_free_ns;
ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue))
goto out_release_instance;
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
kref_init(&ns->kref);
ns->ns_id = nsid;
ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
nvme_setup_streams_ns(ctrl, ns);
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
id = nvme_identify_ns(ctrl, nsid);
if (!id)
goto out_free_queue;
if (id->ncap == 0)
goto out_free_id;
nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid);
if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
if (nvme_nvm_register(ns, disk_name, node)) {
dev_warn(ctrl->device, "LightNVM init failure\n");
goto out_free_id;
}
}
disk = alloc_disk_node(0, node);
if (!disk)
goto out_free_id;
disk->fops = &nvme_fops;
disk->private_data = ns;
disk->queue = ns->queue;
disk->flags = GENHD_FL_EXT_DEVT;
memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
ns->disk = disk;
__nvme_revalidate_disk(disk, id);
mutex_lock(&ctrl->namespaces_mutex);
list_add_tail(&ns->list, &ctrl->namespaces);
mutex_unlock(&ctrl->namespaces_mutex);
kref_get(&ctrl->kref);
kfree(id);
device_add_disk(ctrl->device, ns->disk);
if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_attr_group))
pr_warn("%s: failed to create sysfs group for identification\n",
ns->disk->disk_name);
if (ns->ndev && nvme_nvm_register_sysfs(ns))
pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
ns->disk->disk_name);
return;
out_free_id:
kfree(id);
out_free_queue:
blk_cleanup_queue(ns->queue);
out_release_instance:
ida_simple_remove(&ctrl->ns_ida, ns->instance);
out_free_ns:
kfree(ns);
}
static void nvme_ns_remove(struct nvme_ns *ns)
{
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
if (blk_get_integrity(ns->disk))
blk_integrity_unregister(ns->disk);
sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_attr_group);
if (ns->ndev)
nvme_nvm_unregister_sysfs(ns);
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
}
mutex_lock(&ns->ctrl->namespaces_mutex);
list_del_init(&ns->list);
mutex_unlock(&ns->ctrl->namespaces_mutex);
nvme_put_ns(ns);
}
static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
if (ns->disk && revalidate_disk(ns->disk))
nvme_ns_remove(ns);
nvme_put_ns(ns);
} else
nvme_alloc_ns(ctrl, nsid);
}
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid)
{
struct nvme_ns *ns, *next;
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (ns->ns_id > nsid)
nvme_ns_remove(ns);
}
}
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
{
struct nvme_ns *ns;
__le32 *ns_list;
unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
int ret = 0;
ns_list = kzalloc(0x1000, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
for (i = 0; i < num_lists; i++) {
ret = nvme_identify_ns_list(ctrl, prev, ns_list);
if (ret)
goto free;
for (j = 0; j < min(nn, 1024U); j++) {
nsid = le32_to_cpu(ns_list[j]);
if (!nsid)
goto out;
nvme_validate_ns(ctrl, nsid);
while (++prev < nsid) {
ns = nvme_find_get_ns(ctrl, prev);
if (ns) {
nvme_ns_remove(ns);
nvme_put_ns(ns);
}
}
}
nn -= j;
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
free:
kfree(ns_list);
return ret;
}
static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
{
unsigned i;
for (i = 1; i <= nn; i++)
nvme_validate_ns(ctrl, i);
nvme_remove_invalid_namespaces(ctrl, nn);
}
static void nvme_scan_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, scan_work);
struct nvme_id_ctrl *id;
unsigned nn;
if (ctrl->state != NVME_CTRL_LIVE)
return;
if (nvme_identify_ctrl(ctrl, &id))
return;
nn = le32_to_cpu(id->nn);
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
!(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
if (!nvme_scan_ns_list(ctrl, nn))
goto done;
}
nvme_scan_ns_sequential(ctrl, nn);
done:
mutex_lock(&ctrl->namespaces_mutex);
list_sort(NULL, &ctrl->namespaces, ns_cmp);
mutex_unlock(&ctrl->namespaces_mutex);
kfree(id);
}
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
/*
* Do not queue new scan work when a controller is reset during
* removal.
*/
if (ctrl->state == NVME_CTRL_LIVE)
queue_work(nvme_wq, &ctrl->scan_work);
}
EXPORT_SYMBOL_GPL(nvme_queue_scan);
/*
* This function iterates the namespace list unlocked to allow recovery from
* controller failure. It is up to the caller to ensure the namespace list is
* not modified by scan work while this function is executing.
*/
void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns, *next;
/*
* The dead states indicates the controller was not gracefully
* disconnected. In that case, we won't be able to flush any data while
* removing the namespaces' disks; fail all the queues now to avoid
* potentially having to clean up the failed sync later.
*/
if (ctrl->state == NVME_CTRL_DEAD)
nvme_kill_queues(ctrl);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
nvme_ns_remove(ns);
}
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
static void nvme_async_event_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, async_event_work);
spin_lock_irq(&ctrl->lock);
nvme: stop aer posting if controller state not live If an nvme async_event command completes, in most cases, a new async event is posted. However, if the controller enters a resetting or reconnecting state, there is nothing to block the scheduled work element from posting the async event again. Nor are there calls from the transport to stop async events when an association dies. In the case of FC, where the association is torn down, the aer must be aborted on the FC link and completes through the normal job completion path. Thus the terminated async event ends up being rescheduled even though the controller isn't in a valid state for the aer, and the reposting gets the transport into a partially torn down data structure. It's possible to hit the scenario on rdma, although much less likely due to an aer completing right as the association is terminated and as the association teardown reclaims the blk requests via nvme_cancel_request() so its immediate, not a link-related action like on FC. Fix by putting controller state checks in both the async event completion routine where it schedules the async event and in the async event work routine before it calls into the transport. It's effectively a "stop_async_events()" behavior. The transport, when it creates a new association with the subsystem will transition the state back to live and is already restarting the async event posting. Signed-off-by: James Smart <james.smart@broadcom.com> [hch: remove taking a lock over reading the controller state] Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-09-14 21:03:09 +03:00
while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) {
int aer_idx = --ctrl->event_limit;
spin_unlock_irq(&ctrl->lock);
ctrl->ops->submit_async_event(ctrl, aer_idx);
spin_lock_irq(&ctrl->lock);
}
spin_unlock_irq(&ctrl->lock);
}
static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
{
u32 csts;
if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
return false;
if (csts == ~0)
return false;
return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
}
static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
{
struct nvme_command c = { };
struct nvme_fw_slot_info_log *log;
log = kmalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return;
c.common.opcode = nvme_admin_get_log_page;
c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
dev_warn(ctrl->device,
"Get FW SLOT INFO log error\n");
kfree(log);
}
static void nvme_fw_act_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work,
struct nvme_ctrl, fw_act_work);
unsigned long fw_act_timeout;
if (ctrl->mtfa)
fw_act_timeout = jiffies +
msecs_to_jiffies(ctrl->mtfa * 100);
else
fw_act_timeout = jiffies +
msecs_to_jiffies(admin_timeout * 1000);
nvme_stop_queues(ctrl);
while (nvme_ctrl_pp_status(ctrl)) {
if (time_after(jiffies, fw_act_timeout)) {
dev_warn(ctrl->device,
"Fw activation timeout, reset controller\n");
nvme_reset_ctrl(ctrl);
break;
}
msleep(100);
}
if (ctrl->state != NVME_CTRL_LIVE)
return;
nvme_start_queues(ctrl);
/* read FW slot informationi to clear the AER*/
nvme_get_fw_slot_info(ctrl);
}
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
union nvme_result *res)
{
u32 result = le32_to_cpu(res->u32);
bool done = true;
switch (le16_to_cpu(status) >> 1) {
case NVME_SC_SUCCESS:
done = false;
/*FALLTHRU*/
case NVME_SC_ABORT_REQ:
++ctrl->event_limit;
nvme: stop aer posting if controller state not live If an nvme async_event command completes, in most cases, a new async event is posted. However, if the controller enters a resetting or reconnecting state, there is nothing to block the scheduled work element from posting the async event again. Nor are there calls from the transport to stop async events when an association dies. In the case of FC, where the association is torn down, the aer must be aborted on the FC link and completes through the normal job completion path. Thus the terminated async event ends up being rescheduled even though the controller isn't in a valid state for the aer, and the reposting gets the transport into a partially torn down data structure. It's possible to hit the scenario on rdma, although much less likely due to an aer completing right as the association is terminated and as the association teardown reclaims the blk requests via nvme_cancel_request() so its immediate, not a link-related action like on FC. Fix by putting controller state checks in both the async event completion routine where it schedules the async event and in the async event work routine before it calls into the transport. It's effectively a "stop_async_events()" behavior. The transport, when it creates a new association with the subsystem will transition the state back to live and is already restarting the async event posting. Signed-off-by: James Smart <james.smart@broadcom.com> [hch: remove taking a lock over reading the controller state] Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-09-14 21:03:09 +03:00
if (ctrl->state == NVME_CTRL_LIVE)
queue_work(nvme_wq, &ctrl->async_event_work);
break;
default:
break;
}
if (done)
return;
switch (result & 0xff07) {
case NVME_AER_NOTICE_NS_CHANGED:
dev_info(ctrl->device, "rescanning\n");
nvme_queue_scan(ctrl);
break;
case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
}
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_queue_async_events(struct nvme_ctrl *ctrl)
{
ctrl->event_limit = NVME_NR_AERS;
queue_work(nvme_wq, &ctrl->async_event_work);
}
EXPORT_SYMBOL_GPL(nvme_queue_async_events);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
flush_work(&ctrl->scan_work);
cancel_work_sync(&ctrl->fw_act_work);
}
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
if (ctrl->kato)
nvme_start_keep_alive(ctrl);
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
nvme_queue_async_events(ctrl);
nvme_start_queues(ctrl);
}
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
{
device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
spin_lock(&dev_list_lock);
list_del(&ctrl->node);
spin_unlock(&dev_list_lock);
}
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
static void nvme_free_ctrl(struct kref *kref)
{
struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
put_device(ctrl->device);
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
ida_destroy(&ctrl->ns_ida);
ctrl->ops->free_ctrl(ctrl);
}
void nvme_put_ctrl(struct nvme_ctrl *ctrl)
{
kref_put(&ctrl->kref, nvme_free_ctrl);
}
EXPORT_SYMBOL_GPL(nvme_put_ctrl);
/*
* Initialize a NVMe controller structures. This needs to be called during
* earliest initialization so that we have the initialized structured around
* during probing.
*/
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks)
{
int ret;
ctrl->state = NVME_CTRL_NEW;
spin_lock_init(&ctrl->lock);
INIT_LIST_HEAD(&ctrl->namespaces);
mutex_init(&ctrl->namespaces_mutex);
kref_init(&ctrl->kref);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
INIT_WORK(&ctrl->scan_work, nvme_scan_work);
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
if (ret < 0)
goto out;
ctrl->instance = ret;
ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
MKDEV(nvme_char_major, ctrl->instance),
ctrl, nvme_dev_attr_groups,
"nvme%d", ctrl->instance);
if (IS_ERR(ctrl->device)) {
ret = PTR_ERR(ctrl->device);
goto out_release_instance;
}
get_device(ctrl->device);
ida_init(&ctrl->ns_ida);
spin_lock(&dev_list_lock);
list_add_tail(&ctrl->node, &nvme_ctrl_list);
spin_unlock(&dev_list_lock);
nvme: Enable autonomous power state transitions NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <luto@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Jens Axboe <axboe@fb.com>
2017-02-07 21:08:45 +03:00
/*
* Initialize latency tolerance controls. The sysfs files won't
* be visible to userspace unless the device actually supports APST.
*/
ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
dev_pm_qos_update_user_latency_tolerance(ctrl->device,
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
return 0;
out_release_instance:
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
out:
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
/**
* nvme_kill_queues(): Ends all namespace queues
* @ctrl: the dead controller that needs to end
*
* Call this function when the driver determines it is unable to get the
* controller in a state capable of servicing IO.
*/
void nvme_kill_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
nvme: fix hang in remove path We need to start admin queues too in nvme_kill_queues() for avoiding hang in remove path[1]. This patch is very similar with 806f026f9b901eaf(nvme: use blk_mq_start_hw_queues() in nvme_kill_queues()). [1] hang stack trace [<ffffffff813c9716>] blk_execute_rq+0x56/0x80 [<ffffffff815cb6e9>] __nvme_submit_sync_cmd+0x89/0xf0 [<ffffffff815ce7be>] nvme_set_features+0x5e/0x90 [<ffffffff815ce9f6>] nvme_configure_apst+0x166/0x200 [<ffffffff815cef45>] nvme_set_latency_tolerance+0x35/0x50 [<ffffffff8157bd11>] apply_constraint+0xb1/0xc0 [<ffffffff8157cbb4>] dev_pm_qos_constraints_destroy+0xf4/0x1f0 [<ffffffff8157b44a>] dpm_sysfs_remove+0x2a/0x60 [<ffffffff8156d951>] device_del+0x101/0x320 [<ffffffff8156db8a>] device_unregister+0x1a/0x60 [<ffffffff8156dc4c>] device_destroy+0x3c/0x50 [<ffffffff815cd295>] nvme_uninit_ctrl+0x45/0xa0 [<ffffffff815d4858>] nvme_remove+0x78/0x110 [<ffffffff81452b69>] pci_device_remove+0x39/0xb0 [<ffffffff81572935>] device_release_driver_internal+0x155/0x210 [<ffffffff81572a02>] device_release_driver+0x12/0x20 [<ffffffff815d36fb>] nvme_remove_dead_ctrl_work+0x6b/0x70 [<ffffffff810bf3bc>] process_one_work+0x18c/0x3a0 [<ffffffff810bf61e>] worker_thread+0x4e/0x3b0 [<ffffffff810c5ac9>] kthread+0x109/0x140 [<ffffffff8185800c>] ret_from_fork+0x2c/0x40 [<ffffffffffffffff>] 0xffffffffffffffff Fixes: c5552fde102fc("nvme: Enable autonomous power state transitions") Reported-by: Rakesh Pandit <rakesh@tuxera.com> Tested-by: Rakesh Pandit <rakesh@tuxera.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2017-06-02 11:32:08 +03:00
/* Forcibly unquiesce queues to avoid blocking dispatch */
nvme: validate admin queue before unquiesce With a misbehaving controller it's possible we'll never enter the live state and create an admin queue. When we fail out of reset work it's possible we failed out early enough without setting up the admin queue. We tear down queues after a failed reset, but needed to do some more sanitization. Fixes 443bd90f2cca: "nvme: host: unquiesce queue in nvme_kill_queues()" [ 189.650995] nvme nvme1: pci function 0000:0b:00.0 [ 317.680055] nvme nvme0: Device not ready; aborting reset [ 317.680183] nvme nvme0: Removing after probe failure status: -19 [ 317.681258] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 317.681397] general protection fault: 0000 [#1] SMP KASAN [ 317.682984] CPU: 3 PID: 477 Comm: kworker/3:2 Not tainted 4.13.0-rc1+ #5 [ 317.683112] Hardware name: Gigabyte Technology Co., Ltd. Z170X-UD5/Z170X-UD5-CF, BIOS F5 03/07/2016 [ 317.683284] Workqueue: events nvme_remove_dead_ctrl_work [nvme] [ 317.683398] task: ffff8803b0990000 task.stack: ffff8803c2ef0000 [ 317.683516] RIP: 0010:blk_mq_unquiesce_queue+0x2b/0xa0 [ 317.683614] RSP: 0018:ffff8803c2ef7d40 EFLAGS: 00010282 [ 317.683716] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 1ffff1006fbdcde3 [ 317.683847] RDX: 0000000000000038 RSI: 1ffff1006f5a9245 RDI: 0000000000000000 [ 317.683978] RBP: ffff8803c2ef7d58 R08: 1ffff1007bcdc974 R09: 0000000000000000 [ 317.684108] R10: 1ffff1007bcdc975 R11: 0000000000000000 R12: 00000000000001c0 [ 317.684239] R13: ffff88037ad49228 R14: ffff88037ad492d0 R15: ffff88037ad492e0 [ 317.684371] FS: 0000000000000000(0000) GS:ffff8803de6c0000(0000) knlGS:0000000000000000 [ 317.684519] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 317.684627] CR2: 0000002d1860c000 CR3: 000000045b40d000 CR4: 00000000003406e0 [ 317.684758] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 317.684888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 317.685018] Call Trace: [ 317.685084] nvme_kill_queues+0x4d/0x170 [nvme_core] [ 317.685185] nvme_remove_dead_ctrl_work+0x3a/0x90 [nvme] [ 317.685289] process_one_work+0x771/0x1170 [ 317.685372] worker_thread+0xde/0x11e0 [ 317.685452] ? pci_mmcfg_check_reserved+0x110/0x110 [ 317.685550] kthread+0x2d3/0x3d0 [ 317.685617] ? process_one_work+0x1170/0x1170 [ 317.685704] ? kthread_create_on_node+0xc0/0xc0 [ 317.685785] ret_from_fork+0x25/0x30 [ 317.685798] Code: 0f 1f 44 00 00 55 48 b8 00 00 00 00 00 fc ff df 48 89 e5 41 54 4c 8d a7 c0 01 00 00 53 48 89 fb 4c 89 e2 48 c1 ea 03 48 83 ec 08 <80> 3c 02 00 75 50 48 8b bb c0 01 00 00 e8 33 8a f9 00 0f ba b3 [ 317.685872] RIP: blk_mq_unquiesce_queue+0x2b/0xa0 RSP: ffff8803c2ef7d40 [ 317.685908] ---[ end trace a3f8704150b1e8b4 ]--- Signed-off-by: Scott Bauer <scott.bauer@intel.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
2017-07-25 19:27:06 +03:00
if (ctrl->admin_q)
blk_mq_unquiesce_queue(ctrl->admin_q);
list_for_each_entry(ns, &ctrl->namespaces, list) {
/*
* Revalidating a dead namespace sets capacity to 0. This will
* end buffered writers dirtying pages that can't be synced.
*/
if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
continue;
revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
blk_mq_unquiesce_queue(ns->queue);
}
mutex_unlock(&ctrl->namespaces_mutex);
}
Merge branch 'for-4.6/drivers' of git://git.kernel.dk/linux-block Pull block driver updates from Jens Axboe: "This is the block driver pull request for this merge window. It sits on top of for-4.6/core, that was just sent out. This contains: - A set of fixes for lightnvm. One from Alan, fixing an overflow, and the rest from the usual suspects, Javier and Matias. - A set of fixes for nbd from Markus and Dan, and a fixup from Arnd for correct usage of the signed 64-bit divider. - A set of bug fixes for the Micron mtip32xx, from Asai. - A fix for the brd discard handling from Bart. - Update the maintainers entry for cciss, since that hardware has transferred ownership. - Three bug fixes for bcache from Eric Wheeler. - Set of fixes for xen-blk{back,front} from Jan and Konrad. - Removal of the cpqarray driver. It has been disabled in Kconfig since 2013, and we were initially scheduled to remove it in 3.15. - Various updates and fixes for NVMe, with the most important being: - Removal of the per-device NVMe thread, replacing that with a watchdog timer instead. From Christoph. - Exposing the namespace WWID through sysfs, from Keith. - Set of cleanups from Ming Lin. - Logging the controller device name instead of the underlying PCI device name, from Sagi. - And a bunch of fixes and optimizations from the usual suspects in this area" * 'for-4.6/drivers' of git://git.kernel.dk/linux-block: (49 commits) NVMe: Expose ns wwid through single sysfs entry drivers:block: cpqarray clean up brd: Fix discard request processing cpqarray: remove it from the kernel cciss: update MAINTAINERS NVMe: Remove unused sq_head read in completion path bcache: fix cache_set_flush() NULL pointer dereference on OOM bcache: cleaned up error handling around register_cache() bcache: fix race of writeback thread starting before complete initialization NVMe: Create discard zero quirk white list nbd: use correct div_s64 helper mtip32xx: remove unneeded variable in mtip_cmd_timeout() lightnvm: generalize rrpc ppa calculations lightnvm: remove struct nvm_dev->total_blocks lightnvm: rename ->nr_pages to ->nr_sects lightnvm: update closed list outside of intr context xen/blback: Fit the important information of the thread in 17 characters lightnvm: fold get bb tbl when using dual/quad plane mode lightnvm: fix up nonsensical configure overrun checking xen-blkback: advertise indirect segment support earlier ...
2016-03-19 03:13:31 +03:00
EXPORT_SYMBOL_GPL(nvme_kill_queues);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_unfreeze_queue(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list) {
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
if (timeout <= 0)
break;
}
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_freeze_queue_wait(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_freeze_queue_start(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
void nvme_stop_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_quiesce_queue(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_stop_queues);
void nvme_start_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_unquiesce_queue(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_start_queues);
int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
{
if (!ctrl->ops->reinit_request)
return 0;
return blk_mq_tagset_iter(set, set->driver_data,
ctrl->ops->reinit_request);
}
EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
int __init nvme_core_init(void)
{
int result;
nvme_wq = alloc_workqueue("nvme-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_wq)
return -ENOMEM;
result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
&nvme_dev_fops);
if (result < 0)
goto destroy_wq;
else if (result > 0)
nvme_char_major = result;
nvme_class = class_create(THIS_MODULE, "nvme");
if (IS_ERR(nvme_class)) {
result = PTR_ERR(nvme_class);
goto unregister_chrdev;
}
return 0;
unregister_chrdev:
__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
destroy_wq:
destroy_workqueue(nvme_wq);
return result;
}
void nvme_core_exit(void)
{
class_destroy(nvme_class);
__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
destroy_workqueue(nvme_wq);
}
MODULE_LICENSE("GPL");
MODULE_VERSION("1.0");
module_init(nvme_core_init);
module_exit(nvme_core_exit);