for-6.2/block-2022-12-08
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmOScsgQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpi5ID/9pLXFYOq1+uDjU0KO/MdjMjK8Ukr34lCnk WkajRLheE8JBKOFDE54XJk56sQSZHX9bTWqziar0h1fioh7FlQR/tVvzsERCm2M9 2y9THJNJygC68wgybStyiKlshFjl7TD7Kv5N9Y3xP3mkQygT+D6o8fXZk5xQbYyH YdFSoq4rJVHxRL03yzQiReGGIYdOUEQQh8l1FiLwLlKa3lXAey1KuxWIzksVN0KK aZB4QhiBpOiPgDHUVisq2XtyQjpZ2byoCImPzgrcqk9Jo4esvm/e6esrg4xlsvII LKFFkTmbVqjUZtFjqakFHmfuzVor4nU5f+xb90ZHExuuODYckkxWp5rWhf9QwqqI 0ik6WYgI1/5vnHnX8f2DYzOFQf9qa/rLgg0CshyUODlD6RfHa9vntqYvlIFkmOBd Q7KblIoK8YTzUS1M+v7X8JQ7gDR2KwygH37Da2KJS+vgvfIb8kJGr1ZORuhJuJJ7 Bl69gaNkHTHrqufp7UI64YXfueeuNu2J9z3zwzGoxeaFaofF/phDn0/2gCQE1fQI XBhsMw+ETqI6B2SPHMnzYDu2DM1S8ZTOYQlaD4G3uqgWnAM1tG707395uAy5yu4n D5azU1fVG4UocoNIyPujpaoSRs2zWZycEFEeUQkhyDDww/j4hlHi6H33eOnk0zsr wxzFGfvHfw== =k/vv -----END PGP SIGNATURE----- Merge tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux Pull block updates from Jens Axboe: - NVMe pull requests via Christoph: - Support some passthrough commands without CAP_SYS_ADMIN (Kanchan Joshi) - Refactor PCIe probing and reset (Christoph Hellwig) - Various fabrics authentication fixes and improvements (Sagi Grimberg) - Avoid fallback to sequential scan due to transient issues (Uday Shankar) - Implement support for the DEAC bit in Write Zeroes (Christoph Hellwig) - Allow overriding the IEEE OUI and firmware revision in configfs for nvmet (Aleksandr Miloserdov) - Force reconnect when number of queue changes in nvmet (Daniel Wagner) - Minor fixes and improvements (Uros Bizjak, Joel Granados, Sagi Grimberg, Christoph Hellwig, Christophe JAILLET) - Fix and cleanup nvme-fc req allocation (Chaitanya Kulkarni) - Use the common tagset helpers in nvme-pci driver (Christoph Hellwig) - Cleanup the nvme-pci removal path (Christoph Hellwig) - Use kstrtobool() instead of strtobool (Christophe JAILLET) - Allow unprivileged passthrough of Identify Controller (Joel Granados) - Support io stats on the mpath device (Sagi Grimberg) - Minor nvmet cleanup (Sagi Grimberg) - MD pull requests via Song: - Code cleanups (Christoph) - Various fixes - Floppy pull request from Denis: - Fix a memory leak in the init error path (Yuan) - Series fixing some batch wakeup issues with sbitmap (Gabriel) - Removal of the pktcdvd driver that was deprecated more than 5 years ago, and subsequent removal of the devnode callback in struct block_device_operations as no users are now left (Greg) - Fix for partition read on an exclusively opened bdev (Jan) - Series of elevator API cleanups (Jinlong, Christoph) - Series of fixes and cleanups for blk-iocost (Kemeng) - Series of fixes and cleanups for blk-throttle (Kemeng) - Series adding concurrent support for sync queues in BFQ (Yu) - Series bringing drbd a bit closer to the out-of-tree maintained version (Christian, Joel, Lars, Philipp) - Misc drbd fixes (Wang) - blk-wbt fixes and tweaks for enable/disable (Yu) - Fixes for mq-deadline for zoned devices (Damien) - Add support for read-only and offline zones for null_blk (Shin'ichiro) - Series fixing the delayed holder tracking, as used by DM (Yu, Christoph) - Series enabling bio alloc caching for IRQ based IO (Pavel) - Series enabling userspace peer-to-peer DMA (Logan) - BFQ waker fixes (Khazhismel) - Series fixing elevator refcount issues (Christoph, Jinlong) - Series cleaning up references around queue destruction (Christoph) - Series doing quiesce by tagset, enabling cleanups in drivers (Christoph, Chao) - Series untangling the queue kobject and queue references (Christoph) - Misc fixes and cleanups (Bart, David, Dawei, Jinlong, Kemeng, Ye, Yang, Waiman, Shin'ichiro, Randy, Pankaj, Christoph) * tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux: (247 commits) blktrace: Fix output non-blktrace event when blk_classic option enabled block: sed-opal: Don't include <linux/kernel.h> sed-opal: allow using IOC_OPAL_SAVE for locking too blk-cgroup: Fix typo in comment block: remove bio_set_op_attrs nvmet: don't open-code NVME_NS_ATTR_RO enumeration nvme-pci: use the tagset alloc/free helpers nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers nvme: consolidate setting the tagset flags nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set block: bio_copy_data_iter nvme-pci: split out a nvme_pci_ctrl_is_dead helper nvme-pci: return early on ctrl state mismatch in nvme_reset_work nvme-pci: rename nvme_disable_io_queues nvme-pci: cleanup nvme_suspend_queue nvme-pci: remove nvme_pci_disable nvme-pci: remove nvme_disable_admin_queue nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl nvme: use nvme_wait_ready in nvme_shutdown_ctrl ...
This commit is contained in:
commit
ce8a79d560
@ -1,18 +0,0 @@
|
||||
What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
|
||||
The pktcdvd module (packet writing driver) creates
|
||||
these files in debugfs:
|
||||
|
||||
/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
|
||||
|
||||
==== ====== ====================================
|
||||
info 0444 Lots of driver statistics and infos.
|
||||
==== ====== ====================================
|
||||
|
||||
Example::
|
||||
|
||||
cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
|
@ -407,6 +407,16 @@ Description:
|
||||
file contains a '1' if the memory has been published for
|
||||
use outside the driver that owns the device.
|
||||
|
||||
What: /sys/bus/pci/devices/.../p2pmem/allocate
|
||||
Date: August 2022
|
||||
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||
Description:
|
||||
This file allows mapping p2pmem into userspace. For each
|
||||
mmap() call on this file, the kernel will allocate a chunk
|
||||
of Peer-to-Peer memory for use in Peer-to-Peer transactions.
|
||||
This memory can be used in O_DIRECT calls to NVMe backed
|
||||
files for Peer-to-Peer copies.
|
||||
|
||||
What: /sys/bus/pci/devices/.../link/clkpm
|
||||
/sys/bus/pci/devices/.../link/l0s_aspm
|
||||
/sys/bus/pci/devices/.../link/l1_aspm
|
||||
|
@ -1,97 +0,0 @@
|
||||
sysfs interface
|
||||
---------------
|
||||
The pktcdvd module (packet writing driver) creates the following files in the
|
||||
sysfs: (<devid> is in the format major:minor)
|
||||
|
||||
What: /sys/class/pktcdvd/add
|
||||
What: /sys/class/pktcdvd/remove
|
||||
What: /sys/class/pktcdvd/device_map
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
|
||||
========== ==============================================
|
||||
add (WO) Write a block device id (major:minor) to
|
||||
create a new pktcdvd device and map it to the
|
||||
block device.
|
||||
|
||||
remove (WO) Write the pktcdvd device id (major:minor)
|
||||
to remove the pktcdvd device.
|
||||
|
||||
device_map (RO) Shows the device mapping in format:
|
||||
pktcdvd[0-7] <pktdevid> <blkdevid>
|
||||
========== ==============================================
|
||||
|
||||
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/dev
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/uevent
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
dev: (RO) Device id
|
||||
|
||||
uevent: (WO) To send a uevent
|
||||
|
||||
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_started
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_finished
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_written
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read_gather
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/reset
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
packets_started: (RO) Number of started packets.
|
||||
|
||||
packets_finished: (RO) Number of finished packets.
|
||||
|
||||
kb_written: (RO) kBytes written.
|
||||
|
||||
kb_read: (RO) kBytes read.
|
||||
|
||||
kb_read_gather: (RO) kBytes read to fill write packets.
|
||||
|
||||
reset: (WO) Write any value to it to reset
|
||||
pktcdvd device statistic values, like
|
||||
bytes read/written.
|
||||
|
||||
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/size
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_off
|
||||
What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_on
|
||||
Date: Oct. 2006
|
||||
KernelVersion: 2.6.20
|
||||
Contact: Thomas Maier <balagi@justmail.de>
|
||||
Description:
|
||||
============== ================================================
|
||||
size (RO) Contains the size of the bio write queue.
|
||||
|
||||
congestion_off (RW) If bio write queue size is below this mark,
|
||||
accept new bio requests from the block layer.
|
||||
|
||||
congestion_on (RW) If bio write queue size is higher as this
|
||||
mark, do no longer accept bio write requests
|
||||
from the block layer and wait till the pktcdvd
|
||||
device has processed enough bio's so that bio
|
||||
write queue size is below congestion off mark.
|
||||
A value of <= 0 disables congestion control.
|
||||
============== ================================================
|
||||
|
||||
|
||||
Example:
|
||||
--------
|
||||
To use the pktcdvd sysfs interface directly, you can do::
|
||||
|
||||
# create a new pktcdvd device mapped to /dev/hdc
|
||||
echo "22:0" >/sys/class/pktcdvd/add
|
||||
cat /sys/class/pktcdvd/device_map
|
||||
# assuming device pktcdvd0 was created, look at stat's
|
||||
cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written
|
||||
# print the device id of the mapped block device
|
||||
fgrep pktcdvd0 /sys/class/pktcdvd/device_map
|
||||
# remove device, using pktcdvd0 device id 253:0
|
||||
echo "253:0" >/sys/class/pktcdvd/remove
|
@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
|
||||
of inline encryption using the kernel crypto API. blk-crypto-fallback is built
|
||||
into the block layer, so it works on any block device without any special setup.
|
||||
Essentially, when a bio with an encryption context is submitted to a
|
||||
request_queue that doesn't support that encryption context, the block layer will
|
||||
block_device that doesn't support that encryption context, the block layer will
|
||||
handle en/decryption of the bio using blk-crypto-fallback.
|
||||
|
||||
For encryption, the data cannot be encrypted in-place, as callers usually rely
|
||||
@ -187,7 +187,7 @@ API presented to users of the block layer
|
||||
|
||||
``blk_crypto_config_supported()`` allows users to check ahead of time whether
|
||||
inline encryption with particular crypto settings will work on a particular
|
||||
request_queue -- either via hardware or via blk-crypto-fallback. This function
|
||||
block_device -- either via hardware or via blk-crypto-fallback. This function
|
||||
takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
|
||||
the actual bytes of the key and instead just contains the algorithm, data unit
|
||||
size, etc. This function can be useful if blk-crypto-fallback is disabled.
|
||||
@ -195,7 +195,7 @@ size, etc. This function can be useful if blk-crypto-fallback is disabled.
|
||||
``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
|
||||
|
||||
Users must call ``blk_crypto_start_using_key()`` before actually starting to use
|
||||
a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
|
||||
a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
|
||||
was called earlier). This is needed to initialize blk-crypto-fallback if it
|
||||
will be needed. This must not be called from the data path, as this may have to
|
||||
allocate resources, which may deadlock in that case.
|
||||
@ -207,7 +207,7 @@ for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
|
||||
later, as that happens automatically when the bio is freed or reset.
|
||||
|
||||
Finally, when done using inline encryption with a blk_crypto_key on a
|
||||
request_queue, users must call ``blk_crypto_evict_key()``. This ensures that
|
||||
block_device, users must call ``blk_crypto_evict_key()``. This ensures that
|
||||
the key is evicted from all keyslots it may be programmed into and unlinked from
|
||||
any kernel data structures it may be linked into.
|
||||
|
||||
@ -221,9 +221,9 @@ as follows:
|
||||
5. ``blk_crypto_evict_key()`` (after all I/O has completed)
|
||||
6. Zeroize the blk_crypto_key (this has no dedicated function)
|
||||
|
||||
If a blk_crypto_key is being used on multiple request_queues, then
|
||||
If a blk_crypto_key is being used on multiple block_devices, then
|
||||
``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
|
||||
and ``blk_crypto_evict_key()`` must be called on each request_queue.
|
||||
and ``blk_crypto_evict_key()`` must be called on each block_device.
|
||||
|
||||
API presented to device drivers
|
||||
===============================
|
||||
|
@ -16430,13 +16430,6 @@ S: Supported
|
||||
F: Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml
|
||||
F: drivers/input/keyboard/pinephone-keyboard.c
|
||||
|
||||
PKTCDVD DRIVER
|
||||
M: linux-block@vger.kernel.org
|
||||
S: Orphan
|
||||
F: drivers/block/pktcdvd.c
|
||||
F: include/linux/pktcdvd.h
|
||||
F: include/uapi/linux/pktcdvd.h
|
||||
|
||||
PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
|
||||
M: Tomasz Duszynski <tduszyns@gmail.com>
|
||||
S: Maintained
|
||||
|
@ -224,7 +224,7 @@ int fsync_bdev(struct block_device *bdev)
|
||||
EXPORT_SYMBOL(fsync_bdev);
|
||||
|
||||
/**
|
||||
* freeze_bdev -- lock a filesystem and force it into a consistent state
|
||||
* freeze_bdev - lock a filesystem and force it into a consistent state
|
||||
* @bdev: blockdevice to lock
|
||||
*
|
||||
* If a superblock is found on this device, we take the s_umount semaphore
|
||||
@ -268,7 +268,7 @@ done:
|
||||
EXPORT_SYMBOL(freeze_bdev);
|
||||
|
||||
/**
|
||||
* thaw_bdev -- unlock filesystem
|
||||
* thaw_bdev - unlock filesystem
|
||||
* @bdev: blockdevice to unlock
|
||||
*
|
||||
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
|
||||
|
@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
|
||||
{
|
||||
blkg_rwstat_add(&bfqg->stats.queued, opf, 1);
|
||||
bfqg_stats_end_empty_time(&bfqg->stats);
|
||||
if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
|
||||
if (!(bfqq == bfqg->bfqd->in_service_queue))
|
||||
bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
|
||||
}
|
||||
|
||||
@ -552,6 +552,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
|
||||
*/
|
||||
bfqg->bfqd = bfqd;
|
||||
bfqg->active_entities = 0;
|
||||
bfqg->num_queues_with_pending_reqs = 0;
|
||||
bfqg->online = true;
|
||||
bfqg->rq_pos_tree = RB_ROOT;
|
||||
}
|
||||
@ -645,6 +646,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
{
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
struct bfq_group *old_parent = bfqq_group(bfqq);
|
||||
bool has_pending_reqs = false;
|
||||
|
||||
/*
|
||||
* No point to move bfqq to the same group, which can happen when
|
||||
@ -665,6 +667,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
*/
|
||||
bfqq->ref++;
|
||||
|
||||
if (entity->in_groups_with_pending_reqs) {
|
||||
has_pending_reqs = true;
|
||||
bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
|
||||
}
|
||||
|
||||
/* If bfqq is empty, then bfq_bfqq_expire also invokes
|
||||
* bfq_del_bfqq_busy, thereby removing bfqq and its entity
|
||||
* from data structures related to current group. Otherwise we
|
||||
@ -692,6 +699,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
/* pin down bfqg and its associated blkg */
|
||||
bfqg_and_blkg_get(bfqg);
|
||||
|
||||
if (has_pending_reqs)
|
||||
bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
|
||||
|
||||
if (bfq_bfqq_busy(bfqq)) {
|
||||
if (unlikely(!bfqd->nonrot_with_queueing))
|
||||
bfq_pos_tree_add_move(bfqd, bfqq);
|
||||
|
@ -820,7 +820,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
|
||||
* much easier to maintain the needed state:
|
||||
* 1) all active queues have the same weight,
|
||||
* 2) all active queues belong to the same I/O-priority class,
|
||||
* 3) there are no active groups.
|
||||
* 3) there is at most one active group.
|
||||
* In particular, the last condition is always true if hierarchical
|
||||
* support or the cgroups interface are not enabled, thus no state
|
||||
* needs to be maintained in this case.
|
||||
@ -852,7 +852,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
|
||||
|
||||
return varied_queue_weights || multiple_classes_busy
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
|| bfqd->num_groups_with_pending_reqs > 0
|
||||
|| bfqd->num_groups_with_pending_reqs > 1
|
||||
#endif
|
||||
;
|
||||
}
|
||||
@ -870,9 +870,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
|
||||
* In most scenarios, the rate at which nodes are created/destroyed
|
||||
* should be low too.
|
||||
*/
|
||||
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
struct rb_root_cached *root)
|
||||
void bfq_weights_tree_add(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree;
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
|
||||
bool leftmost = true;
|
||||
@ -944,13 +944,14 @@ inc_counter:
|
||||
* See the comments to the function bfq_weights_tree_add() for considerations
|
||||
* about overhead.
|
||||
*/
|
||||
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq,
|
||||
struct rb_root_cached *root)
|
||||
void bfq_weights_tree_remove(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct rb_root_cached *root;
|
||||
|
||||
if (!bfqq->weight_counter)
|
||||
return;
|
||||
|
||||
root = &bfqq->bfqd->queue_weights_tree;
|
||||
bfqq->weight_counter->num_active--;
|
||||
if (bfqq->weight_counter->num_active > 0)
|
||||
goto reset_entity_pointer;
|
||||
@ -963,59 +964,6 @@ reset_entity_pointer:
|
||||
bfq_put_queue(bfqq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoke __bfq_weights_tree_remove on bfqq and decrement the number
|
||||
* of active groups for each queue's inactive parent entity.
|
||||
*/
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_entity *entity = bfqq->entity.parent;
|
||||
|
||||
for_each_entity(entity) {
|
||||
struct bfq_sched_data *sd = entity->my_sched_data;
|
||||
|
||||
if (sd->next_in_service || sd->in_service_entity) {
|
||||
/*
|
||||
* entity is still active, because either
|
||||
* next_in_service or in_service_entity is not
|
||||
* NULL (see the comments on the definition of
|
||||
* next_in_service for details on why
|
||||
* in_service_entity must be checked too).
|
||||
*
|
||||
* As a consequence, its parent entities are
|
||||
* active as well, and thus this loop must
|
||||
* stop here.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* The decrement of num_groups_with_pending_reqs is
|
||||
* not performed immediately upon the deactivation of
|
||||
* entity, but it is delayed to when it also happens
|
||||
* that the first leaf descendant bfqq of entity gets
|
||||
* all its pending requests completed. The following
|
||||
* instructions perform this delayed decrement, if
|
||||
* needed. See the comments on
|
||||
* num_groups_with_pending_reqs for details.
|
||||
*/
|
||||
if (entity->in_groups_with_pending_reqs) {
|
||||
entity->in_groups_with_pending_reqs = false;
|
||||
bfqd->num_groups_with_pending_reqs--;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Next function is invoked last, because it causes bfqq to be
|
||||
* freed if the following holds: bfqq is not in service and
|
||||
* has no dispatched request. DO NOT use bfqq after the next
|
||||
* function invocation.
|
||||
*/
|
||||
__bfq_weights_tree_remove(bfqd, bfqq,
|
||||
&bfqd->queue_weights_tree);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return expired entry, or NULL to just start from scratch in rbtree.
|
||||
*/
|
||||
@ -2135,7 +2083,9 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
if (!bfqd->last_completed_rq_bfqq ||
|
||||
bfqd->last_completed_rq_bfqq == bfqq ||
|
||||
bfq_bfqq_has_short_ttime(bfqq) ||
|
||||
now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC)
|
||||
now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
|
||||
bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq ||
|
||||
bfqq == &bfqd->oom_bfqq)
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -2373,22 +2323,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if 0 /* Still not clear if we can do without next two functions */
|
||||
static void bfq_activate_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct bfq_data *bfqd = q->elevator->elevator_data;
|
||||
|
||||
bfqd->rq_in_driver++;
|
||||
}
|
||||
|
||||
static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
struct bfq_data *bfqd = q->elevator->elevator_data;
|
||||
|
||||
bfqd->rq_in_driver--;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void bfq_remove_request(struct request_queue *q,
|
||||
struct request *rq)
|
||||
{
|
||||
@ -6261,7 +6195,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
||||
*/
|
||||
bfqq->budget_timeout = jiffies;
|
||||
|
||||
bfq_weights_tree_remove(bfqd, bfqq);
|
||||
bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
|
||||
bfq_weights_tree_remove(bfqq);
|
||||
}
|
||||
|
||||
now_ns = ktime_get_ns();
|
||||
@ -6784,6 +6719,12 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
|
||||
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
|
||||
true, is_sync,
|
||||
NULL);
|
||||
if (unlikely(bfqq == &bfqd->oom_bfqq))
|
||||
bfqq_already_existing = true;
|
||||
} else
|
||||
bfqq_already_existing = true;
|
||||
|
||||
if (!bfqq_already_existing) {
|
||||
bfqq->waker_bfqq = old_bfqq->waker_bfqq;
|
||||
bfqq->tentative_waker_bfqq = NULL;
|
||||
|
||||
@ -6797,8 +6738,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
|
||||
if (bfqq->waker_bfqq)
|
||||
hlist_add_head(&bfqq->woken_list_node,
|
||||
&bfqq->waker_bfqq->woken_list);
|
||||
} else
|
||||
bfqq_already_existing = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -7045,6 +6985,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
|
||||
#endif
|
||||
|
||||
blk_stat_disable_accounting(bfqd->queue);
|
||||
clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
|
||||
wbt_enable_default(bfqd->queue);
|
||||
|
||||
kfree(bfqd);
|
||||
@ -7190,6 +7131,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
|
||||
/* We dispatch from request queue wide instead of hw queue */
|
||||
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
|
||||
|
||||
set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
|
||||
wbt_disable_default(q);
|
||||
blk_stat_enable_accounting(q);
|
||||
|
||||
|
@ -492,27 +492,27 @@ struct bfq_data {
|
||||
struct rb_root_cached queue_weights_tree;
|
||||
|
||||
/*
|
||||
* Number of groups with at least one descendant process that
|
||||
* Number of groups with at least one process that
|
||||
* has at least one request waiting for completion. Note that
|
||||
* this accounts for also requests already dispatched, but not
|
||||
* yet completed. Therefore this number of groups may differ
|
||||
* (be larger) than the number of active groups, as a group is
|
||||
* considered active only if its corresponding entity has
|
||||
* descendant queues with at least one request queued. This
|
||||
* queues with at least one request queued. This
|
||||
* number is used to decide whether a scenario is symmetric.
|
||||
* For a detailed explanation see comments on the computation
|
||||
* of the variable asymmetric_scenario in the function
|
||||
* bfq_better_to_idle().
|
||||
*
|
||||
* However, it is hard to compute this number exactly, for
|
||||
* groups with multiple descendant processes. Consider a group
|
||||
* that is inactive, i.e., that has no descendant process with
|
||||
* groups with multiple processes. Consider a group
|
||||
* that is inactive, i.e., that has no process with
|
||||
* pending I/O inside BFQ queues. Then suppose that
|
||||
* num_groups_with_pending_reqs is still accounting for this
|
||||
* group, because the group has descendant processes with some
|
||||
* group, because the group has processes with some
|
||||
* I/O request still in flight. num_groups_with_pending_reqs
|
||||
* should be decremented when the in-flight request of the
|
||||
* last descendant process is finally completed (assuming that
|
||||
* last process is finally completed (assuming that
|
||||
* nothing else has changed for the group in the meantime, in
|
||||
* terms of composition of the group and active/inactive state of child
|
||||
* groups and processes). To accomplish this, an additional
|
||||
@ -521,7 +521,7 @@ struct bfq_data {
|
||||
* we resort to the following tradeoff between simplicity and
|
||||
* accuracy: for an inactive group that is still counted in
|
||||
* num_groups_with_pending_reqs, we decrement
|
||||
* num_groups_with_pending_reqs when the first descendant
|
||||
* num_groups_with_pending_reqs when the first
|
||||
* process of the group remains with no request waiting for
|
||||
* completion.
|
||||
*
|
||||
@ -529,12 +529,12 @@ struct bfq_data {
|
||||
* carefulness: to avoid multiple decrements, we flag a group,
|
||||
* more precisely an entity representing a group, as still
|
||||
* counted in num_groups_with_pending_reqs when it becomes
|
||||
* inactive. Then, when the first descendant queue of the
|
||||
* inactive. Then, when the first queue of the
|
||||
* entity remains with no request waiting for completion,
|
||||
* num_groups_with_pending_reqs is decremented, and this flag
|
||||
* is reset. After this flag is reset for the entity,
|
||||
* num_groups_with_pending_reqs won't be decremented any
|
||||
* longer in case a new descendant queue of the entity remains
|
||||
* longer in case a new queue of the entity remains
|
||||
* with no request waiting for completion.
|
||||
*/
|
||||
unsigned int num_groups_with_pending_reqs;
|
||||
@ -931,7 +931,7 @@ struct bfq_group {
|
||||
struct bfq_entity entity;
|
||||
struct bfq_sched_data sched_data;
|
||||
|
||||
void *bfqd;
|
||||
struct bfq_data *bfqd;
|
||||
|
||||
struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
|
||||
struct bfq_queue *async_idle_bfqq;
|
||||
@ -939,6 +939,7 @@ struct bfq_group {
|
||||
struct bfq_entity *my_entity;
|
||||
|
||||
int active_entities;
|
||||
int num_queues_with_pending_reqs;
|
||||
|
||||
struct rb_root rq_pos_tree;
|
||||
|
||||
@ -968,13 +969,8 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
|
||||
void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
|
||||
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
|
||||
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
|
||||
void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
struct rb_root_cached *root);
|
||||
void __bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq,
|
||||
struct rb_root_cached *root);
|
||||
void bfq_weights_tree_remove(struct bfq_data *bfqd,
|
||||
struct bfq_queue *bfqq);
|
||||
void bfq_weights_tree_add(struct bfq_queue *bfqq);
|
||||
void bfq_weights_tree_remove(struct bfq_queue *bfqq);
|
||||
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bool compensate, enum bfqq_expiration reason);
|
||||
void bfq_put_queue(struct bfq_queue *bfqq);
|
||||
@ -1078,6 +1074,8 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bool expiration);
|
||||
void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
|
||||
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
|
||||
void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
|
||||
void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
|
||||
|
||||
/* --------------- end of interface of B-WF2Q+ ---------------- */
|
||||
|
||||
|
157
block/bfq-wf2q.c
157
block/bfq-wf2q.c
@ -218,6 +218,24 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
|
||||
return false;
|
||||
}
|
||||
|
||||
static void bfq_inc_active_entities(struct bfq_entity *entity)
|
||||
{
|
||||
struct bfq_sched_data *sd = entity->sched_data;
|
||||
struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
|
||||
|
||||
if (bfqg != bfqg->bfqd->root_group)
|
||||
bfqg->active_entities++;
|
||||
}
|
||||
|
||||
static void bfq_dec_active_entities(struct bfq_entity *entity)
|
||||
{
|
||||
struct bfq_sched_data *sd = entity->sched_data;
|
||||
struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
|
||||
|
||||
if (bfqg != bfqg->bfqd->root_group)
|
||||
bfqg->active_entities--;
|
||||
}
|
||||
|
||||
#else /* CONFIG_BFQ_GROUP_IOSCHED */
|
||||
|
||||
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
|
||||
@ -230,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
|
||||
return true;
|
||||
}
|
||||
|
||||
static void bfq_inc_active_entities(struct bfq_entity *entity)
|
||||
{
|
||||
}
|
||||
|
||||
static void bfq_dec_active_entities(struct bfq_entity *entity)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
|
||||
|
||||
/*
|
||||
@ -456,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
|
||||
{
|
||||
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
||||
struct rb_node *node = &entity->rb_node;
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
struct bfq_sched_data *sd = NULL;
|
||||
struct bfq_group *bfqg = NULL;
|
||||
struct bfq_data *bfqd = NULL;
|
||||
#endif
|
||||
|
||||
bfq_insert(&st->active, entity);
|
||||
|
||||
@ -471,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st,
|
||||
|
||||
bfq_update_active_tree(node);
|
||||
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
sd = entity->sched_data;
|
||||
bfqg = container_of(sd, struct bfq_group, sched_data);
|
||||
bfqd = (struct bfq_data *)bfqg->bfqd;
|
||||
#endif
|
||||
if (bfqq)
|
||||
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
if (bfqg != bfqd->root_group)
|
||||
bfqg->active_entities++;
|
||||
#endif
|
||||
|
||||
bfq_inc_active_entities(entity);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -558,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st,
|
||||
{
|
||||
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
||||
struct rb_node *node;
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
struct bfq_sched_data *sd = NULL;
|
||||
struct bfq_group *bfqg = NULL;
|
||||
struct bfq_data *bfqd = NULL;
|
||||
#endif
|
||||
|
||||
node = bfq_find_deepest(&entity->rb_node);
|
||||
bfq_extract(&st->active, entity);
|
||||
|
||||
if (node)
|
||||
bfq_update_active_tree(node);
|
||||
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
sd = entity->sched_data;
|
||||
bfqg = container_of(sd, struct bfq_group, sched_data);
|
||||
bfqd = (struct bfq_data *)bfqg->bfqd;
|
||||
#endif
|
||||
if (bfqq)
|
||||
list_del(&bfqq->bfqq_list);
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
if (bfqg != bfqd->root_group)
|
||||
bfqg->active_entities--;
|
||||
#endif
|
||||
|
||||
bfq_dec_active_entities(entity);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -706,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
|
||||
if (entity->prio_changed) {
|
||||
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
|
||||
unsigned int prev_weight, new_weight;
|
||||
struct bfq_data *bfqd = NULL;
|
||||
struct rb_root_cached *root;
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
struct bfq_sched_data *sd;
|
||||
struct bfq_group *bfqg;
|
||||
#endif
|
||||
|
||||
if (bfqq)
|
||||
bfqd = bfqq->bfqd;
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
else {
|
||||
sd = entity->my_sched_data;
|
||||
bfqg = container_of(sd, struct bfq_group, sched_data);
|
||||
bfqd = (struct bfq_data *)bfqg->bfqd;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Matches the smp_wmb() in bfq_group_set_weight. */
|
||||
smp_rmb();
|
||||
@ -770,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
|
||||
* queue, remove the entity from its old weight counter (if
|
||||
* there is a counter associated with the entity).
|
||||
*/
|
||||
if (prev_weight != new_weight && bfqq) {
|
||||
root = &bfqd->queue_weights_tree;
|
||||
__bfq_weights_tree_remove(bfqd, bfqq, root);
|
||||
}
|
||||
if (prev_weight != new_weight && bfqq)
|
||||
bfq_weights_tree_remove(bfqq);
|
||||
entity->weight = new_weight;
|
||||
/*
|
||||
* Add the entity, if it is not a weight-raised queue,
|
||||
* to the counter associated with its new weight.
|
||||
*/
|
||||
if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
|
||||
/* If we get here, root has been initialized. */
|
||||
bfq_weights_tree_add(bfqd, bfqq, root);
|
||||
}
|
||||
if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1)
|
||||
bfq_weights_tree_add(bfqq);
|
||||
|
||||
new_st->wsum += entity->weight;
|
||||
|
||||
@ -984,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
|
||||
entity->on_st_or_in_serv = true;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
|
||||
struct bfq_group *bfqg =
|
||||
container_of(entity, struct bfq_group, entity);
|
||||
struct bfq_data *bfqd = bfqg->bfqd;
|
||||
|
||||
if (!entity->in_groups_with_pending_reqs) {
|
||||
entity->in_groups_with_pending_reqs = true;
|
||||
bfqd->num_groups_with_pending_reqs++;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
bfq_update_fin_time_enqueue(entity, st, backshifted);
|
||||
}
|
||||
|
||||
@ -1082,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity)
|
||||
}
|
||||
|
||||
static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
|
||||
struct bfq_sched_data *sd,
|
||||
bool non_blocking_wait_rq)
|
||||
{
|
||||
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
|
||||
|
||||
if (sd->in_service_entity == entity || entity->tree == &st->active)
|
||||
if (entity->sched_data->in_service_entity == entity ||
|
||||
entity->tree == &st->active)
|
||||
/*
|
||||
* in service or already queued on the active tree,
|
||||
* requeue or reposition
|
||||
@ -1119,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity,
|
||||
bool non_blocking_wait_rq,
|
||||
bool requeue, bool expiration)
|
||||
{
|
||||
struct bfq_sched_data *sd;
|
||||
|
||||
for_each_entity(entity) {
|
||||
sd = entity->sched_data;
|
||||
__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
|
||||
|
||||
if (!bfq_update_next_in_service(sd, entity, expiration) &&
|
||||
!requeue)
|
||||
__bfq_activate_requeue_entity(entity, non_blocking_wait_rq);
|
||||
if (!bfq_update_next_in_service(entity->sched_data, entity,
|
||||
expiration) && !requeue)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1646,6 +1610,32 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bfqq == bfqd->in_service_queue, expiration);
|
||||
}
|
||||
|
||||
void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
|
||||
if (!entity->in_groups_with_pending_reqs) {
|
||||
entity->in_groups_with_pending_reqs = true;
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++))
|
||||
bfqq->bfqd->num_groups_with_pending_reqs++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
|
||||
if (entity->in_groups_with_pending_reqs) {
|
||||
entity->in_groups_with_pending_reqs = false;
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs))
|
||||
bfqq->bfqd->num_groups_with_pending_reqs--;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Called when the bfqq no longer has requests pending, remove it from
|
||||
* the service tree. As a special case, it can be invoked during an
|
||||
@ -1668,8 +1658,14 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration)
|
||||
|
||||
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
|
||||
|
||||
if (!bfqq->dispatched)
|
||||
bfq_weights_tree_remove(bfqd, bfqq);
|
||||
if (!bfqq->dispatched) {
|
||||
bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
|
||||
/*
|
||||
* Next function is invoked last, because it causes bfqq to be
|
||||
* freed. DO NOT use bfqq after the next function invocation.
|
||||
*/
|
||||
bfq_weights_tree_remove(bfqq);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1686,10 +1682,11 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq)
|
||||
bfq_mark_bfqq_busy(bfqq);
|
||||
bfqd->busy_queues[bfqq->ioprio_class - 1]++;
|
||||
|
||||
if (!bfqq->dispatched)
|
||||
if (!bfqq->dispatched) {
|
||||
bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
|
||||
if (bfqq->wr_coeff == 1)
|
||||
bfq_weights_tree_add(bfqd, bfqq,
|
||||
&bfqd->queue_weights_tree);
|
||||
bfq_weights_tree_add(bfqq);
|
||||
}
|
||||
|
||||
if (bfqq->wr_coeff > 1)
|
||||
bfqd->wr_busy_queues++;
|
||||
|
146
block/bio.c
146
block/bio.c
@ -25,9 +25,15 @@
|
||||
#include "blk-rq-qos.h"
|
||||
#include "blk-cgroup.h"
|
||||
|
||||
#define ALLOC_CACHE_THRESHOLD 16
|
||||
#define ALLOC_CACHE_SLACK 64
|
||||
#define ALLOC_CACHE_MAX 256
|
||||
|
||||
struct bio_alloc_cache {
|
||||
struct bio *free_list;
|
||||
struct bio *free_list_irq;
|
||||
unsigned int nr;
|
||||
unsigned int nr_irq;
|
||||
};
|
||||
|
||||
static struct biovec_slab {
|
||||
@ -408,6 +414,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
|
||||
queue_work(bs->rescue_workqueue, &bs->rescue_work);
|
||||
}
|
||||
|
||||
static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/* cache->free_list must be empty */
|
||||
if (WARN_ON_ONCE(cache->free_list))
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
cache->free_list = cache->free_list_irq;
|
||||
cache->free_list_irq = NULL;
|
||||
cache->nr += cache->nr_irq;
|
||||
cache->nr_irq = 0;
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
|
||||
unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
|
||||
struct bio_set *bs)
|
||||
@ -417,8 +439,12 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
|
||||
|
||||
cache = per_cpu_ptr(bs->cache, get_cpu());
|
||||
if (!cache->free_list) {
|
||||
put_cpu();
|
||||
return NULL;
|
||||
if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
|
||||
bio_alloc_irq_cache_splice(cache);
|
||||
if (!cache->free_list) {
|
||||
put_cpu();
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
bio = cache->free_list;
|
||||
cache->free_list = bio->bi_next;
|
||||
@ -462,9 +488,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
|
||||
* submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
|
||||
* for per bio allocations.
|
||||
*
|
||||
* If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process
|
||||
* context, not hard/soft IRQ.
|
||||
*
|
||||
* Returns: Pointer to new bio on success, NULL on failure.
|
||||
*/
|
||||
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
|
||||
@ -526,6 +549,8 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
|
||||
}
|
||||
if (unlikely(!p))
|
||||
return NULL;
|
||||
if (!mempool_is_saturated(&bs->bio_pool))
|
||||
opf &= ~REQ_ALLOC_CACHE;
|
||||
|
||||
bio = p + bs->front_pad;
|
||||
if (nr_vecs > BIO_INLINE_VECS) {
|
||||
@ -676,11 +701,8 @@ void guard_bio_eod(struct bio *bio)
|
||||
bio_truncate(bio, maxsector << 9);
|
||||
}
|
||||
|
||||
#define ALLOC_CACHE_MAX 512
|
||||
#define ALLOC_CACHE_SLACK 64
|
||||
|
||||
static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
|
||||
unsigned int nr)
|
||||
static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
|
||||
unsigned int nr)
|
||||
{
|
||||
unsigned int i = 0;
|
||||
struct bio *bio;
|
||||
@ -692,6 +714,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
|
||||
if (++i == nr)
|
||||
break;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
|
||||
unsigned int nr)
|
||||
{
|
||||
nr -= __bio_alloc_cache_prune(cache, nr);
|
||||
if (!READ_ONCE(cache->free_list)) {
|
||||
bio_alloc_irq_cache_splice(cache);
|
||||
__bio_alloc_cache_prune(cache, nr);
|
||||
}
|
||||
}
|
||||
|
||||
static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
|
||||
@ -725,6 +758,35 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
|
||||
bs->cache = NULL;
|
||||
}
|
||||
|
||||
static inline void bio_put_percpu_cache(struct bio *bio)
|
||||
{
|
||||
struct bio_alloc_cache *cache;
|
||||
|
||||
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
|
||||
if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
|
||||
put_cpu();
|
||||
bio_free(bio);
|
||||
return;
|
||||
}
|
||||
|
||||
bio_uninit(bio);
|
||||
|
||||
if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
|
||||
bio->bi_next = cache->free_list;
|
||||
cache->free_list = bio;
|
||||
cache->nr++;
|
||||
} else {
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
bio->bi_next = cache->free_list_irq;
|
||||
cache->free_list_irq = bio;
|
||||
cache->nr_irq++;
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_put - release a reference to a bio
|
||||
* @bio: bio to release reference to
|
||||
@ -740,20 +802,10 @@ void bio_put(struct bio *bio)
|
||||
if (!atomic_dec_and_test(&bio->__bi_cnt))
|
||||
return;
|
||||
}
|
||||
|
||||
if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) {
|
||||
struct bio_alloc_cache *cache;
|
||||
|
||||
bio_uninit(bio);
|
||||
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
|
||||
bio->bi_next = cache->free_list;
|
||||
cache->free_list = bio;
|
||||
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
|
||||
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
|
||||
put_cpu();
|
||||
} else {
|
||||
if (bio->bi_opf & REQ_ALLOC_CACHE)
|
||||
bio_put_percpu_cache(bio);
|
||||
else
|
||||
bio_free(bio);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(bio_put);
|
||||
|
||||
@ -863,6 +915,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
|
||||
return false;
|
||||
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
|
||||
return false;
|
||||
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
|
||||
return false;
|
||||
|
||||
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
|
||||
if (*same_page)
|
||||
@ -1195,6 +1249,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
|
||||
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
|
||||
struct page **pages = (struct page **)bv;
|
||||
unsigned int gup_flags = 0;
|
||||
ssize_t size, left;
|
||||
unsigned len, i = 0;
|
||||
size_t offset, trim;
|
||||
@ -1208,6 +1263,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
|
||||
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
|
||||
|
||||
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
|
||||
gup_flags |= FOLL_PCI_P2PDMA;
|
||||
|
||||
/*
|
||||
* Each segment in the iov is required to be a block size multiple.
|
||||
* However, we may not be able to get the entire segment if it spans
|
||||
@ -1215,8 +1273,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
* result to ensure the bio's total size is correct. The remainder of
|
||||
* the iov data will be picked up in the next bio iteration.
|
||||
*/
|
||||
size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
|
||||
nr_pages, &offset);
|
||||
size = iov_iter_get_pages(iter, pages,
|
||||
UINT_MAX - bio->bi_iter.bi_size,
|
||||
nr_pages, &offset, gup_flags);
|
||||
if (unlikely(size <= 0))
|
||||
return size ? size : -EFAULT;
|
||||
|
||||
@ -1342,27 +1401,6 @@ void __bio_advance(struct bio *bio, unsigned bytes)
|
||||
}
|
||||
EXPORT_SYMBOL(__bio_advance);
|
||||
|
||||
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
|
||||
struct bio *src, struct bvec_iter *src_iter)
|
||||
{
|
||||
while (src_iter->bi_size && dst_iter->bi_size) {
|
||||
struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
|
||||
struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
|
||||
unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
|
||||
void *src_buf = bvec_kmap_local(&src_bv);
|
||||
void *dst_buf = bvec_kmap_local(&dst_bv);
|
||||
|
||||
memcpy(dst_buf, src_buf, bytes);
|
||||
|
||||
kunmap_local(dst_buf);
|
||||
kunmap_local(src_buf);
|
||||
|
||||
bio_advance_iter_single(src, src_iter, bytes);
|
||||
bio_advance_iter_single(dst, dst_iter, bytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(bio_copy_data_iter);
|
||||
|
||||
/**
|
||||
* bio_copy_data - copy contents of data buffers from one bio to another
|
||||
* @src: source bio
|
||||
@ -1376,7 +1414,21 @@ void bio_copy_data(struct bio *dst, struct bio *src)
|
||||
struct bvec_iter src_iter = src->bi_iter;
|
||||
struct bvec_iter dst_iter = dst->bi_iter;
|
||||
|
||||
bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
|
||||
while (src_iter.bi_size && dst_iter.bi_size) {
|
||||
struct bio_vec src_bv = bio_iter_iovec(src, src_iter);
|
||||
struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter);
|
||||
unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
|
||||
void *src_buf = bvec_kmap_local(&src_bv);
|
||||
void *dst_buf = bvec_kmap_local(&dst_bv);
|
||||
|
||||
memcpy(dst_buf, src_buf, bytes);
|
||||
|
||||
kunmap_local(dst_buf);
|
||||
kunmap_local(src_buf);
|
||||
|
||||
bio_advance_iter_single(src, &src_iter, bytes);
|
||||
bio_advance_iter_single(dst, &dst_iter, bytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(bio_copy_data);
|
||||
|
||||
|
@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
|
||||
|
||||
#define BLKG_DESTROY_BATCH_SIZE 64
|
||||
|
||||
/*
|
||||
* Lockless lists for tracking IO stats update
|
||||
*
|
||||
* New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
|
||||
* There are multiple blkg's (one for each block device) attached to each
|
||||
* blkcg. The rstat code keeps track of which cpu has IO stats updated,
|
||||
* but it doesn't know which blkg has the updated stats. If there are many
|
||||
* block devices in a system, the cost of iterating all the blkg's to flush
|
||||
* out the IO stats can be high. To reduce such overhead, a set of percpu
|
||||
* lockless lists (lhead) per blkcg are used to track the set of recently
|
||||
* updated iostat_cpu's since the last flush. An iostat_cpu will be put
|
||||
* onto the lockless list on the update side [blk_cgroup_bio_start()] if
|
||||
* not there yet and then removed when being flushed [blkcg_rstat_flush()].
|
||||
* References to blkg are gotten and then put back in the process to
|
||||
* protect against blkg removal.
|
||||
*
|
||||
* Return: 0 if successful or -ENOMEM if allocation fails.
|
||||
*/
|
||||
static int init_blkcg_llists(struct blkcg *blkcg)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
|
||||
if (!blkcg->lhead)
|
||||
return -ENOMEM;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* blkcg_css - find the current css
|
||||
*
|
||||
@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
|
||||
blkg->blkcg = blkcg;
|
||||
|
||||
u64_stats_init(&blkg->iostat.sync);
|
||||
for_each_possible_cpu(cpu)
|
||||
for_each_possible_cpu(cpu) {
|
||||
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
|
||||
per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
|
||||
}
|
||||
|
||||
for (i = 0; i < BLKCG_MAX_POLS; i++) {
|
||||
struct blkcg_policy *pol = blkcg_policy[i];
|
||||
@ -577,7 +610,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
|
||||
* @pd: policy private data of interest
|
||||
* @v: value to print
|
||||
*
|
||||
* Print @v to @sf for the device assocaited with @pd.
|
||||
* Print @v to @sf for the device associated with @pd.
|
||||
*/
|
||||
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
|
||||
{
|
||||
@ -765,7 +798,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
|
||||
|
||||
/**
|
||||
* blkg_conf_finish - finish up per-blkg config update
|
||||
* @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
|
||||
* @ctx: blkg_conf_ctx initialized by blkg_conf_prep()
|
||||
*
|
||||
* Finish up after per-blkg config update. This function must be paired
|
||||
* with blkg_conf_prep().
|
||||
@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
|
||||
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
|
||||
{
|
||||
struct blkcg *blkcg = css_to_blkcg(css);
|
||||
struct blkcg_gq *blkg;
|
||||
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
|
||||
struct llist_node *lnode;
|
||||
struct blkg_iostat_set *bisc, *next_bisc;
|
||||
|
||||
/* Root-level stats are sourced from system-wide IO stats */
|
||||
if (!cgroup_parent(css->cgroup))
|
||||
@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
|
||||
lnode = llist_del_all(lhead);
|
||||
if (!lnode)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Iterate only the iostat_cpu's queued in the lockless list.
|
||||
*/
|
||||
llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
|
||||
struct blkcg_gq *blkg = bisc->blkg;
|
||||
struct blkcg_gq *parent = blkg->parent;
|
||||
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
|
||||
struct blkg_iostat cur;
|
||||
unsigned int seq;
|
||||
|
||||
WRITE_ONCE(bisc->lqueued, false);
|
||||
|
||||
/* fetch the current per-cpu values */
|
||||
do {
|
||||
seq = u64_stats_fetch_begin(&bisc->sync);
|
||||
@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
|
||||
if (parent && parent->parent)
|
||||
blkcg_iostat_update(parent, &blkg->iostat.cur,
|
||||
&blkg->iostat.last);
|
||||
percpu_ref_put(&blkg->refcnt);
|
||||
}
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
|
||||
|
||||
mutex_unlock(&blkcg_pol_mutex);
|
||||
|
||||
free_percpu(blkcg->lhead);
|
||||
kfree(blkcg);
|
||||
}
|
||||
|
||||
@ -1139,7 +1186,6 @@ static struct cgroup_subsys_state *
|
||||
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct blkcg *blkcg;
|
||||
struct cgroup_subsys_state *ret;
|
||||
int i;
|
||||
|
||||
mutex_lock(&blkcg_pol_mutex);
|
||||
@ -1148,12 +1194,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
blkcg = &blkcg_root;
|
||||
} else {
|
||||
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
|
||||
if (!blkcg) {
|
||||
ret = ERR_PTR(-ENOMEM);
|
||||
if (!blkcg)
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
|
||||
if (init_blkcg_llists(blkcg))
|
||||
goto free_blkcg;
|
||||
|
||||
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
|
||||
struct blkcg_policy *pol = blkcg_policy[i];
|
||||
struct blkcg_policy_data *cpd;
|
||||
@ -1168,10 +1215,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
continue;
|
||||
|
||||
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
|
||||
if (!cpd) {
|
||||
ret = ERR_PTR(-ENOMEM);
|
||||
if (!cpd)
|
||||
goto free_pd_blkcg;
|
||||
}
|
||||
|
||||
blkcg->cpd[i] = cpd;
|
||||
cpd->blkcg = blkcg;
|
||||
cpd->plid = i;
|
||||
@ -1195,12 +1241,13 @@ free_pd_blkcg:
|
||||
for (i--; i >= 0; i--)
|
||||
if (blkcg->cpd[i])
|
||||
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
|
||||
|
||||
free_percpu(blkcg->lhead);
|
||||
free_blkcg:
|
||||
if (blkcg != &blkcg_root)
|
||||
kfree(blkcg);
|
||||
unlock:
|
||||
mutex_unlock(&blkcg_pol_mutex);
|
||||
return ret;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
static int blkcg_css_online(struct cgroup_subsys_state *css)
|
||||
@ -1784,7 +1831,7 @@ out:
|
||||
|
||||
/**
|
||||
* blkcg_schedule_throttle - this task needs to check for throttling
|
||||
* @gendisk: disk to throttle
|
||||
* @disk: disk to throttle
|
||||
* @use_memdelay: do we charge this to memory delay for PSI
|
||||
*
|
||||
* This is called by the IO controller when we know there's delay accumulated
|
||||
@ -1943,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio)
|
||||
|
||||
void blk_cgroup_bio_start(struct bio *bio)
|
||||
{
|
||||
struct blkcg *blkcg = bio->bi_blkg->blkcg;
|
||||
int rwd = blk_cgroup_io_type(bio), cpu;
|
||||
struct blkg_iostat_set *bis;
|
||||
unsigned long flags;
|
||||
@ -1961,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio)
|
||||
}
|
||||
bis->cur.ios[rwd]++;
|
||||
|
||||
/*
|
||||
* If the iostat_cpu isn't in a lockless list, put it into the
|
||||
* list to indicate that a stat update is pending.
|
||||
*/
|
||||
if (!READ_ONCE(bis->lqueued)) {
|
||||
struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
|
||||
|
||||
llist_add(&bis->lnode, lhead);
|
||||
WRITE_ONCE(bis->lqueued, true);
|
||||
percpu_ref_get(&bis->blkg->refcnt);
|
||||
}
|
||||
|
||||
u64_stats_update_end_irqrestore(&bis->sync, flags);
|
||||
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
|
||||
cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
|
||||
cgroup_rstat_updated(blkcg->css.cgroup, cpu);
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/llist.h>
|
||||
|
||||
struct blkcg_gq;
|
||||
struct blkg_policy_data;
|
||||
@ -43,6 +44,9 @@ struct blkg_iostat {
|
||||
|
||||
struct blkg_iostat_set {
|
||||
struct u64_stats_sync sync;
|
||||
struct blkcg_gq *blkg;
|
||||
struct llist_node lnode;
|
||||
int lqueued; /* queued in llist */
|
||||
struct blkg_iostat cur;
|
||||
struct blkg_iostat last;
|
||||
};
|
||||
@ -97,6 +101,12 @@ struct blkcg {
|
||||
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
|
||||
|
||||
struct list_head all_blkcgs_node;
|
||||
|
||||
/*
|
||||
* List of updated percpu blkg_iostat_set's since the last flush.
|
||||
*/
|
||||
struct llist_head __percpu *lhead;
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP_FC_APPID
|
||||
char fc_app_id[FC_APPID_LEN];
|
||||
#endif
|
||||
|
@ -59,13 +59,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
|
||||
|
||||
DEFINE_IDA(blk_queue_ida);
|
||||
static DEFINE_IDA(blk_queue_ida);
|
||||
|
||||
/*
|
||||
* For queue allocation
|
||||
*/
|
||||
struct kmem_cache *blk_requestq_cachep;
|
||||
struct kmem_cache *blk_requestq_srcu_cachep;
|
||||
static struct kmem_cache *blk_requestq_cachep;
|
||||
|
||||
/*
|
||||
* Controlling structure to kblockd
|
||||
@ -253,19 +252,44 @@ void blk_clear_pm_only(struct request_queue *q)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
|
||||
|
||||
static void blk_free_queue_rcu(struct rcu_head *rcu_head)
|
||||
{
|
||||
kmem_cache_free(blk_requestq_cachep,
|
||||
container_of(rcu_head, struct request_queue, rcu_head));
|
||||
}
|
||||
|
||||
static void blk_free_queue(struct request_queue *q)
|
||||
{
|
||||
percpu_ref_exit(&q->q_usage_counter);
|
||||
|
||||
if (q->poll_stat)
|
||||
blk_stat_remove_callback(q, q->poll_cb);
|
||||
blk_stat_free_callback(q->poll_cb);
|
||||
|
||||
blk_free_queue_stats(q->stats);
|
||||
kfree(q->poll_stat);
|
||||
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_release(q);
|
||||
|
||||
ida_free(&blk_queue_ida, q->id);
|
||||
call_rcu(&q->rcu_head, blk_free_queue_rcu);
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_put_queue - decrement the request_queue refcount
|
||||
* @q: the request_queue structure to decrement the refcount for
|
||||
*
|
||||
* Decrements the refcount of the request_queue kobject. When this reaches 0
|
||||
* we'll have blk_release_queue() called.
|
||||
* Decrements the refcount of the request_queue and free it when the refcount
|
||||
* reaches 0.
|
||||
*
|
||||
* Context: Any context, but the last reference must not be dropped from
|
||||
* atomic context.
|
||||
* Context: Can sleep.
|
||||
*/
|
||||
void blk_put_queue(struct request_queue *q)
|
||||
{
|
||||
kobject_put(&q->kobj);
|
||||
might_sleep();
|
||||
if (refcount_dec_and_test(&q->refs))
|
||||
blk_free_queue(q);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_put_queue);
|
||||
|
||||
@ -373,26 +397,20 @@ static void blk_timeout_work(struct work_struct *work)
|
||||
{
|
||||
}
|
||||
|
||||
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
|
||||
struct request_queue *blk_alloc_queue(int node_id)
|
||||
{
|
||||
struct request_queue *q;
|
||||
|
||||
q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
|
||||
GFP_KERNEL | __GFP_ZERO, node_id);
|
||||
q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
|
||||
node_id);
|
||||
if (!q)
|
||||
return NULL;
|
||||
|
||||
if (alloc_srcu) {
|
||||
blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
|
||||
if (init_srcu_struct(q->srcu) != 0)
|
||||
goto fail_q;
|
||||
}
|
||||
|
||||
q->last_merge = NULL;
|
||||
|
||||
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
|
||||
if (q->id < 0)
|
||||
goto fail_srcu;
|
||||
goto fail_q;
|
||||
|
||||
q->stats = blk_alloc_queue_stats();
|
||||
if (!q->stats)
|
||||
@ -406,8 +424,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
|
||||
INIT_WORK(&q->timeout_work, blk_timeout_work);
|
||||
INIT_LIST_HEAD(&q->icq_list);
|
||||
|
||||
kobject_init(&q->kobj, &blk_queue_ktype);
|
||||
|
||||
refcount_set(&q->refs, 1);
|
||||
mutex_init(&q->debugfs_mutex);
|
||||
mutex_init(&q->sysfs_lock);
|
||||
mutex_init(&q->sysfs_dir_lock);
|
||||
@ -434,11 +451,8 @@ fail_stats:
|
||||
blk_free_queue_stats(q->stats);
|
||||
fail_id:
|
||||
ida_free(&blk_queue_ida, q->id);
|
||||
fail_srcu:
|
||||
if (alloc_srcu)
|
||||
cleanup_srcu_struct(q->srcu);
|
||||
fail_q:
|
||||
kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
|
||||
kmem_cache_free(blk_requestq_cachep, q);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -454,7 +468,7 @@ bool blk_get_queue(struct request_queue *q)
|
||||
{
|
||||
if (unlikely(blk_queue_dying(q)))
|
||||
return false;
|
||||
kobject_get(&q->kobj);
|
||||
refcount_inc(&q->refs);
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_get_queue);
|
||||
@ -944,18 +958,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev,
|
||||
}
|
||||
EXPORT_SYMBOL(bdev_start_io_acct);
|
||||
|
||||
/**
|
||||
* bio_start_io_acct_time - start I/O accounting for bio based drivers
|
||||
* @bio: bio to start account for
|
||||
* @start_time: start time that should be passed back to bio_end_io_acct().
|
||||
*/
|
||||
void bio_start_io_acct_time(struct bio *bio, unsigned long start_time)
|
||||
{
|
||||
bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
|
||||
bio_op(bio), start_time);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_start_io_acct_time);
|
||||
|
||||
/**
|
||||
* bio_start_io_acct - start I/O accounting for bio based drivers
|
||||
* @bio: bio to start account for
|
||||
@ -1183,9 +1185,6 @@ int __init blk_dev_init(void)
|
||||
sizeof_field(struct request, cmd_flags));
|
||||
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
|
||||
sizeof_field(struct bio, bi_opf));
|
||||
BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
|
||||
__alignof__(struct request_queue)) !=
|
||||
sizeof(struct request_queue));
|
||||
|
||||
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
|
||||
kblockd_workqueue = alloc_workqueue("kblockd",
|
||||
@ -1196,10 +1195,6 @@ int __init blk_dev_init(void)
|
||||
blk_requestq_cachep = kmem_cache_create("request_queue",
|
||||
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
|
||||
|
||||
blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
|
||||
sizeof(struct request_queue) +
|
||||
sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
|
||||
|
||||
blk_debugfs_root = debugfs_create_dir("block", NULL);
|
||||
|
||||
return 0;
|
||||
|
@ -21,9 +21,9 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
|
||||
|
||||
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
|
||||
|
||||
int blk_crypto_sysfs_register(struct request_queue *q);
|
||||
int blk_crypto_sysfs_register(struct gendisk *disk);
|
||||
|
||||
void blk_crypto_sysfs_unregister(struct request_queue *q);
|
||||
void blk_crypto_sysfs_unregister(struct gendisk *disk);
|
||||
|
||||
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
|
||||
unsigned int inc);
|
||||
@ -65,14 +65,28 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
|
||||
return rq->crypt_ctx;
|
||||
}
|
||||
|
||||
blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
|
||||
const struct blk_crypto_key *key,
|
||||
struct blk_crypto_keyslot **slot_ptr);
|
||||
|
||||
void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
|
||||
|
||||
int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
|
||||
const struct blk_crypto_key *key);
|
||||
|
||||
bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
|
||||
const struct blk_crypto_config *cfg);
|
||||
|
||||
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
|
||||
|
||||
static inline int blk_crypto_sysfs_register(struct request_queue *q)
|
||||
static inline int blk_crypto_sysfs_register(struct gendisk *disk)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
|
||||
static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
|
||||
{
|
||||
}
|
||||
|
||||
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
|
||||
struct bio *bio)
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include <linux/wait.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include "blk-crypto-internal.h"
|
||||
|
||||
struct blk_crypto_keyslot {
|
||||
atomic_t slot_refs;
|
||||
|
@ -126,8 +126,9 @@ static struct kobj_type blk_crypto_ktype = {
|
||||
* If the request_queue has a blk_crypto_profile, create the "crypto"
|
||||
* subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
|
||||
*/
|
||||
int blk_crypto_sysfs_register(struct request_queue *q)
|
||||
int blk_crypto_sysfs_register(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct blk_crypto_kobj *obj;
|
||||
int err;
|
||||
|
||||
@ -139,8 +140,8 @@ int blk_crypto_sysfs_register(struct request_queue *q)
|
||||
return -ENOMEM;
|
||||
obj->profile = q->crypto_profile;
|
||||
|
||||
err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
|
||||
"crypto");
|
||||
err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype,
|
||||
&disk->queue_kobj, "crypto");
|
||||
if (err) {
|
||||
kobject_put(&obj->kobj);
|
||||
return err;
|
||||
@ -149,9 +150,9 @@ int blk_crypto_sysfs_register(struct request_queue *q)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blk_crypto_sysfs_unregister(struct request_queue *q)
|
||||
void blk_crypto_sysfs_unregister(struct gendisk *disk)
|
||||
{
|
||||
kobject_put(q->crypto_kobject);
|
||||
kobject_put(disk->queue->crypto_kobject);
|
||||
}
|
||||
|
||||
static int __init blk_crypto_sysfs_init(void)
|
||||
|
@ -273,7 +273,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
|
||||
{
|
||||
struct bio *bio = *bio_ptr;
|
||||
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
|
||||
struct blk_crypto_profile *profile;
|
||||
|
||||
/* Error if bio has no data. */
|
||||
if (WARN_ON_ONCE(!bio_has_data(bio))) {
|
||||
@ -290,10 +289,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
|
||||
* Success if device supports the encryption context, or if we succeeded
|
||||
* in falling back to the crypto API.
|
||||
*/
|
||||
profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
|
||||
if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
|
||||
if (blk_crypto_config_supported_natively(bio->bi_bdev,
|
||||
&bc_key->crypto_cfg))
|
||||
return true;
|
||||
|
||||
if (blk_crypto_fallback_bio_prep(bio_ptr))
|
||||
return true;
|
||||
fail:
|
||||
@ -358,22 +356,29 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool blk_crypto_config_supported_natively(struct block_device *bdev,
|
||||
const struct blk_crypto_config *cfg)
|
||||
{
|
||||
return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
|
||||
cfg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
|
||||
* request queue it's submitted to supports inline crypto, or the
|
||||
* block_device it's submitted to supports inline crypto, or the
|
||||
* blk-crypto-fallback is enabled and supports the cfg).
|
||||
*/
|
||||
bool blk_crypto_config_supported(struct request_queue *q,
|
||||
bool blk_crypto_config_supported(struct block_device *bdev,
|
||||
const struct blk_crypto_config *cfg)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
|
||||
__blk_crypto_cfg_supported(q->crypto_profile, cfg);
|
||||
blk_crypto_config_supported_natively(bdev, cfg);
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
|
||||
* @bdev: block device to operate on
|
||||
* @key: A key to use on the device
|
||||
* @q: the request queue for the device
|
||||
*
|
||||
* Upper layers must call this function to ensure that either the hardware
|
||||
* supports the key's crypto settings, or the crypto API fallback has transforms
|
||||
@ -385,10 +390,10 @@ bool blk_crypto_config_supported(struct request_queue *q,
|
||||
* blk-crypto-fallback is either disabled or the needed algorithm
|
||||
* is disabled in the crypto API; or another -errno code.
|
||||
*/
|
||||
int blk_crypto_start_using_key(const struct blk_crypto_key *key,
|
||||
struct request_queue *q)
|
||||
int blk_crypto_start_using_key(struct block_device *bdev,
|
||||
const struct blk_crypto_key *key)
|
||||
{
|
||||
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
|
||||
if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
|
||||
return 0;
|
||||
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
|
||||
}
|
||||
@ -396,7 +401,7 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
|
||||
/**
|
||||
* blk_crypto_evict_key() - Evict a key from any inline encryption hardware
|
||||
* it may have been programmed into
|
||||
* @q: The request queue who's associated inline encryption hardware this key
|
||||
* @bdev: The block_device who's associated inline encryption hardware this key
|
||||
* might have been programmed into
|
||||
* @key: The key to evict
|
||||
*
|
||||
@ -406,14 +411,16 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
|
||||
*
|
||||
* Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
|
||||
*/
|
||||
int blk_crypto_evict_key(struct request_queue *q,
|
||||
int blk_crypto_evict_key(struct block_device *bdev,
|
||||
const struct blk_crypto_key *key)
|
||||
{
|
||||
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
|
||||
if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
|
||||
return __blk_crypto_evict_key(q->crypto_profile, key);
|
||||
|
||||
/*
|
||||
* If the request_queue didn't support the key, then blk-crypto-fallback
|
||||
* If the block_device didn't support the key, then blk-crypto-fallback
|
||||
* may have been used, so try to evict the key from blk-crypto-fallback.
|
||||
*/
|
||||
return blk_crypto_fallback_evict_key(key);
|
||||
|
@ -123,7 +123,8 @@ int disk_register_independent_access_ranges(struct gendisk *disk)
|
||||
*/
|
||||
WARN_ON(iars->sysfs_registered);
|
||||
ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
|
||||
&q->kobj, "%s", "independent_access_ranges");
|
||||
&disk->queue_kobj, "%s",
|
||||
"independent_access_ranges");
|
||||
if (ret) {
|
||||
disk->ia_ranges = NULL;
|
||||
kobject_put(&iars->kobj);
|
||||
|
@ -111,7 +111,7 @@
|
||||
* busy signal.
|
||||
*
|
||||
* As devices can have deep queues and be unfair in how the queued commands
|
||||
* are executed, soley depending on rq wait may not result in satisfactory
|
||||
* are executed, solely depending on rq wait may not result in satisfactory
|
||||
* control quality. For a better control quality, completion latency QoS
|
||||
* parameters can be configured so that the device is considered saturated
|
||||
* if N'th percentile completion latency rises above the set point.
|
||||
@ -556,7 +556,6 @@ struct ioc_now {
|
||||
u64 now_ns;
|
||||
u64 now;
|
||||
u64 vnow;
|
||||
u64 vrate;
|
||||
};
|
||||
|
||||
struct iocg_wait {
|
||||
@ -906,8 +905,10 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
|
||||
if (idx == ioc->autop_idx && !force)
|
||||
return false;
|
||||
|
||||
if (idx != ioc->autop_idx)
|
||||
if (idx != ioc->autop_idx) {
|
||||
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
|
||||
ioc->vtime_base_rate = VTIME_PER_USEC;
|
||||
}
|
||||
|
||||
ioc->autop_idx = idx;
|
||||
ioc->autop_too_fast_at = 0;
|
||||
@ -975,7 +976,7 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
|
||||
|
||||
if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
|
||||
if (ioc->busy_level != prev_busy_level || nr_lagging)
|
||||
trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
|
||||
trace_iocost_ioc_vrate_adj(ioc, vrate,
|
||||
missed_ppm, rq_wait_pct,
|
||||
nr_lagging, nr_shortages);
|
||||
|
||||
@ -1018,10 +1019,11 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
|
||||
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
|
||||
{
|
||||
unsigned seq;
|
||||
u64 vrate;
|
||||
|
||||
now->now_ns = ktime_get();
|
||||
now->now = ktime_to_us(now->now_ns);
|
||||
now->vrate = atomic64_read(&ioc->vtime_rate);
|
||||
vrate = atomic64_read(&ioc->vtime_rate);
|
||||
|
||||
/*
|
||||
* The current vtime is
|
||||
@ -1034,7 +1036,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
|
||||
do {
|
||||
seq = read_seqcount_begin(&ioc->period_seqcount);
|
||||
now->vnow = ioc->period_at_vtime +
|
||||
(now->now - ioc->period_at) * now->vrate;
|
||||
(now->now - ioc->period_at) * vrate;
|
||||
} while (read_seqcount_retry(&ioc->period_seqcount, seq));
|
||||
}
|
||||
|
||||
@ -2203,8 +2205,8 @@ static void ioc_timer_fn(struct timer_list *timer)
|
||||
LIST_HEAD(surpluses);
|
||||
int nr_debtors, nr_shortages = 0, nr_lagging = 0;
|
||||
u64 usage_us_sum = 0;
|
||||
u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
|
||||
u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
|
||||
u32 ppm_rthr;
|
||||
u32 ppm_wthr;
|
||||
u32 missed_ppm[2], rq_wait_pct;
|
||||
u64 period_vtime;
|
||||
int prev_busy_level;
|
||||
@ -2215,6 +2217,8 @@ static void ioc_timer_fn(struct timer_list *timer)
|
||||
/* take care of active iocgs */
|
||||
spin_lock_irq(&ioc->lock);
|
||||
|
||||
ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
|
||||
ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
|
||||
ioc_now(ioc, &now);
|
||||
|
||||
period_vtime = now.vnow - ioc->period_at_vtime;
|
||||
@ -2878,7 +2882,7 @@ static int blk_iocost_init(struct gendisk *disk)
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
/*
|
||||
* rqos must be added before activation to allow iocg_pd_init() to
|
||||
* rqos must be added before activation to allow ioc_pd_init() to
|
||||
* lookup the ioc from q. This means that the rqos methods may get
|
||||
* called before policy activation completion, can't assume that the
|
||||
* target bio has an iocg associated and need to test for NULL iocg.
|
||||
@ -3187,11 +3191,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
|
||||
ioc = q_to_ioc(disk->queue);
|
||||
}
|
||||
|
||||
blk_mq_freeze_queue(disk->queue);
|
||||
blk_mq_quiesce_queue(disk->queue);
|
||||
|
||||
spin_lock_irq(&ioc->lock);
|
||||
memcpy(qos, ioc->params.qos, sizeof(qos));
|
||||
enable = ioc->enabled;
|
||||
user = ioc->user_qos_params;
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
while ((p = strsep(&input, " \t\n"))) {
|
||||
substring_t args[MAX_OPT_ARGS];
|
||||
@ -3258,15 +3264,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
|
||||
if (qos[QOS_MIN] > qos[QOS_MAX])
|
||||
goto einval;
|
||||
|
||||
spin_lock_irq(&ioc->lock);
|
||||
|
||||
if (enable) {
|
||||
blk_stat_enable_accounting(disk->queue);
|
||||
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
|
||||
ioc->enabled = true;
|
||||
wbt_disable_default(disk->queue);
|
||||
} else {
|
||||
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
|
||||
ioc->enabled = false;
|
||||
wbt_enable_default(disk->queue);
|
||||
}
|
||||
|
||||
if (user) {
|
||||
@ -3279,9 +3285,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
|
||||
ioc_refresh_params(ioc, true);
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
blk_mq_unquiesce_queue(disk->queue);
|
||||
blk_mq_unfreeze_queue(disk->queue);
|
||||
|
||||
blkdev_put_no_open(bdev);
|
||||
return nbytes;
|
||||
einval:
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
blk_mq_unquiesce_queue(disk->queue);
|
||||
blk_mq_unfreeze_queue(disk->queue);
|
||||
|
||||
ret = -EINVAL;
|
||||
err:
|
||||
blkdev_put_no_open(bdev);
|
||||
@ -3336,6 +3350,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
|
||||
size_t nbytes, loff_t off)
|
||||
{
|
||||
struct block_device *bdev;
|
||||
struct request_queue *q;
|
||||
struct ioc *ioc;
|
||||
u64 u[NR_I_LCOEFS];
|
||||
bool user;
|
||||
@ -3346,18 +3361,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
|
||||
if (IS_ERR(bdev))
|
||||
return PTR_ERR(bdev);
|
||||
|
||||
ioc = q_to_ioc(bdev_get_queue(bdev));
|
||||
q = bdev_get_queue(bdev);
|
||||
ioc = q_to_ioc(q);
|
||||
if (!ioc) {
|
||||
ret = blk_iocost_init(bdev->bd_disk);
|
||||
if (ret)
|
||||
goto err;
|
||||
ioc = q_to_ioc(bdev_get_queue(bdev));
|
||||
ioc = q_to_ioc(q);
|
||||
}
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
spin_lock_irq(&ioc->lock);
|
||||
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
|
||||
user = ioc->user_cost_model;
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
while ((p = strsep(&input, " \t\n"))) {
|
||||
substring_t args[MAX_OPT_ARGS];
|
||||
@ -3394,7 +3412,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
|
||||
user = true;
|
||||
}
|
||||
|
||||
spin_lock_irq(&ioc->lock);
|
||||
if (user) {
|
||||
memcpy(ioc->params.i_lcoefs, u, sizeof(u));
|
||||
ioc->user_cost_model = true;
|
||||
@ -3404,10 +3421,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
|
||||
ioc_refresh_params(ioc, true);
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
blkdev_put_no_open(bdev);
|
||||
return nbytes;
|
||||
|
||||
einval:
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
ret = -EINVAL;
|
||||
err:
|
||||
blkdev_put_no_open(bdev);
|
||||
|
@ -141,7 +141,7 @@ struct iolatency_grp {
|
||||
struct latency_stat __percpu *stats;
|
||||
struct latency_stat cur_stat;
|
||||
struct blk_iolatency *blkiolat;
|
||||
struct rq_depth rq_depth;
|
||||
unsigned int max_depth;
|
||||
struct rq_wait rq_wait;
|
||||
atomic64_t window_start;
|
||||
atomic_t scale_cookie;
|
||||
@ -280,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
|
||||
static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
|
||||
{
|
||||
struct iolatency_grp *iolat = private_data;
|
||||
return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
|
||||
return rq_wait_inc_below(rqw, iolat->max_depth);
|
||||
}
|
||||
|
||||
static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
|
||||
@ -364,15 +364,17 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
|
||||
}
|
||||
|
||||
/*
|
||||
* Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
|
||||
* Change the queue depth of the iolatency_grp. We add 1/16th of the
|
||||
* queue depth at a time so we don't get wild swings and hopefully dial in to
|
||||
* fairer distribution of the overall queue depth.
|
||||
* fairer distribution of the overall queue depth. We halve the queue depth
|
||||
* at a time so we can scale down queue depth quickly from default unlimited
|
||||
* to target.
|
||||
*/
|
||||
static void scale_change(struct iolatency_grp *iolat, bool up)
|
||||
{
|
||||
unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
|
||||
unsigned long scale = scale_amount(qd, up);
|
||||
unsigned long old = iolat->rq_depth.max_depth;
|
||||
unsigned long old = iolat->max_depth;
|
||||
|
||||
if (old > qd)
|
||||
old = qd;
|
||||
@ -384,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
|
||||
if (old < qd) {
|
||||
old += scale;
|
||||
old = min(old, qd);
|
||||
iolat->rq_depth.max_depth = old;
|
||||
iolat->max_depth = old;
|
||||
wake_up_all(&iolat->rq_wait.wait);
|
||||
}
|
||||
} else {
|
||||
old >>= 1;
|
||||
iolat->rq_depth.max_depth = max(old, 1UL);
|
||||
iolat->max_depth = max(old, 1UL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -403,9 +405,6 @@ static void check_scale_change(struct iolatency_grp *iolat)
|
||||
u64 scale_lat;
|
||||
int direction = 0;
|
||||
|
||||
if (lat_to_blkg(iolat)->parent == NULL)
|
||||
return;
|
||||
|
||||
parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
|
||||
if (!parent)
|
||||
return;
|
||||
@ -445,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
|
||||
}
|
||||
|
||||
/* We're as low as we can go. */
|
||||
if (iolat->rq_depth.max_depth == 1 && direction < 0) {
|
||||
if (iolat->max_depth == 1 && direction < 0) {
|
||||
blkcg_use_delay(lat_to_blkg(iolat));
|
||||
return;
|
||||
}
|
||||
@ -453,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
|
||||
/* We're back to the default cookie, unthrottle all the things. */
|
||||
if (cur_cookie == DEFAULT_SCALE_COOKIE) {
|
||||
blkcg_clear_delay(lat_to_blkg(iolat));
|
||||
iolat->rq_depth.max_depth = UINT_MAX;
|
||||
iolat->max_depth = UINT_MAX;
|
||||
wake_up_all(&iolat->rq_wait.wait);
|
||||
return;
|
||||
}
|
||||
@ -508,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
|
||||
* We don't want to count issue_as_root bio's in the cgroups latency
|
||||
* statistics as it could skew the numbers downwards.
|
||||
*/
|
||||
if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
|
||||
if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) {
|
||||
u64 sub = iolat->min_lat_nsec;
|
||||
if (req_time < sub)
|
||||
blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
|
||||
@ -920,7 +919,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
if (iolat->rq_depth.max_depth == UINT_MAX)
|
||||
if (iolat->max_depth == UINT_MAX)
|
||||
seq_printf(s, " missed=%llu total=%llu depth=max",
|
||||
(unsigned long long)stat.ps.missed,
|
||||
(unsigned long long)stat.ps.total);
|
||||
@ -928,7 +927,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
|
||||
seq_printf(s, " missed=%llu total=%llu depth=%u",
|
||||
(unsigned long long)stat.ps.missed,
|
||||
(unsigned long long)stat.ps.total,
|
||||
iolat->rq_depth.max_depth);
|
||||
iolat->max_depth);
|
||||
}
|
||||
|
||||
static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
|
||||
@ -945,12 +944,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
|
||||
|
||||
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
|
||||
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
|
||||
if (iolat->rq_depth.max_depth == UINT_MAX)
|
||||
if (iolat->max_depth == UINT_MAX)
|
||||
seq_printf(s, " depth=max avg_lat=%llu win=%llu",
|
||||
avg_lat, cur_win);
|
||||
else
|
||||
seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
|
||||
iolat->rq_depth.max_depth, avg_lat, cur_win);
|
||||
iolat->max_depth, avg_lat, cur_win);
|
||||
}
|
||||
|
||||
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
|
||||
@ -994,9 +993,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
|
||||
latency_stat_init(iolat, &iolat->cur_stat);
|
||||
rq_wait_init(&iolat->rq_wait);
|
||||
spin_lock_init(&iolat->child_lat.lock);
|
||||
iolat->rq_depth.queue_depth = blkg->q->nr_requests;
|
||||
iolat->rq_depth.max_depth = UINT_MAX;
|
||||
iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
|
||||
iolat->max_depth = UINT_MAX;
|
||||
iolat->blkiolat = blkiolat;
|
||||
iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
|
||||
atomic64_set(&iolat->window_start, now);
|
||||
|
@ -267,6 +267,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
|
||||
{
|
||||
unsigned int max_sectors = queue_max_hw_sectors(rq->q);
|
||||
unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
|
||||
unsigned int gup_flags = 0;
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
int j;
|
||||
@ -278,6 +279,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
|
||||
if (bio == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
if (blk_queue_pci_p2pdma(rq->q))
|
||||
gup_flags |= FOLL_PCI_P2PDMA;
|
||||
|
||||
while (iov_iter_count(iter)) {
|
||||
struct page **pages, *stack_pages[UIO_FASTIOV];
|
||||
ssize_t bytes;
|
||||
@ -286,11 +290,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
|
||||
|
||||
if (nr_vecs <= ARRAY_SIZE(stack_pages)) {
|
||||
pages = stack_pages;
|
||||
bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
|
||||
nr_vecs, &offs);
|
||||
bytes = iov_iter_get_pages(iter, pages, LONG_MAX,
|
||||
nr_vecs, &offs, gup_flags);
|
||||
} else {
|
||||
bytes = iov_iter_get_pages_alloc2(iter, &pages,
|
||||
LONG_MAX, &offs);
|
||||
bytes = iov_iter_get_pages_alloc(iter, &pages,
|
||||
LONG_MAX, &offs, gup_flags);
|
||||
}
|
||||
if (unlikely(bytes <= 0)) {
|
||||
ret = bytes ? bytes : -EFAULT;
|
||||
@ -555,7 +559,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
|
||||
size_t nr_iter = iov_iter_count(iter);
|
||||
size_t nr_segs = iter->nr_segs;
|
||||
struct bio_vec *bvecs, *bvprvp = NULL;
|
||||
struct queue_limits *lim = &q->limits;
|
||||
const struct queue_limits *lim = &q->limits;
|
||||
unsigned int nsegs = 0, bytes = 0;
|
||||
struct bio *bio;
|
||||
size_t i;
|
||||
|
@ -100,13 +100,14 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
|
||||
* is defined as 'unsigned int', meantime it has to be aligned to with the
|
||||
* logical block size, which is the minimum accepted unit by hardware.
|
||||
*/
|
||||
static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
|
||||
static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
|
||||
{
|
||||
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
|
||||
}
|
||||
|
||||
static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
|
||||
unsigned *nsegs, struct bio_set *bs)
|
||||
static struct bio *bio_split_discard(struct bio *bio,
|
||||
const struct queue_limits *lim,
|
||||
unsigned *nsegs, struct bio_set *bs)
|
||||
{
|
||||
unsigned int max_discard_sectors, granularity;
|
||||
sector_t tmp;
|
||||
@ -146,7 +147,8 @@ static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
|
||||
}
|
||||
|
||||
static struct bio *bio_split_write_zeroes(struct bio *bio,
|
||||
struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
|
||||
const struct queue_limits *lim,
|
||||
unsigned *nsegs, struct bio_set *bs)
|
||||
{
|
||||
*nsegs = 0;
|
||||
if (!lim->max_write_zeroes_sectors)
|
||||
@ -165,7 +167,7 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
|
||||
* aligned to a physical block boundary.
|
||||
*/
|
||||
static inline unsigned get_max_io_size(struct bio *bio,
|
||||
struct queue_limits *lim)
|
||||
const struct queue_limits *lim)
|
||||
{
|
||||
unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
|
||||
unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
|
||||
@ -184,7 +186,15 @@ static inline unsigned get_max_io_size(struct bio *bio,
|
||||
return max_sectors & ~(lbs - 1);
|
||||
}
|
||||
|
||||
static inline unsigned get_max_segment_size(struct queue_limits *lim,
|
||||
/**
|
||||
* get_max_segment_size() - maximum number of bytes to add as a single segment
|
||||
* @lim: Request queue limits.
|
||||
* @start_page: See below.
|
||||
* @offset: Offset from @start_page where to add a segment.
|
||||
*
|
||||
* Returns the maximum number of bytes that can be added as a single segment.
|
||||
*/
|
||||
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
|
||||
struct page *start_page, unsigned long offset)
|
||||
{
|
||||
unsigned long mask = lim->seg_boundary_mask;
|
||||
@ -192,11 +202,10 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
|
||||
offset = mask & (page_to_phys(start_page) + offset);
|
||||
|
||||
/*
|
||||
* overflow may be triggered in case of zero page physical address
|
||||
* on 32bit arch, use queue's max segment size when that happens.
|
||||
* Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
|
||||
* after having calculated the minimum.
|
||||
*/
|
||||
return min_not_zero(mask - offset + 1,
|
||||
(unsigned long)lim->max_segment_size);
|
||||
return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -219,9 +228,9 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
|
||||
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
|
||||
* the block driver.
|
||||
*/
|
||||
static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
|
||||
unsigned *nsegs, unsigned *bytes, unsigned max_segs,
|
||||
unsigned max_bytes)
|
||||
static bool bvec_split_segs(const struct queue_limits *lim,
|
||||
const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
|
||||
unsigned max_segs, unsigned max_bytes)
|
||||
{
|
||||
unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
|
||||
unsigned len = min(bv->bv_len, max_len);
|
||||
@ -267,7 +276,7 @@ static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
|
||||
* responsible for ensuring that @bs is only destroyed after processing of the
|
||||
* split bio has finished.
|
||||
*/
|
||||
static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
|
||||
static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
|
||||
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
|
||||
{
|
||||
struct bio_vec bv, bvprv, *bvprvp = NULL;
|
||||
@ -331,8 +340,9 @@ split:
|
||||
* The split bio is allocated from @q->bio_split, which is provided by the
|
||||
* block layer.
|
||||
*/
|
||||
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
|
||||
unsigned int *nr_segs)
|
||||
struct bio *__bio_split_to_limits(struct bio *bio,
|
||||
const struct queue_limits *lim,
|
||||
unsigned int *nr_segs)
|
||||
{
|
||||
struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
|
||||
struct bio *split;
|
||||
@ -377,7 +387,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
|
||||
*/
|
||||
struct bio *bio_split_to_limits(struct bio *bio)
|
||||
{
|
||||
struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
|
||||
const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
|
||||
unsigned int nr_segs;
|
||||
|
||||
if (bio_may_exceed_limits(bio, lim))
|
||||
|
@ -555,6 +555,7 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* caller must have a reference to @e, will grab another one if successful */
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
{
|
||||
unsigned int flags = q->tag_set->flags;
|
||||
@ -563,13 +564,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
unsigned long i;
|
||||
int ret;
|
||||
|
||||
if (!e) {
|
||||
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
|
||||
q->elevator = NULL;
|
||||
q->nr_requests = q->tag_set->queue_depth;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Default to double of smaller one between hw queue_depth and 128,
|
||||
* since we don't split into sync/async like the old code did.
|
||||
|
@ -185,7 +185,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct request_queue *q = hctx->queue;
|
||||
struct blk_mq_ctx *ctx;
|
||||
int i, ret;
|
||||
int i, j, ret;
|
||||
|
||||
if (!hctx->nr_ctx)
|
||||
return 0;
|
||||
@ -197,9 +197,16 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
|
||||
hctx_for_each_ctx(hctx, ctx, i) {
|
||||
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
|
||||
if (ret)
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
|
||||
return 0;
|
||||
out:
|
||||
hctx_for_each_ctx(hctx, ctx, j) {
|
||||
if (j < i)
|
||||
kobject_del(&ctx->kobj);
|
||||
}
|
||||
kobject_del(&hctx->kobj);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
229
block/blk-mq.c
229
block/blk-mq.c
@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
|
||||
|
||||
/**
|
||||
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
|
||||
* @q: request queue.
|
||||
* @set: tag_set to wait on
|
||||
*
|
||||
* Note: it is driver's responsibility for making sure that quiesce has
|
||||
* been started.
|
||||
* been started on or more of the request_queues of the tag_set. This
|
||||
* function only waits for the quiesce on those request_queues that had
|
||||
* the quiesce flag set using blk_mq_quiesce_queue_nowait.
|
||||
*/
|
||||
void blk_mq_wait_quiesce_done(struct request_queue *q)
|
||||
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
|
||||
{
|
||||
if (blk_queue_has_srcu(q))
|
||||
synchronize_srcu(q->srcu);
|
||||
if (set->flags & BLK_MQ_F_BLOCKING)
|
||||
synchronize_srcu(set->srcu);
|
||||
else
|
||||
synchronize_rcu();
|
||||
}
|
||||
@ -280,7 +282,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
|
||||
void blk_mq_quiesce_queue(struct request_queue *q)
|
||||
{
|
||||
blk_mq_quiesce_queue_nowait(q);
|
||||
blk_mq_wait_quiesce_done(q);
|
||||
/* nothing to wait for non-mq queues */
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_wait_quiesce_done(q->tag_set);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
|
||||
|
||||
@ -311,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
|
||||
|
||||
void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
|
||||
{
|
||||
struct request_queue *q;
|
||||
|
||||
mutex_lock(&set->tag_list_lock);
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
if (!blk_queue_skip_tagset_quiesce(q))
|
||||
blk_mq_quiesce_queue_nowait(q);
|
||||
}
|
||||
blk_mq_wait_quiesce_done(set);
|
||||
mutex_unlock(&set->tag_list_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
|
||||
|
||||
void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
|
||||
{
|
||||
struct request_queue *q;
|
||||
|
||||
mutex_lock(&set->tag_list_lock);
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
if (!blk_queue_skip_tagset_quiesce(q))
|
||||
blk_mq_unquiesce_queue(q);
|
||||
}
|
||||
mutex_unlock(&set->tag_list_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
|
||||
|
||||
void blk_mq_wake_waiters(struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
@ -544,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
|
||||
|
||||
if (!plug)
|
||||
return NULL;
|
||||
|
||||
if (rq_list_empty(plug->cached_rq)) {
|
||||
if (plug->nr_ios == 1)
|
||||
return NULL;
|
||||
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
|
||||
if (rq)
|
||||
goto got_it;
|
||||
return NULL;
|
||||
if (!rq)
|
||||
return NULL;
|
||||
} else {
|
||||
rq = rq_list_peek(&plug->cached_rq);
|
||||
if (!rq || rq->q != q)
|
||||
return NULL;
|
||||
|
||||
if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
|
||||
return NULL;
|
||||
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
|
||||
return NULL;
|
||||
|
||||
plug->cached_rq = rq_list_next(rq);
|
||||
}
|
||||
rq = rq_list_peek(&plug->cached_rq);
|
||||
if (!rq || rq->q != q)
|
||||
return NULL;
|
||||
|
||||
if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
|
||||
return NULL;
|
||||
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
|
||||
return NULL;
|
||||
|
||||
plug->cached_rq = rq_list_next(rq);
|
||||
got_it:
|
||||
rq->cmd_flags = opf;
|
||||
INIT_LIST_HEAD(&rq->queuelist);
|
||||
return rq;
|
||||
@ -1529,7 +1561,13 @@ static void blk_mq_rq_timed_out(struct request *req)
|
||||
blk_add_timer(req);
|
||||
}
|
||||
|
||||
static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
|
||||
struct blk_expired_data {
|
||||
bool has_timedout_rq;
|
||||
unsigned long next;
|
||||
unsigned long timeout_start;
|
||||
};
|
||||
|
||||
static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
|
||||
{
|
||||
unsigned long deadline;
|
||||
|
||||
@ -1539,13 +1577,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
|
||||
return false;
|
||||
|
||||
deadline = READ_ONCE(rq->deadline);
|
||||
if (time_after_eq(jiffies, deadline))
|
||||
if (time_after_eq(expired->timeout_start, deadline))
|
||||
return true;
|
||||
|
||||
if (*next == 0)
|
||||
*next = deadline;
|
||||
else if (time_after(*next, deadline))
|
||||
*next = deadline;
|
||||
if (expired->next == 0)
|
||||
expired->next = deadline;
|
||||
else if (time_after(expired->next, deadline))
|
||||
expired->next = deadline;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1561,7 +1599,7 @@ void blk_mq_put_rq_ref(struct request *rq)
|
||||
|
||||
static bool blk_mq_check_expired(struct request *rq, void *priv)
|
||||
{
|
||||
unsigned long *next = priv;
|
||||
struct blk_expired_data *expired = priv;
|
||||
|
||||
/*
|
||||
* blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
|
||||
@ -1570,7 +1608,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv)
|
||||
* it was completed and reallocated as a new request after returning
|
||||
* from blk_mq_check_expired().
|
||||
*/
|
||||
if (blk_mq_req_expired(rq, next))
|
||||
if (blk_mq_req_expired(rq, expired)) {
|
||||
expired->has_timedout_rq = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool blk_mq_handle_expired(struct request *rq, void *priv)
|
||||
{
|
||||
struct blk_expired_data *expired = priv;
|
||||
|
||||
if (blk_mq_req_expired(rq, expired))
|
||||
blk_mq_rq_timed_out(rq);
|
||||
return true;
|
||||
}
|
||||
@ -1579,7 +1628,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
|
||||
{
|
||||
struct request_queue *q =
|
||||
container_of(work, struct request_queue, timeout_work);
|
||||
unsigned long next = 0;
|
||||
struct blk_expired_data expired = {
|
||||
.timeout_start = jiffies,
|
||||
};
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
|
||||
@ -1599,10 +1650,23 @@ static void blk_mq_timeout_work(struct work_struct *work)
|
||||
if (!percpu_ref_tryget(&q->q_usage_counter))
|
||||
return;
|
||||
|
||||
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
|
||||
/* check if there is any timed-out request */
|
||||
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
|
||||
if (expired.has_timedout_rq) {
|
||||
/*
|
||||
* Before walking tags, we must ensure any submit started
|
||||
* before the current time has finished. Since the submit
|
||||
* uses srcu or rcu, wait for a synchronization point to
|
||||
* ensure all running submits have finished
|
||||
*/
|
||||
blk_mq_wait_quiesce_done(q->tag_set);
|
||||
|
||||
if (next != 0) {
|
||||
mod_timer(&q->timeout, next);
|
||||
expired.next = 0;
|
||||
blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
|
||||
}
|
||||
|
||||
if (expired.next != 0) {
|
||||
mod_timer(&q->timeout, expired.next);
|
||||
} else {
|
||||
/*
|
||||
* Request timeouts are handled as a forward rolling timer. If
|
||||
@ -3248,21 +3312,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
|
||||
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
|
||||
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
|
||||
node);
|
||||
if (!tags->rqs) {
|
||||
blk_mq_free_tags(tags);
|
||||
return NULL;
|
||||
}
|
||||
if (!tags->rqs)
|
||||
goto err_free_tags;
|
||||
|
||||
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
|
||||
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
|
||||
node);
|
||||
if (!tags->static_rqs) {
|
||||
kfree(tags->rqs);
|
||||
blk_mq_free_tags(tags);
|
||||
return NULL;
|
||||
}
|
||||
if (!tags->static_rqs)
|
||||
goto err_free_rqs;
|
||||
|
||||
return tags;
|
||||
|
||||
err_free_rqs:
|
||||
kfree(tags->rqs);
|
||||
err_free_tags:
|
||||
blk_mq_free_tags(tags);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
|
||||
@ -3975,7 +4040,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q;
|
||||
int ret;
|
||||
|
||||
q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
|
||||
q = blk_alloc_queue(set->numa_node);
|
||||
if (!q)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
q->queuedata = queuedata;
|
||||
@ -4011,14 +4076,11 @@ void blk_mq_destroy_queue(struct request_queue *q)
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
|
||||
blk_queue_start_drain(q);
|
||||
blk_freeze_queue(q);
|
||||
blk_mq_freeze_queue_wait(q);
|
||||
|
||||
blk_sync_queue(q);
|
||||
blk_mq_cancel_work_sync(q);
|
||||
blk_mq_exit_queue(q);
|
||||
|
||||
/* @q is and will stay empty, shutdown and put */
|
||||
blk_put_queue(q);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_destroy_queue);
|
||||
|
||||
@ -4035,6 +4097,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
|
||||
disk = __alloc_disk_node(q, set->numa_node, lkclass);
|
||||
if (!disk) {
|
||||
blk_mq_destroy_queue(q);
|
||||
blk_put_queue(q);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
set_bit(GD_OWNS_QUEUE, &disk->state);
|
||||
@ -4147,9 +4210,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q)
|
||||
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q)
|
||||
{
|
||||
WARN_ON_ONCE(blk_queue_has_srcu(q) !=
|
||||
!!(set->flags & BLK_MQ_F_BLOCKING));
|
||||
|
||||
/* mark the queue as mq asap */
|
||||
q->mq_ops = set->ops;
|
||||
|
||||
@ -4325,12 +4385,12 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
|
||||
}
|
||||
|
||||
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
|
||||
int cur_nr_hw_queues, int new_nr_hw_queues)
|
||||
int new_nr_hw_queues)
|
||||
{
|
||||
struct blk_mq_tags **new_tags;
|
||||
|
||||
if (cur_nr_hw_queues >= new_nr_hw_queues)
|
||||
return 0;
|
||||
if (set->nr_hw_queues >= new_nr_hw_queues)
|
||||
goto done;
|
||||
|
||||
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
|
||||
GFP_KERNEL, set->numa_node);
|
||||
@ -4338,21 +4398,15 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
|
||||
return -ENOMEM;
|
||||
|
||||
if (set->tags)
|
||||
memcpy(new_tags, set->tags, cur_nr_hw_queues *
|
||||
memcpy(new_tags, set->tags, set->nr_hw_queues *
|
||||
sizeof(*set->tags));
|
||||
kfree(set->tags);
|
||||
set->tags = new_tags;
|
||||
done:
|
||||
set->nr_hw_queues = new_nr_hw_queues;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
|
||||
int new_nr_hw_queues)
|
||||
{
|
||||
return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
|
||||
}
|
||||
|
||||
/*
|
||||
* Alloc a tag set to be associated with one or more request queues.
|
||||
* May fail with EINVAL for various error conditions. May adjust the
|
||||
@ -4406,10 +4460,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
|
||||
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
|
||||
set->nr_hw_queues = nr_cpu_ids;
|
||||
|
||||
if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
|
||||
return -ENOMEM;
|
||||
if (set->flags & BLK_MQ_F_BLOCKING) {
|
||||
set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
|
||||
if (!set->srcu)
|
||||
return -ENOMEM;
|
||||
ret = init_srcu_struct(set->srcu);
|
||||
if (ret)
|
||||
goto out_free_srcu;
|
||||
}
|
||||
|
||||
ret = -ENOMEM;
|
||||
set->tags = kcalloc_node(set->nr_hw_queues,
|
||||
sizeof(struct blk_mq_tags *), GFP_KERNEL,
|
||||
set->numa_node);
|
||||
if (!set->tags)
|
||||
goto out_cleanup_srcu;
|
||||
|
||||
for (i = 0; i < set->nr_maps; i++) {
|
||||
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
|
||||
sizeof(set->map[i].mq_map[0]),
|
||||
@ -4437,6 +4503,12 @@ out_free_mq_map:
|
||||
}
|
||||
kfree(set->tags);
|
||||
set->tags = NULL;
|
||||
out_cleanup_srcu:
|
||||
if (set->flags & BLK_MQ_F_BLOCKING)
|
||||
cleanup_srcu_struct(set->srcu);
|
||||
out_free_srcu:
|
||||
if (set->flags & BLK_MQ_F_BLOCKING)
|
||||
kfree(set->srcu);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
|
||||
@ -4476,6 +4548,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
|
||||
|
||||
kfree(set->tags);
|
||||
set->tags = NULL;
|
||||
if (set->flags & BLK_MQ_F_BLOCKING) {
|
||||
cleanup_srcu_struct(set->srcu);
|
||||
kfree(set->srcu);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(blk_mq_free_tag_set);
|
||||
|
||||
@ -4564,17 +4640,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
|
||||
INIT_LIST_HEAD(&qe->node);
|
||||
qe->q = q;
|
||||
qe->type = q->elevator->type;
|
||||
/* keep a reference to the elevator module as we'll switch back */
|
||||
__elevator_get(qe->type);
|
||||
list_add(&qe->node, head);
|
||||
|
||||
/*
|
||||
* After elevator_switch, the previous elevator_queue will be
|
||||
* released by elevator_release. The reference of the io scheduler
|
||||
* module get by elevator_get will also be put. So we need to get
|
||||
* a reference of the io scheduler module here to prevent it to be
|
||||
* removed.
|
||||
*/
|
||||
__module_get(qe->type->elevator_owner);
|
||||
elevator_switch(q, NULL);
|
||||
elevator_disable(q);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
return true;
|
||||
@ -4607,6 +4676,8 @@ static void blk_mq_elv_switch_back(struct list_head *head,
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
elevator_switch(q, t);
|
||||
/* drop the reference acquired in blk_mq_elv_switch_none */
|
||||
elevator_put(t);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
}
|
||||
|
||||
@ -4643,11 +4714,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
||||
}
|
||||
|
||||
prev_nr_hw_queues = set->nr_hw_queues;
|
||||
if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
|
||||
0)
|
||||
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
|
||||
goto reregister;
|
||||
|
||||
set->nr_hw_queues = nr_hw_queues;
|
||||
fallback:
|
||||
blk_mq_update_queue_map(set);
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
@ -4867,15 +4936,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
|
||||
|
||||
void blk_mq_cancel_work_sync(struct request_queue *q)
|
||||
{
|
||||
if (queue_is_mq(q)) {
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
|
||||
cancel_delayed_work_sync(&q->requeue_work);
|
||||
cancel_delayed_work_sync(&q->requeue_work);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i)
|
||||
cancel_delayed_work_sync(&hctx->run_work);
|
||||
}
|
||||
queue_for_each_hw_ctx(q, hctx, i)
|
||||
cancel_delayed_work_sync(&hctx->run_work);
|
||||
}
|
||||
|
||||
static int __init blk_mq_init(void)
|
||||
|
@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
|
||||
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
|
||||
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
|
||||
do { \
|
||||
if (!blk_queue_has_srcu(q)) { \
|
||||
rcu_read_lock(); \
|
||||
(dispatch_ops); \
|
||||
rcu_read_unlock(); \
|
||||
} else { \
|
||||
if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \
|
||||
int srcu_idx; \
|
||||
\
|
||||
might_sleep_if(check_sleep); \
|
||||
srcu_idx = srcu_read_lock((q)->srcu); \
|
||||
srcu_idx = srcu_read_lock((q)->tag_set->srcu); \
|
||||
(dispatch_ops); \
|
||||
srcu_read_unlock((q)->srcu, srcu_idx); \
|
||||
srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \
|
||||
} else { \
|
||||
rcu_read_lock(); \
|
||||
(dispatch_ops); \
|
||||
rcu_read_unlock(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
@ -481,7 +481,7 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
|
||||
}
|
||||
EXPORT_SYMBOL(blk_queue_io_opt);
|
||||
|
||||
static int queue_limit_alignment_offset(struct queue_limits *lim,
|
||||
static int queue_limit_alignment_offset(const struct queue_limits *lim,
|
||||
sector_t sector)
|
||||
{
|
||||
unsigned int granularity = max(lim->physical_block_size, lim->io_min);
|
||||
@ -491,8 +491,8 @@ static int queue_limit_alignment_offset(struct queue_limits *lim,
|
||||
return (granularity + lim->alignment_offset - alignment) % granularity;
|
||||
}
|
||||
|
||||
static unsigned int queue_limit_discard_alignment(struct queue_limits *lim,
|
||||
sector_t sector)
|
||||
static unsigned int queue_limit_discard_alignment(
|
||||
const struct queue_limits *lim, sector_t sector)
|
||||
{
|
||||
unsigned int alignment, granularity, offset;
|
||||
|
||||
|
@ -470,6 +470,9 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
|
||||
if (!wbt_rq_qos(q))
|
||||
return -EINVAL;
|
||||
|
||||
if (wbt_disabled(q))
|
||||
return sprintf(page, "0\n");
|
||||
|
||||
return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
|
||||
}
|
||||
|
||||
@ -680,8 +683,8 @@ static struct attribute *queue_attrs[] = {
|
||||
static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
|
||||
int n)
|
||||
{
|
||||
struct request_queue *q =
|
||||
container_of(kobj, struct request_queue, kobj);
|
||||
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
|
||||
struct request_queue *q = disk->queue;
|
||||
|
||||
if (attr == &queue_io_timeout_entry.attr &&
|
||||
(!q->mq_ops || !q->mq_ops->timeout))
|
||||
@ -707,8 +710,8 @@ static ssize_t
|
||||
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
|
||||
{
|
||||
struct queue_sysfs_entry *entry = to_queue(attr);
|
||||
struct request_queue *q =
|
||||
container_of(kobj, struct request_queue, kobj);
|
||||
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
|
||||
struct request_queue *q = disk->queue;
|
||||
ssize_t res;
|
||||
|
||||
if (!entry->show)
|
||||
@ -724,68 +727,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
|
||||
const char *page, size_t length)
|
||||
{
|
||||
struct queue_sysfs_entry *entry = to_queue(attr);
|
||||
struct request_queue *q;
|
||||
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
|
||||
struct request_queue *q = disk->queue;
|
||||
ssize_t res;
|
||||
|
||||
if (!entry->store)
|
||||
return -EIO;
|
||||
|
||||
q = container_of(kobj, struct request_queue, kobj);
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
res = entry->store(q, page, length);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
return res;
|
||||
}
|
||||
|
||||
static void blk_free_queue_rcu(struct rcu_head *rcu_head)
|
||||
{
|
||||
struct request_queue *q = container_of(rcu_head, struct request_queue,
|
||||
rcu_head);
|
||||
|
||||
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_release_queue - releases all allocated resources of the request_queue
|
||||
* @kobj: pointer to a kobject, whose container is a request_queue
|
||||
*
|
||||
* This function releases all allocated resources of the request queue.
|
||||
*
|
||||
* The struct request_queue refcount is incremented with blk_get_queue() and
|
||||
* decremented with blk_put_queue(). Once the refcount reaches 0 this function
|
||||
* is called.
|
||||
*
|
||||
* Drivers exist which depend on the release of the request_queue to be
|
||||
* synchronous, it should not be deferred.
|
||||
*
|
||||
* Context: can sleep
|
||||
*/
|
||||
static void blk_release_queue(struct kobject *kobj)
|
||||
{
|
||||
struct request_queue *q =
|
||||
container_of(kobj, struct request_queue, kobj);
|
||||
|
||||
might_sleep();
|
||||
|
||||
percpu_ref_exit(&q->q_usage_counter);
|
||||
|
||||
if (q->poll_stat)
|
||||
blk_stat_remove_callback(q, q->poll_cb);
|
||||
blk_stat_free_callback(q->poll_cb);
|
||||
|
||||
blk_free_queue_stats(q->stats);
|
||||
kfree(q->poll_stat);
|
||||
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_release(q);
|
||||
|
||||
if (blk_queue_has_srcu(q))
|
||||
cleanup_srcu_struct(q->srcu);
|
||||
|
||||
ida_free(&blk_queue_ida, q->id);
|
||||
call_rcu(&q->rcu_head, blk_free_queue_rcu);
|
||||
}
|
||||
|
||||
static const struct sysfs_ops queue_sysfs_ops = {
|
||||
.show = queue_attr_show,
|
||||
.store = queue_attr_store,
|
||||
@ -796,12 +750,30 @@ static const struct attribute_group *blk_queue_attr_groups[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
struct kobj_type blk_queue_ktype = {
|
||||
static void blk_queue_release(struct kobject *kobj)
|
||||
{
|
||||
/* nothing to do here, all data is associated with the parent gendisk */
|
||||
}
|
||||
|
||||
static struct kobj_type blk_queue_ktype = {
|
||||
.default_groups = blk_queue_attr_groups,
|
||||
.sysfs_ops = &queue_sysfs_ops,
|
||||
.release = blk_release_queue,
|
||||
.release = blk_queue_release,
|
||||
};
|
||||
|
||||
static void blk_debugfs_remove(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_trace_shutdown(q);
|
||||
debugfs_remove_recursive(q->debugfs_dir);
|
||||
q->debugfs_dir = NULL;
|
||||
q->sched_debugfs_dir = NULL;
|
||||
q->rqos_debugfs_dir = NULL;
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
/**
|
||||
* blk_register_queue - register a block layer queue with sysfs
|
||||
* @disk: Disk of which the request queue should be registered with sysfs.
|
||||
@ -812,47 +784,47 @@ int blk_register_queue(struct gendisk *disk)
|
||||
int ret;
|
||||
|
||||
mutex_lock(&q->sysfs_dir_lock);
|
||||
|
||||
ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue");
|
||||
kobject_init(&disk->queue_kobj, &blk_queue_ktype);
|
||||
ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
goto out_put_queue_kobj;
|
||||
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_sysfs_register(disk);
|
||||
if (queue_is_mq(q)) {
|
||||
ret = blk_mq_sysfs_register(disk);
|
||||
if (ret)
|
||||
goto out_put_queue_kobj;
|
||||
}
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
|
||||
blk_debugfs_root);
|
||||
q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_debugfs_register(q);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
|
||||
ret = disk_register_independent_access_ranges(disk);
|
||||
if (ret)
|
||||
goto put_dev;
|
||||
goto out_debugfs_remove;
|
||||
|
||||
if (q->elevator) {
|
||||
ret = elv_register_queue(q, false);
|
||||
if (ret)
|
||||
goto put_dev;
|
||||
goto out_unregister_ia_ranges;
|
||||
}
|
||||
|
||||
ret = blk_crypto_sysfs_register(q);
|
||||
ret = blk_crypto_sysfs_register(disk);
|
||||
if (ret)
|
||||
goto put_dev;
|
||||
goto out_elv_unregister;
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
|
||||
wbt_enable_default(q);
|
||||
blk_throtl_register(disk);
|
||||
|
||||
/* Now everything is ready and send out KOBJ_ADD uevent */
|
||||
kobject_uevent(&q->kobj, KOBJ_ADD);
|
||||
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
|
||||
if (q->elevator)
|
||||
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
|
||||
/*
|
||||
@ -871,13 +843,16 @@ unlock:
|
||||
|
||||
return ret;
|
||||
|
||||
put_dev:
|
||||
out_elv_unregister:
|
||||
elv_unregister_queue(q);
|
||||
out_unregister_ia_ranges:
|
||||
disk_unregister_independent_access_ranges(disk);
|
||||
out_debugfs_remove:
|
||||
blk_debugfs_remove(disk);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
out_put_queue_kobj:
|
||||
kobject_put(&disk->queue_kobj);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
kobject_del(&q->kobj);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -915,7 +890,7 @@ void blk_unregister_queue(struct gendisk *disk)
|
||||
*/
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_sysfs_unregister(disk);
|
||||
blk_crypto_sysfs_unregister(q);
|
||||
blk_crypto_sysfs_unregister(disk);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
elv_unregister_queue(q);
|
||||
@ -923,15 +898,9 @@ void blk_unregister_queue(struct gendisk *disk)
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
/* Now that we've deleted all child objects, we can delete the queue. */
|
||||
kobject_uevent(&q->kobj, KOBJ_REMOVE);
|
||||
kobject_del(&q->kobj);
|
||||
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
|
||||
kobject_del(&disk->queue_kobj);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_trace_shutdown(q);
|
||||
debugfs_remove_recursive(q->debugfs_dir);
|
||||
q->debugfs_dir = NULL;
|
||||
q->sched_debugfs_dir = NULL;
|
||||
q->rqos_debugfs_dir = NULL;
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
blk_debugfs_remove(disk);
|
||||
}
|
||||
|
@ -129,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
|
||||
/*
|
||||
* cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
|
||||
* make the IO dispatch more smooth.
|
||||
* Scale up: linearly scale up according to lapsed time since upgrade. For
|
||||
* Scale up: linearly scale up according to elapsed time since upgrade. For
|
||||
* every throtl_slice, the limit scales up 1/2 .low limit till the
|
||||
* limit hits .max limit
|
||||
* Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
|
||||
@ -395,8 +395,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
|
||||
* If on the default hierarchy, we switch to properly hierarchical
|
||||
* behavior where limits on a given throtl_grp are applied to the
|
||||
* whole subtree rather than just the group itself. e.g. If 16M
|
||||
* read_bps limit is set on the root group, the whole system can't
|
||||
* exceed 16M for the device.
|
||||
* read_bps limit is set on a parent group, summary bps of
|
||||
* parent group and its subtree groups can't exceed 16M for the
|
||||
* device.
|
||||
*
|
||||
* If not on the default hierarchy, the broken flat hierarchy
|
||||
* behavior is retained where all throtl_grps are treated as if
|
||||
@ -644,7 +645,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
|
||||
* that bandwidth. Do try to make use of that bandwidth while giving
|
||||
* credit.
|
||||
*/
|
||||
if (time_after_eq(start, tg->slice_start[rw]))
|
||||
if (time_after(start, tg->slice_start[rw]))
|
||||
tg->slice_start[rw] = start;
|
||||
|
||||
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
|
||||
@ -821,17 +822,15 @@ static void tg_update_carryover(struct throtl_grp *tg)
|
||||
tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
|
||||
}
|
||||
|
||||
static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
u32 iops_limit, unsigned long *wait)
|
||||
static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
u32 iops_limit)
|
||||
{
|
||||
bool rw = bio_data_dir(bio);
|
||||
unsigned int io_allowed;
|
||||
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
|
||||
|
||||
if (iops_limit == UINT_MAX) {
|
||||
if (wait)
|
||||
*wait = 0;
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
jiffy_elapsed = jiffies - tg->slice_start[rw];
|
||||
@ -841,21 +840,16 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
|
||||
tg->carryover_ios[rw];
|
||||
if (tg->io_disp[rw] + 1 <= io_allowed) {
|
||||
if (wait)
|
||||
*wait = 0;
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Calc approx time to dispatch */
|
||||
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
|
||||
|
||||
if (wait)
|
||||
*wait = jiffy_wait;
|
||||
return false;
|
||||
return jiffy_wait;
|
||||
}
|
||||
|
||||
static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
u64 bps_limit, unsigned long *wait)
|
||||
static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
u64 bps_limit)
|
||||
{
|
||||
bool rw = bio_data_dir(bio);
|
||||
u64 bytes_allowed, extra_bytes;
|
||||
@ -864,9 +858,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
|
||||
/* no need to throttle if this bio's bytes have been accounted */
|
||||
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
|
||||
if (wait)
|
||||
*wait = 0;
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
|
||||
@ -879,9 +871,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
|
||||
tg->carryover_bytes[rw];
|
||||
if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
|
||||
if (wait)
|
||||
*wait = 0;
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Calc approx time to dispatch */
|
||||
@ -896,9 +886,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
||||
* up we did. Add that time also.
|
||||
*/
|
||||
jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
|
||||
if (wait)
|
||||
*wait = jiffy_wait;
|
||||
return false;
|
||||
return jiffy_wait;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -946,8 +934,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
|
||||
jiffies + tg->td->throtl_slice);
|
||||
}
|
||||
|
||||
if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) &&
|
||||
tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) {
|
||||
bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
|
||||
iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
|
||||
if (bps_wait + iops_wait == 0) {
|
||||
if (wait)
|
||||
*wait = 0;
|
||||
return true;
|
||||
@ -1066,7 +1055,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
|
||||
sq->nr_queued[rw]--;
|
||||
|
||||
throtl_charge_bio(tg, bio);
|
||||
bio_set_flag(bio, BIO_BPS_THROTTLED);
|
||||
|
||||
/*
|
||||
* If our parent is another tg, we just need to transfer @bio to
|
||||
@ -1079,6 +1067,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
|
||||
throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
|
||||
start_parent_slice_with_credit(tg, parent_tg, rw);
|
||||
} else {
|
||||
bio_set_flag(bio, BIO_BPS_THROTTLED);
|
||||
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
|
||||
&parent_sq->queued[rw]);
|
||||
BUG_ON(tg->td->nr_queued[rw] <= 0);
|
||||
@ -1737,7 +1726,18 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
|
||||
* Set the flag to make sure throtl_pending_timer_fn() won't
|
||||
* stop until all throttled bios are dispatched.
|
||||
*/
|
||||
blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
|
||||
tg->flags |= THROTL_TG_CANCELING;
|
||||
|
||||
/*
|
||||
* Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
|
||||
* will be inserted to service queue without THROTL_TG_PENDING
|
||||
* set in tg_update_disptime below. Then IO dispatched from
|
||||
* child in tg_dispatch_one_bio will trigger double insertion
|
||||
* and corrupt the tree.
|
||||
*/
|
||||
if (!(tg->flags & THROTL_TG_PENDING))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Update disptime after setting the above flag to make sure
|
||||
* throtl_select_dispatch() won't exit without dispatching.
|
||||
@ -1762,7 +1762,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
|
||||
return min(rtime, wtime);
|
||||
}
|
||||
|
||||
/* tg should not be an intermediate node */
|
||||
static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
|
||||
{
|
||||
struct throtl_service_queue *parent_sq;
|
||||
@ -1816,24 +1815,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
|
||||
static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
|
||||
{
|
||||
struct throtl_service_queue *sq = &tg->service_queue;
|
||||
bool read_limit, write_limit;
|
||||
bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
|
||||
|
||||
/*
|
||||
* if cgroup reaches low limit (if low limit is 0, the cgroup always
|
||||
* reaches), it's ok to upgrade to next limit
|
||||
* if low limit is zero, low limit is always reached.
|
||||
* if low limit is non-zero, we can check if there is any request
|
||||
* is queued to determine if low limit is reached as we throttle
|
||||
* request according to limit.
|
||||
*/
|
||||
read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
|
||||
write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
|
||||
if (!read_limit && !write_limit)
|
||||
return true;
|
||||
if (read_limit && sq->nr_queued[READ] &&
|
||||
(!write_limit || sq->nr_queued[WRITE]))
|
||||
return true;
|
||||
if (write_limit && sq->nr_queued[WRITE] &&
|
||||
(!read_limit || sq->nr_queued[READ]))
|
||||
return !limit || sq->nr_queued[rw];
|
||||
}
|
||||
|
||||
static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
|
||||
{
|
||||
/*
|
||||
* cgroup reaches low limit when low limit of READ and WRITE are
|
||||
* both reached, it's ok to upgrade to next limit if cgroup reaches
|
||||
* low limit
|
||||
*/
|
||||
if (throtl_low_limit_reached(tg, READ) &&
|
||||
throtl_low_limit_reached(tg, WRITE))
|
||||
return true;
|
||||
|
||||
if (time_after_eq(jiffies,
|
||||
@ -1951,8 +1955,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
|
||||
* If cgroup is below low limit, consider downgrade and throttle other
|
||||
* cgroups
|
||||
*/
|
||||
if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
|
||||
time_after_eq(now, tg_last_low_overflow_time(tg) +
|
||||
if (time_after_eq(now, tg_last_low_overflow_time(tg) +
|
||||
td->throtl_slice) &&
|
||||
(!throtl_tg_is_idle(tg) ||
|
||||
!list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
|
||||
@ -1962,6 +1965,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
|
||||
|
||||
static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
|
||||
{
|
||||
struct throtl_data *td = tg->td;
|
||||
|
||||
if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
|
||||
return false;
|
||||
|
||||
while (true) {
|
||||
if (!throtl_tg_can_downgrade(tg))
|
||||
return false;
|
||||
|
@ -27,6 +27,7 @@
|
||||
|
||||
#include "blk-wbt.h"
|
||||
#include "blk-rq-qos.h"
|
||||
#include "elevator.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/wbt.h>
|
||||
@ -422,6 +423,14 @@ static void wbt_update_limits(struct rq_wb *rwb)
|
||||
rwb_wake_all(rwb);
|
||||
}
|
||||
|
||||
bool wbt_disabled(struct request_queue *q)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
|
||||
return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT ||
|
||||
RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL;
|
||||
}
|
||||
|
||||
u64 wbt_get_min_lat(struct request_queue *q)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
@ -435,8 +444,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val)
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (!rqos)
|
||||
return;
|
||||
|
||||
RQWB(rqos)->min_lat_nsec = val;
|
||||
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
|
||||
if (val)
|
||||
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
|
||||
else
|
||||
RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
|
||||
|
||||
wbt_update_limits(RQWB(rqos));
|
||||
}
|
||||
|
||||
@ -638,11 +652,15 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
|
||||
*/
|
||||
void wbt_enable_default(struct request_queue *q)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
struct rq_qos *rqos;
|
||||
bool disable_flag = q->elevator &&
|
||||
test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags);
|
||||
|
||||
/* Throttling already enabled? */
|
||||
rqos = wbt_rq_qos(q);
|
||||
if (rqos) {
|
||||
if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
|
||||
if (!disable_flag &&
|
||||
RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
|
||||
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
|
||||
return;
|
||||
}
|
||||
@ -651,7 +669,7 @@ void wbt_enable_default(struct request_queue *q)
|
||||
if (!blk_queue_registered(q))
|
||||
return;
|
||||
|
||||
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
|
||||
if (queue_is_mq(q) && !disable_flag)
|
||||
wbt_init(q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wbt_enable_default);
|
||||
|
@ -28,13 +28,15 @@ enum {
|
||||
};
|
||||
|
||||
/*
|
||||
* Enable states. Either off, or on by default (done at init time),
|
||||
* or on through manual setup in sysfs.
|
||||
* If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
|
||||
* state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
|
||||
* to WBT_STATE_OFF/ON_MANUAL.
|
||||
*/
|
||||
enum {
|
||||
WBT_STATE_ON_DEFAULT = 1,
|
||||
WBT_STATE_ON_MANUAL = 2,
|
||||
WBT_STATE_OFF_DEFAULT
|
||||
WBT_STATE_ON_DEFAULT = 1, /* on by default */
|
||||
WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
|
||||
WBT_STATE_OFF_DEFAULT = 3, /* off by default */
|
||||
WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
|
||||
};
|
||||
|
||||
struct rq_wb {
|
||||
@ -94,6 +96,7 @@ void wbt_enable_default(struct request_queue *);
|
||||
|
||||
u64 wbt_get_min_lat(struct request_queue *q);
|
||||
void wbt_set_min_lat(struct request_queue *q, u64 val);
|
||||
bool wbt_disabled(struct request_queue *);
|
||||
|
||||
void wbt_set_write_cache(struct request_queue *, bool);
|
||||
|
||||
@ -125,6 +128,10 @@ static inline u64 wbt_default_latency_nsec(struct request_queue *q)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline bool wbt_disabled(struct request_queue *q)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BLK_WBT */
|
||||
|
||||
|
27
block/blk.h
27
block/blk.h
@ -26,11 +26,6 @@ struct blk_flush_queue {
|
||||
spinlock_t mq_flush_lock;
|
||||
};
|
||||
|
||||
extern struct kmem_cache *blk_requestq_cachep;
|
||||
extern struct kmem_cache *blk_requestq_srcu_cachep;
|
||||
extern struct kobj_type blk_queue_ktype;
|
||||
extern struct ida blk_queue_ida;
|
||||
|
||||
bool is_flush_rq(struct request *req);
|
||||
|
||||
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
|
||||
@ -104,7 +99,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
|
||||
static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
|
||||
struct bio_vec *bprv, unsigned int offset)
|
||||
{
|
||||
return (offset & lim->virt_boundary_mask) ||
|
||||
@ -115,7 +110,7 @@ static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
|
||||
* Check if adding a bio_vec after bprv with offset would create a gap in
|
||||
* the SG list. Most drivers don't care about this, but some do.
|
||||
*/
|
||||
static inline bool bvec_gap_to_prev(struct queue_limits *lim,
|
||||
static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
|
||||
struct bio_vec *bprv, unsigned int offset)
|
||||
{
|
||||
if (!lim->virt_boundary_mask)
|
||||
@ -278,6 +273,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
|
||||
void blk_insert_flush(struct request *rq);
|
||||
|
||||
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
|
||||
void elevator_disable(struct request_queue *q);
|
||||
void elevator_exit(struct request_queue *q);
|
||||
int elv_register_queue(struct request_queue *q, bool uevent);
|
||||
void elv_unregister_queue(struct request_queue *q);
|
||||
@ -297,7 +293,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
|
||||
const char *, size_t);
|
||||
|
||||
static inline bool bio_may_exceed_limits(struct bio *bio,
|
||||
struct queue_limits *lim)
|
||||
const struct queue_limits *lim)
|
||||
{
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
@ -320,8 +316,9 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
|
||||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
|
||||
}
|
||||
|
||||
struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
|
||||
unsigned int *nr_segs);
|
||||
struct bio *__bio_split_to_limits(struct bio *bio,
|
||||
const struct queue_limits *lim,
|
||||
unsigned int *nr_segs);
|
||||
int ll_back_merge_fn(struct request *req, struct bio *bio,
|
||||
unsigned int nr_segs);
|
||||
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
|
||||
@ -428,15 +425,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset,
|
||||
unsigned int max_sectors, bool *same_page);
|
||||
|
||||
static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
|
||||
{
|
||||
if (srcu)
|
||||
return blk_requestq_srcu_cachep;
|
||||
return blk_requestq_cachep;
|
||||
}
|
||||
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
|
||||
struct request_queue *blk_alloc_queue(int node_id);
|
||||
|
||||
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
|
||||
int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner);
|
||||
|
||||
int disk_alloc_events(struct gendisk *disk);
|
||||
void disk_add_events(struct gendisk *disk);
|
||||
|
@ -325,6 +325,7 @@ void bsg_remove_queue(struct request_queue *q)
|
||||
|
||||
bsg_unregister_queue(bset->bd);
|
||||
blk_mq_destroy_queue(q);
|
||||
blk_put_queue(q);
|
||||
blk_mq_free_tag_set(&bset->tag_set);
|
||||
kfree(bset);
|
||||
}
|
||||
@ -400,6 +401,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
|
||||
return q;
|
||||
out_cleanup_queue:
|
||||
blk_mq_destroy_queue(q);
|
||||
blk_put_queue(q);
|
||||
out_queue:
|
||||
blk_mq_free_tag_set(set);
|
||||
out_tag_set:
|
||||
|
11
block/bsg.c
11
block/bsg.c
@ -175,8 +175,10 @@ static void bsg_device_release(struct device *dev)
|
||||
|
||||
void bsg_unregister_queue(struct bsg_device *bd)
|
||||
{
|
||||
if (bd->queue->kobj.sd)
|
||||
sysfs_remove_link(&bd->queue->kobj, "bsg");
|
||||
struct gendisk *disk = bd->queue->disk;
|
||||
|
||||
if (disk && disk->queue_kobj.sd)
|
||||
sysfs_remove_link(&disk->queue_kobj, "bsg");
|
||||
cdev_device_del(&bd->cdev, &bd->device);
|
||||
put_device(&bd->device);
|
||||
}
|
||||
@ -216,8 +218,9 @@ struct bsg_device *bsg_register_queue(struct request_queue *q,
|
||||
if (ret)
|
||||
goto out_put_device;
|
||||
|
||||
if (q->kobj.sd) {
|
||||
ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg");
|
||||
if (q->disk && q->disk->queue_kobj.sd) {
|
||||
ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj,
|
||||
"bsg");
|
||||
if (ret)
|
||||
goto out_device_del;
|
||||
}
|
||||
|
256
block/elevator.c
256
block/elevator.c
@ -57,7 +57,7 @@ static LIST_HEAD(elv_list);
|
||||
* Query io scheduler to see if the current process issuing bio may be
|
||||
* merged with rq.
|
||||
*/
|
||||
static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
|
||||
static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
struct elevator_queue *e = q->elevator;
|
||||
@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
|
||||
if (e->type->ops.allow_merge)
|
||||
return e->type->ops.allow_merge(q, rq, bio);
|
||||
|
||||
return 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -83,78 +83,45 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
|
||||
}
|
||||
EXPORT_SYMBOL(elv_bio_merge_ok);
|
||||
|
||||
static inline bool elv_support_features(unsigned int elv_features,
|
||||
unsigned int required_features)
|
||||
static inline bool elv_support_features(struct request_queue *q,
|
||||
const struct elevator_type *e)
|
||||
{
|
||||
return (required_features & elv_features) == required_features;
|
||||
return (q->required_elevator_features & e->elevator_features) ==
|
||||
q->required_elevator_features;
|
||||
}
|
||||
|
||||
/**
|
||||
* elevator_match - Test an elevator name and features
|
||||
* elevator_match - Check whether @e's name or alias matches @name
|
||||
* @e: Scheduler to test
|
||||
* @name: Elevator name to test
|
||||
* @required_features: Features that the elevator must provide
|
||||
*
|
||||
* Return true if the elevator @e name matches @name and if @e provides all
|
||||
* the features specified by @required_features.
|
||||
* Return true if the elevator @e's name or alias matches @name.
|
||||
*/
|
||||
static bool elevator_match(const struct elevator_type *e, const char *name,
|
||||
unsigned int required_features)
|
||||
static bool elevator_match(const struct elevator_type *e, const char *name)
|
||||
{
|
||||
if (!elv_support_features(e->elevator_features, required_features))
|
||||
return false;
|
||||
if (!strcmp(e->elevator_name, name))
|
||||
return true;
|
||||
if (e->elevator_alias && !strcmp(e->elevator_alias, name))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
return !strcmp(e->elevator_name, name) ||
|
||||
(e->elevator_alias && !strcmp(e->elevator_alias, name));
|
||||
}
|
||||
|
||||
/**
|
||||
* elevator_find - Find an elevator
|
||||
* @name: Name of the elevator to find
|
||||
* @required_features: Features that the elevator must provide
|
||||
*
|
||||
* Return the first registered scheduler with name @name and supporting the
|
||||
* features @required_features and NULL otherwise.
|
||||
*/
|
||||
static struct elevator_type *elevator_find(const char *name,
|
||||
unsigned int required_features)
|
||||
static struct elevator_type *__elevator_find(const char *name)
|
||||
{
|
||||
struct elevator_type *e;
|
||||
|
||||
list_for_each_entry(e, &elv_list, list) {
|
||||
if (elevator_match(e, name, required_features))
|
||||
list_for_each_entry(e, &elv_list, list)
|
||||
if (elevator_match(e, name))
|
||||
return e;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void elevator_put(struct elevator_type *e)
|
||||
{
|
||||
module_put(e->elevator_owner);
|
||||
}
|
||||
|
||||
static struct elevator_type *elevator_get(struct request_queue *q,
|
||||
const char *name, bool try_loading)
|
||||
static struct elevator_type *elevator_find_get(struct request_queue *q,
|
||||
const char *name)
|
||||
{
|
||||
struct elevator_type *e;
|
||||
|
||||
spin_lock(&elv_list_lock);
|
||||
|
||||
e = elevator_find(name, q->required_elevator_features);
|
||||
if (!e && try_loading) {
|
||||
spin_unlock(&elv_list_lock);
|
||||
request_module("%s-iosched", name);
|
||||
spin_lock(&elv_list_lock);
|
||||
e = elevator_find(name, q->required_elevator_features);
|
||||
}
|
||||
|
||||
if (e && !try_module_get(e->elevator_owner))
|
||||
e = __elevator_find(name);
|
||||
if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
|
||||
e = NULL;
|
||||
|
||||
spin_unlock(&elv_list_lock);
|
||||
return e;
|
||||
}
|
||||
@ -170,6 +137,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
|
||||
if (unlikely(!eq))
|
||||
return NULL;
|
||||
|
||||
__elevator_get(e);
|
||||
eq->type = e;
|
||||
kobject_init(&eq->kobj, &elv_ktype);
|
||||
mutex_init(&eq->sysfs_lock);
|
||||
@ -499,7 +467,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
|
||||
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
|
||||
if (!error) {
|
||||
struct elv_fs_entry *attr = e->type->elevator_attrs;
|
||||
if (attr) {
|
||||
@ -512,7 +480,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
|
||||
if (uevent)
|
||||
kobject_uevent(&e->kobj, KOBJ_ADD);
|
||||
|
||||
e->registered = 1;
|
||||
set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
@ -523,13 +491,9 @@ void elv_unregister_queue(struct request_queue *q)
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
if (e && e->registered) {
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
|
||||
kobject_uevent(&e->kobj, KOBJ_REMOVE);
|
||||
kobject_del(&e->kobj);
|
||||
|
||||
e->registered = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -555,7 +519,7 @@ int elv_register(struct elevator_type *e)
|
||||
|
||||
/* register, don't allow duplicate names */
|
||||
spin_lock(&elv_list_lock);
|
||||
if (elevator_find(e->elevator_name, 0)) {
|
||||
if (__elevator_find(e->elevator_name)) {
|
||||
spin_unlock(&elv_list_lock);
|
||||
kmem_cache_destroy(e->icq_cache);
|
||||
return -EBUSY;
|
||||
@ -588,39 +552,6 @@ void elv_unregister(struct elevator_type *e)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(elv_unregister);
|
||||
|
||||
static int elevator_switch_mq(struct request_queue *q,
|
||||
struct elevator_type *new_e)
|
||||
{
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
if (q->elevator) {
|
||||
elv_unregister_queue(q);
|
||||
elevator_exit(q);
|
||||
}
|
||||
|
||||
ret = blk_mq_init_sched(q, new_e);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (new_e) {
|
||||
ret = elv_register_queue(q, true);
|
||||
if (ret) {
|
||||
elevator_exit(q);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (new_e)
|
||||
blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
|
||||
else
|
||||
blk_add_trace_msg(q, "elv switch: none");
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool elv_support_iosched(struct request_queue *q)
|
||||
{
|
||||
if (!queue_is_mq(q) ||
|
||||
@ -642,7 +573,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
|
||||
!blk_mq_is_shared_tags(q->tag_set->flags))
|
||||
return NULL;
|
||||
|
||||
return elevator_get(q, "mq-deadline", false);
|
||||
return elevator_find_get(q, "mq-deadline");
|
||||
}
|
||||
|
||||
/*
|
||||
@ -656,14 +587,13 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q)
|
||||
spin_lock(&elv_list_lock);
|
||||
|
||||
list_for_each_entry(e, &elv_list, list) {
|
||||
if (elv_support_features(e->elevator_features,
|
||||
q->required_elevator_features)) {
|
||||
if (elv_support_features(q, e)) {
|
||||
found = e;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found && !try_module_get(found->elevator_owner))
|
||||
if (found && !elevator_tryget(found))
|
||||
found = NULL;
|
||||
|
||||
spin_unlock(&elv_list_lock);
|
||||
@ -713,115 +643,147 @@ void elevator_init_mq(struct request_queue *q)
|
||||
if (err) {
|
||||
pr_warn("\"%s\" elevator initialization failed, "
|
||||
"falling back to \"none\"\n", e->elevator_name);
|
||||
elevator_put(e);
|
||||
}
|
||||
|
||||
elevator_put(e);
|
||||
}
|
||||
|
||||
/*
|
||||
* switch to new_e io scheduler. be careful not to introduce deadlocks -
|
||||
* we don't free the old io scheduler, before we have allocated what we
|
||||
* need for the new one. this way we have a chance of going back to the old
|
||||
* one, if the new one fails init for some reason.
|
||||
* Switch to new_e io scheduler.
|
||||
*
|
||||
* If switching fails, we are most likely running out of memory and not able
|
||||
* to restore the old io scheduler, so leaving the io scheduler being none.
|
||||
*/
|
||||
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
|
||||
{
|
||||
int err;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
err = elevator_switch_mq(q, new_e);
|
||||
if (q->elevator) {
|
||||
elv_unregister_queue(q);
|
||||
elevator_exit(q);
|
||||
}
|
||||
|
||||
ret = blk_mq_init_sched(q, new_e);
|
||||
if (ret)
|
||||
goto out_unfreeze;
|
||||
|
||||
ret = elv_register_queue(q, true);
|
||||
if (ret) {
|
||||
elevator_exit(q);
|
||||
goto out_unfreeze;
|
||||
}
|
||||
blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
|
||||
|
||||
out_unfreeze:
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
|
||||
return err;
|
||||
if (ret) {
|
||||
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
|
||||
new_e->elevator_name);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void elevator_disable(struct request_queue *q)
|
||||
{
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
elv_unregister_queue(q);
|
||||
elevator_exit(q);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
|
||||
q->elevator = NULL;
|
||||
q->nr_requests = q->tag_set->queue_depth;
|
||||
blk_add_trace_msg(q, "elv switch: none");
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
}
|
||||
|
||||
/*
|
||||
* Switch this queue to the given IO scheduler.
|
||||
*/
|
||||
static int __elevator_change(struct request_queue *q, const char *name)
|
||||
static int elevator_change(struct request_queue *q, const char *elevator_name)
|
||||
{
|
||||
char elevator_name[ELV_NAME_MAX];
|
||||
struct elevator_type *e;
|
||||
int ret;
|
||||
|
||||
/* Make sure queue is not in the middle of being removed */
|
||||
if (!blk_queue_registered(q))
|
||||
return -ENOENT;
|
||||
|
||||
/*
|
||||
* Special case for mq, turn off scheduling
|
||||
*/
|
||||
if (!strncmp(name, "none", 4)) {
|
||||
if (!q->elevator)
|
||||
return 0;
|
||||
return elevator_switch(q, NULL);
|
||||
}
|
||||
|
||||
strlcpy(elevator_name, name, sizeof(elevator_name));
|
||||
e = elevator_get(q, strstrip(elevator_name), true);
|
||||
if (!e)
|
||||
return -EINVAL;
|
||||
|
||||
if (q->elevator &&
|
||||
elevator_match(q->elevator->type, elevator_name, 0)) {
|
||||
elevator_put(e);
|
||||
if (!strncmp(elevator_name, "none", 4)) {
|
||||
if (q->elevator)
|
||||
elevator_disable(q);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return elevator_switch(q, e);
|
||||
if (q->elevator && elevator_match(q->elevator->type, elevator_name))
|
||||
return 0;
|
||||
|
||||
e = elevator_find_get(q, elevator_name);
|
||||
if (!e) {
|
||||
request_module("%s-iosched", elevator_name);
|
||||
e = elevator_find_get(q, elevator_name);
|
||||
if (!e)
|
||||
return -EINVAL;
|
||||
}
|
||||
ret = elevator_switch(q, e);
|
||||
elevator_put(e);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
|
||||
ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
|
||||
size_t count)
|
||||
{
|
||||
char elevator_name[ELV_NAME_MAX];
|
||||
int ret;
|
||||
|
||||
if (!elv_support_iosched(q))
|
||||
return count;
|
||||
|
||||
ret = __elevator_change(q, name);
|
||||
strlcpy(elevator_name, buf, sizeof(elevator_name));
|
||||
ret = elevator_change(q, strstrip(elevator_name));
|
||||
if (!ret)
|
||||
return count;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssize_t elv_iosched_show(struct request_queue *q, char *name)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
struct elevator_type *elv = NULL;
|
||||
struct elevator_type *__e;
|
||||
struct elevator_queue *eq = q->elevator;
|
||||
struct elevator_type *cur = NULL, *e;
|
||||
int len = 0;
|
||||
|
||||
if (!queue_is_mq(q))
|
||||
if (!elv_support_iosched(q))
|
||||
return sprintf(name, "none\n");
|
||||
|
||||
if (!q->elevator)
|
||||
if (!q->elevator) {
|
||||
len += sprintf(name+len, "[none] ");
|
||||
else
|
||||
elv = e->type;
|
||||
} else {
|
||||
len += sprintf(name+len, "none ");
|
||||
cur = eq->type;
|
||||
}
|
||||
|
||||
spin_lock(&elv_list_lock);
|
||||
list_for_each_entry(__e, &elv_list, list) {
|
||||
if (elv && elevator_match(elv, __e->elevator_name, 0)) {
|
||||
len += sprintf(name+len, "[%s] ", elv->elevator_name);
|
||||
continue;
|
||||
}
|
||||
if (elv_support_iosched(q) &&
|
||||
elevator_match(__e, __e->elevator_name,
|
||||
q->required_elevator_features))
|
||||
len += sprintf(name+len, "%s ", __e->elevator_name);
|
||||
list_for_each_entry(e, &elv_list, list) {
|
||||
if (e == cur)
|
||||
len += sprintf(name+len, "[%s] ", e->elevator_name);
|
||||
else if (elv_support_features(q, e))
|
||||
len += sprintf(name+len, "%s ", e->elevator_name);
|
||||
}
|
||||
spin_unlock(&elv_list_lock);
|
||||
|
||||
if (q->elevator)
|
||||
len += sprintf(name+len, "none");
|
||||
|
||||
len += sprintf(len+name, "\n");
|
||||
len += sprintf(name+len, "\n");
|
||||
return len;
|
||||
}
|
||||
|
||||
|
@ -84,6 +84,21 @@ struct elevator_type
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
static inline bool elevator_tryget(struct elevator_type *e)
|
||||
{
|
||||
return try_module_get(e->elevator_owner);
|
||||
}
|
||||
|
||||
static inline void __elevator_get(struct elevator_type *e)
|
||||
{
|
||||
__module_get(e->elevator_owner);
|
||||
}
|
||||
|
||||
static inline void elevator_put(struct elevator_type *e)
|
||||
{
|
||||
module_put(e->elevator_owner);
|
||||
}
|
||||
|
||||
#define ELV_HASH_BITS 6
|
||||
|
||||
void elv_rqhash_del(struct request_queue *q, struct request *rq);
|
||||
@ -100,10 +115,13 @@ struct elevator_queue
|
||||
void *elevator_data;
|
||||
struct kobject kobj;
|
||||
struct mutex sysfs_lock;
|
||||
unsigned int registered:1;
|
||||
unsigned long flags;
|
||||
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
|
||||
};
|
||||
|
||||
#define ELEVATOR_FLAG_REGISTERED 0
|
||||
#define ELEVATOR_FLAG_DISABLE_WBT 1
|
||||
|
||||
/*
|
||||
* block elevator interface
|
||||
*/
|
||||
|
@ -405,12 +405,6 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int blkdev_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
return generic_writepages(mapping, wbc);
|
||||
}
|
||||
|
||||
const struct address_space_operations def_blk_aops = {
|
||||
.dirty_folio = block_dirty_folio,
|
||||
.invalidate_folio = block_invalidate_folio,
|
||||
@ -419,7 +413,6 @@ const struct address_space_operations def_blk_aops = {
|
||||
.writepage = blkdev_writepage,
|
||||
.write_begin = blkdev_write_begin,
|
||||
.write_end = blkdev_write_end,
|
||||
.writepages = blkdev_writepages,
|
||||
.direct_IO = blkdev_direct_IO,
|
||||
.migrate_folio = buffer_migrate_folio_norefs,
|
||||
.is_dirty_writeback = buffer_check_dirty_writeback,
|
||||
|
@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(disk_uevent);
|
||||
|
||||
int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
|
||||
int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner)
|
||||
{
|
||||
struct block_device *bdev;
|
||||
|
||||
@ -366,6 +366,9 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
|
||||
return -EINVAL;
|
||||
if (disk->open_partitions)
|
||||
return -EBUSY;
|
||||
/* Someone else has bdev exclusively open? */
|
||||
if (disk->part0->bd_holder && disk->part0->bd_holder != owner)
|
||||
return -EBUSY;
|
||||
|
||||
set_bit(GD_NEED_PART_SCAN, &disk->state);
|
||||
bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
|
||||
@ -479,10 +482,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
goto out_put_holder_dir;
|
||||
}
|
||||
|
||||
ret = bd_register_pending_holders(disk);
|
||||
if (ret < 0)
|
||||
goto out_put_slave_dir;
|
||||
|
||||
ret = blk_register_queue(disk);
|
||||
if (ret)
|
||||
goto out_put_slave_dir;
|
||||
@ -500,7 +499,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
|
||||
bdev_add(disk->part0, ddev->devt);
|
||||
if (get_capacity(disk))
|
||||
disk_scan_partitions(disk, FMODE_READ);
|
||||
disk_scan_partitions(disk, FMODE_READ, NULL);
|
||||
|
||||
/*
|
||||
* Announce the disk and partitions after all partitions are
|
||||
@ -530,6 +529,7 @@ out_unregister_queue:
|
||||
rq_qos_exit(disk->queue);
|
||||
out_put_slave_dir:
|
||||
kobject_put(disk->slave_dir);
|
||||
disk->slave_dir = NULL;
|
||||
out_put_holder_dir:
|
||||
kobject_put(disk->part0->bd_holder_dir);
|
||||
out_del_integrity:
|
||||
@ -560,6 +560,11 @@ void blk_mark_disk_dead(struct gendisk *disk)
|
||||
{
|
||||
set_bit(GD_DEAD, &disk->state);
|
||||
blk_queue_start_drain(disk->queue);
|
||||
|
||||
/*
|
||||
* Stop buffered writers from dirtying pages that can't be written out.
|
||||
*/
|
||||
set_capacity_and_notify(disk, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
|
||||
|
||||
@ -629,6 +634,7 @@ void del_gendisk(struct gendisk *disk)
|
||||
|
||||
kobject_put(disk->part0->bd_holder_dir);
|
||||
kobject_put(disk->slave_dir);
|
||||
disk->slave_dir = NULL;
|
||||
|
||||
part_stat_set_all(disk->part0, 0);
|
||||
disk->part0->bd_stamp = 0;
|
||||
@ -643,7 +649,9 @@ void del_gendisk(struct gendisk *disk)
|
||||
|
||||
blk_sync_queue(q);
|
||||
blk_flush_integrity();
|
||||
blk_mq_cancel_work_sync(q);
|
||||
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_cancel_work_sync(q);
|
||||
|
||||
blk_mq_quiesce_queue(q);
|
||||
if (q->elevator) {
|
||||
@ -1193,21 +1201,10 @@ struct class block_class = {
|
||||
.dev_uevent = block_uevent,
|
||||
};
|
||||
|
||||
static char *block_devnode(struct device *dev, umode_t *mode,
|
||||
kuid_t *uid, kgid_t *gid)
|
||||
{
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
|
||||
if (disk->fops->devnode)
|
||||
return disk->fops->devnode(disk, mode);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const struct device_type disk_type = {
|
||||
.name = "disk",
|
||||
.groups = disk_attr_groups,
|
||||
.release = disk_release,
|
||||
.devnode = block_devnode,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
@ -1412,7 +1409,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
|
||||
struct request_queue *q;
|
||||
struct gendisk *disk;
|
||||
|
||||
q = blk_alloc_queue(node, false);
|
||||
q = blk_alloc_queue(node);
|
||||
if (!q)
|
||||
return NULL;
|
||||
|
||||
|
103
block/holder.c
103
block/holder.c
@ -4,7 +4,7 @@
|
||||
|
||||
struct bd_holder_disk {
|
||||
struct list_head list;
|
||||
struct block_device *bdev;
|
||||
struct kobject *holder_dir;
|
||||
int refcnt;
|
||||
};
|
||||
|
||||
@ -14,7 +14,7 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
|
||||
struct bd_holder_disk *holder;
|
||||
|
||||
list_for_each_entry(holder, &disk->slave_bdevs, list)
|
||||
if (holder->bdev == bdev)
|
||||
if (holder->holder_dir == bdev->bd_holder_dir)
|
||||
return holder;
|
||||
return NULL;
|
||||
}
|
||||
@ -29,19 +29,6 @@ static void del_symlink(struct kobject *from, struct kobject *to)
|
||||
sysfs_remove_link(from, kobject_name(to));
|
||||
}
|
||||
|
||||
static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
|
||||
if (ret)
|
||||
del_symlink(disk->slave_dir, bdev_kobj(bdev));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bd_link_disk_holder - create symlinks between holding disk and slave bdev
|
||||
* @bdev: the claimed slave bdev
|
||||
@ -75,12 +62,30 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
|
||||
struct bd_holder_disk *holder;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&disk->open_mutex);
|
||||
if (WARN_ON_ONCE(!disk->slave_dir))
|
||||
return -EINVAL;
|
||||
|
||||
if (bdev->bd_disk == disk)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* del_gendisk drops the initial reference to bd_holder_dir, so we
|
||||
* need to keep our own here to allow for cleanup past that point.
|
||||
*/
|
||||
mutex_lock(&bdev->bd_disk->open_mutex);
|
||||
if (!disk_live(bdev->bd_disk)) {
|
||||
mutex_unlock(&bdev->bd_disk->open_mutex);
|
||||
return -ENODEV;
|
||||
}
|
||||
kobject_get(bdev->bd_holder_dir);
|
||||
mutex_unlock(&bdev->bd_disk->open_mutex);
|
||||
|
||||
mutex_lock(&disk->open_mutex);
|
||||
WARN_ON_ONCE(!bdev->bd_holder);
|
||||
|
||||
holder = bd_find_holder_disk(bdev, disk);
|
||||
if (holder) {
|
||||
kobject_put(bdev->bd_holder_dir);
|
||||
holder->refcnt++;
|
||||
goto out_unlock;
|
||||
}
|
||||
@ -92,36 +97,32 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&holder->list);
|
||||
holder->bdev = bdev;
|
||||
holder->refcnt = 1;
|
||||
if (disk->slave_dir) {
|
||||
ret = __link_disk_holder(bdev, disk);
|
||||
if (ret) {
|
||||
kfree(holder);
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
holder->holder_dir = bdev->bd_holder_dir;
|
||||
|
||||
ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
|
||||
if (ret)
|
||||
goto out_free_holder;
|
||||
ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
|
||||
if (ret)
|
||||
goto out_del_symlink;
|
||||
list_add(&holder->list, &disk->slave_bdevs);
|
||||
/*
|
||||
* del_gendisk drops the initial reference to bd_holder_dir, so we need
|
||||
* to keep our own here to allow for cleanup past that point.
|
||||
*/
|
||||
kobject_get(bdev->bd_holder_dir);
|
||||
|
||||
mutex_unlock(&disk->open_mutex);
|
||||
return 0;
|
||||
|
||||
out_del_symlink:
|
||||
del_symlink(disk->slave_dir, bdev_kobj(bdev));
|
||||
out_free_holder:
|
||||
kfree(holder);
|
||||
out_unlock:
|
||||
mutex_unlock(&disk->open_mutex);
|
||||
if (ret)
|
||||
kobject_put(bdev->bd_holder_dir);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
|
||||
|
||||
static void __unlink_disk_holder(struct block_device *bdev,
|
||||
struct gendisk *disk)
|
||||
{
|
||||
del_symlink(disk->slave_dir, bdev_kobj(bdev));
|
||||
del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
|
||||
}
|
||||
|
||||
/**
|
||||
* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
|
||||
* @bdev: the calimed slave bdev
|
||||
@ -136,36 +137,18 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
|
||||
{
|
||||
struct bd_holder_disk *holder;
|
||||
|
||||
if (WARN_ON_ONCE(!disk->slave_dir))
|
||||
return;
|
||||
|
||||
mutex_lock(&disk->open_mutex);
|
||||
holder = bd_find_holder_disk(bdev, disk);
|
||||
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
|
||||
if (disk->slave_dir)
|
||||
__unlink_disk_holder(bdev, disk);
|
||||
kobject_put(bdev->bd_holder_dir);
|
||||
del_symlink(disk->slave_dir, bdev_kobj(bdev));
|
||||
del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj);
|
||||
kobject_put(holder->holder_dir);
|
||||
list_del_init(&holder->list);
|
||||
kfree(holder);
|
||||
}
|
||||
mutex_unlock(&disk->open_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
|
||||
|
||||
int bd_register_pending_holders(struct gendisk *disk)
|
||||
{
|
||||
struct bd_holder_disk *holder;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&disk->open_mutex);
|
||||
list_for_each_entry(holder, &disk->slave_bdevs, list) {
|
||||
ret = __link_disk_holder(holder->bdev, disk);
|
||||
if (ret)
|
||||
goto out_undo;
|
||||
}
|
||||
mutex_unlock(&disk->open_mutex);
|
||||
return 0;
|
||||
|
||||
out_undo:
|
||||
list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list)
|
||||
__unlink_disk_holder(holder->bdev, disk);
|
||||
mutex_unlock(&disk->open_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
@ -467,9 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
|
||||
* user space. Note the separate arg/argp parameters that are needed
|
||||
* to deal with the compat_ptr() conversion.
|
||||
*/
|
||||
static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
unsigned cmd, unsigned long arg, void __user *argp)
|
||||
static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd,
|
||||
unsigned long arg, void __user *argp)
|
||||
{
|
||||
struct block_device *bdev = I_BDEV(file->f_mapping->host);
|
||||
unsigned int max_sectors;
|
||||
|
||||
switch (cmd) {
|
||||
@ -527,7 +528,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
return -EACCES;
|
||||
if (bdev_is_partition(bdev))
|
||||
return -EINVAL;
|
||||
return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
|
||||
return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL,
|
||||
file);
|
||||
case BLKTRACESTART:
|
||||
case BLKTRACESTOP:
|
||||
case BLKTRACETEARDOWN:
|
||||
@ -605,7 +607,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
break;
|
||||
}
|
||||
|
||||
ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
|
||||
ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
|
||||
if (ret != -ENOIOCTLCMD)
|
||||
return ret;
|
||||
|
||||
@ -674,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
break;
|
||||
}
|
||||
|
||||
ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
|
||||
ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
|
||||
if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
|
||||
ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);
|
||||
|
||||
|
@ -130,6 +130,20 @@ static u8 dd_rq_ioclass(struct request *rq)
|
||||
return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
|
||||
}
|
||||
|
||||
/*
|
||||
* get the request before `rq' in sector-sorted order
|
||||
*/
|
||||
static inline struct request *
|
||||
deadline_earlier_request(struct request *rq)
|
||||
{
|
||||
struct rb_node *node = rb_prev(&rq->rb_node);
|
||||
|
||||
if (node)
|
||||
return rb_entry_rq(node);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the request after `rq' in sector-sorted order
|
||||
*/
|
||||
@ -277,6 +291,39 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if rq has a sequential request preceding it.
|
||||
*/
|
||||
static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
|
||||
{
|
||||
struct request *prev = deadline_earlier_request(rq);
|
||||
|
||||
if (!prev)
|
||||
return false;
|
||||
|
||||
return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip all write requests that are sequential from @rq, even if we cross
|
||||
* a zone boundary.
|
||||
*/
|
||||
static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
|
||||
struct request *rq)
|
||||
{
|
||||
sector_t pos = blk_rq_pos(rq);
|
||||
sector_t skipped_sectors = 0;
|
||||
|
||||
while (rq) {
|
||||
if (blk_rq_pos(rq) != pos + skipped_sectors)
|
||||
break;
|
||||
skipped_sectors += blk_rq_sectors(rq);
|
||||
rq = deadline_latter_request(rq);
|
||||
}
|
||||
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* For the specified data direction, return the next request to
|
||||
* dispatch using arrival ordered lists.
|
||||
@ -297,11 +344,16 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
* an unlocked target zone. For some HDDs, breaking a sequential
|
||||
* write stream can lead to lower throughput, so make sure to preserve
|
||||
* sequential write streams, even if that stream crosses into the next
|
||||
* zones and these zones are unlocked.
|
||||
*/
|
||||
spin_lock_irqsave(&dd->zone_lock, flags);
|
||||
list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
if (blk_req_can_dispatch_to_zone(rq) &&
|
||||
(blk_queue_nonrot(rq->q) ||
|
||||
!deadline_is_seq_write(dd, rq)))
|
||||
goto out;
|
||||
}
|
||||
rq = NULL;
|
||||
@ -331,13 +383,19 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
|
||||
|
||||
/*
|
||||
* Look for a write request that can be dispatched, that is one with
|
||||
* an unlocked target zone.
|
||||
* an unlocked target zone. For some HDDs, breaking a sequential
|
||||
* write stream can lead to lower throughput, so make sure to preserve
|
||||
* sequential write streams, even if that stream crosses into the next
|
||||
* zones and these zones are unlocked.
|
||||
*/
|
||||
spin_lock_irqsave(&dd->zone_lock, flags);
|
||||
while (rq) {
|
||||
if (blk_req_can_dispatch_to_zone(rq))
|
||||
break;
|
||||
rq = deadline_latter_request(rq);
|
||||
if (blk_queue_nonrot(rq->q))
|
||||
rq = deadline_latter_request(rq);
|
||||
else
|
||||
rq = deadline_skip_seq_writes(dd, rq);
|
||||
}
|
||||
spin_unlock_irqrestore(&dd->zone_lock, flags);
|
||||
|
||||
@ -789,6 +847,18 @@ static void dd_prepare_request(struct request *rq)
|
||||
rq->elv.priv[0] = NULL;
|
||||
}
|
||||
|
||||
static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
|
||||
enum dd_prio p;
|
||||
|
||||
for (p = 0; p <= DD_PRIO_MAX; p++)
|
||||
if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Callback from inside blk_mq_free_request().
|
||||
*
|
||||
@ -828,9 +898,10 @@ static void dd_finish_request(struct request *rq)
|
||||
|
||||
spin_lock_irqsave(&dd->zone_lock, flags);
|
||||
blk_req_zone_write_unlock(rq);
|
||||
if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
|
||||
blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
|
||||
spin_unlock_irqrestore(&dd->zone_lock, flags);
|
||||
|
||||
if (dd_has_write_work(rq->mq_hctx))
|
||||
blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2461,6 +2461,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key)
|
||||
return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step));
|
||||
}
|
||||
|
||||
static void opal_lock_check_for_saved_key(struct opal_dev *dev,
|
||||
struct opal_lock_unlock *lk_unlk)
|
||||
{
|
||||
struct opal_suspend_data *iter;
|
||||
|
||||
if (lk_unlk->l_state != OPAL_LK ||
|
||||
lk_unlk->session.opal_key.key_len > 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Usually when closing a crypto device (eg: dm-crypt with LUKS) the
|
||||
* volume key is not required, as it requires root privileges anyway,
|
||||
* and root can deny access to a disk in many ways regardless.
|
||||
* Requiring the volume key to lock the device is a peculiarity of the
|
||||
* OPAL specification. Given we might already have saved the key if
|
||||
* the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use
|
||||
* that key to lock the device if no key was provided here, the
|
||||
* locking range matches and the appropriate flag was passed with
|
||||
* 'IOC_OPAL_SAVE'.
|
||||
* This allows integrating OPAL with tools and libraries that are used
|
||||
* to the common behaviour and do not ask for the volume key when
|
||||
* closing a device.
|
||||
*/
|
||||
setup_opal_dev(dev);
|
||||
list_for_each_entry(iter, &dev->unlk_lst, node) {
|
||||
if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) &&
|
||||
iter->lr == lk_unlk->session.opal_key.lr &&
|
||||
iter->unlk.session.opal_key.key_len > 0) {
|
||||
lk_unlk->session.opal_key.key_len =
|
||||
iter->unlk.session.opal_key.key_len;
|
||||
memcpy(lk_unlk->session.opal_key.key,
|
||||
iter->unlk.session.opal_key.key,
|
||||
iter->unlk.session.opal_key.key_len);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int opal_lock_unlock(struct opal_dev *dev,
|
||||
struct opal_lock_unlock *lk_unlk)
|
||||
{
|
||||
@ -2470,6 +2508,7 @@ static int opal_lock_unlock(struct opal_dev *dev,
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&dev->dev_lock);
|
||||
opal_lock_check_for_saved_key(dev, lk_unlk);
|
||||
ret = __opal_lock_unlock(dev, lk_unlk);
|
||||
mutex_unlock(&dev->dev_lock);
|
||||
|
||||
|
@ -285,49 +285,6 @@ config BLK_DEV_RAM_SIZE
|
||||
The default value is 4096 kilobytes. Only change this if you know
|
||||
what you are doing.
|
||||
|
||||
config CDROM_PKTCDVD
|
||||
tristate "Packet writing on CD/DVD media (DEPRECATED)"
|
||||
depends on !UML
|
||||
depends on SCSI
|
||||
select CDROM
|
||||
help
|
||||
Note: This driver is deprecated and will be removed from the
|
||||
kernel in the near future!
|
||||
|
||||
If you have a CDROM/DVD drive that supports packet writing, say
|
||||
Y to include support. It should work with any MMC/Mt Fuji
|
||||
compliant ATAPI or SCSI drive, which is just about any newer
|
||||
DVD/CD writer.
|
||||
|
||||
Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs
|
||||
is possible.
|
||||
DVD-RW disks must be in restricted overwrite mode.
|
||||
|
||||
See the file <file:Documentation/cdrom/packet-writing.rst>
|
||||
for further information on the use of this driver.
|
||||
|
||||
To compile this driver as a module, choose M here: the
|
||||
module will be called pktcdvd.
|
||||
|
||||
config CDROM_PKTCDVD_BUFFERS
|
||||
int "Free buffers for data gathering"
|
||||
depends on CDROM_PKTCDVD
|
||||
default "8"
|
||||
help
|
||||
This controls the maximum number of active concurrent packets. More
|
||||
concurrent packets can increase write performance, but also require
|
||||
more memory. Each concurrent packet will require approximately 64Kb
|
||||
of non-swappable kernel memory, memory which will be allocated when
|
||||
a disc is opened for writing.
|
||||
|
||||
config CDROM_PKTCDVD_WCACHE
|
||||
bool "Enable write caching"
|
||||
depends on CDROM_PKTCDVD
|
||||
help
|
||||
If enabled, write caching will be set for the CD-R/W device. For now
|
||||
this option is dangerous unless the CD-RW media is known good, as we
|
||||
don't do deferred write error handling yet.
|
||||
|
||||
config ATA_OVER_ETH
|
||||
tristate "ATA over Ethernet support"
|
||||
depends on NET
|
||||
|
@ -20,7 +20,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
|
||||
obj-$(CONFIG_N64CART) += n64cart.o
|
||||
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
|
||||
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
|
||||
obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
|
||||
obj-$(CONFIG_SUNVDC) += sunvdc.o
|
||||
|
||||
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
|
||||
|
@ -1,4 +1,4 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# DRBD device driver configuration
|
||||
#
|
||||
|
@ -1,4 +1,4 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
drbd-y := drbd_bitmap.o drbd_proc.o
|
||||
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
|
||||
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_actlog.c
|
||||
|
||||
@ -868,9 +868,9 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
|
||||
nr_sectors = get_capacity(device->vdisk);
|
||||
esector = sector + (size >> 9) - 1;
|
||||
|
||||
if (!expect(sector < nr_sectors))
|
||||
if (!expect(device, sector < nr_sectors))
|
||||
goto out;
|
||||
if (!expect(esector < nr_sectors))
|
||||
if (!expect(device, esector < nr_sectors))
|
||||
esector = nr_sectors - 1;
|
||||
|
||||
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
|
||||
@ -1143,7 +1143,7 @@ void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
|
||||
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
|
||||
if (!bm_ext) {
|
||||
spin_unlock_irqrestore(&device->al_lock, flags);
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
|
||||
return;
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_bitmap.c
|
||||
|
||||
@ -113,7 +113,7 @@ struct drbd_bitmap {
|
||||
static void __bm_print_lock_info(struct drbd_device *device, const char *func)
|
||||
{
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
if (!__ratelimit(&drbd_ratelimit_state))
|
||||
if (!drbd_ratelimit())
|
||||
return;
|
||||
drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
|
||||
current->comm, task_pid_nr(current),
|
||||
@ -448,7 +448,7 @@ int drbd_bm_init(struct drbd_device *device)
|
||||
|
||||
sector_t drbd_bm_capacity(struct drbd_device *device)
|
||||
{
|
||||
if (!expect(device->bitmap))
|
||||
if (!expect(device, device->bitmap))
|
||||
return 0;
|
||||
return device->bitmap->bm_dev_capacity;
|
||||
}
|
||||
@ -457,7 +457,7 @@ sector_t drbd_bm_capacity(struct drbd_device *device)
|
||||
*/
|
||||
void drbd_bm_cleanup(struct drbd_device *device)
|
||||
{
|
||||
if (!expect(device->bitmap))
|
||||
if (!expect(device, device->bitmap))
|
||||
return;
|
||||
bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
|
||||
bm_vk_free(device->bitmap->bm_pages);
|
||||
@ -636,7 +636,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
|
||||
int err = 0;
|
||||
bool growing;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return -ENOMEM;
|
||||
|
||||
drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
|
||||
@ -757,9 +757,9 @@ unsigned long _drbd_bm_total_weight(struct drbd_device *device)
|
||||
unsigned long s;
|
||||
unsigned long flags;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return 0;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return 0;
|
||||
|
||||
spin_lock_irqsave(&b->bm_lock, flags);
|
||||
@ -783,9 +783,9 @@ unsigned long drbd_bm_total_weight(struct drbd_device *device)
|
||||
size_t drbd_bm_words(struct drbd_device *device)
|
||||
{
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return 0;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return 0;
|
||||
|
||||
return b->bm_words;
|
||||
@ -794,7 +794,7 @@ size_t drbd_bm_words(struct drbd_device *device)
|
||||
unsigned long drbd_bm_bits(struct drbd_device *device)
|
||||
{
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return 0;
|
||||
|
||||
return b->bm_bits;
|
||||
@ -816,9 +816,9 @@ void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
|
||||
|
||||
end = offset + number;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return;
|
||||
if (number == 0)
|
||||
return;
|
||||
@ -863,9 +863,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
|
||||
|
||||
end = offset + number;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return;
|
||||
|
||||
spin_lock_irq(&b->bm_lock);
|
||||
@ -894,9 +894,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
|
||||
void drbd_bm_set_all(struct drbd_device *device)
|
||||
{
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return;
|
||||
|
||||
spin_lock_irq(&b->bm_lock);
|
||||
@ -910,9 +910,9 @@ void drbd_bm_set_all(struct drbd_device *device)
|
||||
void drbd_bm_clear_all(struct drbd_device *device)
|
||||
{
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return;
|
||||
|
||||
spin_lock_irq(&b->bm_lock);
|
||||
@ -952,7 +952,7 @@ static void drbd_bm_endio(struct bio *bio)
|
||||
bm_set_page_io_err(b->bm_pages[idx]);
|
||||
/* Not identical to on disk version of it.
|
||||
* Is BM_PAGE_IO_ERROR enough? */
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
|
||||
bio->bi_status, idx);
|
||||
} else {
|
||||
@ -1013,7 +1013,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
|
||||
else
|
||||
len = PAGE_SIZE;
|
||||
} else {
|
||||
if (__ratelimit(&drbd_ratelimit_state)) {
|
||||
if (drbd_ratelimit()) {
|
||||
drbd_err(device, "Invalid offset during on-disk bitmap access: "
|
||||
"page idx %u, sector %llu\n", page_nr, on_disk_sector);
|
||||
}
|
||||
@ -1332,9 +1332,9 @@ static unsigned long bm_find_next(struct drbd_device *device,
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
unsigned long i = DRBD_END_OF_BITMAP;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return i;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return i;
|
||||
|
||||
spin_lock_irq(&b->bm_lock);
|
||||
@ -1436,9 +1436,9 @@ static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
|
||||
struct drbd_bitmap *b = device->bitmap;
|
||||
int c = 0;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return 1;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return 0;
|
||||
|
||||
spin_lock_irqsave(&b->bm_lock, flags);
|
||||
@ -1582,9 +1582,9 @@ int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
|
||||
unsigned long *p_addr;
|
||||
int i;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return 0;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return 0;
|
||||
|
||||
spin_lock_irqsave(&b->bm_lock, flags);
|
||||
@ -1619,9 +1619,9 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const
|
||||
* robust in case we screwed up elsewhere, in that case pretend there
|
||||
* was one dirty bit in the requested area, so we won't try to do a
|
||||
* local read there (no bitmap probably implies no disk) */
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return 1;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return 1;
|
||||
|
||||
spin_lock_irqsave(&b->bm_lock, flags);
|
||||
@ -1635,7 +1635,7 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const
|
||||
bm_unmap(p_addr);
|
||||
p_addr = bm_map_pidx(b, idx);
|
||||
}
|
||||
if (expect(bitnr < b->bm_bits))
|
||||
if (expect(device, bitnr < b->bm_bits))
|
||||
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
|
||||
else
|
||||
drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
|
||||
@ -1668,9 +1668,9 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
|
||||
unsigned long flags;
|
||||
unsigned long *p_addr, *bm;
|
||||
|
||||
if (!expect(b))
|
||||
if (!expect(device, b))
|
||||
return 0;
|
||||
if (!expect(b->bm_pages))
|
||||
if (!expect(device, b->bm_pages))
|
||||
return 0;
|
||||
|
||||
spin_lock_irqsave(&b->bm_lock, flags);
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
#define pr_fmt(fmt) "drbd debugfs: " fmt
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/debugfs.h>
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
drbd_int.h
|
||||
|
||||
@ -37,6 +37,7 @@
|
||||
#include "drbd_strings.h"
|
||||
#include "drbd_state.h"
|
||||
#include "drbd_protocol.h"
|
||||
#include "drbd_polymorph_printk.h"
|
||||
|
||||
#ifdef __CHECKER__
|
||||
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
|
||||
@ -75,71 +76,6 @@ extern int drbd_proc_details;
|
||||
struct drbd_device;
|
||||
struct drbd_connection;
|
||||
|
||||
#define __drbd_printk_device(level, device, fmt, args...) \
|
||||
dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
|
||||
#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
|
||||
dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
|
||||
#define __drbd_printk_resource(level, resource, fmt, args...) \
|
||||
printk(level "drbd %s: " fmt, (resource)->name, ## args)
|
||||
#define __drbd_printk_connection(level, connection, fmt, args...) \
|
||||
printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
|
||||
|
||||
void drbd_printk_with_wrong_object_type(void);
|
||||
|
||||
#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
|
||||
(__builtin_types_compatible_p(typeof(obj), type) || \
|
||||
__builtin_types_compatible_p(typeof(obj), const type)), \
|
||||
func(level, (const type)(obj), fmt, ## args)
|
||||
|
||||
#define drbd_printk(level, obj, fmt, args...) \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, struct drbd_device *, \
|
||||
__drbd_printk_device, level, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, struct drbd_resource *, \
|
||||
__drbd_printk_resource, level, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, struct drbd_connection *, \
|
||||
__drbd_printk_connection, level, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
|
||||
__drbd_printk_peer_device, level, fmt, ## args), \
|
||||
drbd_printk_with_wrong_object_type()))))
|
||||
|
||||
#define drbd_dbg(obj, fmt, args...) \
|
||||
drbd_printk(KERN_DEBUG, obj, fmt, ## args)
|
||||
#define drbd_alert(obj, fmt, args...) \
|
||||
drbd_printk(KERN_ALERT, obj, fmt, ## args)
|
||||
#define drbd_err(obj, fmt, args...) \
|
||||
drbd_printk(KERN_ERR, obj, fmt, ## args)
|
||||
#define drbd_warn(obj, fmt, args...) \
|
||||
drbd_printk(KERN_WARNING, obj, fmt, ## args)
|
||||
#define drbd_info(obj, fmt, args...) \
|
||||
drbd_printk(KERN_INFO, obj, fmt, ## args)
|
||||
#define drbd_emerg(obj, fmt, args...) \
|
||||
drbd_printk(KERN_EMERG, obj, fmt, ## args)
|
||||
|
||||
#define dynamic_drbd_dbg(device, fmt, args...) \
|
||||
dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
|
||||
|
||||
#define D_ASSERT(device, exp) do { \
|
||||
if (!(exp)) \
|
||||
drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* expect - Make an assertion
|
||||
*
|
||||
* Unlike the assert macro, this macro returns a boolean result.
|
||||
*/
|
||||
#define expect(exp) ({ \
|
||||
bool _bool = (exp); \
|
||||
if (!_bool) \
|
||||
drbd_err(device, "ASSERTION %s FAILED in %s\n", \
|
||||
#exp, __func__); \
|
||||
_bool; \
|
||||
})
|
||||
|
||||
/* Defines to control fault insertion */
|
||||
enum {
|
||||
DRBD_FAULT_MD_WR = 0, /* meta data write */
|
||||
@ -395,6 +331,7 @@ struct drbd_peer_request {
|
||||
struct drbd_peer_device *peer_device;
|
||||
struct drbd_epoch *epoch; /* for writes */
|
||||
struct page *pages;
|
||||
blk_opf_t opf;
|
||||
atomic_t pending_bios;
|
||||
struct drbd_interval i;
|
||||
/* see comments on ee flag bits below */
|
||||
@ -406,6 +343,10 @@ struct drbd_peer_request {
|
||||
};
|
||||
};
|
||||
|
||||
/* Equivalent to bio_op and req_op. */
|
||||
#define peer_req_op(peer_req) \
|
||||
((peer_req)->opf & REQ_OP_MASK)
|
||||
|
||||
/* ee flag bits.
|
||||
* While corresponding bios are in flight, the only modification will be
|
||||
* set_bit WAS_ERROR, which has to be atomic.
|
||||
@ -1545,8 +1486,7 @@ extern void drbd_send_acks_wf(struct work_struct *ws);
|
||||
extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
|
||||
extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
|
||||
bool throttle_if_app_is_waiting);
|
||||
extern int drbd_submit_peer_request(struct drbd_device *,
|
||||
struct drbd_peer_request *, blk_opf_t, int);
|
||||
extern int drbd_submit_peer_request(struct drbd_peer_request *peer_req);
|
||||
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
|
||||
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
|
||||
sector_t, unsigned int,
|
||||
@ -1718,7 +1658,7 @@ static inline void __drbd_chk_io_error_(struct drbd_device *device,
|
||||
switch (ep) {
|
||||
case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
|
||||
if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "Local IO failed in %s.\n", where);
|
||||
if (device->state.disk > D_INCONSISTENT)
|
||||
_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
#include <asm/bug.h>
|
||||
#include <linux/rbtree_augmented.h>
|
||||
#include "drbd_interval.h"
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef __DRBD_INTERVAL_H
|
||||
#define __DRBD_INTERVAL_H
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd.c
|
||||
|
||||
@ -1259,7 +1259,7 @@ static int _drbd_send_bitmap(struct drbd_device *device)
|
||||
struct bm_xfer_ctx c;
|
||||
int err;
|
||||
|
||||
if (!expect(device->bitmap))
|
||||
if (!expect(device, device->bitmap))
|
||||
return false;
|
||||
|
||||
if (get_ldev(device)) {
|
||||
@ -2217,7 +2217,8 @@ void drbd_destroy_device(struct kref *kref)
|
||||
kref_put(&peer_device->connection->kref, drbd_destroy_connection);
|
||||
kfree(peer_device);
|
||||
}
|
||||
memset(device, 0xfd, sizeof(*device));
|
||||
if (device->submit.wq)
|
||||
destroy_workqueue(device->submit.wq);
|
||||
kfree(device);
|
||||
kref_put(&resource->kref, drbd_destroy_resource);
|
||||
}
|
||||
@ -2249,9 +2250,9 @@ static void do_retry(struct work_struct *ws)
|
||||
bool expected;
|
||||
|
||||
expected =
|
||||
expect(atomic_read(&req->completion_ref) == 0) &&
|
||||
expect(req->rq_state & RQ_POSTPONED) &&
|
||||
expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
|
||||
expect(device, atomic_read(&req->completion_ref) == 0) &&
|
||||
expect(device, req->rq_state & RQ_POSTPONED) &&
|
||||
expect(device, (req->rq_state & RQ_LOCAL_PENDING) == 0 ||
|
||||
(req->rq_state & RQ_LOCAL_ABORTED) != 0);
|
||||
|
||||
if (!expected)
|
||||
@ -2309,7 +2310,6 @@ void drbd_destroy_resource(struct kref *kref)
|
||||
idr_destroy(&resource->devices);
|
||||
free_cpumask_var(resource->cpu_mask);
|
||||
kfree(resource->name);
|
||||
memset(resource, 0xf2, sizeof(*resource));
|
||||
kfree(resource);
|
||||
}
|
||||
|
||||
@ -2650,7 +2650,6 @@ void drbd_destroy_connection(struct kref *kref)
|
||||
drbd_free_socket(&connection->data);
|
||||
kfree(connection->int_dig_in);
|
||||
kfree(connection->int_dig_vv);
|
||||
memset(connection, 0xfc, sizeof(*connection));
|
||||
kfree(connection);
|
||||
kref_put(&resource->kref, drbd_destroy_resource);
|
||||
}
|
||||
@ -2774,7 +2773,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
|
||||
|
||||
err = add_disk(disk);
|
||||
if (err)
|
||||
goto out_idr_remove_from_resource;
|
||||
goto out_destroy_workqueue;
|
||||
|
||||
/* inherit the connection state */
|
||||
device->state.conn = first_connection(resource)->cstate;
|
||||
@ -2788,6 +2787,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
|
||||
drbd_debugfs_device_add(device);
|
||||
return NO_ERROR;
|
||||
|
||||
out_destroy_workqueue:
|
||||
destroy_workqueue(device->submit.wq);
|
||||
out_idr_remove_from_resource:
|
||||
for_each_connection_safe(connection, n, resource) {
|
||||
peer_device = idr_remove(&connection->peer_devices, vnr);
|
||||
@ -3766,7 +3767,7 @@ _drbd_insert_fault(struct drbd_device *device, unsigned int type)
|
||||
if (ret) {
|
||||
drbd_fault_count++;
|
||||
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_warn(device, "***Simulating %s failure\n",
|
||||
_drbd_fault_str(type));
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_nl.c
|
||||
|
||||
@ -1210,6 +1210,7 @@ static void decide_on_discard_support(struct drbd_device *device,
|
||||
struct drbd_connection *connection =
|
||||
first_peer_device(device)->connection;
|
||||
struct request_queue *q = device->rq_queue;
|
||||
unsigned int max_discard_sectors;
|
||||
|
||||
if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev))
|
||||
goto not_supported;
|
||||
@ -1230,15 +1231,14 @@ static void decide_on_discard_support(struct drbd_device *device,
|
||||
* topology on all peers.
|
||||
*/
|
||||
blk_queue_discard_granularity(q, 512);
|
||||
q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
|
||||
q->limits.max_write_zeroes_sectors =
|
||||
drbd_max_discard_sectors(connection);
|
||||
max_discard_sectors = drbd_max_discard_sectors(connection);
|
||||
blk_queue_max_discard_sectors(q, max_discard_sectors);
|
||||
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
|
||||
return;
|
||||
|
||||
not_supported:
|
||||
blk_queue_discard_granularity(q, 0);
|
||||
q->limits.max_discard_sectors = 0;
|
||||
q->limits.max_write_zeroes_sectors = 0;
|
||||
blk_queue_max_discard_sectors(q, 0);
|
||||
}
|
||||
|
||||
static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q)
|
||||
@ -1256,6 +1256,18 @@ static void fixup_write_zeroes(struct drbd_device *device, struct request_queue
|
||||
q->limits.max_write_zeroes_sectors = 0;
|
||||
}
|
||||
|
||||
static void fixup_discard_support(struct drbd_device *device, struct request_queue *q)
|
||||
{
|
||||
unsigned int max_discard = device->rq_queue->limits.max_discard_sectors;
|
||||
unsigned int discard_granularity =
|
||||
device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT;
|
||||
|
||||
if (discard_granularity > max_discard) {
|
||||
blk_queue_discard_granularity(q, 0);
|
||||
blk_queue_max_discard_sectors(q, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
|
||||
unsigned int max_bio_size, struct o_qlim *o)
|
||||
{
|
||||
@ -1288,6 +1300,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
|
||||
disk_update_readahead(device->vdisk);
|
||||
}
|
||||
fixup_write_zeroes(device, q);
|
||||
fixup_discard_support(device, q);
|
||||
}
|
||||
|
||||
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
|
||||
@ -1530,7 +1543,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
|
||||
goto fail_unlock;
|
||||
}
|
||||
|
||||
if (!expect(new_disk_conf->resync_rate >= 1))
|
||||
if (!expect(device, new_disk_conf->resync_rate >= 1))
|
||||
new_disk_conf->resync_rate = 1;
|
||||
|
||||
sanitize_disk_conf(device, new_disk_conf, device->ldev);
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
#include <linux/kernel.h>
|
||||
#include <net/netlink.h>
|
||||
#include <linux/drbd_genl_api.h>
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef __DRBD_NLA_H
|
||||
#define __DRBD_NLA_H
|
||||
|
||||
|
141
drivers/block/drbd/drbd_polymorph_printk.h
Normal file
141
drivers/block/drbd/drbd_polymorph_printk.h
Normal file
@ -0,0 +1,141 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef DRBD_POLYMORPH_PRINTK_H
|
||||
#define DRBD_POLYMORPH_PRINTK_H
|
||||
|
||||
#if !defined(CONFIG_DYNAMIC_DEBUG)
|
||||
#undef DEFINE_DYNAMIC_DEBUG_METADATA
|
||||
#undef __dynamic_pr_debug
|
||||
#undef DYNAMIC_DEBUG_BRANCH
|
||||
#define DEFINE_DYNAMIC_DEBUG_METADATA(D, F) const char *D = F; ((void)D)
|
||||
#define __dynamic_pr_debug(D, F, args...) do { (void)(D); if (0) printk(F, ## args); } while (0)
|
||||
#define DYNAMIC_DEBUG_BRANCH(D) false
|
||||
#endif
|
||||
|
||||
|
||||
#define __drbd_printk_drbd_device_prep(device) \
|
||||
const struct drbd_device *__d = (device); \
|
||||
const struct drbd_resource *__r = __d->resource
|
||||
#define __drbd_printk_drbd_device_fmt(fmt) "drbd %s/%u drbd%u: " fmt
|
||||
#define __drbd_printk_drbd_device_args() __r->name, __d->vnr, __d->minor
|
||||
#define __drbd_printk_drbd_device_unprep()
|
||||
|
||||
#define __drbd_printk_drbd_peer_device_prep(peer_device) \
|
||||
const struct drbd_device *__d; \
|
||||
const struct drbd_resource *__r; \
|
||||
__d = (peer_device)->device; \
|
||||
__r = __d->resource
|
||||
#define __drbd_printk_drbd_peer_device_fmt(fmt) \
|
||||
"drbd %s/%u drbd%u: " fmt
|
||||
#define __drbd_printk_drbd_peer_device_args() \
|
||||
__r->name, __d->vnr, __d->minor
|
||||
#define __drbd_printk_drbd_peer_device_unprep()
|
||||
|
||||
#define __drbd_printk_drbd_resource_prep(resource) \
|
||||
const struct drbd_resource *__r = resource
|
||||
#define __drbd_printk_drbd_resource_fmt(fmt) "drbd %s: " fmt
|
||||
#define __drbd_printk_drbd_resource_args() __r->name
|
||||
#define __drbd_printk_drbd_resource_unprep(resource)
|
||||
|
||||
#define __drbd_printk_drbd_connection_prep(connection) \
|
||||
const struct drbd_connection *__c = (connection); \
|
||||
const struct drbd_resource *__r = __c->resource
|
||||
#define __drbd_printk_drbd_connection_fmt(fmt) \
|
||||
"drbd %s: " fmt
|
||||
#define __drbd_printk_drbd_connection_args() \
|
||||
__r->name
|
||||
#define __drbd_printk_drbd_connection_unprep()
|
||||
|
||||
void drbd_printk_with_wrong_object_type(void);
|
||||
void drbd_dyn_dbg_with_wrong_object_type(void);
|
||||
|
||||
#define __drbd_printk_choose_cond(obj, struct_name) \
|
||||
(__builtin_types_compatible_p(typeof(obj), struct struct_name *) || \
|
||||
__builtin_types_compatible_p(typeof(obj), const struct struct_name *))
|
||||
#define __drbd_printk_if_same_type(obj, struct_name, level, fmt, args...) \
|
||||
__drbd_printk_choose_cond(obj, struct_name), \
|
||||
({ \
|
||||
__drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
|
||||
printk(level __drbd_printk_ ## struct_name ## _fmt(fmt), \
|
||||
__drbd_printk_ ## struct_name ## _args(), ## args); \
|
||||
__drbd_printk_ ## struct_name ## _unprep(); \
|
||||
})
|
||||
|
||||
#define drbd_printk(level, obj, fmt, args...) \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, drbd_device, level, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, drbd_resource, level, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, drbd_connection, level, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_printk_if_same_type(obj, drbd_peer_device, level, fmt, ## args), \
|
||||
drbd_printk_with_wrong_object_type()))))
|
||||
|
||||
#define __drbd_dyn_dbg_if_same_type(obj, struct_name, fmt, args...) \
|
||||
__drbd_printk_choose_cond(obj, struct_name), \
|
||||
({ \
|
||||
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
|
||||
if (DYNAMIC_DEBUG_BRANCH(descriptor)) { \
|
||||
__drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
|
||||
__dynamic_pr_debug(&descriptor, __drbd_printk_ ## struct_name ## _fmt(fmt), \
|
||||
__drbd_printk_ ## struct_name ## _args(), ## args); \
|
||||
__drbd_printk_ ## struct_name ## _unprep(); \
|
||||
} \
|
||||
})
|
||||
|
||||
#define dynamic_drbd_dbg(obj, fmt, args...) \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_dyn_dbg_if_same_type(obj, drbd_device, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_dyn_dbg_if_same_type(obj, drbd_resource, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_dyn_dbg_if_same_type(obj, drbd_connection, fmt, ## args), \
|
||||
__builtin_choose_expr( \
|
||||
__drbd_dyn_dbg_if_same_type(obj, drbd_peer_device, fmt, ## args), \
|
||||
drbd_dyn_dbg_with_wrong_object_type()))))
|
||||
|
||||
#define drbd_emerg(device, fmt, args...) \
|
||||
drbd_printk(KERN_EMERG, device, fmt, ## args)
|
||||
#define drbd_alert(device, fmt, args...) \
|
||||
drbd_printk(KERN_ALERT, device, fmt, ## args)
|
||||
#define drbd_crit(device, fmt, args...) \
|
||||
drbd_printk(KERN_CRIT, device, fmt, ## args)
|
||||
#define drbd_err(device, fmt, args...) \
|
||||
drbd_printk(KERN_ERR, device, fmt, ## args)
|
||||
#define drbd_warn(device, fmt, args...) \
|
||||
drbd_printk(KERN_WARNING, device, fmt, ## args)
|
||||
#define drbd_notice(device, fmt, args...) \
|
||||
drbd_printk(KERN_NOTICE, device, fmt, ## args)
|
||||
#define drbd_info(device, fmt, args...) \
|
||||
drbd_printk(KERN_INFO, device, fmt, ## args)
|
||||
|
||||
|
||||
#define drbd_ratelimit() \
|
||||
({ \
|
||||
static DEFINE_RATELIMIT_STATE(_rs, \
|
||||
DEFAULT_RATELIMIT_INTERVAL, \
|
||||
DEFAULT_RATELIMIT_BURST); \
|
||||
__ratelimit(&_rs); \
|
||||
})
|
||||
|
||||
#define D_ASSERT(x, exp) \
|
||||
do { \
|
||||
if (!(exp)) \
|
||||
drbd_err(x, "ASSERTION %s FAILED in %s\n", \
|
||||
#exp, __func__); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* expect - Make an assertion
|
||||
*
|
||||
* Unlike the assert macro, this macro returns a boolean result.
|
||||
*/
|
||||
#define expect(x, exp) ({ \
|
||||
bool _bool = (exp); \
|
||||
if (!_bool && drbd_ratelimit()) \
|
||||
drbd_err(x, "ASSERTION %s FAILED in %s\n", \
|
||||
#exp, __func__); \
|
||||
_bool; \
|
||||
})
|
||||
|
||||
#endif
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_proc.c
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef __DRBD_PROTOCOL_H
|
||||
#define __DRBD_PROTOCOL_H
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_receiver.c
|
||||
|
||||
@ -413,7 +413,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *
|
||||
drbd_free_pages(device, peer_req->pages, is_net);
|
||||
D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
|
||||
D_ASSERT(device, drbd_interval_empty(&peer_req->i));
|
||||
if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
|
||||
if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
|
||||
peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
|
||||
drbd_al_complete_io(device, &peer_req->i);
|
||||
}
|
||||
@ -1603,9 +1603,19 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru
|
||||
drbd_endio_write_sec_final(peer_req);
|
||||
}
|
||||
|
||||
static int peer_request_fault_type(struct drbd_peer_request *peer_req)
|
||||
{
|
||||
if (peer_req_op(peer_req) == REQ_OP_READ) {
|
||||
return peer_req->flags & EE_APPLICATION ?
|
||||
DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD;
|
||||
} else {
|
||||
return peer_req->flags & EE_APPLICATION ?
|
||||
DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* drbd_submit_peer_request()
|
||||
* @device: DRBD device.
|
||||
* @peer_req: peer request
|
||||
*
|
||||
* May spread the pages to multiple bios,
|
||||
@ -1619,10 +1629,9 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru
|
||||
* on certain Xen deployments.
|
||||
*/
|
||||
/* TODO allocate from our own bio_set. */
|
||||
int drbd_submit_peer_request(struct drbd_device *device,
|
||||
struct drbd_peer_request *peer_req,
|
||||
const blk_opf_t opf, const int fault_type)
|
||||
int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
|
||||
{
|
||||
struct drbd_device *device = peer_req->peer_device->device;
|
||||
struct bio *bios = NULL;
|
||||
struct bio *bio;
|
||||
struct page *page = peer_req->pages;
|
||||
@ -1667,7 +1676,18 @@ int drbd_submit_peer_request(struct drbd_device *device,
|
||||
* generated bio, but a bio allocated on behalf of the peer.
|
||||
*/
|
||||
next_bio:
|
||||
bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO);
|
||||
/* _DISCARD, _WRITE_ZEROES handled above.
|
||||
* REQ_OP_FLUSH (empty flush) not expected,
|
||||
* should have been mapped to a "drbd protocol barrier".
|
||||
* REQ_OP_SECURE_ERASE: I don't see how we could ever support that.
|
||||
*/
|
||||
if (!(peer_req_op(peer_req) == REQ_OP_WRITE ||
|
||||
peer_req_op(peer_req) == REQ_OP_READ)) {
|
||||
drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO);
|
||||
/* > peer_req->i.sector, unless this is the first bio */
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
bio->bi_private = peer_req;
|
||||
@ -1697,7 +1717,7 @@ next_bio:
|
||||
bios = bios->bi_next;
|
||||
bio->bi_next = NULL;
|
||||
|
||||
drbd_submit_bio_noacct(device, fault_type, bio);
|
||||
drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio);
|
||||
} while (bios);
|
||||
return 0;
|
||||
}
|
||||
@ -1853,21 +1873,21 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
|
||||
/* assume request_size == data_size, but special case trim. */
|
||||
ds = data_size;
|
||||
if (trim) {
|
||||
if (!expect(data_size == 0))
|
||||
if (!expect(peer_device, data_size == 0))
|
||||
return NULL;
|
||||
ds = be32_to_cpu(trim->size);
|
||||
} else if (zeroes) {
|
||||
if (!expect(data_size == 0))
|
||||
if (!expect(peer_device, data_size == 0))
|
||||
return NULL;
|
||||
ds = be32_to_cpu(zeroes->size);
|
||||
}
|
||||
|
||||
if (!expect(IS_ALIGNED(ds, 512)))
|
||||
if (!expect(peer_device, IS_ALIGNED(ds, 512)))
|
||||
return NULL;
|
||||
if (trim || zeroes) {
|
||||
if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
|
||||
if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
|
||||
return NULL;
|
||||
} else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
|
||||
} else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE))
|
||||
return NULL;
|
||||
|
||||
/* even though we trust out peer,
|
||||
@ -2051,6 +2071,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
|
||||
* respective _drbd_clear_done_ee */
|
||||
|
||||
peer_req->w.cb = e_end_resync_block;
|
||||
peer_req->opf = REQ_OP_WRITE;
|
||||
peer_req->submit_jif = jiffies;
|
||||
|
||||
spin_lock_irq(&device->resource->req_lock);
|
||||
@ -2058,8 +2079,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
|
||||
spin_unlock_irq(&device->resource->req_lock);
|
||||
|
||||
atomic_add(pi->size >> 9, &device->rs_sect_ev);
|
||||
if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE,
|
||||
DRBD_FAULT_RS_WR) == 0)
|
||||
if (drbd_submit_peer_request(peer_req) == 0)
|
||||
return 0;
|
||||
|
||||
/* don't care for the reason here */
|
||||
@ -2145,7 +2165,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet
|
||||
* or in drbd_peer_request_endio. */
|
||||
err = recv_resync_read(peer_device, sector, pi);
|
||||
} else {
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "Can not write resync data to local disk.\n");
|
||||
|
||||
err = drbd_drain_block(peer_device, pi->size);
|
||||
@ -2375,16 +2395,6 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* see also bio_flags_to_wire()
|
||||
* DRBD_REQ_*, because we need to semantically map the flags to data packet
|
||||
* flags and back. We may replicate to other kernel versions. */
|
||||
static blk_opf_t wire_flags_to_bio_flags(u32 dpf)
|
||||
{
|
||||
return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
|
||||
(dpf & DP_FUA ? REQ_FUA : 0) |
|
||||
(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
|
||||
}
|
||||
|
||||
static enum req_op wire_flags_to_bio_op(u32 dpf)
|
||||
{
|
||||
if (dpf & DP_ZEROES)
|
||||
@ -2395,6 +2405,15 @@ static enum req_op wire_flags_to_bio_op(u32 dpf)
|
||||
return REQ_OP_WRITE;
|
||||
}
|
||||
|
||||
/* see also bio_flags_to_wire() */
|
||||
static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf)
|
||||
{
|
||||
return wire_flags_to_bio_op(dpf) |
|
||||
(dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
|
||||
(dpf & DP_FUA ? REQ_FUA : 0) |
|
||||
(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
|
||||
}
|
||||
|
||||
static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
|
||||
unsigned int size)
|
||||
{
|
||||
@ -2538,8 +2557,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
|
||||
struct drbd_peer_request *peer_req;
|
||||
struct p_data *p = pi->data;
|
||||
u32 peer_seq = be32_to_cpu(p->seq_num);
|
||||
enum req_op op;
|
||||
blk_opf_t op_flags;
|
||||
u32 dp_flags;
|
||||
int err, tp;
|
||||
|
||||
@ -2578,11 +2595,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
|
||||
peer_req->flags |= EE_APPLICATION;
|
||||
|
||||
dp_flags = be32_to_cpu(p->dp_flags);
|
||||
op = wire_flags_to_bio_op(dp_flags);
|
||||
op_flags = wire_flags_to_bio_flags(dp_flags);
|
||||
peer_req->opf = wire_flags_to_bio(connection, dp_flags);
|
||||
if (pi->cmd == P_TRIM) {
|
||||
D_ASSERT(peer_device, peer_req->i.size > 0);
|
||||
D_ASSERT(peer_device, op == REQ_OP_DISCARD);
|
||||
D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD);
|
||||
D_ASSERT(peer_device, peer_req->pages == NULL);
|
||||
/* need to play safe: an older DRBD sender
|
||||
* may mean zero-out while sending P_TRIM. */
|
||||
@ -2590,7 +2606,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
|
||||
peer_req->flags |= EE_ZEROOUT;
|
||||
} else if (pi->cmd == P_ZEROES) {
|
||||
D_ASSERT(peer_device, peer_req->i.size > 0);
|
||||
D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
|
||||
D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES);
|
||||
D_ASSERT(peer_device, peer_req->pages == NULL);
|
||||
/* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
|
||||
if (dp_flags & DP_DISCARD)
|
||||
@ -2677,8 +2693,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
|
||||
peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
|
||||
}
|
||||
|
||||
err = drbd_submit_peer_request(device, peer_req, op | op_flags,
|
||||
DRBD_FAULT_DT_WR);
|
||||
err = drbd_submit_peer_request(peer_req);
|
||||
if (!err)
|
||||
return 0;
|
||||
|
||||
@ -2789,7 +2804,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
|
||||
struct drbd_peer_request *peer_req;
|
||||
struct digest_info *di = NULL;
|
||||
int size, verb;
|
||||
unsigned int fault_type;
|
||||
struct p_block_req *p = pi->data;
|
||||
|
||||
peer_device = conn_peer_device(connection, pi->vnr);
|
||||
@ -2832,7 +2846,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
if (verb && __ratelimit(&drbd_ratelimit_state))
|
||||
if (verb && drbd_ratelimit())
|
||||
drbd_err(device, "Can not satisfy peer's read request, "
|
||||
"no local data.\n");
|
||||
|
||||
@ -2849,11 +2863,11 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
|
||||
put_ldev(device);
|
||||
return -ENOMEM;
|
||||
}
|
||||
peer_req->opf = REQ_OP_READ;
|
||||
|
||||
switch (pi->cmd) {
|
||||
case P_DATA_REQUEST:
|
||||
peer_req->w.cb = w_e_end_data_req;
|
||||
fault_type = DRBD_FAULT_DT_RD;
|
||||
/* application IO, don't drbd_rs_begin_io */
|
||||
peer_req->flags |= EE_APPLICATION;
|
||||
goto submit;
|
||||
@ -2867,14 +2881,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
|
||||
fallthrough;
|
||||
case P_RS_DATA_REQUEST:
|
||||
peer_req->w.cb = w_e_end_rsdata_req;
|
||||
fault_type = DRBD_FAULT_RS_RD;
|
||||
/* used in the sector offset progress display */
|
||||
device->bm_resync_fo = BM_SECT_TO_BIT(sector);
|
||||
break;
|
||||
|
||||
case P_OV_REPLY:
|
||||
case P_CSUM_RS_REQUEST:
|
||||
fault_type = DRBD_FAULT_RS_RD;
|
||||
di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
|
||||
if (!di)
|
||||
goto out_free_e;
|
||||
@ -2923,7 +2935,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
|
||||
(unsigned long long)sector);
|
||||
}
|
||||
peer_req->w.cb = w_e_end_ov_req;
|
||||
fault_type = DRBD_FAULT_RS_RD;
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -2975,8 +2986,7 @@ submit_for_resync:
|
||||
submit:
|
||||
update_receiver_timing_details(connection, drbd_submit_peer_request);
|
||||
inc_unacked(device);
|
||||
if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
|
||||
fault_type) == 0)
|
||||
if (drbd_submit_peer_request(peer_req) == 0)
|
||||
return 0;
|
||||
|
||||
/* don't care for the reason here */
|
||||
@ -4947,7 +4957,6 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
|
||||
|
||||
if (get_ldev(device)) {
|
||||
struct drbd_peer_request *peer_req;
|
||||
const enum req_op op = REQ_OP_WRITE_ZEROES;
|
||||
|
||||
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
|
||||
size, 0, GFP_NOIO);
|
||||
@ -4957,6 +4966,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
|
||||
}
|
||||
|
||||
peer_req->w.cb = e_end_resync_block;
|
||||
peer_req->opf = REQ_OP_DISCARD;
|
||||
peer_req->submit_jif = jiffies;
|
||||
peer_req->flags |= EE_TRIM;
|
||||
|
||||
@ -4965,8 +4975,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
|
||||
spin_unlock_irq(&device->resource->req_lock);
|
||||
|
||||
atomic_add(pi->size >> 9, &device->rs_sect_ev);
|
||||
err = drbd_submit_peer_request(device, peer_req, op,
|
||||
DRBD_FAULT_RS_WR);
|
||||
err = drbd_submit_peer_request(peer_req);
|
||||
|
||||
if (err) {
|
||||
spin_lock_irq(&device->resource->req_lock);
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_req.c
|
||||
|
||||
@ -144,7 +144,7 @@ void drbd_req_destroy(struct kref *kref)
|
||||
if (get_ldev_if_state(device, D_FAILED)) {
|
||||
drbd_al_complete_io(device, &req->i);
|
||||
put_ldev(device);
|
||||
} else if (__ratelimit(&drbd_ratelimit_state)) {
|
||||
} else if (drbd_ratelimit()) {
|
||||
drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
|
||||
"but my Disk seems to have failed :(\n",
|
||||
(unsigned long long) req->i.sector, req->i.size);
|
||||
@ -518,7 +518,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
|
||||
|
||||
static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
|
||||
{
|
||||
if (!__ratelimit(&drbd_ratelimit_state))
|
||||
if (!drbd_ratelimit())
|
||||
return;
|
||||
|
||||
drbd_warn(device, "local %s IO error sector %llu+%u on %pg\n",
|
||||
@ -1402,7 +1402,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
|
||||
submit_private_bio = true;
|
||||
} else if (no_remote) {
|
||||
nodata:
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
|
||||
(unsigned long long)req->i.sector, req->i.size >> 9);
|
||||
/* A write may have been queued for send_oos, however.
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
drbd_req.h
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_state.c
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef DRBD_STATE_H
|
||||
#define DRBD_STATE_H
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef DRBD_STATE_CHANGE_H
|
||||
#define DRBD_STATE_CHANGE_H
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd.h
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
#ifndef __DRBD_STRINGS_H
|
||||
#define __DRBD_STRINGS_H
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
-*- linux-c -*-
|
||||
drbd_receiver.c
|
||||
|
@ -1,4 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
drbd_worker.c
|
||||
|
||||
@ -176,7 +176,7 @@ void drbd_peer_request_endio(struct bio *bio)
|
||||
bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
|
||||
bio_op(bio) == REQ_OP_DISCARD;
|
||||
|
||||
if (bio->bi_status && __ratelimit(&drbd_ratelimit_state))
|
||||
if (bio->bi_status && drbd_ratelimit())
|
||||
drbd_warn(device, "%s: error=%d s=%llus\n",
|
||||
is_write ? (is_discard ? "discard" : "write")
|
||||
: "read", bio->bi_status,
|
||||
@ -240,7 +240,7 @@ void drbd_request_endio(struct bio *bio)
|
||||
* though we still will complain noisily about it.
|
||||
*/
|
||||
if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
|
||||
|
||||
if (!bio->bi_status)
|
||||
@ -400,13 +400,13 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
|
||||
goto defer;
|
||||
|
||||
peer_req->w.cb = w_e_send_csum;
|
||||
peer_req->opf = REQ_OP_READ;
|
||||
spin_lock_irq(&device->resource->req_lock);
|
||||
list_add_tail(&peer_req->w.list, &device->read_ee);
|
||||
spin_unlock_irq(&device->resource->req_lock);
|
||||
|
||||
atomic_add(size >> 9, &device->rs_sect_ev);
|
||||
if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
|
||||
DRBD_FAULT_RS_RD) == 0)
|
||||
if (drbd_submit_peer_request(peer_req) == 0)
|
||||
return 0;
|
||||
|
||||
/* If it failed because of ENOMEM, retry should help. If it failed
|
||||
@ -1062,7 +1062,7 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
|
||||
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
|
||||
err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
|
||||
} else {
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "Sending NegDReply. sector=%llus.\n",
|
||||
(unsigned long long)peer_req->i.sector);
|
||||
|
||||
@ -1135,13 +1135,13 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
|
||||
else
|
||||
err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
|
||||
} else {
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "Not sending RSDataReply, "
|
||||
"partner DISKLESS!\n");
|
||||
err = 0;
|
||||
}
|
||||
} else {
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
|
||||
(unsigned long long)peer_req->i.sector);
|
||||
|
||||
@ -1212,7 +1212,7 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
|
||||
}
|
||||
} else {
|
||||
err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
|
||||
if (__ratelimit(&drbd_ratelimit_state))
|
||||
if (drbd_ratelimit())
|
||||
drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
|
||||
}
|
||||
|
||||
|
@ -4593,8 +4593,10 @@ static int __init do_floppy_init(void)
|
||||
goto out_put_disk;
|
||||
|
||||
err = floppy_alloc_disk(drive, 0);
|
||||
if (err)
|
||||
if (err) {
|
||||
blk_mq_free_tag_set(&tag_sets[drive]);
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
timer_setup(&motor_off_timer[drive], motor_off_callback, 0);
|
||||
}
|
||||
|
@ -523,6 +523,24 @@ out:
|
||||
}
|
||||
CONFIGFS_ATTR(nullb_device_, badblocks);
|
||||
|
||||
static ssize_t nullb_device_zone_readonly_store(struct config_item *item,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct nullb_device *dev = to_nullb_device(item);
|
||||
|
||||
return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY);
|
||||
}
|
||||
CONFIGFS_ATTR_WO(nullb_device_, zone_readonly);
|
||||
|
||||
static ssize_t nullb_device_zone_offline_store(struct config_item *item,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct nullb_device *dev = to_nullb_device(item);
|
||||
|
||||
return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE);
|
||||
}
|
||||
CONFIGFS_ATTR_WO(nullb_device_, zone_offline);
|
||||
|
||||
static struct configfs_attribute *nullb_device_attrs[] = {
|
||||
&nullb_device_attr_size,
|
||||
&nullb_device_attr_completion_nsec,
|
||||
@ -549,6 +567,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
|
||||
&nullb_device_attr_zone_nr_conv,
|
||||
&nullb_device_attr_zone_max_open,
|
||||
&nullb_device_attr_zone_max_active,
|
||||
&nullb_device_attr_zone_readonly,
|
||||
&nullb_device_attr_zone_offline,
|
||||
&nullb_device_attr_virt_boundary,
|
||||
&nullb_device_attr_no_sched,
|
||||
&nullb_device_attr_shared_tag_bitmap,
|
||||
@ -614,7 +634,7 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page)
|
||||
"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
|
||||
"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
|
||||
"zone_capacity,zone_max_active,zone_max_open,"
|
||||
"zone_nr_conv,zone_size\n");
|
||||
"zone_nr_conv,zone_offline,zone_readonly,zone_size\n");
|
||||
}
|
||||
|
||||
CONFIGFS_ATTR_RO(memb_group_, features);
|
||||
|
@ -151,6 +151,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
|
||||
sector_t sector, sector_t nr_sectors);
|
||||
size_t null_zone_valid_read_len(struct nullb *nullb,
|
||||
sector_t sector, unsigned int len);
|
||||
ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
|
||||
size_t count, enum blk_zone_cond cond);
|
||||
#else
|
||||
static inline int null_init_zoned_dev(struct nullb_device *dev,
|
||||
struct request_queue *q)
|
||||
@ -174,6 +176,12 @@ static inline size_t null_zone_valid_read_len(struct nullb *nullb,
|
||||
{
|
||||
return len;
|
||||
}
|
||||
static inline ssize_t zone_cond_store(struct nullb_device *dev,
|
||||
const char *page, size_t count,
|
||||
enum blk_zone_cond cond)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
#define null_report_zones NULL
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
#endif /* __NULL_BLK_H */
|
||||
|
@ -384,8 +384,10 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
||||
|
||||
null_lock_zone(dev, zone);
|
||||
|
||||
if (zone->cond == BLK_ZONE_COND_FULL) {
|
||||
/* Cannot write to a full zone */
|
||||
if (zone->cond == BLK_ZONE_COND_FULL ||
|
||||
zone->cond == BLK_ZONE_COND_READONLY ||
|
||||
zone->cond == BLK_ZONE_COND_OFFLINE) {
|
||||
/* Cannot write to the zone */
|
||||
ret = BLK_STS_IOERR;
|
||||
goto unlock;
|
||||
}
|
||||
@ -613,7 +615,9 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
|
||||
for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
|
||||
zone = &dev->zones[i];
|
||||
null_lock_zone(dev, zone);
|
||||
if (zone->cond != BLK_ZONE_COND_EMPTY) {
|
||||
if (zone->cond != BLK_ZONE_COND_EMPTY &&
|
||||
zone->cond != BLK_ZONE_COND_READONLY &&
|
||||
zone->cond != BLK_ZONE_COND_OFFLINE) {
|
||||
null_reset_zone(dev, zone);
|
||||
trace_nullb_zone_op(cmd, i, zone->cond);
|
||||
}
|
||||
@ -627,6 +631,12 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
|
||||
|
||||
null_lock_zone(dev, zone);
|
||||
|
||||
if (zone->cond == BLK_ZONE_COND_READONLY ||
|
||||
zone->cond == BLK_ZONE_COND_OFFLINE) {
|
||||
ret = BLK_STS_IOERR;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
switch (op) {
|
||||
case REQ_OP_ZONE_RESET:
|
||||
ret = null_reset_zone(dev, zone);
|
||||
@ -648,6 +658,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
|
||||
if (ret == BLK_STS_OK)
|
||||
trace_nullb_zone_op(cmd, zone_no, zone->cond);
|
||||
|
||||
unlock:
|
||||
null_unlock_zone(dev, zone);
|
||||
|
||||
return ret;
|
||||
@ -674,6 +685,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
|
||||
default:
|
||||
dev = cmd->nq->dev;
|
||||
zone = &dev->zones[null_zone_no(dev, sector)];
|
||||
if (zone->cond == BLK_ZONE_COND_OFFLINE)
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
null_lock_zone(dev, zone);
|
||||
sts = null_process_cmd(cmd, op, sector, nr_sectors);
|
||||
@ -681,3 +694,79 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
|
||||
return sts;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set a zone in the read-only or offline condition.
|
||||
*/
|
||||
static void null_set_zone_cond(struct nullb_device *dev,
|
||||
struct nullb_zone *zone, enum blk_zone_cond cond)
|
||||
{
|
||||
if (WARN_ON_ONCE(cond != BLK_ZONE_COND_READONLY &&
|
||||
cond != BLK_ZONE_COND_OFFLINE))
|
||||
return;
|
||||
|
||||
null_lock_zone(dev, zone);
|
||||
|
||||
/*
|
||||
* If the read-only condition is requested again to zones already in
|
||||
* read-only condition, restore back normal empty condition. Do the same
|
||||
* if the offline condition is requested for offline zones. Otherwise,
|
||||
* set the specified zone condition to the zones. Finish the zones
|
||||
* beforehand to free up zone resources.
|
||||
*/
|
||||
if (zone->cond == cond) {
|
||||
zone->cond = BLK_ZONE_COND_EMPTY;
|
||||
zone->wp = zone->start;
|
||||
if (dev->memory_backed)
|
||||
null_handle_discard(dev, zone->start, zone->len);
|
||||
} else {
|
||||
if (zone->cond != BLK_ZONE_COND_READONLY &&
|
||||
zone->cond != BLK_ZONE_COND_OFFLINE)
|
||||
null_finish_zone(dev, zone);
|
||||
zone->cond = cond;
|
||||
zone->wp = (sector_t)-1;
|
||||
}
|
||||
|
||||
null_unlock_zone(dev, zone);
|
||||
}
|
||||
|
||||
/*
|
||||
* Identify a zone from the sector written to configfs file. Then set zone
|
||||
* condition to the zone.
|
||||
*/
|
||||
ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
|
||||
size_t count, enum blk_zone_cond cond)
|
||||
{
|
||||
unsigned long long sector;
|
||||
unsigned int zone_no;
|
||||
int ret;
|
||||
|
||||
if (!dev->zoned) {
|
||||
pr_err("null_blk device is not zoned\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!dev->zones) {
|
||||
pr_err("null_blk device is not yet powered\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = kstrtoull(page, 0, §or);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
zone_no = null_zone_no(dev, sector);
|
||||
if (zone_no >= dev->nr_zones) {
|
||||
pr_err("Sector out of range\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (dev->zones[zone_no].type == BLK_ZONE_TYPE_CONVENTIONAL) {
|
||||
pr_err("Can not change condition of conventional zones\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
null_set_zone_cond(dev, &dev->zones[zone_no], cond);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -512,7 +512,7 @@ static void virtblk_free_disk(struct gendisk *disk)
|
||||
{
|
||||
struct virtio_blk *vblk = disk->private_data;
|
||||
|
||||
ida_simple_remove(&vd_index_ida, vblk->index);
|
||||
ida_free(&vd_index_ida, vblk->index);
|
||||
mutex_destroy(&vblk->vdev_mutex);
|
||||
kfree(vblk);
|
||||
}
|
||||
@ -902,8 +902,8 @@ static int virtblk_probe(struct virtio_device *vdev)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
|
||||
GFP_KERNEL);
|
||||
err = ida_alloc_range(&vd_index_ida, 0,
|
||||
minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
|
||||
if (err < 0)
|
||||
goto out;
|
||||
index = err;
|
||||
@ -1163,7 +1163,7 @@ out_free_vq:
|
||||
out_free_vblk:
|
||||
kfree(vblk);
|
||||
out_free_index:
|
||||
ida_simple_remove(&vd_index_ida, index);
|
||||
ida_free(&vd_index_ida, index);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
@ -2129,7 +2129,6 @@ static void blkfront_closing(struct blkfront_info *info)
|
||||
if (info->rq && info->gd) {
|
||||
blk_mq_stop_hw_queues(info->rq);
|
||||
blk_mark_disk_dead(info->gd);
|
||||
set_capacity(info->gd, 0);
|
||||
}
|
||||
|
||||
for_each_rinfo(info, rinfo, i) {
|
||||
|
@ -160,7 +160,7 @@ static void read_moving(struct cache_set *c)
|
||||
moving_init(io);
|
||||
bio = &io->bio.bio;
|
||||
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
bio->bi_opf = REQ_OP_READ;
|
||||
bio->bi_end_io = read_moving_endio;
|
||||
|
||||
if (bch_bio_alloc_pages(bio, GFP_KERNEL))
|
||||
|
@ -244,7 +244,7 @@ static void bch_data_insert_start(struct closure *cl)
|
||||
trace_bcache_cache_insert(k);
|
||||
bch_keylist_push(&op->insert_keys);
|
||||
|
||||
bio_set_op_attrs(n, REQ_OP_WRITE, 0);
|
||||
n->bi_opf = REQ_OP_WRITE;
|
||||
bch_submit_bbio(n, op->c, k, 0);
|
||||
} while (n != bio);
|
||||
|
||||
|
@ -434,7 +434,7 @@ static void write_dirty(struct closure *cl)
|
||||
*/
|
||||
if (KEY_DIRTY(&w->key)) {
|
||||
dirty_init(w);
|
||||
bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
|
||||
io->bio.bi_opf = REQ_OP_WRITE;
|
||||
io->bio.bi_iter.bi_sector = KEY_START(&w->key);
|
||||
bio_set_dev(&io->bio, io->dc->bdev);
|
||||
io->bio.bi_end_io = dirty_endio;
|
||||
@ -547,7 +547,7 @@ static void read_dirty(struct cached_dev *dc)
|
||||
io->sequence = sequence++;
|
||||
|
||||
dirty_init(w);
|
||||
bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
|
||||
io->bio.bi_opf = REQ_OP_READ;
|
||||
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
|
||||
bio_set_dev(&io->bio, dc->disk.c->cache->bdev);
|
||||
io->bio.bi_end_io = read_dirty_endio;
|
||||
|
@ -1215,7 +1215,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
|
||||
struct dm_keyslot_evict_args *args = data;
|
||||
int err;
|
||||
|
||||
err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key);
|
||||
err = blk_crypto_evict_key(dev->bdev, args->key);
|
||||
if (!args->err)
|
||||
args->err = err;
|
||||
/* Always try to evict the key from all devices. */
|
||||
|
@ -410,7 +410,7 @@ static void end_discard(struct discard_op *op, int r)
|
||||
* need to wait for the chain to complete.
|
||||
*/
|
||||
bio_chain(op->bio, op->parent_bio);
|
||||
bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
|
||||
op->bio->bi_opf = REQ_OP_DISCARD;
|
||||
submit_bio(op->bio);
|
||||
}
|
||||
|
||||
|
140
drivers/md/dm.c
140
drivers/md/dm.c
@ -732,28 +732,48 @@ static char *_dm_claim_ptr = "I belong to device-mapper";
|
||||
/*
|
||||
* Open a table device so we can use it as a map destination.
|
||||
*/
|
||||
static int open_table_device(struct table_device *td, dev_t dev,
|
||||
struct mapped_device *md)
|
||||
static struct table_device *open_table_device(struct mapped_device *md,
|
||||
dev_t dev, fmode_t mode)
|
||||
{
|
||||
struct table_device *td;
|
||||
struct block_device *bdev;
|
||||
u64 part_off;
|
||||
int r;
|
||||
|
||||
BUG_ON(td->dm_dev.bdev);
|
||||
td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
|
||||
if (!td)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
refcount_set(&td->count, 1);
|
||||
|
||||
bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
|
||||
if (IS_ERR(bdev))
|
||||
return PTR_ERR(bdev);
|
||||
|
||||
r = bd_link_disk_holder(bdev, dm_disk(md));
|
||||
if (r) {
|
||||
blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
|
||||
return r;
|
||||
bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr);
|
||||
if (IS_ERR(bdev)) {
|
||||
r = PTR_ERR(bdev);
|
||||
goto out_free_td;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can be called before the dm disk is added. In that case we can't
|
||||
* register the holder relation here. It will be done once add_disk was
|
||||
* called.
|
||||
*/
|
||||
if (md->disk->slave_dir) {
|
||||
r = bd_link_disk_holder(bdev, md->disk);
|
||||
if (r)
|
||||
goto out_blkdev_put;
|
||||
}
|
||||
|
||||
td->dm_dev.mode = mode;
|
||||
td->dm_dev.bdev = bdev;
|
||||
td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
|
||||
return 0;
|
||||
format_dev_t(td->dm_dev.name, dev);
|
||||
list_add(&td->list, &md->table_devices);
|
||||
return td;
|
||||
|
||||
out_blkdev_put:
|
||||
blkdev_put(bdev, mode | FMODE_EXCL);
|
||||
out_free_td:
|
||||
kfree(td);
|
||||
return ERR_PTR(r);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -761,14 +781,12 @@ static int open_table_device(struct table_device *td, dev_t dev,
|
||||
*/
|
||||
static void close_table_device(struct table_device *td, struct mapped_device *md)
|
||||
{
|
||||
if (!td->dm_dev.bdev)
|
||||
return;
|
||||
|
||||
bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
|
||||
if (md->disk->slave_dir)
|
||||
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
|
||||
blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
|
||||
put_dax(td->dm_dev.dax_dev);
|
||||
td->dm_dev.bdev = NULL;
|
||||
td->dm_dev.dax_dev = NULL;
|
||||
list_del(&td->list);
|
||||
kfree(td);
|
||||
}
|
||||
|
||||
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
|
||||
@ -786,31 +804,16 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
|
||||
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
|
||||
struct dm_dev **result)
|
||||
{
|
||||
int r;
|
||||
struct table_device *td;
|
||||
|
||||
mutex_lock(&md->table_devices_lock);
|
||||
td = find_table_device(&md->table_devices, dev, mode);
|
||||
if (!td) {
|
||||
td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
|
||||
if (!td) {
|
||||
td = open_table_device(md, dev, mode);
|
||||
if (IS_ERR(td)) {
|
||||
mutex_unlock(&md->table_devices_lock);
|
||||
return -ENOMEM;
|
||||
return PTR_ERR(td);
|
||||
}
|
||||
|
||||
td->dm_dev.mode = mode;
|
||||
td->dm_dev.bdev = NULL;
|
||||
|
||||
if ((r = open_table_device(td, dev, md))) {
|
||||
mutex_unlock(&md->table_devices_lock);
|
||||
kfree(td);
|
||||
return r;
|
||||
}
|
||||
|
||||
format_dev_t(td->dm_dev.name, dev);
|
||||
|
||||
refcount_set(&td->count, 1);
|
||||
list_add(&td->list, &md->table_devices);
|
||||
} else {
|
||||
refcount_inc(&td->count);
|
||||
}
|
||||
@ -825,27 +828,11 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
|
||||
struct table_device *td = container_of(d, struct table_device, dm_dev);
|
||||
|
||||
mutex_lock(&md->table_devices_lock);
|
||||
if (refcount_dec_and_test(&td->count)) {
|
||||
if (refcount_dec_and_test(&td->count))
|
||||
close_table_device(td, md);
|
||||
list_del(&td->list);
|
||||
kfree(td);
|
||||
}
|
||||
mutex_unlock(&md->table_devices_lock);
|
||||
}
|
||||
|
||||
static void free_table_devices(struct list_head *devices)
|
||||
{
|
||||
struct list_head *tmp, *next;
|
||||
|
||||
list_for_each_safe(tmp, next, devices) {
|
||||
struct table_device *td = list_entry(tmp, struct table_device, list);
|
||||
|
||||
DMWARN("dm_destroy: %s still exists with %d references",
|
||||
td->dm_dev.name, refcount_read(&td->count));
|
||||
kfree(td);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the geometry associated with a dm device
|
||||
*/
|
||||
@ -1972,8 +1959,21 @@ static void cleanup_mapped_device(struct mapped_device *md)
|
||||
md->disk->private_data = NULL;
|
||||
spin_unlock(&_minor_lock);
|
||||
if (dm_get_md_type(md) != DM_TYPE_NONE) {
|
||||
struct table_device *td;
|
||||
|
||||
dm_sysfs_exit(md);
|
||||
list_for_each_entry(td, &md->table_devices, list) {
|
||||
bd_unlink_disk_holder(td->dm_dev.bdev,
|
||||
md->disk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Hold lock to make sure del_gendisk() won't concurrent
|
||||
* with open/close_table_device().
|
||||
*/
|
||||
mutex_lock(&md->table_devices_lock);
|
||||
del_gendisk(md->disk);
|
||||
mutex_unlock(&md->table_devices_lock);
|
||||
}
|
||||
dm_queue_destroy_crypto_profile(md->queue);
|
||||
put_disk(md->disk);
|
||||
@ -2122,7 +2122,7 @@ static void free_dev(struct mapped_device *md)
|
||||
|
||||
cleanup_mapped_device(md);
|
||||
|
||||
free_table_devices(&md->table_devices);
|
||||
WARN_ON_ONCE(!list_empty(&md->table_devices));
|
||||
dm_stats_cleanup(&md->stats);
|
||||
free_minor(minor);
|
||||
|
||||
@ -2305,6 +2305,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
|
||||
{
|
||||
enum dm_queue_mode type = dm_table_get_type(t);
|
||||
struct queue_limits limits;
|
||||
struct table_device *td;
|
||||
int r;
|
||||
|
||||
switch (type) {
|
||||
@ -2333,17 +2334,40 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
/*
|
||||
* Hold lock to make sure add_disk() and del_gendisk() won't concurrent
|
||||
* with open_table_device() and close_table_device().
|
||||
*/
|
||||
mutex_lock(&md->table_devices_lock);
|
||||
r = add_disk(md->disk);
|
||||
mutex_unlock(&md->table_devices_lock);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
r = dm_sysfs_init(md);
|
||||
if (r) {
|
||||
del_gendisk(md->disk);
|
||||
return r;
|
||||
/*
|
||||
* Register the holder relationship for devices added before the disk
|
||||
* was live.
|
||||
*/
|
||||
list_for_each_entry(td, &md->table_devices, list) {
|
||||
r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
|
||||
if (r)
|
||||
goto out_undo_holders;
|
||||
}
|
||||
|
||||
r = dm_sysfs_init(md);
|
||||
if (r)
|
||||
goto out_undo_holders;
|
||||
|
||||
md->type = type;
|
||||
return 0;
|
||||
|
||||
out_undo_holders:
|
||||
list_for_each_entry_continue_reverse(td, &md->table_devices, list)
|
||||
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
|
||||
mutex_lock(&md->table_devices_lock);
|
||||
del_gendisk(md->disk);
|
||||
mutex_unlock(&md->table_devices_lock);
|
||||
return r;
|
||||
}
|
||||
|
||||
struct mapped_device *dm_get_md(dev_t dev)
|
||||
|
@ -486,7 +486,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
|
||||
sb = kmap_atomic(bitmap->storage.sb_page);
|
||||
pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
|
||||
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
|
||||
pr_debug(" version: %d\n", le32_to_cpu(sb->version));
|
||||
pr_debug(" version: %u\n", le32_to_cpu(sb->version));
|
||||
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
|
||||
le32_to_cpu(*(__le32 *)(sb->uuid+0)),
|
||||
le32_to_cpu(*(__le32 *)(sb->uuid+4)),
|
||||
@ -497,11 +497,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
|
||||
pr_debug("events cleared: %llu\n",
|
||||
(unsigned long long) le64_to_cpu(sb->events_cleared));
|
||||
pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
|
||||
pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize));
|
||||
pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
|
||||
pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize));
|
||||
pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep));
|
||||
pr_debug(" sync size: %llu KB\n",
|
||||
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
|
||||
pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
|
||||
pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
|
||||
kunmap_atomic(sb);
|
||||
}
|
||||
|
||||
@ -2105,7 +2105,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
||||
bytes = DIV_ROUND_UP(chunks, 8);
|
||||
if (!bitmap->mddev->bitmap_info.external)
|
||||
bytes += sizeof(bitmap_super_t);
|
||||
} while (bytes > (space << 9));
|
||||
} while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) <
|
||||
(BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1));
|
||||
} else
|
||||
chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
|
||||
|
||||
@ -2150,7 +2151,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
||||
bitmap->counts.missing_pages = pages;
|
||||
bitmap->counts.chunkshift = chunkshift;
|
||||
bitmap->counts.chunks = chunks;
|
||||
bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
|
||||
bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift +
|
||||
BITMAP_BLOCK_SHIFT);
|
||||
|
||||
blocks = min(old_counts.chunks << old_counts.chunkshift,
|
||||
@ -2176,8 +2177,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
||||
bitmap->counts.missing_pages = old_counts.pages;
|
||||
bitmap->counts.chunkshift = old_counts.chunkshift;
|
||||
bitmap->counts.chunks = old_counts.chunks;
|
||||
bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
|
||||
BITMAP_BLOCK_SHIFT);
|
||||
bitmap->mddev->bitmap_info.chunksize =
|
||||
1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT);
|
||||
blocks = old_counts.chunks << old_counts.chunkshift;
|
||||
pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
|
||||
break;
|
||||
@ -2195,20 +2196,23 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
||||
|
||||
if (set) {
|
||||
bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
|
||||
if (*bmc_new == 0) {
|
||||
/* need to set on-disk bits too. */
|
||||
sector_t end = block + new_blocks;
|
||||
sector_t start = block >> chunkshift;
|
||||
start <<= chunkshift;
|
||||
while (start < end) {
|
||||
md_bitmap_file_set_bit(bitmap, block);
|
||||
start += 1 << chunkshift;
|
||||
if (bmc_new) {
|
||||
if (*bmc_new == 0) {
|
||||
/* need to set on-disk bits too. */
|
||||
sector_t end = block + new_blocks;
|
||||
sector_t start = block >> chunkshift;
|
||||
|
||||
start <<= chunkshift;
|
||||
while (start < end) {
|
||||
md_bitmap_file_set_bit(bitmap, block);
|
||||
start += 1 << chunkshift;
|
||||
}
|
||||
*bmc_new = 2;
|
||||
md_bitmap_count_page(&bitmap->counts, block, 1);
|
||||
md_bitmap_set_pending(&bitmap->counts, block);
|
||||
}
|
||||
*bmc_new = 2;
|
||||
md_bitmap_count_page(&bitmap->counts, block, 1);
|
||||
md_bitmap_set_pending(&bitmap->counts, block);
|
||||
*bmc_new |= NEEDED_MASK;
|
||||
}
|
||||
*bmc_new |= NEEDED_MASK;
|
||||
if (new_blocks < old_blocks)
|
||||
old_blocks = new_blocks;
|
||||
}
|
||||
@ -2534,6 +2538,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
if (csize < 512 ||
|
||||
!is_power_of_2(csize))
|
||||
return -EINVAL;
|
||||
if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE *
|
||||
sizeof(((bitmap_super_t *)0)->chunksize))))
|
||||
return -EOVERFLOW;
|
||||
mddev->bitmap_info.chunksize = csize;
|
||||
return len;
|
||||
}
|
||||
|
323
drivers/md/md.c
323
drivers/md/md.c
@ -93,6 +93,18 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||
struct md_rdev *this);
|
||||
static void mddev_detach(struct mddev *mddev);
|
||||
|
||||
enum md_ro_state {
|
||||
MD_RDWR,
|
||||
MD_RDONLY,
|
||||
MD_AUTO_READ,
|
||||
MD_MAX_STATE
|
||||
};
|
||||
|
||||
static bool md_is_rdwr(struct mddev *mddev)
|
||||
{
|
||||
return (mddev->ro == MD_RDWR);
|
||||
}
|
||||
|
||||
/*
|
||||
* Default number of read corrections we'll attempt on an rdev
|
||||
* before ejecting it from the array. We divide the read error
|
||||
@ -444,7 +456,7 @@ static void md_submit_bio(struct bio *bio)
|
||||
|
||||
bio = bio_split_to_limits(bio);
|
||||
|
||||
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
|
||||
if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
|
||||
if (bio_sectors(bio) != 0)
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
bio_endio(bio);
|
||||
@ -509,13 +521,14 @@ static void md_end_flush(struct bio *bio)
|
||||
struct md_rdev *rdev = bio->bi_private;
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
|
||||
bio_put(bio);
|
||||
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
|
||||
if (atomic_dec_and_test(&mddev->flush_pending)) {
|
||||
/* The pre-request flush has finished */
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
}
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
static void md_submit_flush_data(struct work_struct *ws);
|
||||
@ -913,10 +926,12 @@ static void super_written(struct bio *bio)
|
||||
} else
|
||||
clear_bit(LastDev, &rdev->flags);
|
||||
|
||||
bio_put(bio);
|
||||
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
|
||||
if (atomic_dec_and_test(&mddev->pending_writes))
|
||||
wake_up(&mddev->sb_wait);
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
|
||||
@ -2453,7 +2468,22 @@ static void rdev_delayed_delete(struct work_struct *ws)
|
||||
kobject_put(&rdev->kobj);
|
||||
}
|
||||
|
||||
static void unbind_rdev_from_array(struct md_rdev *rdev)
|
||||
void md_autodetect_dev(dev_t dev);
|
||||
|
||||
static void export_rdev(struct md_rdev *rdev)
|
||||
{
|
||||
pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
|
||||
md_rdev_clear(rdev);
|
||||
#ifndef MODULE
|
||||
if (test_bit(AutoDetected, &rdev->flags))
|
||||
md_autodetect_dev(rdev->bdev->bd_dev);
|
||||
#endif
|
||||
blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
||||
rdev->bdev = NULL;
|
||||
kobject_put(&rdev->kobj);
|
||||
}
|
||||
|
||||
static void md_kick_rdev_from_array(struct md_rdev *rdev)
|
||||
{
|
||||
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
|
||||
list_del_rcu(&rdev->same_set);
|
||||
@ -2476,56 +2506,8 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
|
||||
INIT_WORK(&rdev->del_work, rdev_delayed_delete);
|
||||
kobject_get(&rdev->kobj);
|
||||
queue_work(md_rdev_misc_wq, &rdev->del_work);
|
||||
}
|
||||
|
||||
/*
|
||||
* prevent the device from being mounted, repartitioned or
|
||||
* otherwise reused by a RAID array (or any other kernel
|
||||
* subsystem), by bd_claiming the device.
|
||||
*/
|
||||
static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
|
||||
{
|
||||
int err = 0;
|
||||
struct block_device *bdev;
|
||||
|
||||
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
|
||||
shared ? (struct md_rdev *)lock_rdev : rdev);
|
||||
if (IS_ERR(bdev)) {
|
||||
pr_warn("md: could not open device unknown-block(%u,%u).\n",
|
||||
MAJOR(dev), MINOR(dev));
|
||||
return PTR_ERR(bdev);
|
||||
}
|
||||
rdev->bdev = bdev;
|
||||
return err;
|
||||
}
|
||||
|
||||
static void unlock_rdev(struct md_rdev *rdev)
|
||||
{
|
||||
struct block_device *bdev = rdev->bdev;
|
||||
rdev->bdev = NULL;
|
||||
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
|
||||
}
|
||||
|
||||
void md_autodetect_dev(dev_t dev);
|
||||
|
||||
static void export_rdev(struct md_rdev *rdev)
|
||||
{
|
||||
pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
|
||||
md_rdev_clear(rdev);
|
||||
#ifndef MODULE
|
||||
if (test_bit(AutoDetected, &rdev->flags))
|
||||
md_autodetect_dev(rdev->bdev->bd_dev);
|
||||
#endif
|
||||
unlock_rdev(rdev);
|
||||
kobject_put(&rdev->kobj);
|
||||
}
|
||||
|
||||
void md_kick_rdev_from_array(struct md_rdev *rdev)
|
||||
{
|
||||
unbind_rdev_from_array(rdev);
|
||||
export_rdev(rdev);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
|
||||
|
||||
static void export_array(struct mddev *mddev)
|
||||
{
|
||||
@ -2639,7 +2621,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
|
||||
int any_badblocks_changed = 0;
|
||||
int ret = -1;
|
||||
|
||||
if (mddev->ro) {
|
||||
if (!md_is_rdwr(mddev)) {
|
||||
if (force_change)
|
||||
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
|
||||
return;
|
||||
@ -3660,9 +3642,10 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
|
||||
*/
|
||||
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
|
||||
{
|
||||
int err;
|
||||
static struct md_rdev *claim_rdev; /* just for claiming the bdev */
|
||||
struct md_rdev *rdev;
|
||||
sector_t size;
|
||||
int err;
|
||||
|
||||
rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
|
||||
if (!rdev)
|
||||
@ -3670,14 +3653,20 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
|
||||
|
||||
err = md_rdev_init(rdev);
|
||||
if (err)
|
||||
goto abort_free;
|
||||
goto out_free_rdev;
|
||||
err = alloc_disk_sb(rdev);
|
||||
if (err)
|
||||
goto abort_free;
|
||||
goto out_clear_rdev;
|
||||
|
||||
err = lock_rdev(rdev, newdev, super_format == -2);
|
||||
if (err)
|
||||
goto abort_free;
|
||||
rdev->bdev = blkdev_get_by_dev(newdev,
|
||||
FMODE_READ | FMODE_WRITE | FMODE_EXCL,
|
||||
super_format == -2 ? claim_rdev : rdev);
|
||||
if (IS_ERR(rdev->bdev)) {
|
||||
pr_warn("md: could not open device unknown-block(%u,%u).\n",
|
||||
MAJOR(newdev), MINOR(newdev));
|
||||
err = PTR_ERR(rdev->bdev);
|
||||
goto out_clear_rdev;
|
||||
}
|
||||
|
||||
kobject_init(&rdev->kobj, &rdev_ktype);
|
||||
|
||||
@ -3686,7 +3675,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
|
||||
pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
|
||||
rdev->bdev);
|
||||
err = -EINVAL;
|
||||
goto abort_free;
|
||||
goto out_blkdev_put;
|
||||
}
|
||||
|
||||
if (super_format >= 0) {
|
||||
@ -3696,21 +3685,22 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
|
||||
pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
|
||||
rdev->bdev,
|
||||
super_format, super_minor);
|
||||
goto abort_free;
|
||||
goto out_blkdev_put;
|
||||
}
|
||||
if (err < 0) {
|
||||
pr_warn("md: could not read %pg's sb, not importing!\n",
|
||||
rdev->bdev);
|
||||
goto abort_free;
|
||||
goto out_blkdev_put;
|
||||
}
|
||||
}
|
||||
|
||||
return rdev;
|
||||
|
||||
abort_free:
|
||||
if (rdev->bdev)
|
||||
unlock_rdev(rdev);
|
||||
out_blkdev_put:
|
||||
blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
||||
out_clear_rdev:
|
||||
md_rdev_clear(rdev);
|
||||
out_free_rdev:
|
||||
kfree(rdev);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
@ -3901,7 +3891,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
goto out_unlock;
|
||||
}
|
||||
rv = -EROFS;
|
||||
if (mddev->ro)
|
||||
if (!md_is_rdwr(mddev))
|
||||
goto out_unlock;
|
||||
|
||||
/* request to change the personality. Need to ensure:
|
||||
@ -4107,7 +4097,7 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
if (mddev->pers) {
|
||||
if (mddev->pers->check_reshape == NULL)
|
||||
err = -EBUSY;
|
||||
else if (mddev->ro)
|
||||
else if (!md_is_rdwr(mddev))
|
||||
err = -EROFS;
|
||||
else {
|
||||
mddev->new_layout = n;
|
||||
@ -4216,7 +4206,7 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
if (mddev->pers) {
|
||||
if (mddev->pers->check_reshape == NULL)
|
||||
err = -EBUSY;
|
||||
else if (mddev->ro)
|
||||
else if (!md_is_rdwr(mddev))
|
||||
err = -EROFS;
|
||||
else {
|
||||
mddev->new_chunk_sectors = n >> 9;
|
||||
@ -4339,13 +4329,13 @@ array_state_show(struct mddev *mddev, char *page)
|
||||
|
||||
if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
|
||||
switch(mddev->ro) {
|
||||
case 1:
|
||||
case MD_RDONLY:
|
||||
st = readonly;
|
||||
break;
|
||||
case 2:
|
||||
case MD_AUTO_READ:
|
||||
st = read_auto;
|
||||
break;
|
||||
case 0:
|
||||
case MD_RDWR:
|
||||
spin_lock(&mddev->lock);
|
||||
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
|
||||
st = write_pending;
|
||||
@ -4381,7 +4371,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
int err = 0;
|
||||
enum array_state st = match_word(buf, array_states);
|
||||
|
||||
if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
|
||||
if (mddev->pers && (st == active || st == clean) &&
|
||||
mddev->ro != MD_RDONLY) {
|
||||
/* don't take reconfig_mutex when toggling between
|
||||
* clean and active
|
||||
*/
|
||||
@ -4425,23 +4416,23 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
if (mddev->pers)
|
||||
err = md_set_readonly(mddev, NULL);
|
||||
else {
|
||||
mddev->ro = 1;
|
||||
mddev->ro = MD_RDONLY;
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
err = do_md_run(mddev);
|
||||
}
|
||||
break;
|
||||
case read_auto:
|
||||
if (mddev->pers) {
|
||||
if (mddev->ro == 0)
|
||||
if (md_is_rdwr(mddev))
|
||||
err = md_set_readonly(mddev, NULL);
|
||||
else if (mddev->ro == 1)
|
||||
else if (mddev->ro == MD_RDONLY)
|
||||
err = restart_array(mddev);
|
||||
if (err == 0) {
|
||||
mddev->ro = 2;
|
||||
mddev->ro = MD_AUTO_READ;
|
||||
set_disk_ro(mddev->gendisk, 0);
|
||||
}
|
||||
} else {
|
||||
mddev->ro = 2;
|
||||
mddev->ro = MD_AUTO_READ;
|
||||
err = do_md_run(mddev);
|
||||
}
|
||||
break;
|
||||
@ -4466,7 +4457,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
wake_up(&mddev->sb_wait);
|
||||
err = 0;
|
||||
} else {
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
set_disk_ro(mddev->gendisk, 0);
|
||||
err = do_md_run(mddev);
|
||||
}
|
||||
@ -4765,7 +4756,7 @@ action_show(struct mddev *mddev, char *page)
|
||||
if (test_bit(MD_RECOVERY_FROZEN, &recovery))
|
||||
type = "frozen";
|
||||
else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
|
||||
(!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
|
||||
(md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
|
||||
type = "reshape";
|
||||
else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
|
||||
@ -4851,11 +4842,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
|
||||
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
||||
}
|
||||
if (mddev->ro == 2) {
|
||||
if (mddev->ro == MD_AUTO_READ) {
|
||||
/* A write to sync_action is enough to justify
|
||||
* canceling read-auto mode
|
||||
*/
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
md_wakeup_thread(mddev->sync_thread);
|
||||
}
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
@ -5083,8 +5074,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
goto out_unlock;
|
||||
|
||||
err = -EBUSY;
|
||||
if (max < mddev->resync_max &&
|
||||
mddev->ro == 0 &&
|
||||
if (max < mddev->resync_max && md_is_rdwr(mddev) &&
|
||||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
|
||||
goto out_unlock;
|
||||
|
||||
@ -5813,8 +5803,8 @@ int md_run(struct mddev *mddev)
|
||||
continue;
|
||||
sync_blockdev(rdev->bdev);
|
||||
invalidate_bdev(rdev->bdev);
|
||||
if (mddev->ro != 1 && rdev_read_only(rdev)) {
|
||||
mddev->ro = 1;
|
||||
if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
|
||||
mddev->ro = MD_RDONLY;
|
||||
if (mddev->gendisk)
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
}
|
||||
@ -5917,8 +5907,8 @@ int md_run(struct mddev *mddev)
|
||||
|
||||
mddev->ok_start_degraded = start_dirty_degraded;
|
||||
|
||||
if (start_readonly && mddev->ro == 0)
|
||||
mddev->ro = 2; /* read-only, but switch on first write */
|
||||
if (start_readonly && md_is_rdwr(mddev))
|
||||
mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
|
||||
|
||||
err = pers->run(mddev);
|
||||
if (err)
|
||||
@ -5996,8 +5986,8 @@ int md_run(struct mddev *mddev)
|
||||
mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
|
||||
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
|
||||
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
|
||||
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
|
||||
mddev->ro = 0;
|
||||
} else if (mddev->ro == MD_AUTO_READ)
|
||||
mddev->ro = MD_RDWR;
|
||||
|
||||
atomic_set(&mddev->max_corr_read_errors,
|
||||
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
|
||||
@ -6015,7 +6005,7 @@ int md_run(struct mddev *mddev)
|
||||
if (rdev->raid_disk >= 0)
|
||||
sysfs_link_rdev(mddev, rdev); /* failure here is OK */
|
||||
|
||||
if (mddev->degraded && !mddev->ro)
|
||||
if (mddev->degraded && md_is_rdwr(mddev))
|
||||
/* This ensures that recovering status is reported immediately
|
||||
* via sysfs - until a lack of spares is confirmed.
|
||||
*/
|
||||
@ -6105,7 +6095,7 @@ static int restart_array(struct mddev *mddev)
|
||||
return -ENXIO;
|
||||
if (!mddev->pers)
|
||||
return -EINVAL;
|
||||
if (!mddev->ro)
|
||||
if (md_is_rdwr(mddev))
|
||||
return -EBUSY;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -6124,7 +6114,7 @@ static int restart_array(struct mddev *mddev)
|
||||
return -EROFS;
|
||||
|
||||
mddev->safemode = 0;
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
set_disk_ro(disk, 0);
|
||||
pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
|
||||
/* Kick recovery or resync if necessary */
|
||||
@ -6151,7 +6141,7 @@ static void md_clean(struct mddev *mddev)
|
||||
mddev->clevel[0] = 0;
|
||||
mddev->flags = 0;
|
||||
mddev->sb_flags = 0;
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
mddev->metadata_type[0] = 0;
|
||||
mddev->chunk_sectors = 0;
|
||||
mddev->ctime = mddev->utime = 0;
|
||||
@ -6203,7 +6193,7 @@ static void __md_stop_writes(struct mddev *mddev)
|
||||
}
|
||||
md_bitmap_flush(mddev);
|
||||
|
||||
if (mddev->ro == 0 &&
|
||||
if (md_is_rdwr(mddev) &&
|
||||
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
|
||||
mddev->sb_flags)) {
|
||||
/* mark array as shutdown cleanly */
|
||||
@ -6312,9 +6302,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
|
||||
__md_stop_writes(mddev);
|
||||
|
||||
err = -ENXIO;
|
||||
if (mddev->ro==1)
|
||||
if (mddev->ro == MD_RDONLY)
|
||||
goto out;
|
||||
mddev->ro = 1;
|
||||
mddev->ro = MD_RDONLY;
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
@ -6371,7 +6361,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
|
||||
return -EBUSY;
|
||||
}
|
||||
if (mddev->pers) {
|
||||
if (mddev->ro)
|
||||
if (!md_is_rdwr(mddev))
|
||||
set_disk_ro(disk, 0);
|
||||
|
||||
__md_stop_writes(mddev);
|
||||
@ -6388,8 +6378,8 @@ static int do_md_stop(struct mddev *mddev, int mode,
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
mddev->changed = 1;
|
||||
|
||||
if (mddev->ro)
|
||||
mddev->ro = 0;
|
||||
if (!md_is_rdwr(mddev))
|
||||
mddev->ro = MD_RDWR;
|
||||
} else
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
/*
|
||||
@ -7204,7 +7194,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
|
||||
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
|
||||
mddev->sync_thread)
|
||||
return -EBUSY;
|
||||
if (mddev->ro)
|
||||
if (!md_is_rdwr(mddev))
|
||||
return -EROFS;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
@ -7234,7 +7224,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
|
||||
/* change the number of raid disks */
|
||||
if (mddev->pers->check_reshape == NULL)
|
||||
return -EINVAL;
|
||||
if (mddev->ro)
|
||||
if (!md_is_rdwr(mddev))
|
||||
return -EROFS;
|
||||
if (raid_disks <= 0 ||
|
||||
(mddev->max_disks && raid_disks >= mddev->max_disks))
|
||||
@ -7464,6 +7454,40 @@ static inline bool md_ioctl_valid(unsigned int cmd)
|
||||
}
|
||||
}
|
||||
|
||||
static int __md_set_array_info(struct mddev *mddev, void __user *argp)
|
||||
{
|
||||
mdu_array_info_t info;
|
||||
int err;
|
||||
|
||||
if (!argp)
|
||||
memset(&info, 0, sizeof(info));
|
||||
else if (copy_from_user(&info, argp, sizeof(info)))
|
||||
return -EFAULT;
|
||||
|
||||
if (mddev->pers) {
|
||||
err = update_array_info(mddev, &info);
|
||||
if (err)
|
||||
pr_warn("md: couldn't update array info. %d\n", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
if (!list_empty(&mddev->disks)) {
|
||||
pr_warn("md: array %s already has disks!\n", mdname(mddev));
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
if (mddev->raid_disks) {
|
||||
pr_warn("md: array %s already initialised!\n", mdname(mddev));
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
err = md_set_array_info(mddev, &info);
|
||||
if (err)
|
||||
pr_warn("md: couldn't set array info. %d\n", err);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int md_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
@ -7569,36 +7593,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
}
|
||||
|
||||
if (cmd == SET_ARRAY_INFO) {
|
||||
mdu_array_info_t info;
|
||||
if (!arg)
|
||||
memset(&info, 0, sizeof(info));
|
||||
else if (copy_from_user(&info, argp, sizeof(info))) {
|
||||
err = -EFAULT;
|
||||
goto unlock;
|
||||
}
|
||||
if (mddev->pers) {
|
||||
err = update_array_info(mddev, &info);
|
||||
if (err) {
|
||||
pr_warn("md: couldn't update array info. %d\n", err);
|
||||
goto unlock;
|
||||
}
|
||||
goto unlock;
|
||||
}
|
||||
if (!list_empty(&mddev->disks)) {
|
||||
pr_warn("md: array %s already has disks!\n", mdname(mddev));
|
||||
err = -EBUSY;
|
||||
goto unlock;
|
||||
}
|
||||
if (mddev->raid_disks) {
|
||||
pr_warn("md: array %s already initialised!\n", mdname(mddev));
|
||||
err = -EBUSY;
|
||||
goto unlock;
|
||||
}
|
||||
err = md_set_array_info(mddev, &info);
|
||||
if (err) {
|
||||
pr_warn("md: couldn't set array info. %d\n", err);
|
||||
goto unlock;
|
||||
}
|
||||
err = __md_set_array_info(mddev, argp);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
@ -7658,26 +7653,25 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
* The remaining ioctls are changing the state of the
|
||||
* superblock, so we do not allow them on read-only arrays.
|
||||
*/
|
||||
if (mddev->ro && mddev->pers) {
|
||||
if (mddev->ro == 2) {
|
||||
mddev->ro = 0;
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
/* mddev_unlock will wake thread */
|
||||
/* If a device failed while we were read-only, we
|
||||
* need to make sure the metadata is updated now.
|
||||
*/
|
||||
if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
|
||||
mddev_unlock(mddev);
|
||||
wait_event(mddev->sb_wait,
|
||||
!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
|
||||
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
|
||||
mddev_lock_nointr(mddev);
|
||||
}
|
||||
} else {
|
||||
if (!md_is_rdwr(mddev) && mddev->pers) {
|
||||
if (mddev->ro != MD_AUTO_READ) {
|
||||
err = -EROFS;
|
||||
goto unlock;
|
||||
}
|
||||
mddev->ro = MD_RDWR;
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
/* mddev_unlock will wake thread */
|
||||
/* If a device failed while we were read-only, we
|
||||
* need to make sure the metadata is updated now.
|
||||
*/
|
||||
if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
|
||||
mddev_unlock(mddev);
|
||||
wait_event(mddev->sb_wait,
|
||||
!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
|
||||
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
|
||||
mddev_lock_nointr(mddev);
|
||||
}
|
||||
}
|
||||
|
||||
switch (cmd) {
|
||||
@ -7763,11 +7757,11 @@ static int md_set_read_only(struct block_device *bdev, bool ro)
|
||||
* Transitioning to read-auto need only happen for arrays that call
|
||||
* md_write_start and which are not ready for writes yet.
|
||||
*/
|
||||
if (!ro && mddev->ro == 1 && mddev->pers) {
|
||||
if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
|
||||
err = restart_array(mddev);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
mddev->ro = 2;
|
||||
mddev->ro = MD_AUTO_READ;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
@ -8241,9 +8235,9 @@ static int md_seq_show(struct seq_file *seq, void *v)
|
||||
seq_printf(seq, "%s : %sactive", mdname(mddev),
|
||||
mddev->pers ? "" : "in");
|
||||
if (mddev->pers) {
|
||||
if (mddev->ro==1)
|
||||
if (mddev->ro == MD_RDONLY)
|
||||
seq_printf(seq, " (read-only)");
|
||||
if (mddev->ro==2)
|
||||
if (mddev->ro == MD_AUTO_READ)
|
||||
seq_printf(seq, " (auto-read-only)");
|
||||
seq_printf(seq, " %s", mddev->pers->name);
|
||||
}
|
||||
@ -8502,10 +8496,10 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
|
||||
if (bio_data_dir(bi) != WRITE)
|
||||
return true;
|
||||
|
||||
BUG_ON(mddev->ro == 1);
|
||||
if (mddev->ro == 2) {
|
||||
BUG_ON(mddev->ro == MD_RDONLY);
|
||||
if (mddev->ro == MD_AUTO_READ) {
|
||||
/* need to switch to read/write */
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
md_wakeup_thread(mddev->sync_thread);
|
||||
@ -8556,7 +8550,7 @@ void md_write_inc(struct mddev *mddev, struct bio *bi)
|
||||
{
|
||||
if (bio_data_dir(bi) != WRITE)
|
||||
return;
|
||||
WARN_ON_ONCE(mddev->in_sync || mddev->ro);
|
||||
WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
|
||||
percpu_ref_get(&mddev->writes_pending);
|
||||
}
|
||||
EXPORT_SYMBOL(md_write_inc);
|
||||
@ -8661,7 +8655,7 @@ void md_allow_write(struct mddev *mddev)
|
||||
{
|
||||
if (!mddev->pers)
|
||||
return;
|
||||
if (mddev->ro)
|
||||
if (!md_is_rdwr(mddev))
|
||||
return;
|
||||
if (!mddev->pers->sync_request)
|
||||
return;
|
||||
@ -8709,7 +8703,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
|
||||
test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
|
||||
return;
|
||||
if (mddev->ro) {/* never try to sync a read-only array */
|
||||
if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
return;
|
||||
}
|
||||
@ -9178,9 +9172,9 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||
if (test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
if (!test_bit(Journal, &rdev->flags)) {
|
||||
if (mddev->ro &&
|
||||
! (rdev->saved_raid_disk >= 0 &&
|
||||
!test_bit(Bitmap_sync, &rdev->flags)))
|
||||
if (!md_is_rdwr(mddev) &&
|
||||
!(rdev->saved_raid_disk >= 0 &&
|
||||
!test_bit(Bitmap_sync, &rdev->flags)))
|
||||
continue;
|
||||
|
||||
rdev->recovery_offset = 0;
|
||||
@ -9278,7 +9272,8 @@ void md_check_recovery(struct mddev *mddev)
|
||||
flush_signals(current);
|
||||
}
|
||||
|
||||
if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
|
||||
if (!md_is_rdwr(mddev) &&
|
||||
!test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
|
||||
return;
|
||||
if ( ! (
|
||||
(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
|
||||
@ -9297,7 +9292,7 @@ void md_check_recovery(struct mddev *mddev)
|
||||
if (!mddev->external && mddev->safemode == 1)
|
||||
mddev->safemode = 0;
|
||||
|
||||
if (mddev->ro) {
|
||||
if (!md_is_rdwr(mddev)) {
|
||||
struct md_rdev *rdev;
|
||||
if (!mddev->external && mddev->in_sync)
|
||||
/* 'Blocked' flag not needed as failed devices
|
||||
|
@ -782,7 +782,6 @@ extern void mddev_resume(struct mddev *mddev);
|
||||
|
||||
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
|
||||
extern void md_update_sb(struct mddev *mddev, int force);
|
||||
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
|
||||
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
|
||||
bool is_suspend);
|
||||
extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
|
||||
|
@ -398,7 +398,6 @@ static int raid0_run(struct mddev *mddev)
|
||||
|
||||
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
|
||||
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
|
||||
blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
|
||||
|
||||
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
|
||||
blk_queue_io_opt(mddev->queue,
|
||||
|
@ -1321,7 +1321,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
|
||||
read_bio->bi_iter.bi_sector = r1_bio->sector +
|
||||
mirror->rdev->data_offset;
|
||||
read_bio->bi_end_io = raid1_end_read_request;
|
||||
bio_set_op_attrs(read_bio, op, do_sync);
|
||||
read_bio->bi_opf = op | do_sync;
|
||||
if (test_bit(FailFast, &mirror->rdev->flags) &&
|
||||
test_bit(R1BIO_FailFast, &r1_bio->state))
|
||||
read_bio->bi_opf |= MD_FAILFAST;
|
||||
@ -2254,7 +2254,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
|
||||
continue;
|
||||
}
|
||||
|
||||
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
|
||||
wbio->bi_opf = REQ_OP_WRITE;
|
||||
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
|
||||
wbio->bi_opf |= MD_FAILFAST;
|
||||
|
||||
@ -2419,7 +2419,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
|
||||
GFP_NOIO, &mddev->bio_set);
|
||||
}
|
||||
|
||||
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
|
||||
wbio->bi_opf = REQ_OP_WRITE;
|
||||
wbio->bi_iter.bi_sector = r1_bio->sector;
|
||||
wbio->bi_iter.bi_size = r1_bio->sectors << 9;
|
||||
|
||||
@ -2770,7 +2770,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
if (i < conf->raid_disks)
|
||||
still_degraded = 1;
|
||||
} else if (!test_bit(In_sync, &rdev->flags)) {
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
bio->bi_opf = REQ_OP_WRITE;
|
||||
bio->bi_end_io = end_sync_write;
|
||||
write_targets ++;
|
||||
} else {
|
||||
@ -2797,7 +2797,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
if (disk < 0)
|
||||
disk = i;
|
||||
}
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
bio->bi_opf = REQ_OP_READ;
|
||||
bio->bi_end_io = end_sync_read;
|
||||
read_targets++;
|
||||
} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
|
||||
@ -2809,7 +2809,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
* if we are doing resync or repair. Otherwise, leave
|
||||
* this device alone for this sync request.
|
||||
*/
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
bio->bi_opf = REQ_OP_WRITE;
|
||||
bio->bi_end_io = end_sync_write;
|
||||
write_targets++;
|
||||
}
|
||||
@ -3159,6 +3159,7 @@ static int raid1_run(struct mddev *mddev)
|
||||
* RAID1 needs at least one disk in active
|
||||
*/
|
||||
if (conf->raid_disks - mddev->degraded < 1) {
|
||||
md_unregister_thread(&conf->thread);
|
||||
ret = -EINVAL;
|
||||
goto abort;
|
||||
}
|
||||
|
@ -1254,7 +1254,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
|
||||
read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
|
||||
choose_data_offset(r10_bio, rdev);
|
||||
read_bio->bi_end_io = raid10_end_read_request;
|
||||
bio_set_op_attrs(read_bio, op, do_sync);
|
||||
read_bio->bi_opf = op | do_sync;
|
||||
if (test_bit(FailFast, &rdev->flags) &&
|
||||
test_bit(R10BIO_FailFast, &r10_bio->state))
|
||||
read_bio->bi_opf |= MD_FAILFAST;
|
||||
@ -1301,7 +1301,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
|
||||
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
|
||||
choose_data_offset(r10_bio, rdev));
|
||||
mbio->bi_end_io = raid10_end_write_request;
|
||||
bio_set_op_attrs(mbio, op, do_sync | do_fua);
|
||||
mbio->bi_opf = op | do_sync | do_fua;
|
||||
if (!replacement && test_bit(FailFast,
|
||||
&conf->mirrors[devnum].rdev->flags)
|
||||
&& enough(conf, devnum))
|
||||
@ -2933,7 +2933,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
|
||||
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
|
||||
wbio->bi_iter.bi_sector = wsector +
|
||||
choose_data_offset(r10_bio, rdev);
|
||||
bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
|
||||
wbio->bi_opf = REQ_OP_WRITE;
|
||||
|
||||
if (submit_bio_wait(wbio) < 0)
|
||||
/* Failure! */
|
||||
@ -3542,7 +3542,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio->bi_next = biolist;
|
||||
biolist = bio;
|
||||
bio->bi_end_io = end_sync_read;
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
bio->bi_opf = REQ_OP_READ;
|
||||
if (test_bit(FailFast, &rdev->flags))
|
||||
bio->bi_opf |= MD_FAILFAST;
|
||||
from_addr = r10_bio->devs[j].addr;
|
||||
@ -3567,7 +3567,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio->bi_next = biolist;
|
||||
biolist = bio;
|
||||
bio->bi_end_io = end_sync_write;
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
bio->bi_opf = REQ_OP_WRITE;
|
||||
bio->bi_iter.bi_sector = to_addr
|
||||
+ mrdev->data_offset;
|
||||
bio_set_dev(bio, mrdev->bdev);
|
||||
@ -3588,7 +3588,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio->bi_next = biolist;
|
||||
biolist = bio;
|
||||
bio->bi_end_io = end_sync_write;
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
bio->bi_opf = REQ_OP_WRITE;
|
||||
bio->bi_iter.bi_sector = to_addr +
|
||||
mreplace->data_offset;
|
||||
bio_set_dev(bio, mreplace->bdev);
|
||||
@ -3742,7 +3742,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio->bi_next = biolist;
|
||||
biolist = bio;
|
||||
bio->bi_end_io = end_sync_read;
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
bio->bi_opf = REQ_OP_READ;
|
||||
if (test_bit(FailFast, &rdev->flags))
|
||||
bio->bi_opf |= MD_FAILFAST;
|
||||
bio->bi_iter.bi_sector = sector + rdev->data_offset;
|
||||
@ -3764,7 +3764,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio->bi_next = biolist;
|
||||
biolist = bio;
|
||||
bio->bi_end_io = end_sync_write;
|
||||
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
|
||||
bio->bi_opf = REQ_OP_WRITE;
|
||||
if (test_bit(FailFast, &rdev->flags))
|
||||
bio->bi_opf |= MD_FAILFAST;
|
||||
bio->bi_iter.bi_sector = sector + rdev->data_offset;
|
||||
@ -4145,8 +4145,6 @@ static int raid10_run(struct mddev *mddev)
|
||||
conf->thread = NULL;
|
||||
|
||||
if (mddev->queue) {
|
||||
blk_queue_max_discard_sectors(mddev->queue,
|
||||
UINT_MAX);
|
||||
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
|
||||
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
|
||||
raid10_set_io_opt(conf);
|
||||
@ -4972,7 +4970,7 @@ read_more:
|
||||
b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
|
||||
rdev2->new_data_offset;
|
||||
b->bi_end_io = end_reshape_write;
|
||||
bio_set_op_attrs(b, REQ_OP_WRITE, 0);
|
||||
b->bi_opf = REQ_OP_WRITE;
|
||||
b->bi_next = blist;
|
||||
blist = b;
|
||||
}
|
||||
|
@ -1565,11 +1565,12 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
|
||||
|
||||
if (!log)
|
||||
return;
|
||||
|
||||
target = READ_ONCE(log->reclaim_target);
|
||||
do {
|
||||
target = log->reclaim_target;
|
||||
if (new < target)
|
||||
return;
|
||||
} while (cmpxchg(&log->reclaim_target, target, new) != target);
|
||||
} while (!try_cmpxchg(&log->reclaim_target, &target, new));
|
||||
md_wakeup_thread(log->reclaim_thread);
|
||||
}
|
||||
|
||||
@ -3061,7 +3062,6 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
|
||||
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(rdev->bdev);
|
||||
struct r5l_log *log;
|
||||
int ret;
|
||||
|
||||
@ -3090,9 +3090,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
|
||||
if (!log)
|
||||
return -ENOMEM;
|
||||
log->rdev = rdev;
|
||||
|
||||
log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
|
||||
|
||||
log->need_cache_flush = bdev_write_cache(rdev->bdev);
|
||||
log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
|
||||
sizeof(rdev->mddev->uuid));
|
||||
|
||||
|
@ -1301,8 +1301,6 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
|
||||
|
||||
static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
|
||||
{
|
||||
struct request_queue *q;
|
||||
|
||||
if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
|
||||
PPL_HEADER_SIZE) * 2) {
|
||||
log->use_multippl = true;
|
||||
@ -1316,8 +1314,7 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
|
||||
}
|
||||
log->next_io_sector = rdev->ppl.sector;
|
||||
|
||||
q = bdev_get_queue(rdev->bdev);
|
||||
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
|
||||
if (bdev_write_cache(rdev->bdev))
|
||||
log->wb_cache_on = true;
|
||||
}
|
||||
|
||||
|
@ -763,7 +763,7 @@ static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
|
||||
goto out_free_cmd;
|
||||
}
|
||||
|
||||
blk_mq_start_request(req);
|
||||
nvme_start_request(req);
|
||||
apple_nvme_submit_cmd(q, cmnd);
|
||||
return BLK_STS_OK;
|
||||
|
||||
@ -821,7 +821,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
|
||||
if (!dead && shutdown && freeze)
|
||||
nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT);
|
||||
|
||||
nvme_stop_queues(&anv->ctrl);
|
||||
nvme_quiesce_io_queues(&anv->ctrl);
|
||||
|
||||
if (!dead) {
|
||||
if (READ_ONCE(anv->ioq.enabled)) {
|
||||
@ -829,15 +829,13 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
|
||||
apple_nvme_remove_cq(anv);
|
||||
}
|
||||
|
||||
if (shutdown)
|
||||
nvme_shutdown_ctrl(&anv->ctrl);
|
||||
nvme_disable_ctrl(&anv->ctrl);
|
||||
nvme_disable_ctrl(&anv->ctrl, shutdown);
|
||||
}
|
||||
|
||||
WRITE_ONCE(anv->ioq.enabled, false);
|
||||
WRITE_ONCE(anv->adminq.enabled, false);
|
||||
mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */
|
||||
nvme_stop_admin_queue(&anv->ctrl);
|
||||
nvme_quiesce_admin_queue(&anv->ctrl);
|
||||
|
||||
/* last chance to complete any requests before nvme_cancel_request */
|
||||
spin_lock_irqsave(&anv->lock, flags);
|
||||
@ -854,8 +852,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
|
||||
* deadlocking blk-mq hot-cpu notifier.
|
||||
*/
|
||||
if (shutdown) {
|
||||
nvme_start_queues(&anv->ctrl);
|
||||
nvme_start_admin_queue(&anv->ctrl);
|
||||
nvme_unquiesce_io_queues(&anv->ctrl);
|
||||
nvme_unquiesce_admin_queue(&anv->ctrl);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1093,7 +1091,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
|
||||
|
||||
dev_dbg(anv->dev, "Starting admin queue");
|
||||
apple_nvme_init_queue(&anv->adminq);
|
||||
nvme_start_admin_queue(&anv->ctrl);
|
||||
nvme_unquiesce_admin_queue(&anv->ctrl);
|
||||
|
||||
if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) {
|
||||
dev_warn(anv->ctrl.device,
|
||||
@ -1102,7 +1100,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = nvme_init_ctrl_finish(&anv->ctrl);
|
||||
ret = nvme_init_ctrl_finish(&anv->ctrl, false);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@ -1127,7 +1125,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
|
||||
|
||||
anv->ctrl.queue_count = nr_io_queues + 1;
|
||||
|
||||
nvme_start_queues(&anv->ctrl);
|
||||
nvme_unquiesce_io_queues(&anv->ctrl);
|
||||
nvme_wait_freeze(&anv->ctrl);
|
||||
blk_mq_update_nr_hw_queues(&anv->tagset, 1);
|
||||
nvme_unfreeze(&anv->ctrl);
|
||||
@ -1153,7 +1151,7 @@ out:
|
||||
nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_DELETING);
|
||||
nvme_get_ctrl(&anv->ctrl);
|
||||
apple_nvme_disable(anv, false);
|
||||
nvme_kill_queues(&anv->ctrl);
|
||||
nvme_mark_namespaces_dead(&anv->ctrl);
|
||||
if (!queue_work(nvme_wq, &anv->remove_work))
|
||||
nvme_put_ctrl(&anv->ctrl);
|
||||
}
|
||||
@ -1507,14 +1505,6 @@ static int apple_nvme_probe(struct platform_device *pdev)
|
||||
goto put_dev;
|
||||
}
|
||||
|
||||
if (!blk_get_queue(anv->ctrl.admin_q)) {
|
||||
nvme_start_admin_queue(&anv->ctrl);
|
||||
blk_mq_destroy_queue(anv->ctrl.admin_q);
|
||||
anv->ctrl.admin_q = NULL;
|
||||
ret = -ENODEV;
|
||||
goto put_dev;
|
||||
}
|
||||
|
||||
nvme_reset_ctrl(&anv->ctrl);
|
||||
async_schedule(apple_nvme_async_probe, anv);
|
||||
|
||||
|
@ -13,6 +13,10 @@
|
||||
#include "fabrics.h"
|
||||
#include <linux/nvme-auth.h>
|
||||
|
||||
#define CHAP_BUF_SIZE 4096
|
||||
static struct kmem_cache *nvme_chap_buf_cache;
|
||||
static mempool_t *nvme_chap_buf_pool;
|
||||
|
||||
struct nvme_dhchap_queue_context {
|
||||
struct list_head entry;
|
||||
struct work_struct auth_work;
|
||||
@ -20,7 +24,6 @@ struct nvme_dhchap_queue_context {
|
||||
struct crypto_shash *shash_tfm;
|
||||
struct crypto_kpp *dh_tfm;
|
||||
void *buf;
|
||||
size_t buf_size;
|
||||
int qid;
|
||||
int error;
|
||||
u32 s1;
|
||||
@ -47,6 +50,12 @@ struct nvme_dhchap_queue_context {
|
||||
#define nvme_auth_queue_from_qid(ctrl, qid) \
|
||||
(qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
|
||||
|
||||
static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues +
|
||||
ctrl->opts->nr_poll_queues + 1;
|
||||
}
|
||||
|
||||
static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
|
||||
void *data, size_t data_len, bool auth_send)
|
||||
{
|
||||
@ -112,7 +121,7 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
|
||||
struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
|
||||
size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
|
||||
|
||||
if (chap->buf_size < size) {
|
||||
if (size > CHAP_BUF_SIZE) {
|
||||
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -147,7 +156,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
|
||||
const char *gid_name = nvme_auth_dhgroup_name(data->dhgid);
|
||||
const char *hmac_name, *kpp_name;
|
||||
|
||||
if (chap->buf_size < size) {
|
||||
if (size > CHAP_BUF_SIZE) {
|
||||
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
|
||||
return NVME_SC_INVALID_FIELD;
|
||||
}
|
||||
@ -197,12 +206,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
|
||||
return NVME_SC_AUTH_REQUIRED;
|
||||
}
|
||||
|
||||
/* Reset host response if the hash had been changed */
|
||||
if (chap->hash_id != data->hashid) {
|
||||
kfree(chap->host_response);
|
||||
chap->host_response = NULL;
|
||||
}
|
||||
|
||||
chap->hash_id = data->hashid;
|
||||
chap->hash_len = data->hl;
|
||||
dev_dbg(ctrl->device, "qid %d: selected hash %s\n",
|
||||
@ -219,14 +222,6 @@ select_kpp:
|
||||
return NVME_SC_AUTH_REQUIRED;
|
||||
}
|
||||
|
||||
/* Clear host and controller key to avoid accidental reuse */
|
||||
kfree_sensitive(chap->host_key);
|
||||
chap->host_key = NULL;
|
||||
chap->host_key_len = 0;
|
||||
kfree_sensitive(chap->ctrl_key);
|
||||
chap->ctrl_key = NULL;
|
||||
chap->ctrl_key_len = 0;
|
||||
|
||||
if (chap->dhgroup_id == data->dhgid &&
|
||||
(data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) {
|
||||
dev_dbg(ctrl->device,
|
||||
@ -302,7 +297,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
|
||||
if (chap->host_key_len)
|
||||
size += chap->host_key_len;
|
||||
|
||||
if (chap->buf_size < size) {
|
||||
if (size > CHAP_BUF_SIZE) {
|
||||
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -344,10 +339,10 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
|
||||
struct nvmf_auth_dhchap_success1_data *data = chap->buf;
|
||||
size_t size = sizeof(*data);
|
||||
|
||||
if (ctrl->ctrl_key)
|
||||
if (chap->ctrl_key)
|
||||
size += chap->hash_len;
|
||||
|
||||
if (chap->buf_size < size) {
|
||||
if (size > CHAP_BUF_SIZE) {
|
||||
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
|
||||
return NVME_SC_INVALID_FIELD;
|
||||
}
|
||||
@ -521,6 +516,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
|
||||
ret = PTR_ERR(ctrl_response);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = crypto_shash_setkey(chap->shash_tfm,
|
||||
ctrl_response, ctrl->ctrl_key->len);
|
||||
if (ret) {
|
||||
@ -621,9 +617,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
|
||||
if (ret) {
|
||||
dev_dbg(ctrl->device,
|
||||
"failed to generate public key, error %d\n", ret);
|
||||
kfree(chap->host_key);
|
||||
chap->host_key = NULL;
|
||||
chap->host_key_len = 0;
|
||||
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
|
||||
return ret;
|
||||
}
|
||||
@ -643,9 +636,6 @@ gen_sesskey:
|
||||
if (ret) {
|
||||
dev_dbg(ctrl->device,
|
||||
"failed to generate shared secret, error %d\n", ret);
|
||||
kfree_sensitive(chap->sess_key);
|
||||
chap->sess_key = NULL;
|
||||
chap->sess_key_len = 0;
|
||||
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
|
||||
return ret;
|
||||
}
|
||||
@ -654,7 +644,7 @@ gen_sesskey:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
|
||||
static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
|
||||
{
|
||||
kfree_sensitive(chap->host_response);
|
||||
chap->host_response = NULL;
|
||||
@ -674,24 +664,20 @@ static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
|
||||
chap->transaction = 0;
|
||||
memset(chap->c1, 0, sizeof(chap->c1));
|
||||
memset(chap->c2, 0, sizeof(chap->c2));
|
||||
mempool_free(chap->buf, nvme_chap_buf_pool);
|
||||
chap->buf = NULL;
|
||||
}
|
||||
|
||||
static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
|
||||
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
|
||||
{
|
||||
__nvme_auth_reset(chap);
|
||||
nvme_auth_reset_dhchap(chap);
|
||||
if (chap->shash_tfm)
|
||||
crypto_free_shash(chap->shash_tfm);
|
||||
if (chap->dh_tfm)
|
||||
crypto_free_kpp(chap->dh_tfm);
|
||||
kfree_sensitive(chap->ctrl_key);
|
||||
kfree_sensitive(chap->host_key);
|
||||
kfree_sensitive(chap->sess_key);
|
||||
kfree_sensitive(chap->host_response);
|
||||
kfree(chap->buf);
|
||||
kfree(chap);
|
||||
}
|
||||
|
||||
static void __nvme_auth_work(struct work_struct *work)
|
||||
static void nvme_queue_auth_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_dhchap_queue_context *chap =
|
||||
container_of(work, struct nvme_dhchap_queue_context, auth_work);
|
||||
@ -699,6 +685,16 @@ static void __nvme_auth_work(struct work_struct *work)
|
||||
size_t tl;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* Allocate a large enough buffer for the entire negotiation:
|
||||
* 4k is enough to ffdhe8192.
|
||||
*/
|
||||
chap->buf = mempool_alloc(nvme_chap_buf_pool, GFP_KERNEL);
|
||||
if (!chap->buf) {
|
||||
chap->error = -ENOMEM;
|
||||
return;
|
||||
}
|
||||
|
||||
chap->transaction = ctrl->transaction++;
|
||||
|
||||
/* DH-HMAC-CHAP Step 1: send negotiate */
|
||||
@ -720,8 +716,9 @@ static void __nvme_auth_work(struct work_struct *work)
|
||||
dev_dbg(ctrl->device, "%s: qid %d receive challenge\n",
|
||||
__func__, chap->qid);
|
||||
|
||||
memset(chap->buf, 0, chap->buf_size);
|
||||
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
|
||||
memset(chap->buf, 0, CHAP_BUF_SIZE);
|
||||
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE,
|
||||
false);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device,
|
||||
"qid %d failed to receive challenge, %s %d\n",
|
||||
@ -757,11 +754,14 @@ static void __nvme_auth_work(struct work_struct *work)
|
||||
|
||||
dev_dbg(ctrl->device, "%s: qid %d host response\n",
|
||||
__func__, chap->qid);
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
|
||||
if (ret) {
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
chap->error = ret;
|
||||
goto fail2;
|
||||
}
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
|
||||
/* DH-HMAC-CHAP Step 3: send reply */
|
||||
dev_dbg(ctrl->device, "%s: qid %d send reply\n",
|
||||
@ -783,8 +783,9 @@ static void __nvme_auth_work(struct work_struct *work)
|
||||
dev_dbg(ctrl->device, "%s: qid %d receive success1\n",
|
||||
__func__, chap->qid);
|
||||
|
||||
memset(chap->buf, 0, chap->buf_size);
|
||||
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
|
||||
memset(chap->buf, 0, CHAP_BUF_SIZE);
|
||||
ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE,
|
||||
false);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device,
|
||||
"qid %d failed to receive success1, %s %d\n",
|
||||
@ -801,16 +802,19 @@ static void __nvme_auth_work(struct work_struct *work)
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
if (ctrl->ctrl_key) {
|
||||
dev_dbg(ctrl->device,
|
||||
"%s: qid %d controller response\n",
|
||||
__func__, chap->qid);
|
||||
ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap);
|
||||
if (ret) {
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
chap->error = ret;
|
||||
goto fail2;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
|
||||
ret = nvme_auth_process_dhchap_success1(ctrl, chap);
|
||||
if (ret) {
|
||||
@ -819,7 +823,7 @@ static void __nvme_auth_work(struct work_struct *work)
|
||||
goto fail2;
|
||||
}
|
||||
|
||||
if (ctrl->ctrl_key) {
|
||||
if (chap->ctrl_key) {
|
||||
/* DH-HMAC-CHAP Step 5: send success2 */
|
||||
dev_dbg(ctrl->device, "%s: qid %d send success2\n",
|
||||
__func__, chap->qid);
|
||||
@ -860,42 +864,8 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
|
||||
return -ENOKEY;
|
||||
}
|
||||
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
/* Check if the context is already queued */
|
||||
list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
|
||||
WARN_ON(!chap->buf);
|
||||
if (chap->qid == qid) {
|
||||
dev_dbg(ctrl->device, "qid %d: re-using context\n", qid);
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
flush_work(&chap->auth_work);
|
||||
__nvme_auth_reset(chap);
|
||||
queue_work(nvme_wq, &chap->auth_work);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
chap = kzalloc(sizeof(*chap), GFP_KERNEL);
|
||||
if (!chap) {
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
return -ENOMEM;
|
||||
}
|
||||
chap->qid = (qid == NVME_QID_ANY) ? 0 : qid;
|
||||
chap->ctrl = ctrl;
|
||||
|
||||
/*
|
||||
* Allocate a large enough buffer for the entire negotiation:
|
||||
* 4k should be enough to ffdhe8192.
|
||||
*/
|
||||
chap->buf_size = 4096;
|
||||
chap->buf = kzalloc(chap->buf_size, GFP_KERNEL);
|
||||
if (!chap->buf) {
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
kfree(chap);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
INIT_WORK(&chap->auth_work, __nvme_auth_work);
|
||||
list_add(&chap->entry, &ctrl->dhchap_auth_list);
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
chap = &ctrl->dhchap_ctxs[qid];
|
||||
cancel_work_sync(&chap->auth_work);
|
||||
queue_work(nvme_wq, &chap->auth_work);
|
||||
return 0;
|
||||
}
|
||||
@ -906,40 +876,28 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
|
||||
struct nvme_dhchap_queue_context *chap;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
|
||||
if (chap->qid != qid)
|
||||
continue;
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
flush_work(&chap->auth_work);
|
||||
ret = chap->error;
|
||||
return ret;
|
||||
}
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
return -ENXIO;
|
||||
chap = &ctrl->dhchap_ctxs[qid];
|
||||
flush_work(&chap->auth_work);
|
||||
ret = chap->error;
|
||||
/* clear sensitive info */
|
||||
nvme_auth_reset_dhchap(chap);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_auth_wait);
|
||||
|
||||
void nvme_auth_reset(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_dhchap_queue_context *chap;
|
||||
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
flush_work(&chap->auth_work);
|
||||
__nvme_auth_reset(chap);
|
||||
}
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_auth_reset);
|
||||
|
||||
static void nvme_dhchap_auth_work(struct work_struct *work)
|
||||
static void nvme_ctrl_auth_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_ctrl *ctrl =
|
||||
container_of(work, struct nvme_ctrl, dhchap_auth_work);
|
||||
int ret, q;
|
||||
|
||||
/*
|
||||
* If the ctrl is no connected, bail as reconnect will handle
|
||||
* authentication.
|
||||
*/
|
||||
if (ctrl->state != NVME_CTRL_LIVE)
|
||||
return;
|
||||
|
||||
/* Authenticate admin queue first */
|
||||
ret = nvme_auth_negotiate(ctrl, 0);
|
||||
if (ret) {
|
||||
@ -968,43 +926,75 @@ static void nvme_dhchap_auth_work(struct work_struct *work)
|
||||
* Failure is a soft-state; credentials remain valid until
|
||||
* the controller terminates the connection.
|
||||
*/
|
||||
for (q = 1; q < ctrl->queue_count; q++) {
|
||||
ret = nvme_auth_wait(ctrl, q);
|
||||
if (ret)
|
||||
dev_warn(ctrl->device,
|
||||
"qid %d: authentication failed\n", q);
|
||||
}
|
||||
}
|
||||
|
||||
void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
|
||||
int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
INIT_LIST_HEAD(&ctrl->dhchap_auth_list);
|
||||
INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work);
|
||||
struct nvme_dhchap_queue_context *chap;
|
||||
int i, ret;
|
||||
|
||||
mutex_init(&ctrl->dhchap_auth_mutex);
|
||||
INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work);
|
||||
if (!ctrl->opts)
|
||||
return;
|
||||
nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
|
||||
nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key);
|
||||
return 0;
|
||||
ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret,
|
||||
&ctrl->host_key);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret,
|
||||
&ctrl->ctrl_key);
|
||||
if (ret)
|
||||
goto err_free_dhchap_secret;
|
||||
|
||||
if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret)
|
||||
return ret;
|
||||
|
||||
ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl),
|
||||
sizeof(*chap), GFP_KERNEL);
|
||||
if (!ctrl->dhchap_ctxs) {
|
||||
ret = -ENOMEM;
|
||||
goto err_free_dhchap_ctrl_secret;
|
||||
}
|
||||
|
||||
for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) {
|
||||
chap = &ctrl->dhchap_ctxs[i];
|
||||
chap->qid = i;
|
||||
chap->ctrl = ctrl;
|
||||
INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
|
||||
}
|
||||
|
||||
return 0;
|
||||
err_free_dhchap_ctrl_secret:
|
||||
nvme_auth_free_key(ctrl->ctrl_key);
|
||||
ctrl->ctrl_key = NULL;
|
||||
err_free_dhchap_secret:
|
||||
nvme_auth_free_key(ctrl->host_key);
|
||||
ctrl->host_key = NULL;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl);
|
||||
|
||||
void nvme_auth_stop(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_dhchap_queue_context *chap = NULL, *tmp;
|
||||
|
||||
cancel_work_sync(&ctrl->dhchap_auth_work);
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry)
|
||||
cancel_work_sync(&chap->auth_work);
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_auth_stop);
|
||||
|
||||
void nvme_auth_free(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_dhchap_queue_context *chap = NULL, *tmp;
|
||||
int i;
|
||||
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) {
|
||||
list_del_init(&chap->entry);
|
||||
flush_work(&chap->auth_work);
|
||||
__nvme_auth_free(chap);
|
||||
if (ctrl->dhchap_ctxs) {
|
||||
for (i = 0; i < ctrl_max_dhchaps(ctrl); i++)
|
||||
nvme_auth_free_dhchap(&ctrl->dhchap_ctxs[i]);
|
||||
kfree(ctrl->dhchap_ctxs);
|
||||
}
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
if (ctrl->host_key) {
|
||||
nvme_auth_free_key(ctrl->host_key);
|
||||
ctrl->host_key = NULL;
|
||||
@ -1015,3 +1005,27 @@ void nvme_auth_free(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_auth_free);
|
||||
|
||||
int __init nvme_init_auth(void)
|
||||
{
|
||||
nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache",
|
||||
CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL);
|
||||
if (!nvme_chap_buf_cache)
|
||||
return -ENOMEM;
|
||||
|
||||
nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab,
|
||||
mempool_free_slab, nvme_chap_buf_cache);
|
||||
if (!nvme_chap_buf_pool)
|
||||
goto err_destroy_chap_buf_cache;
|
||||
|
||||
return 0;
|
||||
err_destroy_chap_buf_cache:
|
||||
kmem_cache_destroy(nvme_chap_buf_cache);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
void __exit nvme_exit_auth(void)
|
||||
{
|
||||
mempool_destroy(nvme_chap_buf_pool);
|
||||
kmem_cache_destroy(nvme_chap_buf_cache);
|
||||
}
|
||||
|
@ -384,6 +384,8 @@ static inline void nvme_end_req(struct request *req)
|
||||
nvme_log_error(req);
|
||||
nvme_end_req_zoned(req);
|
||||
nvme_trace_bio_complete(req);
|
||||
if (req->cmd_flags & REQ_NVME_MPATH)
|
||||
nvme_mpath_end_request(req);
|
||||
blk_mq_end_request(req, status);
|
||||
}
|
||||
|
||||
@ -851,8 +853,11 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
|
||||
cmnd->write_zeroes.length =
|
||||
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
||||
|
||||
if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
|
||||
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
|
||||
|
||||
if (nvme_ns_has_pi(ns)) {
|
||||
cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
|
||||
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
|
||||
|
||||
switch (ns->pi_type) {
|
||||
case NVME_NS_DPS_PI_TYPE1:
|
||||
@ -1118,11 +1123,12 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
|
||||
nvme_unfreeze(ctrl);
|
||||
nvme_mpath_unfreeze(ctrl->subsys);
|
||||
mutex_unlock(&ctrl->subsys->lock);
|
||||
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
|
||||
mutex_unlock(&ctrl->scan_lock);
|
||||
}
|
||||
if (effects & NVME_CMD_EFFECTS_CCC)
|
||||
nvme_init_ctrl_finish(ctrl);
|
||||
if (effects & NVME_CMD_EFFECTS_CCC) {
|
||||
dev_info(ctrl->device,
|
||||
"controller capabilities changed, reset may be required to take effect.\n");
|
||||
}
|
||||
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
|
||||
nvme_queue_scan(ctrl);
|
||||
flush_work(&ctrl->scan_work);
|
||||
@ -2003,6 +2009,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Only set the DEAC bit if the device guarantees that reads from
|
||||
* deallocated data return zeroes. While the DEAC bit does not
|
||||
* require that, it must be a no-op if reads from deallocated data
|
||||
* do not return zeroes.
|
||||
*/
|
||||
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
|
||||
ns->features |= NVME_NS_DEAC;
|
||||
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
|
||||
set_bit(NVME_NS_READY, &ns->flags);
|
||||
blk_mq_unfreeze_queue(ns->disk->queue);
|
||||
@ -2179,7 +2193,7 @@ const struct pr_ops nvme_pr_ops = {
|
||||
};
|
||||
|
||||
#ifdef CONFIG_BLK_SED_OPAL
|
||||
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
|
||||
static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
|
||||
bool send)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = data;
|
||||
@ -2196,7 +2210,23 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
|
||||
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
|
||||
NVME_QID_ANY, 1, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_sec_submit);
|
||||
|
||||
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
|
||||
{
|
||||
if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
|
||||
if (!ctrl->opal_dev)
|
||||
ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
|
||||
else if (was_suspended)
|
||||
opal_unlock_from_suspend(ctrl->opal_dev);
|
||||
} else {
|
||||
free_opal_dev(ctrl->opal_dev);
|
||||
ctrl->opal_dev = NULL;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_BLK_SED_OPAL */
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
@ -2221,16 +2251,17 @@ static const struct block_device_operations nvme_bdev_ops = {
|
||||
.pr_ops = &nvme_pr_ops,
|
||||
};
|
||||
|
||||
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
|
||||
static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
|
||||
u32 timeout, const char *op)
|
||||
{
|
||||
unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies;
|
||||
u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
|
||||
unsigned long timeout_jiffies = jiffies + timeout * HZ;
|
||||
u32 csts;
|
||||
int ret;
|
||||
|
||||
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
|
||||
if (csts == ~0)
|
||||
return -ENODEV;
|
||||
if ((csts & NVME_CSTS_RDY) == bit)
|
||||
if ((csts & mask) == val)
|
||||
break;
|
||||
|
||||
usleep_range(1000, 2000);
|
||||
@ -2239,7 +2270,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
|
||||
if (time_after(jiffies, timeout_jiffies)) {
|
||||
dev_err(ctrl->device,
|
||||
"Device not ready; aborting %s, CSTS=0x%x\n",
|
||||
enabled ? "initialisation" : "reset", csts);
|
||||
op, csts);
|
||||
return -ENODEV;
|
||||
}
|
||||
}
|
||||
@ -2247,27 +2278,29 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the device has been passed off to us in an enabled state, just clear
|
||||
* the enabled bit. The spec says we should set the 'shutdown notification
|
||||
* bits', but doing so may cause the device to complete commands to the
|
||||
* admin queue ... and we don't know what memory that might be pointing at!
|
||||
*/
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
|
||||
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
|
||||
if (shutdown)
|
||||
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
|
||||
else
|
||||
ctrl->ctrl_config &= ~NVME_CC_ENABLE;
|
||||
|
||||
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (shutdown) {
|
||||
return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
|
||||
NVME_CSTS_SHST_CMPLT,
|
||||
ctrl->shutdown_timeout, "shutdown");
|
||||
}
|
||||
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
|
||||
msleep(NVME_QUIRK_DELAY_AMOUNT);
|
||||
|
||||
return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false);
|
||||
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
|
||||
(NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
|
||||
|
||||
@ -2332,41 +2365,11 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
|
||||
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
||||
if (ret)
|
||||
return ret;
|
||||
return nvme_wait_ready(ctrl, timeout, true);
|
||||
return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
|
||||
(timeout + 1) / 2, "initialisation");
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
|
||||
|
||||
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
|
||||
u32 csts;
|
||||
int ret;
|
||||
|
||||
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
|
||||
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
|
||||
|
||||
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
|
||||
if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
|
||||
break;
|
||||
|
||||
msleep(100);
|
||||
if (fatal_signal_pending(current))
|
||||
return -EINTR;
|
||||
if (time_after(jiffies, timeout)) {
|
||||
dev_err(ctrl->device,
|
||||
"Device shutdown incomplete; abort shutdown\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
|
||||
|
||||
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
__le64 ts;
|
||||
@ -3049,7 +3052,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
|
||||
|
||||
id = kzalloc(sizeof(*id), GFP_KERNEL);
|
||||
if (!id)
|
||||
return 0;
|
||||
return -ENOMEM;
|
||||
|
||||
c.identify.opcode = nvme_admin_identify;
|
||||
c.identify.cns = NVME_ID_CNS_CS_CTRL;
|
||||
@ -3229,7 +3232,7 @@ out_free:
|
||||
* register in our nvme_ctrl structure. This should be called as soon as
|
||||
* the admin queue is fully up and running.
|
||||
*/
|
||||
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
|
||||
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -3260,6 +3263,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
nvme_configure_opal(ctrl, was_suspended);
|
||||
|
||||
if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
|
||||
/*
|
||||
* Do not return errors unless we are in a controller reset,
|
||||
@ -3745,15 +3750,19 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
|
||||
memcpy(dhchap_secret, buf, count);
|
||||
nvme_auth_stop(ctrl);
|
||||
if (strcmp(dhchap_secret, opts->dhchap_secret)) {
|
||||
struct nvme_dhchap_key *key, *host_key;
|
||||
int ret;
|
||||
|
||||
ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
|
||||
ret = nvme_auth_generate_key(dhchap_secret, &key);
|
||||
if (ret)
|
||||
return ret;
|
||||
kfree(opts->dhchap_secret);
|
||||
opts->dhchap_secret = dhchap_secret;
|
||||
/* Key has changed; re-authentication with new key */
|
||||
nvme_auth_reset(ctrl);
|
||||
host_key = ctrl->host_key;
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
ctrl->host_key = key;
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
nvme_auth_free_key(host_key);
|
||||
}
|
||||
/* Start re-authentication */
|
||||
dev_info(ctrl->device, "re-authenticating controller\n");
|
||||
@ -3795,15 +3804,19 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
|
||||
memcpy(dhchap_secret, buf, count);
|
||||
nvme_auth_stop(ctrl);
|
||||
if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
|
||||
struct nvme_dhchap_key *key, *ctrl_key;
|
||||
int ret;
|
||||
|
||||
ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
|
||||
ret = nvme_auth_generate_key(dhchap_secret, &key);
|
||||
if (ret)
|
||||
return ret;
|
||||
kfree(opts->dhchap_ctrl_secret);
|
||||
opts->dhchap_ctrl_secret = dhchap_secret;
|
||||
/* Key has changed; re-authentication with new key */
|
||||
nvme_auth_reset(ctrl);
|
||||
ctrl_key = ctrl->ctrl_key;
|
||||
mutex_lock(&ctrl->dhchap_auth_mutex);
|
||||
ctrl->ctrl_key = key;
|
||||
mutex_unlock(&ctrl->dhchap_auth_mutex);
|
||||
nvme_auth_free_key(ctrl_key);
|
||||
}
|
||||
/* Start re-authentication */
|
||||
dev_info(ctrl->device, "re-authenticating controller\n");
|
||||
@ -3875,10 +3888,11 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
|
||||
return a->mode;
|
||||
}
|
||||
|
||||
static const struct attribute_group nvme_dev_attrs_group = {
|
||||
const struct attribute_group nvme_dev_attrs_group = {
|
||||
.attrs = nvme_dev_attrs,
|
||||
.is_visible = nvme_dev_attrs_are_visible,
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(nvme_dev_attrs_group);
|
||||
|
||||
static const struct attribute_group *nvme_dev_attr_groups[] = {
|
||||
&nvme_dev_attrs_group,
|
||||
@ -4333,10 +4347,6 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
|
||||
{
|
||||
int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
|
||||
|
||||
if (test_bit(NVME_NS_DEAD, &ns->flags))
|
||||
goto out;
|
||||
|
||||
ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
|
||||
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
|
||||
dev_err(ns->ctrl->device,
|
||||
"identifiers changed for nsid %d\n", ns->head->ns_id);
|
||||
@ -4407,7 +4417,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
|
||||
|
||||
down_write(&ctrl->namespaces_rwsem);
|
||||
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
|
||||
if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
|
||||
if (ns->head->ns_id > nsid)
|
||||
list_move_tail(&ns->list, &rm_list);
|
||||
}
|
||||
up_write(&ctrl->namespaces_rwsem);
|
||||
@ -4424,9 +4434,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
|
||||
u32 prev = 0;
|
||||
int ret = 0, i;
|
||||
|
||||
if (nvme_ctrl_limited_cns(ctrl))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
|
||||
if (!ns_list)
|
||||
return -ENOMEM;
|
||||
@ -4534,8 +4541,18 @@ static void nvme_scan_work(struct work_struct *work)
|
||||
}
|
||||
|
||||
mutex_lock(&ctrl->scan_lock);
|
||||
if (nvme_scan_ns_list(ctrl) != 0)
|
||||
if (nvme_ctrl_limited_cns(ctrl)) {
|
||||
nvme_scan_ns_sequential(ctrl);
|
||||
} else {
|
||||
/*
|
||||
* Fall back to sequential scan if DNR is set to handle broken
|
||||
* devices which should support Identify NS List (as per the VS
|
||||
* they report) but don't actually support it.
|
||||
*/
|
||||
ret = nvme_scan_ns_list(ctrl);
|
||||
if (ret > 0 && ret & NVME_SC_DNR)
|
||||
nvme_scan_ns_sequential(ctrl);
|
||||
}
|
||||
mutex_unlock(&ctrl->scan_lock);
|
||||
}
|
||||
|
||||
@ -4565,8 +4582,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
|
||||
* removing the namespaces' disks; fail all the queues now to avoid
|
||||
* potentially having to clean up the failed sync later.
|
||||
*/
|
||||
if (ctrl->state == NVME_CTRL_DEAD)
|
||||
nvme_kill_queues(ctrl);
|
||||
if (ctrl->state == NVME_CTRL_DEAD) {
|
||||
nvme_mark_namespaces_dead(ctrl);
|
||||
nvme_unquiesce_io_queues(ctrl);
|
||||
}
|
||||
|
||||
/* this is a no-op when called from the controller reset handler */
|
||||
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
|
||||
@ -4692,7 +4711,7 @@ static void nvme_fw_act_work(struct work_struct *work)
|
||||
fw_act_timeout = jiffies +
|
||||
msecs_to_jiffies(admin_timeout * 1000);
|
||||
|
||||
nvme_stop_queues(ctrl);
|
||||
nvme_quiesce_io_queues(ctrl);
|
||||
while (nvme_ctrl_pp_status(ctrl)) {
|
||||
if (time_after(jiffies, fw_act_timeout)) {
|
||||
dev_warn(ctrl->device,
|
||||
@ -4706,7 +4725,7 @@ static void nvme_fw_act_work(struct work_struct *work)
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
|
||||
return;
|
||||
|
||||
nvme_start_queues(ctrl);
|
||||
nvme_unquiesce_io_queues(ctrl);
|
||||
/* read FW slot information to clear the AER */
|
||||
nvme_get_fw_slot_info(ctrl);
|
||||
|
||||
@ -4811,8 +4830,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
|
||||
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
|
||||
|
||||
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
const struct blk_mq_ops *ops, unsigned int flags,
|
||||
unsigned int cmd_size)
|
||||
const struct blk_mq_ops *ops, unsigned int cmd_size)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -4822,7 +4840,9 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
if (ctrl->ops->flags & NVME_F_FABRICS)
|
||||
set->reserved_tags = NVMF_RESERVED_TAGS;
|
||||
set->numa_node = ctrl->numa_node;
|
||||
set->flags = flags;
|
||||
set->flags = BLK_MQ_F_NO_SCHED;
|
||||
if (ctrl->ops->flags & NVME_F_BLOCKING)
|
||||
set->flags |= BLK_MQ_F_BLOCKING;
|
||||
set->cmd_size = cmd_size;
|
||||
set->driver_data = ctrl;
|
||||
set->nr_hw_queues = 1;
|
||||
@ -4850,6 +4870,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
|
||||
out_cleanup_admin_q:
|
||||
blk_mq_destroy_queue(ctrl->admin_q);
|
||||
blk_put_queue(ctrl->admin_q);
|
||||
out_free_tagset:
|
||||
blk_mq_free_tag_set(ctrl->admin_tagset);
|
||||
return ret;
|
||||
@ -4859,14 +4880,17 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
|
||||
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
blk_mq_destroy_queue(ctrl->admin_q);
|
||||
if (ctrl->ops->flags & NVME_F_FABRICS)
|
||||
blk_put_queue(ctrl->admin_q);
|
||||
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
||||
blk_mq_destroy_queue(ctrl->fabrics_q);
|
||||
blk_put_queue(ctrl->fabrics_q);
|
||||
}
|
||||
blk_mq_free_tag_set(ctrl->admin_tagset);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
|
||||
|
||||
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
const struct blk_mq_ops *ops, unsigned int flags,
|
||||
const struct blk_mq_ops *ops, unsigned int nr_maps,
|
||||
unsigned int cmd_size)
|
||||
{
|
||||
int ret;
|
||||
@ -4874,15 +4898,23 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
memset(set, 0, sizeof(*set));
|
||||
set->ops = ops;
|
||||
set->queue_depth = ctrl->sqsize + 1;
|
||||
set->reserved_tags = NVMF_RESERVED_TAGS;
|
||||
/*
|
||||
* Some Apple controllers requires tags to be unique across admin and
|
||||
* the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
|
||||
*/
|
||||
if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
|
||||
set->reserved_tags = NVME_AQ_DEPTH;
|
||||
else if (ctrl->ops->flags & NVME_F_FABRICS)
|
||||
set->reserved_tags = NVMF_RESERVED_TAGS;
|
||||
set->numa_node = ctrl->numa_node;
|
||||
set->flags = flags;
|
||||
set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
if (ctrl->ops->flags & NVME_F_BLOCKING)
|
||||
set->flags |= BLK_MQ_F_BLOCKING;
|
||||
set->cmd_size = cmd_size,
|
||||
set->driver_data = ctrl;
|
||||
set->nr_hw_queues = ctrl->queue_count - 1;
|
||||
set->timeout = NVME_IO_TIMEOUT;
|
||||
if (ops->map_queues)
|
||||
set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
|
||||
set->nr_maps = nr_maps;
|
||||
ret = blk_mq_alloc_tag_set(set);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -4893,6 +4925,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
ret = PTR_ERR(ctrl->connect_q);
|
||||
goto out_free_tag_set;
|
||||
}
|
||||
blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
|
||||
ctrl->connect_q);
|
||||
}
|
||||
|
||||
ctrl->tagset = set;
|
||||
@ -4906,8 +4940,10 @@ EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
|
||||
|
||||
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
if (ctrl->ops->flags & NVME_F_FABRICS)
|
||||
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
||||
blk_mq_destroy_queue(ctrl->connect_q);
|
||||
blk_put_queue(ctrl->connect_q);
|
||||
}
|
||||
blk_mq_free_tag_set(ctrl->tagset);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
|
||||
@ -4943,7 +4979,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
|
||||
|
||||
if (ctrl->queue_count > 1) {
|
||||
nvme_queue_scan(ctrl);
|
||||
nvme_start_queues(ctrl);
|
||||
nvme_unquiesce_io_queues(ctrl);
|
||||
nvme_mpath_update(ctrl);
|
||||
}
|
||||
|
||||
@ -4988,6 +5024,7 @@ static void nvme_free_ctrl(struct device *dev)
|
||||
nvme_auth_stop(ctrl);
|
||||
nvme_auth_free(ctrl);
|
||||
__free_page(ctrl->discard_page);
|
||||
free_opal_dev(ctrl->opal_dev);
|
||||
|
||||
if (subsys) {
|
||||
mutex_lock(&nvme_subsystems_lock);
|
||||
@ -5053,7 +5090,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
||||
ctrl->instance);
|
||||
ctrl->device->class = nvme_class;
|
||||
ctrl->device->parent = ctrl->dev;
|
||||
ctrl->device->groups = nvme_dev_attr_groups;
|
||||
if (ops->dev_attr_groups)
|
||||
ctrl->device->groups = ops->dev_attr_groups;
|
||||
else
|
||||
ctrl->device->groups = nvme_dev_attr_groups;
|
||||
ctrl->device->release = nvme_free_ctrl;
|
||||
dev_set_drvdata(ctrl->device, ctrl);
|
||||
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
|
||||
@ -5077,9 +5117,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
||||
|
||||
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
|
||||
nvme_mpath_init_ctrl(ctrl);
|
||||
nvme_auth_init_ctrl(ctrl);
|
||||
ret = nvme_auth_init_ctrl(ctrl);
|
||||
if (ret)
|
||||
goto out_free_cdev;
|
||||
|
||||
return 0;
|
||||
out_free_cdev:
|
||||
cdev_device_del(&ctrl->cdev, ctrl->device);
|
||||
out_free_name:
|
||||
nvme_put_ctrl(ctrl);
|
||||
kfree_const(ctrl->device->kobj.name);
|
||||
@ -5092,62 +5136,17 @@ out:
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
|
||||
|
||||
static void nvme_start_ns_queue(struct nvme_ns *ns)
|
||||
{
|
||||
if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
|
||||
blk_mq_unquiesce_queue(ns->queue);
|
||||
}
|
||||
|
||||
static void nvme_stop_ns_queue(struct nvme_ns *ns)
|
||||
{
|
||||
if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
|
||||
blk_mq_quiesce_queue(ns->queue);
|
||||
else
|
||||
blk_mq_wait_quiesce_done(ns->queue);
|
||||
}
|
||||
|
||||
/*
|
||||
* Prepare a queue for teardown.
|
||||
*
|
||||
* This must forcibly unquiesce queues to avoid blocking dispatch, and only set
|
||||
* the capacity to 0 after that to avoid blocking dispatchers that may be
|
||||
* holding bd_butex. This will end buffered writers dirtying pages that can't
|
||||
* be synced.
|
||||
*/
|
||||
static void nvme_set_queue_dying(struct nvme_ns *ns)
|
||||
{
|
||||
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
|
||||
return;
|
||||
|
||||
blk_mark_disk_dead(ns->disk);
|
||||
nvme_start_ns_queue(ns);
|
||||
|
||||
set_capacity_and_notify(ns->disk, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* nvme_kill_queues(): Ends all namespace queues
|
||||
* @ctrl: the dead controller that needs to end
|
||||
*
|
||||
* Call this function when the driver determines it is unable to get the
|
||||
* controller in a state capable of servicing IO.
|
||||
*/
|
||||
void nvme_kill_queues(struct nvme_ctrl *ctrl)
|
||||
/* let I/O to all namespaces fail in preparation for surprise removal */
|
||||
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
|
||||
down_read(&ctrl->namespaces_rwsem);
|
||||
|
||||
/* Forcibly unquiesce queues to avoid blocking dispatch */
|
||||
if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
|
||||
nvme_start_admin_queue(ctrl);
|
||||
|
||||
list_for_each_entry(ns, &ctrl->namespaces, list)
|
||||
nvme_set_queue_dying(ns);
|
||||
|
||||
blk_mark_disk_dead(ns->disk);
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_kill_queues);
|
||||
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
|
||||
|
||||
void nvme_unfreeze(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
@ -5197,43 +5196,41 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_start_freeze);
|
||||
|
||||
void nvme_stop_queues(struct nvme_ctrl *ctrl)
|
||||
void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
|
||||
down_read(&ctrl->namespaces_rwsem);
|
||||
list_for_each_entry(ns, &ctrl->namespaces, list)
|
||||
nvme_stop_ns_queue(ns);
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
if (!ctrl->tagset)
|
||||
return;
|
||||
if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
|
||||
blk_mq_quiesce_tagset(ctrl->tagset);
|
||||
else
|
||||
blk_mq_wait_quiesce_done(ctrl->tagset);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_stop_queues);
|
||||
EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
|
||||
|
||||
void nvme_start_queues(struct nvme_ctrl *ctrl)
|
||||
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
|
||||
down_read(&ctrl->namespaces_rwsem);
|
||||
list_for_each_entry(ns, &ctrl->namespaces, list)
|
||||
nvme_start_ns_queue(ns);
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
if (!ctrl->tagset)
|
||||
return;
|
||||
if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
|
||||
blk_mq_unquiesce_tagset(ctrl->tagset);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_start_queues);
|
||||
EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
|
||||
|
||||
void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
|
||||
void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
|
||||
blk_mq_quiesce_queue(ctrl->admin_q);
|
||||
else
|
||||
blk_mq_wait_quiesce_done(ctrl->admin_q);
|
||||
blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
|
||||
EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
|
||||
|
||||
void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
|
||||
void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
|
||||
blk_mq_unquiesce_queue(ctrl->admin_q);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
|
||||
EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
|
||||
|
||||
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
@ -5344,8 +5341,13 @@ static int __init nvme_core_init(void)
|
||||
goto unregister_generic_ns;
|
||||
}
|
||||
|
||||
result = nvme_init_auth();
|
||||
if (result)
|
||||
goto destroy_ns_chr;
|
||||
return 0;
|
||||
|
||||
destroy_ns_chr:
|
||||
class_destroy(nvme_ns_chr_class);
|
||||
unregister_generic_ns:
|
||||
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
|
||||
destroy_subsys_class:
|
||||
@ -5366,6 +5368,7 @@ out:
|
||||
|
||||
static void __exit nvme_core_exit(void)
|
||||
{
|
||||
nvme_exit_auth();
|
||||
class_destroy(nvme_ns_chr_class);
|
||||
class_destroy(nvme_subsys_class);
|
||||
class_destroy(nvme_class);
|
||||
|
@ -1475,6 +1475,8 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
|
||||
fc_dma_unmap_single(lport->dev, lsop->rspdma,
|
||||
sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
|
||||
|
||||
kfree(lsop->rspbuf);
|
||||
kfree(lsop->rqstbuf);
|
||||
kfree(lsop);
|
||||
|
||||
nvme_fc_rport_put(rport);
|
||||
@ -1699,6 +1701,15 @@ restart:
|
||||
spin_unlock_irqrestore(&rport->lock, flags);
|
||||
}
|
||||
|
||||
static
|
||||
void nvme_fc_rcv_ls_req_err_msg(struct nvme_fc_lport *lport,
|
||||
struct fcnvme_ls_rqst_w0 *w0)
|
||||
{
|
||||
dev_info(lport->dev, "RCV %s LS failed: No memory\n",
|
||||
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
|
||||
nvmefc_ls_names[w0->ls_cmd] : "");
|
||||
}
|
||||
|
||||
/**
|
||||
* nvme_fc_rcv_ls_req - transport entry point called by an LLDD
|
||||
* upon the reception of a NVME LS request.
|
||||
@ -1751,20 +1762,20 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
|
||||
goto out_put;
|
||||
}
|
||||
|
||||
lsop = kzalloc(sizeof(*lsop) +
|
||||
sizeof(union nvmefc_ls_requests) +
|
||||
sizeof(union nvmefc_ls_responses),
|
||||
GFP_KERNEL);
|
||||
lsop = kzalloc(sizeof(*lsop), GFP_KERNEL);
|
||||
if (!lsop) {
|
||||
dev_info(lport->dev,
|
||||
"RCV %s LS failed: No memory\n",
|
||||
(w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
|
||||
nvmefc_ls_names[w0->ls_cmd] : "");
|
||||
nvme_fc_rcv_ls_req_err_msg(lport, w0);
|
||||
ret = -ENOMEM;
|
||||
goto out_put;
|
||||
}
|
||||
lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1];
|
||||
lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1];
|
||||
|
||||
lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL);
|
||||
lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL);
|
||||
if (!lsop->rqstbuf || !lsop->rspbuf) {
|
||||
nvme_fc_rcv_ls_req_err_msg(lport, w0);
|
||||
ret = -ENOMEM;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf,
|
||||
sizeof(*lsop->rspbuf),
|
||||
@ -1801,6 +1812,8 @@ out_unmap:
|
||||
fc_dma_unmap_single(lport->dev, lsop->rspdma,
|
||||
sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
|
||||
out_free:
|
||||
kfree(lsop->rspbuf);
|
||||
kfree(lsop->rqstbuf);
|
||||
kfree(lsop);
|
||||
out_put:
|
||||
nvme_fc_rport_put(rport);
|
||||
@ -2391,7 +2404,7 @@ nvme_fc_ctrl_free(struct kref *ref)
|
||||
list_del(&ctrl->ctrl_list);
|
||||
spin_unlock_irqrestore(&ctrl->rport->lock, flags);
|
||||
|
||||
nvme_start_admin_queue(&ctrl->ctrl);
|
||||
nvme_unquiesce_admin_queue(&ctrl->ctrl);
|
||||
nvme_remove_admin_tag_set(&ctrl->ctrl);
|
||||
|
||||
kfree(ctrl->queues);
|
||||
@ -2492,20 +2505,20 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
|
||||
* (but with error status).
|
||||
*/
|
||||
if (ctrl->ctrl.queue_count > 1) {
|
||||
nvme_stop_queues(&ctrl->ctrl);
|
||||
nvme_quiesce_io_queues(&ctrl->ctrl);
|
||||
nvme_sync_io_queues(&ctrl->ctrl);
|
||||
blk_mq_tagset_busy_iter(&ctrl->tag_set,
|
||||
nvme_fc_terminate_exchange, &ctrl->ctrl);
|
||||
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
|
||||
if (start_queues)
|
||||
nvme_start_queues(&ctrl->ctrl);
|
||||
nvme_unquiesce_io_queues(&ctrl->ctrl);
|
||||
}
|
||||
|
||||
/*
|
||||
* Other transports, which don't have link-level contexts bound
|
||||
* to sqe's, would try to gracefully shutdown the controller by
|
||||
* writing the registers for shutdown and polling (call
|
||||
* nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
|
||||
* nvme_disable_ctrl()). Given a bunch of i/o was potentially
|
||||
* just aborted and we will wait on those contexts, and given
|
||||
* there was no indication of how live the controlelr is on the
|
||||
* link, don't send more io to create more contexts for the
|
||||
@ -2516,13 +2529,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
|
||||
/*
|
||||
* clean up the admin queue. Same thing as above.
|
||||
*/
|
||||
nvme_stop_admin_queue(&ctrl->ctrl);
|
||||
nvme_quiesce_admin_queue(&ctrl->ctrl);
|
||||
blk_sync_queue(ctrl->ctrl.admin_q);
|
||||
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
|
||||
nvme_fc_terminate_exchange, &ctrl->ctrl);
|
||||
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
|
||||
if (start_queues)
|
||||
nvme_start_admin_queue(&ctrl->ctrl);
|
||||
nvme_unquiesce_admin_queue(&ctrl->ctrl);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -2732,7 +2745,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
|
||||
atomic_set(&op->state, FCPOP_STATE_ACTIVE);
|
||||
|
||||
if (!(op->flags & FCOP_FLAGS_AEN))
|
||||
blk_mq_start_request(op->rq);
|
||||
nvme_start_request(op->rq);
|
||||
|
||||
cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn));
|
||||
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
|
||||
@ -2903,7 +2916,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
|
||||
nvme_fc_init_io_queues(ctrl);
|
||||
|
||||
ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set,
|
||||
&nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE,
|
||||
&nvme_fc_mq_ops, 1,
|
||||
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
|
||||
ctrl->lport->ops->fcprqst_priv_sz));
|
||||
if (ret)
|
||||
@ -3104,9 +3117,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
|
||||
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
|
||||
(ilog2(SZ_4K) - 9);
|
||||
|
||||
nvme_start_admin_queue(&ctrl->ctrl);
|
||||
nvme_unquiesce_admin_queue(&ctrl->ctrl);
|
||||
|
||||
ret = nvme_init_ctrl_finish(&ctrl->ctrl);
|
||||
ret = nvme_init_ctrl_finish(&ctrl->ctrl, false);
|
||||
if (ret || test_bit(ASSOC_FAILED, &ctrl->flags))
|
||||
goto out_disconnect_admin_queue;
|
||||
|
||||
@ -3250,10 +3263,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
|
||||
nvme_fc_free_queue(&ctrl->queues[0]);
|
||||
|
||||
/* re-enable the admin_q so anything new can fast fail */
|
||||
nvme_start_admin_queue(&ctrl->ctrl);
|
||||
nvme_unquiesce_admin_queue(&ctrl->ctrl);
|
||||
|
||||
/* resume the io queues so that things will fast fail */
|
||||
nvme_start_queues(&ctrl->ctrl);
|
||||
nvme_unquiesce_io_queues(&ctrl->ctrl);
|
||||
|
||||
nvme_fc_ctlr_inactive_on_rport(ctrl);
|
||||
}
|
||||
@ -3509,7 +3522,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
|
||||
nvme_fc_init_queue(ctrl, 0);
|
||||
|
||||
ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
|
||||
&nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED,
|
||||
&nvme_fc_admin_mq_ops,
|
||||
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
|
||||
ctrl->lport->ops->fcprqst_priv_sz));
|
||||
if (ret)
|
||||
|
@ -8,6 +8,50 @@
|
||||
#include <linux/io_uring.h>
|
||||
#include "nvme.h"
|
||||
|
||||
static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
|
||||
fmode_t mode)
|
||||
{
|
||||
if (capable(CAP_SYS_ADMIN))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Do not allow unprivileged processes to send vendor specific or fabrics
|
||||
* commands as we can't be sure about their effects.
|
||||
*/
|
||||
if (c->common.opcode >= nvme_cmd_vendor_start ||
|
||||
c->common.opcode == nvme_fabrics_command)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Do not allow unprivileged passthrough of admin commands except
|
||||
* for a subset of identify commands that contain information required
|
||||
* to form proper I/O commands in userspace and do not expose any
|
||||
* potentially sensitive information.
|
||||
*/
|
||||
if (!ns) {
|
||||
if (c->common.opcode == nvme_admin_identify) {
|
||||
switch (c->identify.cns) {
|
||||
case NVME_ID_CNS_NS:
|
||||
case NVME_ID_CNS_CS_NS:
|
||||
case NVME_ID_CNS_NS_CS_INDEP:
|
||||
case NVME_ID_CNS_CS_CTRL:
|
||||
case NVME_ID_CNS_CTRL:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Only allow I/O commands that transfer data to the controller if the
|
||||
* special file is open for writing, but always allow I/O commands that
|
||||
* transfer data from the controller.
|
||||
*/
|
||||
if (nvme_is_write(c))
|
||||
return mode & FMODE_WRITE;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert integer values from ioctl structures to user pointers, silently
|
||||
* ignoring the upper bits in the compat case to match behaviour of 32-bit
|
||||
@ -261,7 +305,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
|
||||
}
|
||||
|
||||
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
struct nvme_passthru_cmd __user *ucmd)
|
||||
struct nvme_passthru_cmd __user *ucmd, fmode_t mode)
|
||||
{
|
||||
struct nvme_passthru_cmd cmd;
|
||||
struct nvme_command c;
|
||||
@ -269,8 +313,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
u64 result;
|
||||
int status;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
|
||||
return -EFAULT;
|
||||
if (cmd.flags)
|
||||
@ -291,6 +333,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
|
||||
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
|
||||
|
||||
if (!nvme_cmd_allowed(ns, &c, mode))
|
||||
return -EACCES;
|
||||
|
||||
if (cmd.timeout_ms)
|
||||
timeout = msecs_to_jiffies(cmd.timeout_ms);
|
||||
|
||||
@ -308,15 +353,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
}
|
||||
|
||||
static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
struct nvme_passthru_cmd64 __user *ucmd, bool vec)
|
||||
struct nvme_passthru_cmd64 __user *ucmd, bool vec,
|
||||
fmode_t mode)
|
||||
{
|
||||
struct nvme_passthru_cmd64 cmd;
|
||||
struct nvme_command c;
|
||||
unsigned timeout = 0;
|
||||
int status;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
|
||||
return -EFAULT;
|
||||
if (cmd.flags)
|
||||
@ -337,6 +381,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
|
||||
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
|
||||
|
||||
if (!nvme_cmd_allowed(ns, &c, mode))
|
||||
return -EACCES;
|
||||
|
||||
if (cmd.timeout_ms)
|
||||
timeout = msecs_to_jiffies(cmd.timeout_ms);
|
||||
|
||||
@ -483,9 +530,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
void *meta = NULL;
|
||||
int ret;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
|
||||
c.common.opcode = READ_ONCE(cmd->opcode);
|
||||
c.common.flags = READ_ONCE(cmd->flags);
|
||||
if (c.common.flags)
|
||||
@ -507,6 +551,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
|
||||
c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
|
||||
|
||||
if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode))
|
||||
return -EACCES;
|
||||
|
||||
d.metadata = READ_ONCE(cmd->metadata);
|
||||
d.addr = READ_ONCE(cmd->addr);
|
||||
d.data_len = READ_ONCE(cmd->data_len);
|
||||
@ -570,13 +617,13 @@ static bool is_ctrl_ioctl(unsigned int cmd)
|
||||
}
|
||||
|
||||
static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
|
||||
void __user *argp)
|
||||
void __user *argp, fmode_t mode)
|
||||
{
|
||||
switch (cmd) {
|
||||
case NVME_IOCTL_ADMIN_CMD:
|
||||
return nvme_user_cmd(ctrl, NULL, argp);
|
||||
return nvme_user_cmd(ctrl, NULL, argp, mode);
|
||||
case NVME_IOCTL_ADMIN64_CMD:
|
||||
return nvme_user_cmd64(ctrl, NULL, argp, false);
|
||||
return nvme_user_cmd64(ctrl, NULL, argp, false, mode);
|
||||
default:
|
||||
return sed_ioctl(ctrl->opal_dev, cmd, argp);
|
||||
}
|
||||
@ -601,14 +648,14 @@ struct nvme_user_io32 {
|
||||
#endif /* COMPAT_FOR_U64_ALIGNMENT */
|
||||
|
||||
static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
|
||||
void __user *argp)
|
||||
void __user *argp, fmode_t mode)
|
||||
{
|
||||
switch (cmd) {
|
||||
case NVME_IOCTL_ID:
|
||||
force_successful_syscall_return();
|
||||
return ns->head->ns_id;
|
||||
case NVME_IOCTL_IO_CMD:
|
||||
return nvme_user_cmd(ns->ctrl, ns, argp);
|
||||
return nvme_user_cmd(ns->ctrl, ns, argp, mode);
|
||||
/*
|
||||
* struct nvme_user_io can have different padding on some 32-bit ABIs.
|
||||
* Just accept the compat version as all fields that are used are the
|
||||
@ -620,19 +667,20 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
|
||||
case NVME_IOCTL_SUBMIT_IO:
|
||||
return nvme_submit_io(ns, argp);
|
||||
case NVME_IOCTL_IO64_CMD:
|
||||
return nvme_user_cmd64(ns->ctrl, ns, argp, false);
|
||||
return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode);
|
||||
case NVME_IOCTL_IO64_CMD_VEC:
|
||||
return nvme_user_cmd64(ns->ctrl, ns, argp, true);
|
||||
return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode);
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
}
|
||||
|
||||
static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg)
|
||||
static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg,
|
||||
fmode_t mode)
|
||||
{
|
||||
if (is_ctrl_ioctl(cmd))
|
||||
return nvme_ctrl_ioctl(ns->ctrl, cmd, arg);
|
||||
return nvme_ns_ioctl(ns, cmd, arg);
|
||||
if (is_ctrl_ioctl(cmd))
|
||||
return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode);
|
||||
return nvme_ns_ioctl(ns, cmd, arg, mode);
|
||||
}
|
||||
|
||||
int nvme_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
@ -640,7 +688,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
{
|
||||
struct nvme_ns *ns = bdev->bd_disk->private_data;
|
||||
|
||||
return __nvme_ioctl(ns, cmd, (void __user *)arg);
|
||||
return __nvme_ioctl(ns, cmd, (void __user *)arg, mode);
|
||||
}
|
||||
|
||||
long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
@ -648,7 +696,7 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
struct nvme_ns *ns =
|
||||
container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
|
||||
|
||||
return __nvme_ioctl(ns, cmd, (void __user *)arg);
|
||||
return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode);
|
||||
}
|
||||
|
||||
static int nvme_uring_cmd_checks(unsigned int issue_flags)
|
||||
@ -716,7 +764,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
|
||||
}
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
|
||||
void __user *argp, struct nvme_ns_head *head, int srcu_idx)
|
||||
void __user *argp, struct nvme_ns_head *head, int srcu_idx,
|
||||
fmode_t mode)
|
||||
__releases(&head->srcu)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
@ -724,7 +773,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
|
||||
|
||||
nvme_get_ctrl(ns->ctrl);
|
||||
srcu_read_unlock(&head->srcu, srcu_idx);
|
||||
ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
|
||||
ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode);
|
||||
|
||||
nvme_put_ctrl(ctrl);
|
||||
return ret;
|
||||
@ -749,9 +798,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
* deadlock when deleting namespaces using the passthrough interface.
|
||||
*/
|
||||
if (is_ctrl_ioctl(cmd))
|
||||
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
|
||||
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
|
||||
mode);
|
||||
|
||||
ret = nvme_ns_ioctl(ns, cmd, argp);
|
||||
ret = nvme_ns_ioctl(ns, cmd, argp, mode);
|
||||
out_unlock:
|
||||
srcu_read_unlock(&head->srcu, srcu_idx);
|
||||
return ret;
|
||||
@ -773,9 +823,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
|
||||
goto out_unlock;
|
||||
|
||||
if (is_ctrl_ioctl(cmd))
|
||||
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
|
||||
return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
|
||||
file->f_mode);
|
||||
|
||||
ret = nvme_ns_ioctl(ns, cmd, argp);
|
||||
ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode);
|
||||
out_unlock:
|
||||
srcu_read_unlock(&head->srcu, srcu_idx);
|
||||
return ret;
|
||||
@ -849,7 +900,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
|
||||
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
|
||||
fmode_t mode)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
int ret;
|
||||
@ -873,7 +925,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
|
||||
kref_get(&ns->kref);
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
|
||||
ret = nvme_user_cmd(ctrl, ns, argp);
|
||||
ret = nvme_user_cmd(ctrl, ns, argp, mode);
|
||||
nvme_put_ns(ns);
|
||||
return ret;
|
||||
|
||||
@ -890,11 +942,11 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
|
||||
|
||||
switch (cmd) {
|
||||
case NVME_IOCTL_ADMIN_CMD:
|
||||
return nvme_user_cmd(ctrl, NULL, argp);
|
||||
return nvme_user_cmd(ctrl, NULL, argp, file->f_mode);
|
||||
case NVME_IOCTL_ADMIN64_CMD:
|
||||
return nvme_user_cmd64(ctrl, NULL, argp, false);
|
||||
return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode);
|
||||
case NVME_IOCTL_IO_CMD:
|
||||
return nvme_dev_user_cmd(ctrl, argp);
|
||||
return nvme_dev_user_cmd(ctrl, argp, file->f_mode);
|
||||
case NVME_IOCTL_RESET:
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
|
@ -114,6 +114,31 @@ void nvme_failover_req(struct request *req)
|
||||
kblockd_schedule_work(&ns->head->requeue_work);
|
||||
}
|
||||
|
||||
void nvme_mpath_start_request(struct request *rq)
|
||||
{
|
||||
struct nvme_ns *ns = rq->q->queuedata;
|
||||
struct gendisk *disk = ns->head->disk;
|
||||
|
||||
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
|
||||
return;
|
||||
|
||||
nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
|
||||
nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
|
||||
blk_rq_bytes(rq) >> SECTOR_SHIFT,
|
||||
req_op(rq), jiffies);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
|
||||
|
||||
void nvme_mpath_end_request(struct request *rq)
|
||||
{
|
||||
struct nvme_ns *ns = rq->q->queuedata;
|
||||
|
||||
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
|
||||
return;
|
||||
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
|
||||
nvme_req(rq)->start_time);
|
||||
}
|
||||
|
||||
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
@ -506,6 +531,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
|
||||
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
|
||||
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
|
||||
/*
|
||||
* This assumes all controllers that refer to a namespace either
|
||||
* support poll queues or not. That is not a strict guarantee,
|
||||
|
@ -162,6 +162,9 @@ struct nvme_request {
|
||||
u8 retries;
|
||||
u8 flags;
|
||||
u16 status;
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
unsigned long start_time;
|
||||
#endif
|
||||
struct nvme_ctrl *ctrl;
|
||||
};
|
||||
|
||||
@ -173,6 +176,7 @@ struct nvme_request {
|
||||
enum {
|
||||
NVME_REQ_CANCELLED = (1 << 0),
|
||||
NVME_REQ_USERCMD = (1 << 1),
|
||||
NVME_MPATH_IO_STATS = (1 << 2),
|
||||
};
|
||||
|
||||
static inline struct nvme_request *nvme_req(struct request *req)
|
||||
@ -237,6 +241,7 @@ enum nvme_ctrl_flags {
|
||||
NVME_CTRL_FAILFAST_EXPIRED = 0,
|
||||
NVME_CTRL_ADMIN_Q_STOPPED = 1,
|
||||
NVME_CTRL_STARTED_ONCE = 2,
|
||||
NVME_CTRL_STOPPED = 3,
|
||||
};
|
||||
|
||||
struct nvme_ctrl {
|
||||
@ -336,8 +341,8 @@ struct nvme_ctrl {
|
||||
|
||||
#ifdef CONFIG_NVME_AUTH
|
||||
struct work_struct dhchap_auth_work;
|
||||
struct list_head dhchap_auth_list;
|
||||
struct mutex dhchap_auth_mutex;
|
||||
struct nvme_dhchap_queue_context *dhchap_ctxs;
|
||||
struct nvme_dhchap_key *host_key;
|
||||
struct nvme_dhchap_key *ctrl_key;
|
||||
u16 transaction;
|
||||
@ -454,6 +459,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head)
|
||||
enum nvme_ns_features {
|
||||
NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
|
||||
NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
|
||||
NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */
|
||||
};
|
||||
|
||||
struct nvme_ns {
|
||||
@ -483,11 +489,9 @@ struct nvme_ns {
|
||||
unsigned long features;
|
||||
unsigned long flags;
|
||||
#define NVME_NS_REMOVING 0
|
||||
#define NVME_NS_DEAD 1
|
||||
#define NVME_NS_ANA_PENDING 2
|
||||
#define NVME_NS_FORCE_RO 3
|
||||
#define NVME_NS_READY 4
|
||||
#define NVME_NS_STOPPED 5
|
||||
|
||||
struct cdev cdev;
|
||||
struct device cdev_device;
|
||||
@ -508,6 +512,9 @@ struct nvme_ctrl_ops {
|
||||
unsigned int flags;
|
||||
#define NVME_F_FABRICS (1 << 0)
|
||||
#define NVME_F_METADATA_SUPPORTED (1 << 1)
|
||||
#define NVME_F_BLOCKING (1 << 2)
|
||||
|
||||
const struct attribute_group **dev_attr_groups;
|
||||
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
|
||||
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
|
||||
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
|
||||
@ -728,37 +735,32 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
|
||||
void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
|
||||
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
||||
enum nvme_ctrl_state new_state);
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown);
|
||||
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
||||
const struct nvme_ctrl_ops *ops, unsigned long quirks);
|
||||
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
|
||||
void nvme_start_ctrl(struct nvme_ctrl *ctrl);
|
||||
void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
|
||||
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl);
|
||||
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended);
|
||||
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
const struct blk_mq_ops *ops, unsigned int flags,
|
||||
unsigned int cmd_size);
|
||||
const struct blk_mq_ops *ops, unsigned int cmd_size);
|
||||
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl);
|
||||
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
|
||||
const struct blk_mq_ops *ops, unsigned int flags,
|
||||
const struct blk_mq_ops *ops, unsigned int nr_maps,
|
||||
unsigned int cmd_size);
|
||||
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl);
|
||||
|
||||
void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
|
||||
|
||||
int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
|
||||
bool send);
|
||||
|
||||
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
|
||||
volatile union nvme_result *res);
|
||||
|
||||
void nvme_stop_queues(struct nvme_ctrl *ctrl);
|
||||
void nvme_start_queues(struct nvme_ctrl *ctrl);
|
||||
void nvme_stop_admin_queue(struct nvme_ctrl *ctrl);
|
||||
void nvme_start_admin_queue(struct nvme_ctrl *ctrl);
|
||||
void nvme_kill_queues(struct nvme_ctrl *ctrl);
|
||||
void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl);
|
||||
void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl);
|
||||
void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl);
|
||||
void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl);
|
||||
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl);
|
||||
void nvme_sync_queues(struct nvme_ctrl *ctrl);
|
||||
void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
|
||||
void nvme_unfreeze(struct nvme_ctrl *ctrl);
|
||||
@ -857,6 +859,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
|
||||
extern const struct attribute_group *nvme_ns_id_attr_groups[];
|
||||
extern const struct pr_ops nvme_pr_ops;
|
||||
extern const struct block_device_operations nvme_ns_head_ops;
|
||||
extern const struct attribute_group nvme_dev_attrs_group;
|
||||
|
||||
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
@ -883,6 +886,8 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
|
||||
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
|
||||
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
|
||||
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
|
||||
void nvme_mpath_start_request(struct request *rq);
|
||||
void nvme_mpath_end_request(struct request *rq);
|
||||
|
||||
static inline void nvme_trace_bio_complete(struct request *req)
|
||||
{
|
||||
@ -968,6 +973,12 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
|
||||
static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
|
||||
{
|
||||
}
|
||||
static inline void nvme_mpath_start_request(struct request *rq)
|
||||
{
|
||||
}
|
||||
static inline void nvme_mpath_end_request(struct request *rq)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_NVME_MULTIPATH */
|
||||
|
||||
int nvme_revalidate_zones(struct nvme_ns *ns);
|
||||
@ -1013,20 +1024,38 @@ static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void nvme_start_request(struct request *rq)
|
||||
{
|
||||
if (rq->cmd_flags & REQ_NVME_MPATH)
|
||||
nvme_mpath_start_request(rq);
|
||||
blk_mq_start_request(rq);
|
||||
}
|
||||
|
||||
static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
return ctrl->sgls & ((1 << 0) | (1 << 1));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVME_AUTH
|
||||
void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
|
||||
int __init nvme_init_auth(void);
|
||||
void __exit nvme_exit_auth(void);
|
||||
int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
|
||||
void nvme_auth_stop(struct nvme_ctrl *ctrl);
|
||||
int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
|
||||
int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
|
||||
void nvme_auth_reset(struct nvme_ctrl *ctrl);
|
||||
void nvme_auth_free(struct nvme_ctrl *ctrl);
|
||||
#else
|
||||
static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
|
||||
static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline int __init nvme_init_auth(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void __exit nvme_exit_auth(void)
|
||||
{
|
||||
}
|
||||
static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
|
||||
static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
|
||||
{
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <linux/init.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/kstrtox.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
@ -108,7 +109,7 @@ struct nvme_dev;
|
||||
struct nvme_queue;
|
||||
|
||||
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
|
||||
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
|
||||
static void nvme_delete_io_queues(struct nvme_dev *dev);
|
||||
|
||||
/*
|
||||
* Represents an NVM Express device. Each nvme_dev is a PCI function.
|
||||
@ -130,7 +131,6 @@ struct nvme_dev {
|
||||
u32 db_stride;
|
||||
void __iomem *bar;
|
||||
unsigned long bar_mapped_size;
|
||||
struct work_struct remove_work;
|
||||
struct mutex shutdown_lock;
|
||||
bool subsystem;
|
||||
u64 cmb_size;
|
||||
@ -158,8 +158,6 @@ struct nvme_dev {
|
||||
unsigned int nr_allocated_queues;
|
||||
unsigned int nr_write_queues;
|
||||
unsigned int nr_poll_queues;
|
||||
|
||||
bool attrs_added;
|
||||
};
|
||||
|
||||
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
|
||||
@ -241,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
|
||||
return dev->nr_allocated_queues * 8 * dev->db_stride;
|
||||
}
|
||||
|
||||
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
|
||||
static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
|
||||
{
|
||||
unsigned int mem_size = nvme_dbbuf_size(dev);
|
||||
|
||||
if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP))
|
||||
return;
|
||||
|
||||
if (dev->dbbuf_dbs) {
|
||||
/*
|
||||
* Clear the dbbuf memory so the driver doesn't observe stale
|
||||
@ -252,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
|
||||
*/
|
||||
memset(dev->dbbuf_dbs, 0, mem_size);
|
||||
memset(dev->dbbuf_eis, 0, mem_size);
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
|
||||
dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
|
||||
&dev->dbbuf_dbs_dma_addr,
|
||||
GFP_KERNEL);
|
||||
if (!dev->dbbuf_dbs)
|
||||
return -ENOMEM;
|
||||
goto fail;
|
||||
dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
|
||||
&dev->dbbuf_eis_dma_addr,
|
||||
GFP_KERNEL);
|
||||
if (!dev->dbbuf_eis) {
|
||||
dma_free_coherent(dev->dev, mem_size,
|
||||
dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
|
||||
dev->dbbuf_dbs = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
if (!dev->dbbuf_eis)
|
||||
goto fail_free_dbbuf_dbs;
|
||||
return;
|
||||
|
||||
return 0;
|
||||
fail_free_dbbuf_dbs:
|
||||
dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs,
|
||||
dev->dbbuf_dbs_dma_addr);
|
||||
dev->dbbuf_dbs = NULL;
|
||||
fail:
|
||||
dev_warn(dev->dev, "unable to allocate dma for dbbuf\n");
|
||||
}
|
||||
|
||||
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
|
||||
@ -392,18 +395,10 @@ static int nvme_pci_npages_sgl(void)
|
||||
PAGE_SIZE);
|
||||
}
|
||||
|
||||
static size_t nvme_pci_iod_alloc_size(void)
|
||||
{
|
||||
size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
|
||||
|
||||
return sizeof(__le64 *) * npages +
|
||||
sizeof(struct scatterlist) * NVME_MAX_SEGS;
|
||||
}
|
||||
|
||||
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
unsigned int hctx_idx)
|
||||
{
|
||||
struct nvme_dev *dev = data;
|
||||
struct nvme_dev *dev = to_nvme_dev(data);
|
||||
struct nvme_queue *nvmeq = &dev->queues[0];
|
||||
|
||||
WARN_ON(hctx_idx != 0);
|
||||
@ -416,7 +411,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
unsigned int hctx_idx)
|
||||
{
|
||||
struct nvme_dev *dev = data;
|
||||
struct nvme_dev *dev = to_nvme_dev(data);
|
||||
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
|
||||
|
||||
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
|
||||
@ -428,7 +423,7 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set,
|
||||
struct request *req, unsigned int hctx_idx,
|
||||
unsigned int numa_node)
|
||||
{
|
||||
struct nvme_dev *dev = set->driver_data;
|
||||
struct nvme_dev *dev = to_nvme_dev(set->driver_data);
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
|
||||
nvme_req(req)->ctrl = &dev->ctrl;
|
||||
@ -447,7 +442,7 @@ static int queue_irq_offset(struct nvme_dev *dev)
|
||||
|
||||
static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
|
||||
{
|
||||
struct nvme_dev *dev = set->driver_data;
|
||||
struct nvme_dev *dev = to_nvme_dev(set->driver_data);
|
||||
int i, qoff, offset;
|
||||
|
||||
offset = queue_irq_offset(dev);
|
||||
@ -914,7 +909,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
|
||||
goto out_unmap_data;
|
||||
}
|
||||
|
||||
blk_mq_start_request(req);
|
||||
nvme_start_request(req);
|
||||
return BLK_STS_OK;
|
||||
out_unmap_data:
|
||||
nvme_unmap_data(dev, req);
|
||||
@ -1474,24 +1469,21 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* nvme_suspend_queue - put queue into suspended state
|
||||
* @nvmeq: queue to suspend
|
||||
*/
|
||||
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
|
||||
static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid)
|
||||
{
|
||||
struct nvme_queue *nvmeq = &dev->queues[qid];
|
||||
|
||||
if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
|
||||
return 1;
|
||||
return;
|
||||
|
||||
/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
|
||||
mb();
|
||||
|
||||
nvmeq->dev->online_queues--;
|
||||
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
|
||||
nvme_stop_admin_queue(&nvmeq->dev->ctrl);
|
||||
nvme_quiesce_admin_queue(&nvmeq->dev->ctrl);
|
||||
if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
|
||||
pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
|
||||
return 0;
|
||||
pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq);
|
||||
}
|
||||
|
||||
static void nvme_suspend_io_queues(struct nvme_dev *dev)
|
||||
@ -1499,19 +1491,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev)
|
||||
int i;
|
||||
|
||||
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
|
||||
nvme_suspend_queue(&dev->queues[i]);
|
||||
}
|
||||
|
||||
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
|
||||
{
|
||||
struct nvme_queue *nvmeq = &dev->queues[0];
|
||||
|
||||
if (shutdown)
|
||||
nvme_shutdown_ctrl(&dev->ctrl);
|
||||
else
|
||||
nvme_disable_ctrl(&dev->ctrl);
|
||||
|
||||
nvme_poll_irqdisable(nvmeq);
|
||||
nvme_suspend_queue(dev, i);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1748,44 +1728,11 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
|
||||
* user requests may be waiting on a stopped queue. Start the
|
||||
* queue to flush these to completion.
|
||||
*/
|
||||
nvme_start_admin_queue(&dev->ctrl);
|
||||
blk_mq_destroy_queue(dev->ctrl.admin_q);
|
||||
blk_mq_free_tag_set(&dev->admin_tagset);
|
||||
nvme_unquiesce_admin_queue(&dev->ctrl);
|
||||
nvme_remove_admin_tag_set(&dev->ctrl);
|
||||
}
|
||||
}
|
||||
|
||||
static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev)
|
||||
{
|
||||
struct blk_mq_tag_set *set = &dev->admin_tagset;
|
||||
|
||||
set->ops = &nvme_mq_admin_ops;
|
||||
set->nr_hw_queues = 1;
|
||||
|
||||
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
|
||||
set->timeout = NVME_ADMIN_TIMEOUT;
|
||||
set->numa_node = dev->ctrl.numa_node;
|
||||
set->cmd_size = sizeof(struct nvme_iod);
|
||||
set->flags = BLK_MQ_F_NO_SCHED;
|
||||
set->driver_data = dev;
|
||||
|
||||
if (blk_mq_alloc_tag_set(set))
|
||||
return -ENOMEM;
|
||||
dev->ctrl.admin_tagset = set;
|
||||
|
||||
dev->ctrl.admin_q = blk_mq_init_queue(set);
|
||||
if (IS_ERR(dev->ctrl.admin_q)) {
|
||||
blk_mq_free_tag_set(set);
|
||||
dev->ctrl.admin_q = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
if (!blk_get_queue(dev->ctrl.admin_q)) {
|
||||
nvme_dev_remove_admin(dev);
|
||||
dev->ctrl.admin_q = NULL;
|
||||
return -ENODEV;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
|
||||
{
|
||||
return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
|
||||
@ -1829,7 +1776,14 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
|
||||
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
|
||||
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
|
||||
|
||||
result = nvme_disable_ctrl(&dev->ctrl);
|
||||
/*
|
||||
* If the device has been passed off to us in an enabled state, just
|
||||
* clear the enabled bit. The spec says we should set the 'shutdown
|
||||
* notification bits', but doing so may cause the device to complete
|
||||
* commands to the admin queue ... and we don't know what memory that
|
||||
* might be pointing at!
|
||||
*/
|
||||
result = nvme_disable_ctrl(&dev->ctrl, false);
|
||||
if (result < 0)
|
||||
return result;
|
||||
|
||||
@ -2112,6 +2066,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
|
||||
u32 enable_bits = NVME_HOST_MEM_ENABLE;
|
||||
int ret;
|
||||
|
||||
if (!dev->ctrl.hmpre)
|
||||
return 0;
|
||||
|
||||
preferred = min(preferred, max);
|
||||
if (min > max) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
@ -2192,7 +2149,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
|
||||
bool new;
|
||||
int ret;
|
||||
|
||||
if (strtobool(buf, &new) < 0)
|
||||
if (kstrtobool(buf, &new) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (new == ndev->hmb)
|
||||
@ -2240,11 +2197,17 @@ static struct attribute *nvme_pci_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group nvme_pci_attr_group = {
|
||||
static const struct attribute_group nvme_pci_dev_attrs_group = {
|
||||
.attrs = nvme_pci_attrs,
|
||||
.is_visible = nvme_pci_attrs_are_visible,
|
||||
};
|
||||
|
||||
static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
|
||||
&nvme_dev_attrs_group,
|
||||
&nvme_pci_dev_attrs_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
* nirqs is the number of interrupts available for write and read
|
||||
* queues. The core already reserved an interrupt for the admin queue.
|
||||
@ -2319,12 +2282,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
|
||||
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
|
||||
}
|
||||
|
||||
static void nvme_disable_io_queues(struct nvme_dev *dev)
|
||||
{
|
||||
if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
|
||||
__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
|
||||
}
|
||||
|
||||
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
|
||||
{
|
||||
/*
|
||||
@ -2432,7 +2389,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
|
||||
|
||||
if (dev->online_queues - 1 < dev->max_qid) {
|
||||
nr_io_queues = dev->online_queues - 1;
|
||||
nvme_disable_io_queues(dev);
|
||||
nvme_delete_io_queues(dev);
|
||||
result = nvme_setup_io_queues_trylock(dev);
|
||||
if (result)
|
||||
return result;
|
||||
@ -2495,7 +2452,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
|
||||
static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode)
|
||||
{
|
||||
int nr_queues = dev->online_queues - 1, sent = 0;
|
||||
unsigned long timeout;
|
||||
@ -2523,40 +2480,19 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
|
||||
return true;
|
||||
}
|
||||
|
||||
static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
|
||||
static void nvme_delete_io_queues(struct nvme_dev *dev)
|
||||
{
|
||||
struct blk_mq_tag_set * set = &dev->tagset;
|
||||
int ret;
|
||||
if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq))
|
||||
__nvme_delete_io_queues(dev, nvme_admin_delete_cq);
|
||||
}
|
||||
|
||||
set->ops = &nvme_mq_ops;
|
||||
set->nr_hw_queues = dev->online_queues - 1;
|
||||
set->nr_maps = 1;
|
||||
if (dev->io_queues[HCTX_TYPE_READ])
|
||||
set->nr_maps = 2;
|
||||
static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev)
|
||||
{
|
||||
if (dev->io_queues[HCTX_TYPE_POLL])
|
||||
set->nr_maps = 3;
|
||||
set->timeout = NVME_IO_TIMEOUT;
|
||||
set->numa_node = dev->ctrl.numa_node;
|
||||
set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
|
||||
set->cmd_size = sizeof(struct nvme_iod);
|
||||
set->flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
set->driver_data = dev;
|
||||
|
||||
/*
|
||||
* Some Apple controllers requires tags to be unique
|
||||
* across admin and IO queue, so reserve the first 32
|
||||
* tags of the IO queue.
|
||||
*/
|
||||
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
|
||||
set->reserved_tags = NVME_AQ_DEPTH;
|
||||
|
||||
ret = blk_mq_alloc_tag_set(set);
|
||||
if (ret) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"IO queues tagset allocation failed %d\n", ret);
|
||||
return;
|
||||
}
|
||||
dev->ctrl.tagset = set;
|
||||
return 3;
|
||||
if (dev->io_queues[HCTX_TYPE_READ])
|
||||
return 2;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
|
||||
@ -2647,7 +2583,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
|
||||
|
||||
pci_enable_pcie_error_reporting(pdev);
|
||||
pci_save_state(pdev);
|
||||
return 0;
|
||||
|
||||
return nvme_pci_configure_admin_queue(dev);
|
||||
|
||||
disable:
|
||||
pci_disable_device(pdev);
|
||||
@ -2661,57 +2598,53 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
|
||||
pci_release_mem_regions(to_pci_dev(dev->dev));
|
||||
}
|
||||
|
||||
static void nvme_pci_disable(struct nvme_dev *dev)
|
||||
static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)
|
||||
{
|
||||
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
||||
u32 csts;
|
||||
|
||||
pci_free_irq_vectors(pdev);
|
||||
if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev))
|
||||
return true;
|
||||
if (pdev->error_state != pci_channel_io_normal)
|
||||
return true;
|
||||
|
||||
if (pci_is_enabled(pdev)) {
|
||||
pci_disable_pcie_error_reporting(pdev);
|
||||
pci_disable_device(pdev);
|
||||
}
|
||||
csts = readl(dev->bar + NVME_REG_CSTS);
|
||||
return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY);
|
||||
}
|
||||
|
||||
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
|
||||
{
|
||||
bool dead = true, freeze = false;
|
||||
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
||||
bool dead;
|
||||
|
||||
mutex_lock(&dev->shutdown_lock);
|
||||
if (pci_is_enabled(pdev)) {
|
||||
u32 csts;
|
||||
|
||||
if (pci_device_is_present(pdev))
|
||||
csts = readl(dev->bar + NVME_REG_CSTS);
|
||||
else
|
||||
csts = ~0;
|
||||
|
||||
if (dev->ctrl.state == NVME_CTRL_LIVE ||
|
||||
dev->ctrl.state == NVME_CTRL_RESETTING) {
|
||||
freeze = true;
|
||||
dead = nvme_pci_ctrl_is_dead(dev);
|
||||
if (dev->ctrl.state == NVME_CTRL_LIVE ||
|
||||
dev->ctrl.state == NVME_CTRL_RESETTING) {
|
||||
if (pci_is_enabled(pdev))
|
||||
nvme_start_freeze(&dev->ctrl);
|
||||
}
|
||||
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
|
||||
pdev->error_state != pci_channel_io_normal);
|
||||
/*
|
||||
* Give the controller a chance to complete all entered requests
|
||||
* if doing a safe shutdown.
|
||||
*/
|
||||
if (!dead && shutdown)
|
||||
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Give the controller a chance to complete all entered requests if
|
||||
* doing a safe shutdown.
|
||||
*/
|
||||
if (!dead && shutdown && freeze)
|
||||
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
|
||||
|
||||
nvme_stop_queues(&dev->ctrl);
|
||||
nvme_quiesce_io_queues(&dev->ctrl);
|
||||
|
||||
if (!dead && dev->ctrl.queue_count > 0) {
|
||||
nvme_disable_io_queues(dev);
|
||||
nvme_disable_admin_queue(dev, shutdown);
|
||||
nvme_delete_io_queues(dev);
|
||||
nvme_disable_ctrl(&dev->ctrl, shutdown);
|
||||
nvme_poll_irqdisable(&dev->queues[0]);
|
||||
}
|
||||
nvme_suspend_io_queues(dev);
|
||||
nvme_suspend_queue(&dev->queues[0]);
|
||||
nvme_pci_disable(dev);
|
||||
nvme_suspend_queue(dev, 0);
|
||||
pci_free_irq_vectors(pdev);
|
||||
if (pci_is_enabled(pdev)) {
|
||||
pci_disable_pcie_error_reporting(pdev);
|
||||
pci_disable_device(pdev);
|
||||
}
|
||||
nvme_reap_pending_cqes(dev);
|
||||
|
||||
nvme_cancel_tagset(&dev->ctrl);
|
||||
@ -2723,9 +2656,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
|
||||
* deadlocking blk-mq hot-cpu notifier.
|
||||
*/
|
||||
if (shutdown) {
|
||||
nvme_start_queues(&dev->ctrl);
|
||||
nvme_unquiesce_io_queues(&dev->ctrl);
|
||||
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
|
||||
nvme_start_admin_queue(&dev->ctrl);
|
||||
nvme_unquiesce_admin_queue(&dev->ctrl);
|
||||
}
|
||||
mutex_unlock(&dev->shutdown_lock);
|
||||
}
|
||||
@ -2762,42 +2695,40 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
|
||||
dma_pool_destroy(dev->prp_small_pool);
|
||||
}
|
||||
|
||||
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
|
||||
{
|
||||
size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
|
||||
size_t alloc_size = sizeof(__le64 *) * npages +
|
||||
sizeof(struct scatterlist) * NVME_MAX_SEGS;
|
||||
|
||||
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
|
||||
dev->iod_mempool = mempool_create_node(1,
|
||||
mempool_kmalloc, mempool_kfree,
|
||||
(void *)alloc_size, GFP_KERNEL,
|
||||
dev_to_node(dev->dev));
|
||||
if (!dev->iod_mempool)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_free_tagset(struct nvme_dev *dev)
|
||||
{
|
||||
if (dev->tagset.tags)
|
||||
blk_mq_free_tag_set(&dev->tagset);
|
||||
nvme_remove_io_tag_set(&dev->ctrl);
|
||||
dev->ctrl.tagset = NULL;
|
||||
}
|
||||
|
||||
/* pairs with nvme_pci_alloc_dev */
|
||||
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_dev *dev = to_nvme_dev(ctrl);
|
||||
|
||||
nvme_dbbuf_dma_free(dev);
|
||||
nvme_free_tagset(dev);
|
||||
if (dev->ctrl.admin_q)
|
||||
blk_put_queue(dev->ctrl.admin_q);
|
||||
free_opal_dev(dev->ctrl.opal_dev);
|
||||
mempool_destroy(dev->iod_mempool);
|
||||
put_device(dev->dev);
|
||||
kfree(dev->queues);
|
||||
kfree(dev);
|
||||
}
|
||||
|
||||
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
|
||||
{
|
||||
/*
|
||||
* Set state to deleting now to avoid blocking nvme_wait_reset(), which
|
||||
* may be holding this pci_dev's device lock.
|
||||
*/
|
||||
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
|
||||
nvme_get_ctrl(&dev->ctrl);
|
||||
nvme_dev_disable(dev, false);
|
||||
nvme_kill_queues(&dev->ctrl);
|
||||
if (!queue_work(nvme_wq, &dev->remove_work))
|
||||
nvme_put_ctrl(&dev->ctrl);
|
||||
}
|
||||
|
||||
static void nvme_reset_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_dev *dev =
|
||||
@ -2808,8 +2739,7 @@ static void nvme_reset_work(struct work_struct *work)
|
||||
if (dev->ctrl.state != NVME_CTRL_RESETTING) {
|
||||
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
|
||||
dev->ctrl.state);
|
||||
result = -ENODEV;
|
||||
goto out;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2824,34 +2754,7 @@ static void nvme_reset_work(struct work_struct *work)
|
||||
result = nvme_pci_enable(dev);
|
||||
if (result)
|
||||
goto out_unlock;
|
||||
|
||||
result = nvme_pci_configure_admin_queue(dev);
|
||||
if (result)
|
||||
goto out_unlock;
|
||||
|
||||
if (!dev->ctrl.admin_q) {
|
||||
result = nvme_pci_alloc_admin_tag_set(dev);
|
||||
if (result)
|
||||
goto out_unlock;
|
||||
} else {
|
||||
nvme_start_admin_queue(&dev->ctrl);
|
||||
}
|
||||
|
||||
dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
|
||||
|
||||
/*
|
||||
* Limit the max command size to prevent iod->sg allocations going
|
||||
* over a single page.
|
||||
*/
|
||||
dev->ctrl.max_hw_sectors = min_t(u32,
|
||||
NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
|
||||
dev->ctrl.max_segments = NVME_MAX_SEGS;
|
||||
|
||||
/*
|
||||
* Don't limit the IOMMU merged segment size.
|
||||
*/
|
||||
dma_set_max_seg_size(dev->dev, 0xffffffff);
|
||||
|
||||
nvme_unquiesce_admin_queue(&dev->ctrl);
|
||||
mutex_unlock(&dev->shutdown_lock);
|
||||
|
||||
/*
|
||||
@ -2865,62 +2768,37 @@ static void nvme_reset_work(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* We do not support an SGL for metadata (yet), so we are limited to a
|
||||
* single integrity segment for the separate metadata pointer.
|
||||
*/
|
||||
dev->ctrl.max_integrity_segments = 1;
|
||||
|
||||
result = nvme_init_ctrl_finish(&dev->ctrl);
|
||||
result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
|
||||
if (result)
|
||||
goto out;
|
||||
|
||||
if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
|
||||
if (!dev->ctrl.opal_dev)
|
||||
dev->ctrl.opal_dev =
|
||||
init_opal_dev(&dev->ctrl, &nvme_sec_submit);
|
||||
else if (was_suspend)
|
||||
opal_unlock_from_suspend(dev->ctrl.opal_dev);
|
||||
} else {
|
||||
free_opal_dev(dev->ctrl.opal_dev);
|
||||
dev->ctrl.opal_dev = NULL;
|
||||
}
|
||||
nvme_dbbuf_dma_alloc(dev);
|
||||
|
||||
if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
|
||||
result = nvme_dbbuf_dma_alloc(dev);
|
||||
if (result)
|
||||
dev_warn(dev->dev,
|
||||
"unable to allocate dma for dbbuf\n");
|
||||
}
|
||||
|
||||
if (dev->ctrl.hmpre) {
|
||||
result = nvme_setup_host_mem(dev);
|
||||
if (result < 0)
|
||||
goto out;
|
||||
}
|
||||
result = nvme_setup_host_mem(dev);
|
||||
if (result < 0)
|
||||
goto out;
|
||||
|
||||
result = nvme_setup_io_queues(dev);
|
||||
if (result)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Keep the controller around but remove all namespaces if we don't have
|
||||
* any working I/O queue.
|
||||
* Freeze and update the number of I/O queues as thos might have
|
||||
* changed. If there are no I/O queues left after this reset, keep the
|
||||
* controller around but remove all namespaces.
|
||||
*/
|
||||
if (dev->online_queues < 2) {
|
||||
dev_warn(dev->ctrl.device, "IO queues not created\n");
|
||||
nvme_kill_queues(&dev->ctrl);
|
||||
nvme_remove_namespaces(&dev->ctrl);
|
||||
nvme_free_tagset(dev);
|
||||
} else {
|
||||
nvme_start_queues(&dev->ctrl);
|
||||
if (dev->online_queues > 1) {
|
||||
nvme_unquiesce_io_queues(&dev->ctrl);
|
||||
nvme_wait_freeze(&dev->ctrl);
|
||||
if (!dev->ctrl.tagset)
|
||||
nvme_pci_alloc_tag_set(dev);
|
||||
else
|
||||
nvme_pci_update_nr_queues(dev);
|
||||
nvme_pci_update_nr_queues(dev);
|
||||
nvme_dbbuf_set(dev);
|
||||
nvme_unfreeze(&dev->ctrl);
|
||||
} else {
|
||||
dev_warn(dev->ctrl.device, "IO queues lost\n");
|
||||
nvme_mark_namespaces_dead(&dev->ctrl);
|
||||
nvme_unquiesce_io_queues(&dev->ctrl);
|
||||
nvme_remove_namespaces(&dev->ctrl);
|
||||
nvme_free_tagset(dev);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2934,30 +2812,22 @@ static void nvme_reset_work(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
|
||||
&nvme_pci_attr_group))
|
||||
dev->attrs_added = true;
|
||||
|
||||
nvme_start_ctrl(&dev->ctrl);
|
||||
return;
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&dev->shutdown_lock);
|
||||
out:
|
||||
if (result)
|
||||
dev_warn(dev->ctrl.device,
|
||||
"Removing after probe failure status: %d\n", result);
|
||||
nvme_remove_dead_ctrl(dev);
|
||||
}
|
||||
|
||||
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
|
||||
struct pci_dev *pdev = to_pci_dev(dev->dev);
|
||||
|
||||
if (pci_get_drvdata(pdev))
|
||||
device_release_driver(&pdev->dev);
|
||||
nvme_put_ctrl(&dev->ctrl);
|
||||
/*
|
||||
* Set state to deleting now to avoid blocking nvme_wait_reset(), which
|
||||
* may be holding this pci_dev's device lock.
|
||||
*/
|
||||
dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
|
||||
result);
|
||||
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
|
||||
nvme_dev_disable(dev, true);
|
||||
nvme_mark_namespaces_dead(&dev->ctrl);
|
||||
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
|
||||
}
|
||||
|
||||
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
|
||||
@ -3010,6 +2880,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
|
||||
.name = "pcie",
|
||||
.module = THIS_MODULE,
|
||||
.flags = NVME_F_METADATA_SUPPORTED,
|
||||
.dev_attr_groups = nvme_pci_dev_attr_groups,
|
||||
.reg_read32 = nvme_pci_reg_read32,
|
||||
.reg_write32 = nvme_pci_reg_write32,
|
||||
.reg_read64 = nvme_pci_reg_read64,
|
||||
@ -3079,29 +2950,22 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_async_probe(void *data, async_cookie_t cookie)
|
||||
static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
|
||||
const struct pci_device_id *id)
|
||||
{
|
||||
struct nvme_dev *dev = data;
|
||||
|
||||
flush_work(&dev->ctrl.reset_work);
|
||||
flush_work(&dev->ctrl.scan_work);
|
||||
nvme_put_ctrl(&dev->ctrl);
|
||||
}
|
||||
|
||||
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
{
|
||||
int node, result = -ENOMEM;
|
||||
struct nvme_dev *dev;
|
||||
unsigned long quirks = id->driver_data;
|
||||
size_t alloc_size;
|
||||
int node = dev_to_node(&pdev->dev);
|
||||
struct nvme_dev *dev;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
node = dev_to_node(&pdev->dev);
|
||||
if (node == NUMA_NO_NODE)
|
||||
set_dev_node(&pdev->dev, first_memory_node);
|
||||
|
||||
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
return NULL;
|
||||
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
|
||||
mutex_init(&dev->shutdown_lock);
|
||||
|
||||
dev->nr_write_queues = write_queues;
|
||||
dev->nr_poll_queues = poll_queues;
|
||||
@ -3109,25 +2973,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
dev->queues = kcalloc_node(dev->nr_allocated_queues,
|
||||
sizeof(struct nvme_queue), GFP_KERNEL, node);
|
||||
if (!dev->queues)
|
||||
goto free;
|
||||
goto out_free_dev;
|
||||
|
||||
dev->dev = get_device(&pdev->dev);
|
||||
pci_set_drvdata(pdev, dev);
|
||||
|
||||
result = nvme_dev_map(dev);
|
||||
if (result)
|
||||
goto put_pci;
|
||||
|
||||
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
|
||||
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
|
||||
mutex_init(&dev->shutdown_lock);
|
||||
|
||||
result = nvme_setup_prp_pools(dev);
|
||||
if (result)
|
||||
goto unmap;
|
||||
|
||||
quirks |= check_vendor_combination_bug(pdev);
|
||||
|
||||
if (!noacpi && acpi_storage_d3(&pdev->dev)) {
|
||||
/*
|
||||
* Some systems use a bios work around to ask for D3 on
|
||||
@ -3137,46 +2987,131 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
"platform quirk: setting simple suspend\n");
|
||||
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
|
||||
}
|
||||
ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
|
||||
quirks);
|
||||
if (ret)
|
||||
goto out_put_device;
|
||||
|
||||
dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
|
||||
dma_set_max_seg_size(&pdev->dev, 0xffffffff);
|
||||
|
||||
/*
|
||||
* Double check that our mempool alloc size will cover the biggest
|
||||
* command we support.
|
||||
* Limit the max command size to prevent iod->sg allocations going
|
||||
* over a single page.
|
||||
*/
|
||||
alloc_size = nvme_pci_iod_alloc_size();
|
||||
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
|
||||
dev->ctrl.max_hw_sectors = min_t(u32,
|
||||
NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9);
|
||||
dev->ctrl.max_segments = NVME_MAX_SEGS;
|
||||
|
||||
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
|
||||
mempool_kfree,
|
||||
(void *) alloc_size,
|
||||
GFP_KERNEL, node);
|
||||
if (!dev->iod_mempool) {
|
||||
result = -ENOMEM;
|
||||
goto release_pools;
|
||||
}
|
||||
/*
|
||||
* There is no support for SGLs for metadata (yet), so we are limited to
|
||||
* a single integrity segment for the separate metadata pointer.
|
||||
*/
|
||||
dev->ctrl.max_integrity_segments = 1;
|
||||
return dev;
|
||||
|
||||
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
|
||||
quirks);
|
||||
out_put_device:
|
||||
put_device(dev->dev);
|
||||
kfree(dev->queues);
|
||||
out_free_dev:
|
||||
kfree(dev);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
{
|
||||
struct nvme_dev *dev;
|
||||
int result = -ENOMEM;
|
||||
|
||||
dev = nvme_pci_alloc_dev(pdev, id);
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
|
||||
result = nvme_dev_map(dev);
|
||||
if (result)
|
||||
goto release_mempool;
|
||||
goto out_uninit_ctrl;
|
||||
|
||||
result = nvme_setup_prp_pools(dev);
|
||||
if (result)
|
||||
goto out_dev_unmap;
|
||||
|
||||
result = nvme_pci_alloc_iod_mempool(dev);
|
||||
if (result)
|
||||
goto out_release_prp_pools;
|
||||
|
||||
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
|
||||
|
||||
nvme_reset_ctrl(&dev->ctrl);
|
||||
async_schedule(nvme_async_probe, dev);
|
||||
result = nvme_pci_enable(dev);
|
||||
if (result)
|
||||
goto out_release_iod_mempool;
|
||||
|
||||
result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
|
||||
&nvme_mq_admin_ops, sizeof(struct nvme_iod));
|
||||
if (result)
|
||||
goto out_disable;
|
||||
|
||||
/*
|
||||
* Mark the controller as connecting before sending admin commands to
|
||||
* allow the timeout handler to do the right thing.
|
||||
*/
|
||||
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to mark controller CONNECTING\n");
|
||||
result = -EBUSY;
|
||||
goto out_disable;
|
||||
}
|
||||
|
||||
result = nvme_init_ctrl_finish(&dev->ctrl, false);
|
||||
if (result)
|
||||
goto out_disable;
|
||||
|
||||
nvme_dbbuf_dma_alloc(dev);
|
||||
|
||||
result = nvme_setup_host_mem(dev);
|
||||
if (result < 0)
|
||||
goto out_disable;
|
||||
|
||||
result = nvme_setup_io_queues(dev);
|
||||
if (result)
|
||||
goto out_disable;
|
||||
|
||||
if (dev->online_queues > 1) {
|
||||
nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
|
||||
nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
|
||||
nvme_dbbuf_set(dev);
|
||||
}
|
||||
|
||||
if (!dev->ctrl.tagset)
|
||||
dev_warn(dev->ctrl.device, "IO queues not created\n");
|
||||
|
||||
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"failed to mark controller live state\n");
|
||||
result = -ENODEV;
|
||||
goto out_disable;
|
||||
}
|
||||
|
||||
pci_set_drvdata(pdev, dev);
|
||||
|
||||
nvme_start_ctrl(&dev->ctrl);
|
||||
nvme_put_ctrl(&dev->ctrl);
|
||||
return 0;
|
||||
|
||||
release_mempool:
|
||||
out_disable:
|
||||
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
|
||||
nvme_dev_disable(dev, true);
|
||||
nvme_free_host_mem(dev);
|
||||
nvme_dev_remove_admin(dev);
|
||||
nvme_dbbuf_dma_free(dev);
|
||||
nvme_free_queues(dev, 0);
|
||||
out_release_iod_mempool:
|
||||
mempool_destroy(dev->iod_mempool);
|
||||
release_pools:
|
||||
out_release_prp_pools:
|
||||
nvme_release_prp_pools(dev);
|
||||
unmap:
|
||||
out_dev_unmap:
|
||||
nvme_dev_unmap(dev);
|
||||
put_pci:
|
||||
put_device(dev->dev);
|
||||
free:
|
||||
kfree(dev->queues);
|
||||
kfree(dev);
|
||||
out_uninit_ctrl:
|
||||
nvme_uninit_ctrl(&dev->ctrl);
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -3208,13 +3143,6 @@ static void nvme_shutdown(struct pci_dev *pdev)
|
||||
nvme_disable_prepare_reset(dev, true);
|
||||
}
|
||||
|
||||
static void nvme_remove_attrs(struct nvme_dev *dev)
|
||||
{
|
||||
if (dev->attrs_added)
|
||||
sysfs_remove_group(&dev->ctrl.device->kobj,
|
||||
&nvme_pci_attr_group);
|
||||
}
|
||||
|
||||
/*
|
||||
* The driver's remove may be called on a device in a partially initialized
|
||||
* state. This function must not have any dependencies on the device state in
|
||||
@ -3236,10 +3164,11 @@ static void nvme_remove(struct pci_dev *pdev)
|
||||
nvme_stop_ctrl(&dev->ctrl);
|
||||
nvme_remove_namespaces(&dev->ctrl);
|
||||
nvme_dev_disable(dev, true);
|
||||
nvme_remove_attrs(dev);
|
||||
nvme_free_host_mem(dev);
|
||||
nvme_dev_remove_admin(dev);
|
||||
nvme_dbbuf_dma_free(dev);
|
||||
nvme_free_queues(dev, 0);
|
||||
mempool_destroy(dev->iod_mempool);
|
||||
nvme_release_prp_pools(dev);
|
||||
nvme_dev_unmap(dev);
|
||||
nvme_uninit_ctrl(&dev->ctrl);
|
||||
@ -3576,11 +3505,12 @@ static struct pci_driver nvme_driver = {
|
||||
.probe = nvme_probe,
|
||||
.remove = nvme_remove,
|
||||
.shutdown = nvme_shutdown,
|
||||
#ifdef CONFIG_PM_SLEEP
|
||||
.driver = {
|
||||
.pm = &nvme_dev_pm_ops,
|
||||
},
|
||||
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
|
||||
#ifdef CONFIG_PM_SLEEP
|
||||
.pm = &nvme_dev_pm_ops,
|
||||
#endif
|
||||
},
|
||||
.sriov_configure = pci_sriov_configure_simple,
|
||||
.err_handler = &nvme_err_handler,
|
||||
};
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user