7437bb73f0
When zones were first added the SCSI and ATA specs, two different models were supported (in addition to the drive managed one that is invisible to the host): - host managed where non-conventional zones there is strict requirement to write at the write pointer, or else an error is returned - host aware where a write point is maintained if writes always happen at it, otherwise it is left in an under-defined state and the sequential write preferred zones behave like conventional zones (probably very badly performing ones, though) Not surprisingly this lukewarm model didn't prove to be very useful and was finally removed from the ZBC and SBC specs (NVMe never implemented it). Due to to the easily disappearing write pointer host software could never rely on the write pointer to actually be useful for say recovery. Fortunately only a few HDD prototypes shipped using this model which never made it to mass production. Drop the support before it is too late. Note that any such host aware prototype HDD can still be used with Linux as we'll now treat it as a conventional HDD. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Link: https://lore.kernel.org/r/20231217165359.604246-4-hch@lst.de Signed-off-by: Jens Axboe <axboe@kernel.dk>
1161 lines
28 KiB
C
1161 lines
28 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
|
|
*
|
|
* This file is released under the GPL.
|
|
*/
|
|
|
|
#include "dm-zoned.h"
|
|
|
|
#include <linux/module.h>
|
|
|
|
#define DM_MSG_PREFIX "zoned"
|
|
|
|
#define DMZ_MIN_BIOS 8192
|
|
|
|
/*
|
|
* Zone BIO context.
|
|
*/
|
|
struct dmz_bioctx {
|
|
struct dmz_dev *dev;
|
|
struct dm_zone *zone;
|
|
struct bio *bio;
|
|
refcount_t ref;
|
|
};
|
|
|
|
/*
|
|
* Chunk work descriptor.
|
|
*/
|
|
struct dm_chunk_work {
|
|
struct work_struct work;
|
|
refcount_t refcount;
|
|
struct dmz_target *target;
|
|
unsigned int chunk;
|
|
struct bio_list bio_list;
|
|
};
|
|
|
|
/*
|
|
* Target descriptor.
|
|
*/
|
|
struct dmz_target {
|
|
struct dm_dev **ddev;
|
|
unsigned int nr_ddevs;
|
|
|
|
unsigned int flags;
|
|
|
|
/* Zoned block device information */
|
|
struct dmz_dev *dev;
|
|
|
|
/* For metadata handling */
|
|
struct dmz_metadata *metadata;
|
|
|
|
/* For chunk work */
|
|
struct radix_tree_root chunk_rxtree;
|
|
struct workqueue_struct *chunk_wq;
|
|
struct mutex chunk_lock;
|
|
|
|
/* For cloned BIOs to zones */
|
|
struct bio_set bio_set;
|
|
|
|
/* For flush */
|
|
spinlock_t flush_lock;
|
|
struct bio_list flush_list;
|
|
struct delayed_work flush_work;
|
|
struct workqueue_struct *flush_wq;
|
|
};
|
|
|
|
/*
|
|
* Flush intervals (seconds).
|
|
*/
|
|
#define DMZ_FLUSH_PERIOD (10 * HZ)
|
|
|
|
/*
|
|
* Target BIO completion.
|
|
*/
|
|
static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
|
|
{
|
|
struct dmz_bioctx *bioctx =
|
|
dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
|
|
|
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
|
|
bio->bi_status = status;
|
|
if (bioctx->dev && bio->bi_status != BLK_STS_OK)
|
|
bioctx->dev->flags |= DMZ_CHECK_BDEV;
|
|
|
|
if (refcount_dec_and_test(&bioctx->ref)) {
|
|
struct dm_zone *zone = bioctx->zone;
|
|
|
|
if (zone) {
|
|
if (bio->bi_status != BLK_STS_OK &&
|
|
bio_op(bio) == REQ_OP_WRITE &&
|
|
dmz_is_seq(zone))
|
|
set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
|
|
dmz_deactivate_zone(zone);
|
|
}
|
|
bio_endio(bio);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Completion callback for an internally cloned target BIO. This terminates the
|
|
* target BIO when there are no more references to its context.
|
|
*/
|
|
static void dmz_clone_endio(struct bio *clone)
|
|
{
|
|
struct dmz_bioctx *bioctx = clone->bi_private;
|
|
blk_status_t status = clone->bi_status;
|
|
|
|
bio_put(clone);
|
|
dmz_bio_endio(bioctx->bio, status);
|
|
}
|
|
|
|
/*
|
|
* Issue a clone of a target BIO. The clone may only partially process the
|
|
* original target BIO.
|
|
*/
|
|
static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
|
|
struct bio *bio, sector_t chunk_block,
|
|
unsigned int nr_blocks)
|
|
{
|
|
struct dmz_bioctx *bioctx =
|
|
dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
|
struct dmz_dev *dev = zone->dev;
|
|
struct bio *clone;
|
|
|
|
if (dev->flags & DMZ_BDEV_DYING)
|
|
return -EIO;
|
|
|
|
clone = bio_alloc_clone(dev->bdev, bio, GFP_NOIO, &dmz->bio_set);
|
|
if (!clone)
|
|
return -ENOMEM;
|
|
|
|
bioctx->dev = dev;
|
|
clone->bi_iter.bi_sector =
|
|
dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
|
|
clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
|
|
clone->bi_end_io = dmz_clone_endio;
|
|
clone->bi_private = bioctx;
|
|
|
|
bio_advance(bio, clone->bi_iter.bi_size);
|
|
|
|
refcount_inc(&bioctx->ref);
|
|
submit_bio_noacct(clone);
|
|
|
|
if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
|
|
zone->wp_block += nr_blocks;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Zero out pages of discarded blocks accessed by a read BIO.
|
|
*/
|
|
static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
|
|
sector_t chunk_block, unsigned int nr_blocks)
|
|
{
|
|
unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;
|
|
|
|
/* Clear nr_blocks */
|
|
swap(bio->bi_iter.bi_size, size);
|
|
zero_fill_bio(bio);
|
|
swap(bio->bi_iter.bi_size, size);
|
|
|
|
bio_advance(bio, size);
|
|
}
|
|
|
|
/*
|
|
* Process a read BIO.
|
|
*/
|
|
static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
|
|
struct bio *bio)
|
|
{
|
|
struct dmz_metadata *zmd = dmz->metadata;
|
|
sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
|
|
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
|
sector_t end_block = chunk_block + nr_blocks;
|
|
struct dm_zone *rzone, *bzone;
|
|
int ret;
|
|
|
|
/* Read into unmapped chunks need only zeroing the BIO buffer */
|
|
if (!zone) {
|
|
zero_fill_bio(bio);
|
|
return 0;
|
|
}
|
|
|
|
DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks",
|
|
dmz_metadata_label(zmd),
|
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
|
(dmz_is_rnd(zone) ? "RND" :
|
|
(dmz_is_cache(zone) ? "CACHE" : "SEQ")),
|
|
zone->id,
|
|
(unsigned long long)chunk_block, nr_blocks);
|
|
|
|
/* Check block validity to determine the read location */
|
|
bzone = zone->bzone;
|
|
while (chunk_block < end_block) {
|
|
nr_blocks = 0;
|
|
if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
|
|
chunk_block < zone->wp_block) {
|
|
/* Test block validity in the data zone */
|
|
ret = dmz_block_valid(zmd, zone, chunk_block);
|
|
if (ret < 0)
|
|
return ret;
|
|
if (ret > 0) {
|
|
/* Read data zone blocks */
|
|
nr_blocks = ret;
|
|
rzone = zone;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* No valid blocks found in the data zone.
|
|
* Check the buffer zone, if there is one.
|
|
*/
|
|
if (!nr_blocks && bzone) {
|
|
ret = dmz_block_valid(zmd, bzone, chunk_block);
|
|
if (ret < 0)
|
|
return ret;
|
|
if (ret > 0) {
|
|
/* Read buffer zone blocks */
|
|
nr_blocks = ret;
|
|
rzone = bzone;
|
|
}
|
|
}
|
|
|
|
if (nr_blocks) {
|
|
/* Valid blocks found: read them */
|
|
nr_blocks = min_t(unsigned int, nr_blocks,
|
|
end_block - chunk_block);
|
|
ret = dmz_submit_bio(dmz, rzone, bio,
|
|
chunk_block, nr_blocks);
|
|
if (ret)
|
|
return ret;
|
|
chunk_block += nr_blocks;
|
|
} else {
|
|
/* No valid block: zeroout the current BIO block */
|
|
dmz_handle_read_zero(dmz, bio, chunk_block, 1);
|
|
chunk_block++;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Write blocks directly in a data zone, at the write pointer.
|
|
* If a buffer zone is assigned, invalidate the blocks written
|
|
* in place.
|
|
*/
|
|
static int dmz_handle_direct_write(struct dmz_target *dmz,
|
|
struct dm_zone *zone, struct bio *bio,
|
|
sector_t chunk_block,
|
|
unsigned int nr_blocks)
|
|
{
|
|
struct dmz_metadata *zmd = dmz->metadata;
|
|
struct dm_zone *bzone = zone->bzone;
|
|
int ret;
|
|
|
|
if (dmz_is_readonly(zone))
|
|
return -EROFS;
|
|
|
|
/* Submit write */
|
|
ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks);
|
|
if (ret)
|
|
return ret;
|
|
|
|
/*
|
|
* Validate the blocks in the data zone and invalidate
|
|
* in the buffer zone, if there is one.
|
|
*/
|
|
ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
|
|
if (ret == 0 && bzone)
|
|
ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Write blocks in the buffer zone of @zone.
|
|
* If no buffer zone is assigned yet, get one.
|
|
* Called with @zone write locked.
|
|
*/
|
|
static int dmz_handle_buffered_write(struct dmz_target *dmz,
|
|
struct dm_zone *zone, struct bio *bio,
|
|
sector_t chunk_block,
|
|
unsigned int nr_blocks)
|
|
{
|
|
struct dmz_metadata *zmd = dmz->metadata;
|
|
struct dm_zone *bzone;
|
|
int ret;
|
|
|
|
/* Get the buffer zone. One will be allocated if needed */
|
|
bzone = dmz_get_chunk_buffer(zmd, zone);
|
|
if (IS_ERR(bzone))
|
|
return PTR_ERR(bzone);
|
|
|
|
if (dmz_is_readonly(bzone))
|
|
return -EROFS;
|
|
|
|
/* Submit write */
|
|
ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
|
|
if (ret)
|
|
return ret;
|
|
|
|
/*
|
|
* Validate the blocks in the buffer zone
|
|
* and invalidate in the data zone.
|
|
*/
|
|
ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
|
|
if (ret == 0 && chunk_block < zone->wp_block)
|
|
ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Process a write BIO.
|
|
*/
|
|
static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
|
|
struct bio *bio)
|
|
{
|
|
struct dmz_metadata *zmd = dmz->metadata;
|
|
sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio));
|
|
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
|
|
|
if (!zone)
|
|
return -ENOSPC;
|
|
|
|
DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
|
|
dmz_metadata_label(zmd),
|
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
|
(dmz_is_rnd(zone) ? "RND" :
|
|
(dmz_is_cache(zone) ? "CACHE" : "SEQ")),
|
|
zone->id,
|
|
(unsigned long long)chunk_block, nr_blocks);
|
|
|
|
if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
|
|
chunk_block == zone->wp_block) {
|
|
/*
|
|
* zone is a random zone or it is a sequential zone
|
|
* and the BIO is aligned to the zone write pointer:
|
|
* direct write the zone.
|
|
*/
|
|
return dmz_handle_direct_write(dmz, zone, bio,
|
|
chunk_block, nr_blocks);
|
|
}
|
|
|
|
/*
|
|
* This is an unaligned write in a sequential zone:
|
|
* use buffered write.
|
|
*/
|
|
return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
|
|
}
|
|
|
|
/*
|
|
* Process a discard BIO.
|
|
*/
|
|
static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
|
|
struct bio *bio)
|
|
{
|
|
struct dmz_metadata *zmd = dmz->metadata;
|
|
sector_t block = dmz_bio_block(bio);
|
|
unsigned int nr_blocks = dmz_bio_blocks(bio);
|
|
sector_t chunk_block = dmz_chunk_block(zmd, block);
|
|
int ret = 0;
|
|
|
|
/* For unmapped chunks, there is nothing to do */
|
|
if (!zone)
|
|
return 0;
|
|
|
|
if (dmz_is_readonly(zone))
|
|
return -EROFS;
|
|
|
|
DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
|
|
dmz_metadata_label(dmz->metadata),
|
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
|
zone->id,
|
|
(unsigned long long)chunk_block, nr_blocks);
|
|
|
|
/*
|
|
* Invalidate blocks in the data zone and its
|
|
* buffer zone if one is mapped.
|
|
*/
|
|
if (dmz_is_rnd(zone) || dmz_is_cache(zone) ||
|
|
chunk_block < zone->wp_block)
|
|
ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
|
|
if (ret == 0 && zone->bzone)
|
|
ret = dmz_invalidate_blocks(zmd, zone->bzone,
|
|
chunk_block, nr_blocks);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Process a BIO.
|
|
*/
|
|
static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
|
|
struct bio *bio)
|
|
{
|
|
struct dmz_bioctx *bioctx =
|
|
dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
|
struct dmz_metadata *zmd = dmz->metadata;
|
|
struct dm_zone *zone;
|
|
int ret;
|
|
|
|
dmz_lock_metadata(zmd);
|
|
|
|
/*
|
|
* Get the data zone mapping the chunk. There may be no
|
|
* mapping for read and discard. If a mapping is obtained,
|
|
+ the zone returned will be set to active state.
|
|
*/
|
|
zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio),
|
|
bio_op(bio));
|
|
if (IS_ERR(zone)) {
|
|
ret = PTR_ERR(zone);
|
|
goto out;
|
|
}
|
|
|
|
/* Process the BIO */
|
|
if (zone) {
|
|
dmz_activate_zone(zone);
|
|
bioctx->zone = zone;
|
|
dmz_reclaim_bio_acc(zone->dev->reclaim);
|
|
}
|
|
|
|
switch (bio_op(bio)) {
|
|
case REQ_OP_READ:
|
|
ret = dmz_handle_read(dmz, zone, bio);
|
|
break;
|
|
case REQ_OP_WRITE:
|
|
ret = dmz_handle_write(dmz, zone, bio);
|
|
break;
|
|
case REQ_OP_DISCARD:
|
|
case REQ_OP_WRITE_ZEROES:
|
|
ret = dmz_handle_discard(dmz, zone, bio);
|
|
break;
|
|
default:
|
|
DMERR("(%s): Unsupported BIO operation 0x%x",
|
|
dmz_metadata_label(dmz->metadata), bio_op(bio));
|
|
ret = -EIO;
|
|
}
|
|
|
|
/*
|
|
* Release the chunk mapping. This will check that the mapping
|
|
* is still valid, that is, that the zone used still has valid blocks.
|
|
*/
|
|
if (zone)
|
|
dmz_put_chunk_mapping(zmd, zone);
|
|
out:
|
|
dmz_bio_endio(bio, errno_to_blk_status(ret));
|
|
|
|
dmz_unlock_metadata(zmd);
|
|
}
|
|
|
|
/*
|
|
* Increment a chunk reference counter.
|
|
*/
|
|
static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
|
|
{
|
|
refcount_inc(&cw->refcount);
|
|
}
|
|
|
|
/*
|
|
* Decrement a chunk work reference count and
|
|
* free it if it becomes 0.
|
|
*/
|
|
static void dmz_put_chunk_work(struct dm_chunk_work *cw)
|
|
{
|
|
if (refcount_dec_and_test(&cw->refcount)) {
|
|
WARN_ON(!bio_list_empty(&cw->bio_list));
|
|
radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
|
|
kfree(cw);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Chunk BIO work function.
|
|
*/
|
|
static void dmz_chunk_work(struct work_struct *work)
|
|
{
|
|
struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
|
|
struct dmz_target *dmz = cw->target;
|
|
struct bio *bio;
|
|
|
|
mutex_lock(&dmz->chunk_lock);
|
|
|
|
/* Process the chunk BIOs */
|
|
while ((bio = bio_list_pop(&cw->bio_list))) {
|
|
mutex_unlock(&dmz->chunk_lock);
|
|
dmz_handle_bio(dmz, cw, bio);
|
|
mutex_lock(&dmz->chunk_lock);
|
|
dmz_put_chunk_work(cw);
|
|
}
|
|
|
|
/* Queueing the work incremented the work refcount */
|
|
dmz_put_chunk_work(cw);
|
|
|
|
mutex_unlock(&dmz->chunk_lock);
|
|
}
|
|
|
|
/*
|
|
* Flush work.
|
|
*/
|
|
static void dmz_flush_work(struct work_struct *work)
|
|
{
|
|
struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
|
|
struct bio *bio;
|
|
int ret;
|
|
|
|
/* Flush dirty metadata blocks */
|
|
ret = dmz_flush_metadata(dmz->metadata);
|
|
if (ret)
|
|
DMDEBUG("(%s): Metadata flush failed, rc=%d",
|
|
dmz_metadata_label(dmz->metadata), ret);
|
|
|
|
/* Process queued flush requests */
|
|
while (1) {
|
|
spin_lock(&dmz->flush_lock);
|
|
bio = bio_list_pop(&dmz->flush_list);
|
|
spin_unlock(&dmz->flush_lock);
|
|
|
|
if (!bio)
|
|
break;
|
|
|
|
dmz_bio_endio(bio, errno_to_blk_status(ret));
|
|
}
|
|
|
|
queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
|
|
}
|
|
|
|
/*
|
|
* Get a chunk work and start it to process a new BIO.
|
|
* If the BIO chunk has no work yet, create one.
|
|
*/
|
|
static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
|
|
{
|
|
unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio);
|
|
struct dm_chunk_work *cw;
|
|
int ret = 0;
|
|
|
|
mutex_lock(&dmz->chunk_lock);
|
|
|
|
/* Get the BIO chunk work. If one is not active yet, create one */
|
|
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
|
|
if (cw) {
|
|
dmz_get_chunk_work(cw);
|
|
} else {
|
|
/* Create a new chunk work */
|
|
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
|
|
if (unlikely(!cw)) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
INIT_WORK(&cw->work, dmz_chunk_work);
|
|
refcount_set(&cw->refcount, 1);
|
|
cw->target = dmz;
|
|
cw->chunk = chunk;
|
|
bio_list_init(&cw->bio_list);
|
|
|
|
ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
|
|
if (unlikely(ret)) {
|
|
kfree(cw);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
bio_list_add(&cw->bio_list, bio);
|
|
|
|
if (queue_work(dmz->chunk_wq, &cw->work))
|
|
dmz_get_chunk_work(cw);
|
|
out:
|
|
mutex_unlock(&dmz->chunk_lock);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Check if the backing device is being removed. If it's on the way out,
|
|
* start failing I/O. Reclaim and metadata components also call this
|
|
* function to cleanly abort operation in the event of such failure.
|
|
*/
|
|
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
|
|
{
|
|
if (dmz_dev->flags & DMZ_BDEV_DYING)
|
|
return true;
|
|
|
|
if (dmz_dev->flags & DMZ_CHECK_BDEV)
|
|
return !dmz_check_bdev(dmz_dev);
|
|
|
|
if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
|
|
dmz_dev_warn(dmz_dev, "Backing device queue dying");
|
|
dmz_dev->flags |= DMZ_BDEV_DYING;
|
|
}
|
|
|
|
return dmz_dev->flags & DMZ_BDEV_DYING;
|
|
}
|
|
|
|
/*
|
|
* Check the backing device availability. This detects such events as
|
|
* backing device going offline due to errors, media removals, etc.
|
|
* This check is less efficient than dmz_bdev_is_dying() and should
|
|
* only be performed as a part of error handling.
|
|
*/
|
|
bool dmz_check_bdev(struct dmz_dev *dmz_dev)
|
|
{
|
|
struct gendisk *disk;
|
|
|
|
dmz_dev->flags &= ~DMZ_CHECK_BDEV;
|
|
|
|
if (dmz_bdev_is_dying(dmz_dev))
|
|
return false;
|
|
|
|
disk = dmz_dev->bdev->bd_disk;
|
|
if (disk->fops->check_events &&
|
|
disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
|
|
dmz_dev_warn(dmz_dev, "Backing device offline");
|
|
dmz_dev->flags |= DMZ_BDEV_DYING;
|
|
}
|
|
|
|
return !(dmz_dev->flags & DMZ_BDEV_DYING);
|
|
}
|
|
|
|
/*
|
|
* Process a new BIO.
|
|
*/
|
|
static int dmz_map(struct dm_target *ti, struct bio *bio)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
struct dmz_metadata *zmd = dmz->metadata;
|
|
struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
|
|
sector_t sector = bio->bi_iter.bi_sector;
|
|
unsigned int nr_sectors = bio_sectors(bio);
|
|
sector_t chunk_sector;
|
|
int ret;
|
|
|
|
if (dmz_dev_is_dying(zmd))
|
|
return DM_MAPIO_KILL;
|
|
|
|
DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
|
|
dmz_metadata_label(zmd),
|
|
bio_op(bio), (unsigned long long)sector, nr_sectors,
|
|
(unsigned long long)dmz_bio_chunk(zmd, bio),
|
|
(unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)),
|
|
(unsigned int)dmz_bio_blocks(bio));
|
|
|
|
if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
|
|
return DM_MAPIO_REMAPPED;
|
|
|
|
/* The BIO should be block aligned */
|
|
if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
|
|
return DM_MAPIO_KILL;
|
|
|
|
/* Initialize the BIO context */
|
|
bioctx->dev = NULL;
|
|
bioctx->zone = NULL;
|
|
bioctx->bio = bio;
|
|
refcount_set(&bioctx->ref, 1);
|
|
|
|
/* Set the BIO pending in the flush list */
|
|
if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
|
|
spin_lock(&dmz->flush_lock);
|
|
bio_list_add(&dmz->flush_list, bio);
|
|
spin_unlock(&dmz->flush_lock);
|
|
mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
|
|
return DM_MAPIO_SUBMITTED;
|
|
}
|
|
|
|
/* Split zone BIOs to fit entirely into a zone */
|
|
chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1);
|
|
if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd))
|
|
dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector);
|
|
|
|
/* Now ready to handle this BIO */
|
|
ret = dmz_queue_chunk_work(dmz, bio);
|
|
if (ret) {
|
|
DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i",
|
|
dmz_metadata_label(zmd),
|
|
bio_op(bio), (u64)dmz_bio_chunk(zmd, bio),
|
|
ret);
|
|
return DM_MAPIO_REQUEUE;
|
|
}
|
|
|
|
return DM_MAPIO_SUBMITTED;
|
|
}
|
|
|
|
/*
|
|
* Get zoned device information.
|
|
*/
|
|
static int dmz_get_zoned_device(struct dm_target *ti, char *path,
|
|
int idx, int nr_devs)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
struct dm_dev *ddev;
|
|
struct dmz_dev *dev;
|
|
int ret;
|
|
struct block_device *bdev;
|
|
|
|
/* Get the target device */
|
|
ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev);
|
|
if (ret) {
|
|
ti->error = "Get target device failed";
|
|
return ret;
|
|
}
|
|
|
|
bdev = ddev->bdev;
|
|
if (!bdev_is_zoned(bdev)) {
|
|
if (nr_devs == 1) {
|
|
ti->error = "Invalid regular device";
|
|
goto err;
|
|
}
|
|
if (idx != 0) {
|
|
ti->error = "First device must be a regular device";
|
|
goto err;
|
|
}
|
|
if (dmz->ddev[0]) {
|
|
ti->error = "Too many regular devices";
|
|
goto err;
|
|
}
|
|
dev = &dmz->dev[idx];
|
|
dev->flags = DMZ_BDEV_REGULAR;
|
|
} else {
|
|
if (dmz->ddev[idx]) {
|
|
ti->error = "Too many zoned devices";
|
|
goto err;
|
|
}
|
|
if (nr_devs > 1 && idx == 0) {
|
|
ti->error = "First device must be a regular device";
|
|
goto err;
|
|
}
|
|
dev = &dmz->dev[idx];
|
|
}
|
|
dev->bdev = bdev;
|
|
dev->dev_idx = idx;
|
|
|
|
dev->capacity = bdev_nr_sectors(bdev);
|
|
if (ti->begin) {
|
|
ti->error = "Partial mapping is not supported";
|
|
goto err;
|
|
}
|
|
|
|
dmz->ddev[idx] = ddev;
|
|
|
|
return 0;
|
|
err:
|
|
dm_put_device(ti, ddev);
|
|
return -EINVAL;
|
|
}
|
|
|
|
/*
|
|
* Cleanup zoned device information.
|
|
*/
|
|
static void dmz_put_zoned_devices(struct dm_target *ti)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
int i;
|
|
|
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
|
if (dmz->ddev[i])
|
|
dm_put_device(ti, dmz->ddev[i]);
|
|
|
|
kfree(dmz->ddev);
|
|
}
|
|
|
|
static int dmz_fixup_devices(struct dm_target *ti)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
struct dmz_dev *reg_dev = NULL;
|
|
sector_t zone_nr_sectors = 0;
|
|
int i;
|
|
|
|
/*
|
|
* When we have more than on devices, the first one must be a
|
|
* regular block device and the others zoned block devices.
|
|
*/
|
|
if (dmz->nr_ddevs > 1) {
|
|
reg_dev = &dmz->dev[0];
|
|
if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
|
|
ti->error = "Primary disk is not a regular device";
|
|
return -EINVAL;
|
|
}
|
|
for (i = 1; i < dmz->nr_ddevs; i++) {
|
|
struct dmz_dev *zoned_dev = &dmz->dev[i];
|
|
struct block_device *bdev = zoned_dev->bdev;
|
|
|
|
if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
|
|
ti->error = "Secondary disk is not a zoned device";
|
|
return -EINVAL;
|
|
}
|
|
if (zone_nr_sectors &&
|
|
zone_nr_sectors != bdev_zone_sectors(bdev)) {
|
|
ti->error = "Zone nr sectors mismatch";
|
|
return -EINVAL;
|
|
}
|
|
zone_nr_sectors = bdev_zone_sectors(bdev);
|
|
zoned_dev->zone_nr_sectors = zone_nr_sectors;
|
|
zoned_dev->nr_zones = bdev_nr_zones(bdev);
|
|
}
|
|
} else {
|
|
struct dmz_dev *zoned_dev = &dmz->dev[0];
|
|
struct block_device *bdev = zoned_dev->bdev;
|
|
|
|
if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
|
|
ti->error = "Disk is not a zoned device";
|
|
return -EINVAL;
|
|
}
|
|
zoned_dev->zone_nr_sectors = bdev_zone_sectors(bdev);
|
|
zoned_dev->nr_zones = bdev_nr_zones(bdev);
|
|
}
|
|
|
|
if (reg_dev) {
|
|
sector_t zone_offset;
|
|
|
|
reg_dev->zone_nr_sectors = zone_nr_sectors;
|
|
reg_dev->nr_zones =
|
|
DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
|
|
reg_dev->zone_nr_sectors);
|
|
reg_dev->zone_offset = 0;
|
|
zone_offset = reg_dev->nr_zones;
|
|
for (i = 1; i < dmz->nr_ddevs; i++) {
|
|
dmz->dev[i].zone_offset = zone_offset;
|
|
zone_offset += dmz->dev[i].nr_zones;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Setup target.
|
|
*/
|
|
static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
{
|
|
struct dmz_target *dmz;
|
|
int ret, i;
|
|
|
|
/* Check arguments */
|
|
if (argc < 1) {
|
|
ti->error = "Invalid argument count";
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* Allocate and initialize the target descriptor */
|
|
dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
|
|
if (!dmz) {
|
|
ti->error = "Unable to allocate the zoned target descriptor";
|
|
return -ENOMEM;
|
|
}
|
|
dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL);
|
|
if (!dmz->dev) {
|
|
ti->error = "Unable to allocate the zoned device descriptors";
|
|
kfree(dmz);
|
|
return -ENOMEM;
|
|
}
|
|
dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL);
|
|
if (!dmz->ddev) {
|
|
ti->error = "Unable to allocate the dm device descriptors";
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
dmz->nr_ddevs = argc;
|
|
|
|
ti->private = dmz;
|
|
|
|
/* Get the target zoned block device */
|
|
for (i = 0; i < argc; i++) {
|
|
ret = dmz_get_zoned_device(ti, argv[i], i, argc);
|
|
if (ret)
|
|
goto err_dev;
|
|
}
|
|
ret = dmz_fixup_devices(ti);
|
|
if (ret)
|
|
goto err_dev;
|
|
|
|
/* Initialize metadata */
|
|
ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
|
|
dm_table_device_name(ti->table));
|
|
if (ret) {
|
|
ti->error = "Metadata initialization failed";
|
|
goto err_dev;
|
|
}
|
|
|
|
/* Set target (no write same support) */
|
|
ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata);
|
|
ti->num_flush_bios = 1;
|
|
ti->num_discard_bios = 1;
|
|
ti->num_write_zeroes_bios = 1;
|
|
ti->per_io_data_size = sizeof(struct dmz_bioctx);
|
|
ti->flush_supported = true;
|
|
ti->discards_supported = true;
|
|
|
|
/* The exposed capacity is the number of chunks that can be mapped */
|
|
ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) <<
|
|
dmz_zone_nr_sectors_shift(dmz->metadata);
|
|
|
|
/* Zone BIO */
|
|
ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
|
|
if (ret) {
|
|
ti->error = "Create BIO set failed";
|
|
goto err_meta;
|
|
}
|
|
|
|
/* Chunk BIO work */
|
|
mutex_init(&dmz->chunk_lock);
|
|
INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
|
|
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s",
|
|
WQ_MEM_RECLAIM | WQ_UNBOUND, 0,
|
|
dmz_metadata_label(dmz->metadata));
|
|
if (!dmz->chunk_wq) {
|
|
ti->error = "Create chunk workqueue failed";
|
|
ret = -ENOMEM;
|
|
goto err_bio;
|
|
}
|
|
|
|
/* Flush work */
|
|
spin_lock_init(&dmz->flush_lock);
|
|
bio_list_init(&dmz->flush_list);
|
|
INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
|
|
dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
|
|
dmz_metadata_label(dmz->metadata));
|
|
if (!dmz->flush_wq) {
|
|
ti->error = "Create flush workqueue failed";
|
|
ret = -ENOMEM;
|
|
goto err_cwq;
|
|
}
|
|
mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
|
|
|
|
/* Initialize reclaim */
|
|
for (i = 0; i < dmz->nr_ddevs; i++) {
|
|
ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i);
|
|
if (ret) {
|
|
ti->error = "Zone reclaim initialization failed";
|
|
goto err_fwq;
|
|
}
|
|
}
|
|
|
|
DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)",
|
|
dmz_metadata_label(dmz->metadata),
|
|
(unsigned long long)ti->len,
|
|
(unsigned long long)dmz_sect2blk(ti->len));
|
|
|
|
return 0;
|
|
err_fwq:
|
|
destroy_workqueue(dmz->flush_wq);
|
|
err_cwq:
|
|
destroy_workqueue(dmz->chunk_wq);
|
|
err_bio:
|
|
mutex_destroy(&dmz->chunk_lock);
|
|
bioset_exit(&dmz->bio_set);
|
|
err_meta:
|
|
dmz_dtr_metadata(dmz->metadata);
|
|
err_dev:
|
|
dmz_put_zoned_devices(ti);
|
|
err:
|
|
kfree(dmz->dev);
|
|
kfree(dmz);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Cleanup target.
|
|
*/
|
|
static void dmz_dtr(struct dm_target *ti)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
int i;
|
|
|
|
destroy_workqueue(dmz->chunk_wq);
|
|
|
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
|
dmz_dtr_reclaim(dmz->dev[i].reclaim);
|
|
|
|
cancel_delayed_work_sync(&dmz->flush_work);
|
|
destroy_workqueue(dmz->flush_wq);
|
|
|
|
(void) dmz_flush_metadata(dmz->metadata);
|
|
|
|
dmz_dtr_metadata(dmz->metadata);
|
|
|
|
bioset_exit(&dmz->bio_set);
|
|
|
|
dmz_put_zoned_devices(ti);
|
|
|
|
mutex_destroy(&dmz->chunk_lock);
|
|
|
|
kfree(dmz->dev);
|
|
kfree(dmz);
|
|
}
|
|
|
|
/*
|
|
* Setup target request queue limits.
|
|
*/
|
|
static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata);
|
|
|
|
limits->logical_block_size = DMZ_BLOCK_SIZE;
|
|
limits->physical_block_size = DMZ_BLOCK_SIZE;
|
|
|
|
blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
|
|
blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
|
|
|
|
limits->discard_alignment = 0;
|
|
limits->discard_granularity = DMZ_BLOCK_SIZE;
|
|
limits->max_discard_sectors = chunk_sectors;
|
|
limits->max_hw_discard_sectors = chunk_sectors;
|
|
limits->max_write_zeroes_sectors = chunk_sectors;
|
|
|
|
/* FS hint to try to align to the device zone size */
|
|
limits->chunk_sectors = chunk_sectors;
|
|
limits->max_sectors = chunk_sectors;
|
|
|
|
/* We are exposing a drive-managed zoned block device */
|
|
limits->zoned = false;
|
|
}
|
|
|
|
/*
|
|
* Pass on ioctl to the backend device.
|
|
*/
|
|
static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
struct dmz_dev *dev = &dmz->dev[0];
|
|
|
|
if (!dmz_check_bdev(dev))
|
|
return -EIO;
|
|
|
|
*bdev = dev->bdev;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Stop works on suspend.
|
|
*/
|
|
static void dmz_suspend(struct dm_target *ti)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
int i;
|
|
|
|
flush_workqueue(dmz->chunk_wq);
|
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
|
dmz_suspend_reclaim(dmz->dev[i].reclaim);
|
|
cancel_delayed_work_sync(&dmz->flush_work);
|
|
}
|
|
|
|
/*
|
|
* Restart works on resume or if suspend failed.
|
|
*/
|
|
static void dmz_resume(struct dm_target *ti)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
int i;
|
|
|
|
queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
|
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
|
dmz_resume_reclaim(dmz->dev[i].reclaim);
|
|
}
|
|
|
|
static int dmz_iterate_devices(struct dm_target *ti,
|
|
iterate_devices_callout_fn fn, void *data)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
|
|
sector_t capacity;
|
|
int i, r;
|
|
|
|
for (i = 0; i < dmz->nr_ddevs; i++) {
|
|
capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
|
|
r = fn(ti, dmz->ddev[i], 0, capacity, data);
|
|
if (r)
|
|
break;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
static void dmz_status(struct dm_target *ti, status_type_t type,
|
|
unsigned int status_flags, char *result,
|
|
unsigned int maxlen)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
ssize_t sz = 0;
|
|
char buf[BDEVNAME_SIZE];
|
|
struct dmz_dev *dev;
|
|
int i;
|
|
|
|
switch (type) {
|
|
case STATUSTYPE_INFO:
|
|
DMEMIT("%u zones %u/%u cache",
|
|
dmz_nr_zones(dmz->metadata),
|
|
dmz_nr_unmap_cache_zones(dmz->metadata),
|
|
dmz_nr_cache_zones(dmz->metadata));
|
|
for (i = 0; i < dmz->nr_ddevs; i++) {
|
|
/*
|
|
* For a multi-device setup the first device
|
|
* contains only cache zones.
|
|
*/
|
|
if ((i == 0) &&
|
|
(dmz_nr_cache_zones(dmz->metadata) > 0))
|
|
continue;
|
|
DMEMIT(" %u/%u random %u/%u sequential",
|
|
dmz_nr_unmap_rnd_zones(dmz->metadata, i),
|
|
dmz_nr_rnd_zones(dmz->metadata, i),
|
|
dmz_nr_unmap_seq_zones(dmz->metadata, i),
|
|
dmz_nr_seq_zones(dmz->metadata, i));
|
|
}
|
|
break;
|
|
case STATUSTYPE_TABLE:
|
|
dev = &dmz->dev[0];
|
|
format_dev_t(buf, dev->bdev->bd_dev);
|
|
DMEMIT("%s", buf);
|
|
for (i = 1; i < dmz->nr_ddevs; i++) {
|
|
dev = &dmz->dev[i];
|
|
format_dev_t(buf, dev->bdev->bd_dev);
|
|
DMEMIT(" %s", buf);
|
|
}
|
|
break;
|
|
case STATUSTYPE_IMA:
|
|
*result = '\0';
|
|
break;
|
|
}
|
|
}
|
|
|
|
static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
|
|
char *result, unsigned int maxlen)
|
|
{
|
|
struct dmz_target *dmz = ti->private;
|
|
int r = -EINVAL;
|
|
|
|
if (!strcasecmp(argv[0], "reclaim")) {
|
|
int i;
|
|
|
|
for (i = 0; i < dmz->nr_ddevs; i++)
|
|
dmz_schedule_reclaim(dmz->dev[i].reclaim);
|
|
r = 0;
|
|
} else
|
|
DMERR("unrecognized message %s", argv[0]);
|
|
return r;
|
|
}
|
|
|
|
static struct target_type zoned_target = {
|
|
.name = "zoned",
|
|
.version = {2, 0, 0},
|
|
.features = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL,
|
|
.module = THIS_MODULE,
|
|
.ctr = dmz_ctr,
|
|
.dtr = dmz_dtr,
|
|
.map = dmz_map,
|
|
.io_hints = dmz_io_hints,
|
|
.prepare_ioctl = dmz_prepare_ioctl,
|
|
.postsuspend = dmz_suspend,
|
|
.resume = dmz_resume,
|
|
.iterate_devices = dmz_iterate_devices,
|
|
.status = dmz_status,
|
|
.message = dmz_message,
|
|
};
|
|
module_dm(zoned);
|
|
|
|
MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
|
|
MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
|
|
MODULE_LICENSE("GPL");
|